diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml new file mode 100644 index 00000000..a7e43d9f --- /dev/null +++ b/.github/workflows/code-check.yml @@ -0,0 +1,34 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + repository_dispatch: + types: [my_event] +jobs: + format-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pre-commit pytest pytest-cov + pip install -r requirements.txt + pip install -e . + pip install black==24.10.0 + - name: Run pre-commit + run: pre-commit run --all-files + + # - name: Run unit tests + # run: pushd tests/unit && pytest && popd + diff --git a/.github/workflows/pr-title-check.yml b/.github/workflows/pr-title-check.yml new file mode 100644 index 00000000..ae7befd4 --- /dev/null +++ b/.github/workflows/pr-title-check.yml @@ -0,0 +1,28 @@ +name: "Lint PR" + +on: + pull_request_target: + types: + - opened + - edited + - synchronize + +jobs: + main: + name: Validate PR title + runs-on: ubuntu-latest + steps: + # https://www.conventionalcommits.org/en/v1.0.0/#summary + - uses: amannn/action-semantic-pull-request@v5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + requireScope: true + subjectPattern: ^(?![A-Z]).+$ + # If `subjectPattern` is configured, you can use this property to override + # the default error message that is shown when the pattern doesn't match. + # The variables `subject` and `title` can be used within the message. + subjectPatternError: | + The subject "{subject}" found in the pull request title "{title}" + didn't match the configured pattern. Please ensure that the subject + doesn't start with an uppercase character. \ No newline at end of file diff --git a/.gitignore b/.gitignore index 55a71ae2..3dfa7d36 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ *.pyc /dist .vscode/ +.idea/ +.venv/ __pycache__/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..26ba54fb --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + files: ^kag/.*\.py$ + exclude: | + (?x)^( + kag/solver/logic/core_modules/rule_runner/rule_runner.py | + kag/solver/logic/core_modules/parser/logic_node_parser.py + )$ + + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + files: ^kag/.*\.py$ diff --git a/KAG_VERSION b/KAG_VERSION index d7cdee28..5a2a5806 100644 --- a/KAG_VERSION +++ b/KAG_VERSION @@ -1 +1 @@ -0.5.2-beta1 +0.6 diff --git a/MANIFEST.in b/MANIFEST.in index a922307c..9a655d7d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,6 @@ recursive-include kag * recursive-exclude kag/examples * +global-exclude *.pyc +global-exclude *.pyo +global-exclude *.pyd +global-exclude __pycache__ \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100644 index 00000000..a45cb572 --- /dev/null +++ b/build.sh @@ -0,0 +1,5 @@ +rm -rf build + +rm -rf dist + +python setup.py sdist bdist_wheel diff --git a/kag/__init__.py b/kag/__init__.py index 72bfda5c..00456a7b 100644 --- a/kag/__init__.py +++ b/kag/__init__.py @@ -1,3 +1,4 @@ +# flake8: noqa # Apache License # Version 2.0, January 2004 # http://www.apache.org/licenses/ @@ -202,8 +203,27 @@ __package_name__ = "openspg-kag" -__version__ = "0.5.2-beta1" +__version__ = "0.6" -from kag.common.env import init_env +# Register Built-in Components +from kag.common.conf import init_env init_env() + +import kag.interface +import kag.interface.solver.execute +import kag.interface.solver.plan +import kag.solver.execute +import kag.solver.plan +import kag.solver.retriever +import kag.solver.tools +import kag.builder.component +import kag.builder.default_chain +import kag.builder.runner +import kag.builder.prompt +import kag.solver.prompt +import kag.common.vectorize_model +import kag.common.llm +import kag.common.checkpointer +import kag.solver +import kag.bin.commands diff --git a/kag/interface/retriever/__init__.py b/kag/bin/__init__.py similarity index 100% rename from kag/interface/retriever/__init__.py rename to kag/bin/__init__.py diff --git a/kag/bin/base.py b/kag/bin/base.py new file mode 100644 index 00000000..3a41d743 --- /dev/null +++ b/kag/bin/base.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import argparse +import logging +import typing +from kag.common.registry import Registrable + +logger = logging.getLogger() + + +def add_commands( + subparsers: argparse._SubParsersAction, command_names: typing.List[str] = None +): + """add commands to subparsers""" + all_cmds = Command.list_available() + if command_names is None: + logger.warn("invalid command_names, will add all available commands.") + command_names = all_cmds + for cmd in command_names: + if cmd not in all_cmds: + raise ValueError(f"command {cmd} not in available commands {all_cmds}") + # Command Subclasses doesn't accept init args, so just pass subclass name is OK. + cls = Command.from_config(cmd) + cls.add_to_parser(subparsers) + + +class Command(Registrable): + def get_handler(self): + """return handler of current command""" + return self.handler + + def add_to_parser(self, subparsers: argparse._SubParsersAction): + """setup accept arguments""" + raise NotImplementedError("setup_parser not implemented yet.") + + @staticmethod + def handler(args: argparse.Namespace): + """function to proces the request.""" + raise NotImplementedError("handler not implemented yet.") diff --git a/kag/common/retriever/__init__.py b/kag/bin/commands/__init__.py similarity index 74% rename from kag/common/retriever/__init__.py rename to kag/bin/commands/__init__.py index 05156aa5..40427f11 100644 --- a/kag/common/retriever/__init__.py +++ b/kag/bin/commands/__init__.py @@ -9,10 +9,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. +from kag.bin.commands.info import ListRegisterInfo -from kag.common.retriever.kag_retriever import DefaultRetriever -from kag.common.retriever.retriever import Retriever -__all__ = [ - "DefaultRetriever", - "Retriever" -] + +__all__ = ["ListRegisterInfo"] diff --git a/kag/bin/commands/info.py b/kag/bin/commands/info.py new file mode 100644 index 00000000..a818647a --- /dev/null +++ b/kag/bin/commands/info.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import argparse +from tabulate import tabulate +from kag.bin.base import Command +from kag.common.registry import Registrable +from kag.common.utils import reset, bold, red, green, blue + + +@Command.register("register_info") +class ListRegisterInfo(Command): + def add_to_parser(self, subparsers: argparse._SubParsersAction): + parser = subparsers.add_parser( + "interface", help="Show the interface info of the KAG components." + ) + parser.add_argument("--cls", help="class name to query") + parser.add_argument( + "--list", help="list all component interfaces in KAG", action="store_true" + ) + parser.set_defaults(func=self.get_handler()) + + @staticmethod + def get_cls(cls_name): + interface_classes = Registrable.list_all_registered(with_leaf_classes=False) + for item in interface_classes: + if item.__name__ == cls_name: + return item + raise ValueError(f"class {cls_name} is not a valid kag configurable class") + + @staticmethod + def handle_list(args: argparse.Namespace): + interface_classes = Registrable.list_all_registered(with_leaf_classes=False) + data = [] + for cls in interface_classes: + data.append([cls.__name__, cls.__module__]) + headers = [f"{bold}{red}class{reset}", f"{bold}{red}module{reset}"] + msg = ( + f"{bold}{red}Below are the interfaces provided by KAG." + f"For detailed information on each class, please use the command `kag interface --cls $class_name`{reset}" + ) + print(msg) + print(tabulate(data, headers, tablefmt="grid")) + + @staticmethod + def handle_cls(args: argparse.Namespace): + cls_obj = ListRegisterInfo.get_cls(args.cls) + if not issubclass(cls_obj, Registrable): + raise ValueError(f"class {args.cls} is not a valid kag configurable class") + availables = cls_obj.list_available_with_detail() + seg = " " * 20 + + deduped_availables = {} + for register_name, cls_info in availables.items(): + cls = cls_info["class"] + if cls not in deduped_availables: + deduped_availables[cls] = [register_name] + else: + deduped_availables[cls].append(register_name) + + print(f"{bold}{red}{seg}Documentation of {args.cls}{seg}{reset}") + import inspect + + print(inspect.getdoc(cls_obj)) + print(f"{bold}{red}{seg}Registered subclasses of {args.cls}{seg}{reset}") + visited = set() + for register_name, cls_info in availables.items(): + cls = cls_info["class"] + if cls in visited: + continue + visited.add(cls) + print(f"{bold}{blue}[{cls}]{reset}") + register_names = " / ".join([f'"{x}"' for x in deduped_availables[cls]]) + print(f"{bold}{green}Register Name:{reset} {register_names}\n") + + # print(f"Class Name: {cls_info['class']}") + print(f"{bold}{green}Documentation:{reset}\n{cls_info['doc']}\n") + print(f"{bold}{green}Initializer:{reset}\n{cls_info['constructor']}\n") + + required_arguments = [] + for item in cls_info["params"]["required_params"]: + required_arguments.append(f" {item}") + if len(required_arguments) == 0: + required_arguments = " No Required Arguments found" + else: + required_arguments = "\n".join(required_arguments) + print(f"{bold}{green}Required Arguments:{reset}\n{required_arguments}\n") + + optional_arguments = [] + for item in cls_info["params"]["optional_params"]: + optional_arguments.append(f" {item}") + if len(optional_arguments) == 0: + optional_arguments = " No Optional Arguments found" + else: + optional_arguments = "\n".join(optional_arguments) + print(f"{bold}{green}Optional Arguments:{reset}\n{optional_arguments}\n") + print(f"{bold}{green}Sample Useage:{reset}\n {cls_info['sample_useage']}") + # for k, v in cls_info.items(): + # print(f"{k}: {v}") + print("\n") + + @staticmethod + def handler(args: argparse.Namespace): + if args.list: + ListRegisterInfo.handle_list(args) + else: + ListRegisterInfo.handle_cls(args) diff --git a/kag/bin/kag_cmds.py b/kag/bin/kag_cmds.py new file mode 100644 index 00000000..39f31270 --- /dev/null +++ b/kag/bin/kag_cmds.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import argparse +from kag.bin.base import add_commands + + +def build_parser(): + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers( + dest="subcommand_name", + title="subcommands", + help="subcommands supported by kag", + ) + # add registered commands to parser + cmds = [ + "register_info", + ] + add_commands(subparsers, cmds) + return parser + + +def main(): + """entry point of script""" + parser = build_parser() + args = parser.parse_args() + args.func(args) diff --git a/kag/solver/logic/core_modules/op_executor/__init__.py b/kag/bridge/__init__.py similarity index 100% rename from kag/solver/logic/core_modules/op_executor/__init__.py rename to kag/bridge/__init__.py diff --git a/kag/bridge/spg_server_bridge.py b/kag/bridge/spg_server_bridge.py new file mode 100644 index 00000000..7fde8f72 --- /dev/null +++ b/kag/bridge/spg_server_bridge.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import os +import json +import kag.interface as interface +from kag.common.conf import KAGConstants, init_env + + +def init_kag_config(project_id: str, host_addr: str): + + os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = project_id + os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = host_addr + init_env() + + +class SPGServerBridge: + def __init__(self): + pass + + def run_reader(self, config, input_data): + if isinstance(config, str): + config = json.loads(config) + scanner_config = config["scanner"] + reader_config = config["reader"] + scanner = interface.ScannerABC.from_config(scanner_config) + reader = interface.ReaderABC.from_config(reader_config) + chunks = [] + for data in scanner.generate(input_data): + chunks += reader.invoke(data, write_ckpt=False) + return [x.to_dict() for x in chunks] + + def run_component(self, component_name, component_config, input_data): + if isinstance(component_config, str): + component_config = json.loads(component_config) + + cls = getattr(interface, component_name) + instance = cls.from_config(component_config) + if hasattr(instance.input_types, "from_dict"): + input_data = instance.input_types.from_dict(input_data) + return [x.to_dict() for x in instance.invoke(input_data, write_ckpt=False)] diff --git a/kag/builder/__init__.py b/kag/builder/__init__.py index 6f6914a4..e69de29b 100644 --- a/kag/builder/__init__.py +++ b/kag/builder/__init__.py @@ -1,10 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. diff --git a/kag/builder/component/__init__.py b/kag/builder/component/__init__.py index 0dfd96e7..971b2826 100644 --- a/kag/builder/component/__init__.py +++ b/kag/builder/component/__init__.py @@ -10,13 +10,76 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. +from kag.builder.component.external_graph.external_graph import ( + DefaultExternalGraphLoader, +) +from kag.builder.component.extractor.schema_free_extractor import SchemaFreeExtractor +from kag.builder.component.extractor.schema_constraint_extractor import ( + SchemaConstraintExtractor, +) +from kag.builder.component.aligner.kag_aligner import KAGAligner +from kag.builder.component.aligner.spg_aligner import SPGAligner +from kag.builder.component.postprocessor.kag_postprocessor import KAGPostProcessor + from kag.builder.component.mapping.spg_type_mapping import SPGTypeMapping from kag.builder.component.mapping.relation_mapping import RelationMapping +from kag.builder.component.mapping.spo_mapping import SPOMapping +from kag.builder.component.scanner.csv_scanner import CSVScanner +from kag.builder.component.scanner.json_scanner import JSONScanner +from kag.builder.component.scanner.yuque_scanner import YuqueScanner +from kag.builder.component.scanner.dataset_scanner import ( + MusiqueCorpusScanner, + HotpotqaCorpusScanner, +) +from kag.builder.component.scanner.file_scanner import FileScanner +from kag.builder.component.scanner.directory_scanner import DirectoryScanner + + +from kag.builder.component.reader.pdf_reader import PDFReader +from kag.builder.component.reader.markdown_reader import MarkDownReader +from kag.builder.component.reader.docx_reader import DocxReader +from kag.builder.component.reader.txt_reader import TXTReader +from kag.builder.component.reader.mix_reader import MixReader + +from kag.builder.component.reader.dict_reader import DictReader + + +from kag.builder.component.splitter.length_splitter import LengthSplitter +from kag.builder.component.splitter.pattern_splitter import PatternSplitter +from kag.builder.component.splitter.outline_splitter import OutlineSplitter +from kag.builder.component.splitter.semantic_splitter import SemanticSplitter +from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer from kag.builder.component.writer.kg_writer import KGWriter __all__ = [ + "DefaultExternalGraphLoader", + "SchemaFreeExtractor", + "SchemaConstraintExtractor", + "KAGAligner", + "SPGAligner", + "KAGPostProcessor", + "KGWriter", "SPGTypeMapping", "RelationMapping", + "SPOMapping", + "TXTReader", + "PDFReader", + "MarkDownReader", + "DocxReader", + "MixReader", + "DictReader", + "JSONScanner", + "HotpotqaCorpusScanner", + "MusiqueCorpusScanner", + "FileScanner", + "DirectoryScanner", + "YuqueScanner", + "CSVScanner", + "LengthSplitter", + "PatternSplitter", + "OutlineSplitter", + "SemanticSplitter", + "BatchVectorizer", "KGWriter", ] diff --git a/kag/builder/component/aligner/__init__.py b/kag/builder/component/aligner/__init__.py index 123acd8d..e69de29b 100644 --- a/kag/builder/component/aligner/__init__.py +++ b/kag/builder/component/aligner/__init__.py @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - diff --git a/kag/builder/component/aligner/kag_post_processor.py b/kag/builder/component/aligner/kag_aligner.py similarity index 56% rename from kag/builder/component/aligner/kag_post_processor.py rename to kag/builder/component/aligner/kag_aligner.py index 9722c0cd..9f1bd46f 100644 --- a/kag/builder/component/aligner/kag_post_processor.py +++ b/kag/builder/component/aligner/kag_aligner.py @@ -13,12 +13,25 @@ from typing import List, Sequence, Dict, Type from kag.builder.model.sub_graph import SubGraph -from kag.interface.builder import AlignerABC +from kag.interface import AlignerABC from knext.common.base.runnable import Input, Output -class KAGPostProcessorAligner(AlignerABC): +@AlignerABC.register("kag") +class KAGAligner(AlignerABC): + """ + A class that extends the AlignerABC base class. It is responsible for aligning and merging subgraphs. + + This class provides methods to handle the alignment and merging of subgraphs, as well as properties to define the input and output types. + """ + def __init__(self, **kwargs): + """ + Initializes the KAGAligner instance. + + Args: + **kwargs: Arbitrary keyword arguments passed to the parent class constructor. + """ super().__init__(**kwargs) @property @@ -30,6 +43,16 @@ def output_types(self) -> Type[Output]: return SubGraph def invoke(self, input: List[SubGraph], **kwargs) -> SubGraph: + """ + Merges a list of subgraphs into a single subgraph. + + Args: + input (List[SubGraph]): A list of subgraphs to be merged. + **kwargs: Additional keyword arguments. + + Returns: + SubGraph: The merged subgraph containing all nodes and edges from the input subgraphs. + """ merged_sub_graph = SubGraph(nodes=[], edges=[]) for sub_graph in input: for node in sub_graph.nodes: @@ -41,9 +64,15 @@ def invoke(self, input: List[SubGraph], **kwargs) -> SubGraph: return merged_sub_graph def _handle(self, input: Sequence[Dict]) -> Dict: + """ + Handles the input by converting it to the appropriate type, invoking the aligner, and converting the output back to a dictionary. + + Args: + input (Sequence[Dict]): A sequence of dictionaries representing subgraphs. + + Returns: + Dict: A dictionary representing the merged subgraph. + """ _input = [self.input_types.from_dict(i) for i in input] _output = self.invoke(_input) return _output.to_dict() - - def batch(self, inputs: List[Input], **kwargs) -> List[Output]: - pass diff --git a/kag/builder/component/aligner/spg_post_processor.py b/kag/builder/component/aligner/spg_aligner.py similarity index 71% rename from kag/builder/component/aligner/spg_post_processor.py rename to kag/builder/component/aligner/spg_aligner.py index b446c15b..cca5b7c7 100644 --- a/kag/builder/component/aligner/spg_post_processor.py +++ b/kag/builder/component/aligner/spg_aligner.py @@ -12,8 +12,9 @@ from typing import List, Type, Dict -from kag.interface.builder import AlignerABC +from kag.interface import AlignerABC from knext.schema.client import BASIC_TYPES +from kag.common.conf import KAG_PROJECT_CONF from kag.builder.model.spg_record import SPGRecord from kag.builder.model.sub_graph import SubGraph from knext.common.base.runnable import Input, Output @@ -21,10 +22,17 @@ from knext.schema.model.base import ConstraintTypeEnum, BaseSpgType -class SPGPostProcessorAligner(AlignerABC): +@AlignerABC.register("spg") +class SPGAligner(AlignerABC): + """ + A class that extends the AlignerABC base class. It is responsible for aligning and merging SPG records into subgraphs. + + This class provides methods to handle the alignment and merging of SPG records, as well as properties to define the input and output types. + """ + def __init__(self, **kwargs): super().__init__(**kwargs) - self.spg_types = SchemaClient(project_id=self.project_id).load() + self.spg_types = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() @property def input_types(self) -> Type[Input]: @@ -35,6 +43,15 @@ def output_types(self) -> Type[Output]: return SubGraph def merge(self, spg_records: List[SPGRecord]): + """ + Merges a list of SPG records into a single set of records, combining properties as necessary. + + Args: + spg_records (List[SPGRecord]): A list of SPG records to be merged. + + Returns: + List[SPGRecord]: A list of merged SPG records. + """ merged_spg_records = {} for record in spg_records: key = f"{record.spg_type_name}#{record.get_property('name', '')}" @@ -75,6 +92,16 @@ def merge(self, spg_records: List[SPGRecord]): def from_spg_record( spg_types: Dict[str, BaseSpgType], spg_records: List[SPGRecord] ): + """ + Converts a list of SPG records into a subgraph. + + Args: + spg_types (Dict[str, BaseSpgType]): A dictionary mapping SPG type names to their corresponding types. + spg_records (List[SPGRecord]): A list of SPG records to be converted. + + Returns: + SubGraph: A subgraph representing the converted SPG records. + """ sub_graph = SubGraph([], []) for record in spg_records: s_id = record.id @@ -107,10 +134,30 @@ def from_spg_record( return sub_graph def invoke(self, input: Input, **kwargs) -> List[Output]: + """ + Processes a single input and returns a list of outputs. + + Args: + input (Input): The input to be processed. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list containing the processed output. + """ subgraph = SubGraph.from_spg_record(self.spg_types, [input]) return [subgraph] def batch(self, inputs: List[Input], **kwargs) -> List[Output]: + """ + Processes a batch of inputs and returns a list of outputs. + + Args: + inputs (List[Input]): A list of inputs to be processed. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of outputs corresponding to the processed inputs. + """ merged_records = self.merge(inputs) subgraph = SubGraph.from_spg_record(self.spg_types, merged_records) return [subgraph] diff --git a/kag/builder/component/base.py b/kag/builder/component/base.py deleted file mode 100644 index 0117478a..00000000 --- a/kag/builder/component/base.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. -import os -from abc import ABC -from typing import List, Dict -import logging - -from knext.common.base.component import Component -from knext.common.base.runnable import Input, Output -from knext.project.client import ProjectClient -from kag.common.llm.client import LLMClient - - -class BuilderComponent(Component, ABC): - """ - Abstract base class for all builder component. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.project_id = kwargs.get("project_id",None) or os.getenv("KAG_PROJECT_ID") - self.config = ProjectClient().get_config(self.project_id) - - - def _init_llm(self) -> LLMClient: - """ - Initializes the Large Language Model (LLM) client. - - This method retrieves the LLM configuration from environment variables and the project ID. - It then fetches the project configuration using the project ID and updates the LLM configuration - with any additional settings from the project. Finally, it creates and initializes the LLM client - using the updated configuration. - - Args: - None - - Returns: - LLMClient - """ - llm_config = eval(os.getenv("KAG_LLM", "{}")) - project_id = self.project_id or os.getenv("KAG_PROJECT_ID") - if project_id: - try: - config = ProjectClient().get_config(project_id) - llm_config.update(config.get("llm", {})) - except: - logging.warning( - f"Failed to get project config for project id: {project_id}" - ) - llm = LLMClient.from_config(llm_config) - return llm - - @property - def type(self): - """ - Get the type label of the object. - - Returns: - str: The type label of the object, fixed as "BUILDER". - """ - return "BUILDER" - - def batch(self, inputs: List[Input], **kwargs) -> List[Output]: - results = [] - for input in inputs: - results.extend(self.invoke(input, **kwargs)) - return results - - def _handle(self, input: Dict) -> List[Dict]: - _input = self.input_types.from_dict(input) if isinstance(input, dict) else input - _output = self.invoke(_input) - return [_o.to_dict() for _o in _output if _o] diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/__init__.py b/kag/builder/component/external_graph/__init__.py similarity index 100% rename from kag/solver/logic/core_modules/op_executor/op_deduce/__init__.py rename to kag/builder/component/external_graph/__init__.py diff --git a/kag/builder/component/external_graph/external_graph.py b/kag/builder/component/external_graph/external_graph.py new file mode 100644 index 00000000..9728c2cf --- /dev/null +++ b/kag/builder/component/external_graph/external_graph.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import json +import numpy as np +import logging +from typing import List, Union, Dict +from kag.interface import ExternalGraphLoaderABC, MatchConfig +from kag.common.conf import KAG_PROJECT_CONF +from kag.builder.model.sub_graph import Node, Edge, SubGraph +from knext.schema.client import SchemaClient + +from knext.search.client import SearchClient + + +logger = logging.getLogger() + + +@ExternalGraphLoaderABC.register("base", constructor="from_json_file", as_default=True) +class DefaultExternalGraphLoader(ExternalGraphLoaderABC): + """ + A default implementation of the ExternalGraphLoaderABC interface. + + This class is responsible for loading external graph data based on the provided nodes, edges, and match configuration. + """ + + def __init__( + self, + nodes: List[Node], + edges: List[Edge], + match_config: MatchConfig, + ): + """ + Initializes the DefaultExternalGraphLoader with the given nodes, edges, and match configuration. + + Args: + nodes (List[Node]): A list of Node objects representing the nodes in the graph. + edges (List[Edge]): A list of Edge objects representing the edges in the graph. + match_config (MatchConfig): The configuration for matching query str to graph nodes. + """ + super().__init__() + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() + for node in nodes: + if node.label not in self.schema: + raise ValueError( + f"Type of node {node.to_dict()} is beyond the schema definition." + ) + for k in node.properties.keys(): + if k not in self.schema[node.label]: + raise ValueError( + f"Property of node {node.to_dict()} is beyond the schema definition." + ) + self.nodes = nodes + self.edges = edges + + self.vocabulary = {} + self.node_labels = set() + for node in self.nodes: + self.vocabulary[node.name] = node + self.node_labels.add(node.label) + + import jieba + + for word in self.vocabulary.keys(): + jieba.add_word(word) + + self.match_config = match_config + self._init_search() + + def _init_search(self): + self._search_client = SearchClient( + KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id + ) + + def _group_by_label(self, data: Union[List[Node], List[Edge]]): + groups = {} + + for item in data: + label = item.label + if label not in groups: + groups[label] = [item] + else: + groups[label].append(item) + return list(groups.values()) + + def _group_by_cnt(self, data, n): + return [data[i : i + n] for i in range(0, len(data), n)] + + def dump(self, max_num_nodes: int = 4096, max_num_edges: int = 4096): + graphs = [] + # process nodes + for item in self._group_by_label(self.nodes): + for grouped_nodes in self._group_by_cnt(item, max_num_nodes): + graphs.append(SubGraph(nodes=grouped_nodes, edges=[])) + + # process edges + for item in self._group_by_label(self.edges): + for grouped_edges in self._group_by_cnt(item, max_num_edges): + graphs.append(SubGraph(nodes=[], edges=grouped_edges)) + + return graphs + + def ner(self, content: str): + output = [] + import jieba + + for word in jieba.cut(content): + if word in self.vocabulary: + output.append(self.vocabulary[word]) + return output + + def get_allowed_labels(self, labels: List[str] = None): + allowed_labels = [] + + namespace = KAG_PROJECT_CONF.namespace + if labels is None: + allowed_labels = [f"{namespace}.{x}" for x in self.node_labels] + else: + for label in labels: + # remove namespace + if label.startswith(KAG_PROJECT_CONF.namespace): + label = label.split(".")[1] + if label in self.node_labels: + allowed_labels.append(f"{namespace}.{label}") + return allowed_labels + + def search_result_to_node(self, search_result: Dict): + output = [] + for label in search_result["__labels__"]: + node = { + "id": search_result["id"], + "name": search_result["name"], + "label": label, + } + output.append(Node.from_dict(node)) + return output + + def text_match(self, query: str, k: int = 1, labels: List[str] = None): + allowed_labels = self.get_allowed_labels(labels) + text_matched = self._search_client.search_text(query, allowed_labels, topk=k) + return text_matched + + def vector_match( + self, + query: Union[List[float], np.ndarray], + k: int = 1, + threshold: float = 0.9, + labels: List[str] = None, + ): + allowed_labels = self.get_allowed_labels(labels) + if isinstance(query, np.ndarray): + query = query.tolist() + matched_results = [] + for label in allowed_labels: + vector_matched = self._search_client.search_vector( + label=label, property_key="name", query_vector=query, topk=k + ) + matched_results.extend(vector_matched) + + filtered_results = [] + for item in matched_results: + score = item["score"] + if score >= threshold: + filtered_results.append(item) + return filtered_results + + def match_entity(self, query: Union[str, List[float], np.ndarray]): + if isinstance(query, str): + return self.text_match( + query, k=self.match_config.k, labels=self.match_config.labels + ) + else: + return self.vector_match( + query, + k=self.match_config.k, + labels=self.match_config.labels, + threshold=self.match_config.threshold, + ) + + @classmethod + def from_json_file( + cls, + node_file_path: str, + edge_file_path: str, + match_config: MatchConfig, + ): + """ + Creates an instance of DefaultExternalGraphLoader from JSON files containing node and edge data. + + Args: + node_file_path (str): The path to the JSON file containing node data. + edge_file_path (str): The path to the JSON file containing edge data. + match_config (MatchConfig): The configuration for matching query str to graph nodes. + + Returns: + DefaultExternalGraphLoader: An instance of DefaultExternalGraphLoader initialized with the data from the JSON files. + """ + nodes = [] + for item in json.load(open(node_file_path, "r")): + nodes.append(Node.from_dict(item)) + edges = [] + for item in json.load(open(edge_file_path, "r")): + edges.append(Edge.from_dict(item)) + return cls(nodes=nodes, edges=edges, match_config=match_config) diff --git a/kag/builder/component/extractor/__init__.py b/kag/builder/component/extractor/__init__.py index dbde8cd2..e69de29b 100644 --- a/kag/builder/component/extractor/__init__.py +++ b/kag/builder/component/extractor/__init__.py @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.builder.component.extractor.kag_extractor import KAGExtractor -from kag.builder.component.extractor.spg_extractor import SPGExtractor -from kag.builder.component.extractor.user_defined_extractor import ( - UserDefinedExtractor, -) - -__all__ = [ - "KAGExtractor", - "SPGExtractor", - "UserDefinedExtractor", -] diff --git a/kag/builder/component/extractor/schema_constraint_extractor.py b/kag/builder/component/extractor/schema_constraint_extractor.py new file mode 100644 index 00000000..4dfbb2ac --- /dev/null +++ b/kag/builder/component/extractor/schema_constraint_extractor.py @@ -0,0 +1,429 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import copy +import logging +from typing import Dict, Type, List + +from kag.interface import LLMClient +from tenacity import stop_after_attempt, retry + +from kag.interface import ExtractorABC, PromptABC, ExternalGraphLoaderABC + +from kag.common.conf import KAG_PROJECT_CONF +from kag.common.utils import processing_phrases, to_camel_case +from kag.builder.model.chunk import Chunk +from kag.builder.model.sub_graph import SubGraph +from kag.builder.prompt.utils import init_prompt_with_fallback +from knext.schema.client import CHUNK_TYPE, BASIC_TYPES +from knext.common.base.runnable import Input, Output +from knext.schema.client import SchemaClient + +logger = logging.getLogger(__name__) + + +@ExtractorABC.register("schema_constraint") +@ExtractorABC.register("schema_constraint_extractor") +class SchemaConstraintExtractor(ExtractorABC): + """ + Perform knowledge extraction for enforcing schema constraints, including entities, events and their edges. + The types of entities and events, along with their respective attributes, are automatically inherited from the project's schema. + """ + + def __init__( + self, + llm: LLMClient, + ner_prompt: PromptABC = None, + std_prompt: PromptABC = None, + relation_prompt: PromptABC = None, + event_prompt: PromptABC = None, + external_graph: ExternalGraphLoaderABC = None, + ): + """ + Initializes the SchemaBasedExtractor instance. + + Args: + llm (LLMClient): The language model client used for extraction. + ner_prompt (PromptABC, optional): The prompt for named entity recognition. Defaults to None. + std_prompt (PromptABC, optional): The prompt for named entity standardization. Defaults to None. + relation_prompt (PromptABC, optional): The prompt for relation extraction. Defaults to None. + event_prompt (PromptABC, optional): The prompt for event extraction. Defaults to None. + external_graph (ExternalGraphLoaderABC, optional): The external graph loader for additional data. Defaults to None. + """ + super().__init__() + self.llm = llm + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() + self.ner_prompt = ner_prompt + self.std_prompt = std_prompt + self.relation_prompt = relation_prompt + self.event_prompt = event_prompt + + biz_scene = KAG_PROJECT_CONF.biz_scene + if self.ner_prompt is None: + self.ner_prompt = init_prompt_with_fallback("ner", biz_scene) + if self.std_prompt is None: + self.std_prompt = init_prompt_with_fallback("std", biz_scene) + self.external_graph = external_graph + + @property + def input_types(self) -> Type[Input]: + return Chunk + + @property + def output_types(self) -> Type[Output]: + return SubGraph + + @retry(stop=stop_after_attempt(3)) + def named_entity_recognition(self, passage: str): + """ + Performs named entity recognition on a given text passage. + Args: + passage (str): The text to perform named entity recognition on. + Returns: + The result of the named entity recognition operation. + """ + ner_result = self.llm.invoke({"input": passage}, self.ner_prompt) + if self.external_graph: + extra_ner_result = self.external_graph.ner(passage) + else: + extra_ner_result = [] + output = [] + dedup = set() + for item in extra_ner_result: + name = item.name + if name not in dedup: + dedup.add(name) + output.append( + { + "name": name, + "category": item.label, + "properties": item.properties, + } + ) + for item in ner_result: + name = item.get("name", None) + category = item.get("category", None) + if name is None or category is None: + continue + if not isinstance(name, str): + continue + if name not in dedup: + dedup.add(name) + output.append(item) + return output + + @retry(stop=stop_after_attempt(3)) + def named_entity_standardization(self, passage: str, entities: List[Dict]): + """ + Performs named entity standardization on a given text passage and entities. + + Args: + passage (str): The text passage. + entities (List[Dict]): The list of entities to standardize. + + Returns: + The result of the named entity standardization operation. + """ + return self.llm.invoke( + {"input": passage, "named_entities": entities}, self.std_prompt + ) + + @retry(stop=stop_after_attempt(3)) + def relations_extraction(self, passage: str, entities: List[Dict]): + """ + Performs relation extraction on a given text passage and entities. + + Args: + passage (str): The text passage. + entities (List[Dict]): The list of entities. + + Returns: + The result of the relation extraction operation. + """ + if self.relation_prompt is None: + logger.debug("Relation extraction prompt not configured, skip.") + + return [] + return self.llm.invoke( + {"input": passage, "entity_list": entities}, self.relation_prompt + ) + + @retry(stop=stop_after_attempt(3)) + def event_extraction(self, passage: str): + """ + Performs event extraction on a given text passage. + + Args: + passage (str): The text passage. + + Returns: + The result of the event extraction operation. + """ + if self.event_prompt is None: + logger.debug("Event extraction prompt not configured, skip.") + return [] + return self.llm.invoke({"input": passage}, self.event_prompt) + + def parse_nodes_and_edges(self, entities: List[Dict], category: str = None): + """ + Parses nodes and edges from a list of entities. + + Args: + entities (List[Dict]): The list of entities. + + Returns: + Tuple[List[Node], List[Edge]]: The parsed nodes and edges. + """ + graph = SubGraph([], []) + entities = copy.deepcopy(entities) + root_nodes = [] + for record in entities: + if record is None: + continue + if isinstance(record, str): + record = {"name": record} + s_name = record.get("name", "") + s_label = record.get("category", category) + properties = record.get("properties", {}) + # At times, the name and/or label is placed in the properties. + if not s_name: + s_name = properties.pop("name", "") + if not s_label: + s_label = properties.pop("category", "") + if not s_name or not s_label: + continue + s_name = processing_phrases(s_name) + root_nodes.append((s_name, s_label)) + tmp_properties = copy.deepcopy(properties) + spg_type = self.schema.get(s_label) + for prop_name, prop_value in properties.items(): + if prop_value is None: + tmp_properties.pop(prop_name) + continue + if prop_name in spg_type.properties: + prop_schema = spg_type.properties.get(prop_name) + o_label = prop_schema.object_type_name_en + if o_label not in BASIC_TYPES: + # pop and convert property to node and edge + if not isinstance(prop_value, list): + prop_value = [prop_value] + ( + new_root_nodes, + new_nodes, + new_edges, + ) = self.parse_nodes_and_edges(prop_value, o_label) + graph.nodes.extend(new_nodes) + graph.edges.extend(new_edges) + # connect current node to property generated nodes + for node in new_root_nodes: + graph.add_edge( + s_id=s_name, + s_label=s_label, + p=prop_name, + o_id=node[0], + o_label=node[1], + ) + tmp_properties.pop(prop_name) + record["properties"] = tmp_properties + # NOTE: For property converted to nodes/edges, we keep a copy of the original property values. + # Perhaps it is not necessary? + graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties) + + if "official_name" in record: + official_name = processing_phrases(record["official_name"]) + if official_name != s_name: + graph.add_node( + id=official_name, + name=official_name, + label=s_label, + properties=properties, + ) + graph.add_edge( + s_id=s_name, + s_label=s_label, + p="OfficialName", + o_id=official_name, + o_label=s_label, + ) + + return root_nodes, graph.nodes, graph.edges + + @staticmethod + def add_relations_to_graph( + sub_graph: SubGraph, entities: List[Dict], relations: List[list] + ): + """ + Add edges to the subgraph based on a list of relations and entities. + Args: + sub_graph (SubGraph): The subgraph to add edges to. + entities (List[Dict]): A list of entities, for looking up category information. + relations (List[list]): A list of relations, each representing a relationship to be added to the subgraph. + Returns: + The constructed subgraph. + + """ + + for rel in relations: + if len(rel) != 5: + continue + s_name, s_category, predicate, o_name, o_category = rel + s_name = processing_phrases(s_name) + sub_graph.add_node(s_name, s_name, s_category) + o_name = processing_phrases(o_name) + sub_graph.add_node(o_name, o_name, o_category) + edge_type = to_camel_case(predicate) + if edge_type: + sub_graph.add_edge(s_name, s_category, edge_type, o_name, o_category) + return sub_graph + + @staticmethod + def add_chunk_to_graph(sub_graph: SubGraph, chunk: Chunk): + """ + Associates a Chunk object with the subgraph, adding it as a node and connecting it with existing nodes. + Args: + sub_graph (SubGraph): The subgraph to add the chunk information to. + chunk (Chunk): The chunk object containing the text and metadata. + Returns: + The constructed subgraph. + """ + for node in sub_graph.nodes: + sub_graph.add_edge(node.id, node.label, "source", chunk.id, CHUNK_TYPE) + sub_graph.add_node( + id=chunk.id, + name=chunk.name, + label=CHUNK_TYPE, + properties={ + "id": chunk.id, + "name": chunk.name, + "content": f"{chunk.name}\n{chunk.content}", + **chunk.kwargs, + }, + ) + sub_graph.id = chunk.id + return sub_graph + + def assemble_subgraph( + self, + chunk: Chunk, + entities: List[Dict], + relations: List[list], + events: List[Dict], + ): + """ + Assembles a subgraph from the given chunk, entities, events, and relations. + + Args: + chunk (Chunk): The chunk object. + entities (List[Dict]): The list of entities. + events (List[Dict]): The list of events. + + Returns: + The constructed subgraph. + """ + graph = SubGraph([], []) + _, entity_nodes, entity_edges = self.parse_nodes_and_edges(entities) + graph.nodes.extend(entity_nodes) + graph.edges.extend(entity_edges) + _, event_nodes, event_edges = self.parse_nodes_and_edges(events) + graph.nodes.extend(event_nodes) + graph.edges.extend(event_edges) + self.add_relations_to_graph(graph, entities, relations) + self.add_chunk_to_graph(graph, chunk) + return graph + + def append_official_name( + self, source_entities: List[Dict], entities_with_official_name: List[Dict] + ): + """ + Appends official names to entities. + + Args: + source_entities (List[Dict]): A list of source entities. + entities_with_official_name (List[Dict]): A list of entities with official names. + """ + tmp_dict = {} + for tmp_entity in entities_with_official_name: + name = tmp_entity["name"] + category = tmp_entity["category"] + official_name = tmp_entity["official_name"] + key = f"{category}{name}" + tmp_dict[key] = official_name + + for tmp_entity in source_entities: + name = tmp_entity["name"] + category = tmp_entity["category"] + key = f"{category}{name}" + if key in tmp_dict: + official_name = tmp_dict[key] + tmp_entity["official_name"] = official_name + + def postprocess_graph(self, graph): + """ + Postprocesses the graph by merging nodes with the same name and label. + + Args: + graph (SubGraph): The graph to postprocess. + + Returns: + The postprocessed graph. + """ + try: + all_node_properties = {} + for node in graph.nodes: + id_ = node.id + name = node.name + label = node.label + key = (id_, name, label) + if key not in all_node_properties: + all_node_properties[key] = node.properties + else: + all_node_properties[key].update(node.properties) + new_graph = SubGraph([], []) + for key, node_properties in all_node_properties.items(): + id_, name, label = key + new_graph.add_node( + id=id_, name=name, label=label, properties=node_properties + ) + new_graph.edges = graph.edges + return new_graph + except: + return graph + + def _invoke(self, input: Input, **kwargs) -> List[Output]: + """ + Invokes the extractor on the given input. + + Args: + input (Input): The input data. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: The list of output results. + """ + title = input.name + passage = title + "\n" + input.content + + out = [] + entities = self.named_entity_recognition(passage) + events = self.event_extraction(passage) + named_entities = [] + for entity in entities: + named_entities.append( + {"name": entity["name"], "category": entity["category"]} + ) + relations = self.relations_extraction(passage, named_entities) + std_entities = self.named_entity_standardization(passage, named_entities) + self.append_official_name(entities, std_entities) + subgraph = self.assemble_subgraph(input, entities, relations, events) + out.append(self.postprocess_graph(subgraph)) + logger.debug(f"input passage:\n{passage}") + logger.debug(f"output graphs:\n{out}") + return out diff --git a/kag/builder/component/extractor/kag_extractor.py b/kag/builder/component/extractor/schema_free_extractor.py similarity index 64% rename from kag/builder/component/extractor/kag_extractor.py rename to kag/builder/component/extractor/schema_free_extractor.py index fd6f9913..ccf29128 100644 --- a/kag/builder/component/extractor/kag_extractor.py +++ b/kag/builder/component/extractor/schema_free_extractor.py @@ -11,64 +11,75 @@ # or implied. import copy import logging -import os from typing import Dict, Type, List +from kag.interface import LLMClient from tenacity import stop_after_attempt, retry -from kag.builder.prompt.spg_prompt import SPG_KGPrompt -from kag.interface.builder import ExtractorABC -from kag.common.base.prompt_op import PromptOp -from knext.schema.client import OTHER_TYPE, CHUNK_TYPE, BASIC_TYPES +from kag.interface import ExtractorABC, PromptABC, ExternalGraphLoaderABC + +from kag.common.conf import KAG_PROJECT_CONF from kag.common.utils import processing_phrases, to_camel_case from kag.builder.model.chunk import Chunk from kag.builder.model.sub_graph import SubGraph +from kag.builder.prompt.utils import init_prompt_with_fallback +from knext.schema.client import OTHER_TYPE, CHUNK_TYPE, BASIC_TYPES from knext.common.base.runnable import Input, Output from knext.schema.client import SchemaClient -from knext.schema.model.base import SpgTypeEnum logger = logging.getLogger(__name__) -class KAGExtractor(ExtractorABC): +@ExtractorABC.register("schema_free") +@ExtractorABC.register("schema_free_extractor") +class SchemaFreeExtractor(ExtractorABC): """ A class for extracting knowledge graph subgraphs from text using a large language model (LLM). Inherits from the Extractor base class. + + Attributes: + llm (LLMClient): The large language model client used for text processing. + schema (SchemaClient): The schema client used to load the schema for the project. + ner_prompt (PromptABC): The prompt used for named entity recognition. + std_prompt (PromptABC): The prompt used for named entity standardization. + triple_prompt (PromptABC): The prompt used for triple extraction. + external_graph (ExternalGraphLoaderABC): The external graph loader used for additional NER. """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.llm = self._init_llm() - self.prompt_config = self.config.get("prompt", {}) - self.biz_scene = self.prompt_config.get("biz_scene") or os.getenv( - "KAG_PROMPT_BIZ_SCENE", "default" - ) - self.language = self.prompt_config.get("language") or os.getenv( - "KAG_PROMPT_LANGUAGE", "en" - ) - self.schema = SchemaClient(project_id=self.project_id).load() - self.ner_prompt = PromptOp.load(self.biz_scene, "ner")( - language=self.language, project_id=self.project_id - ) - self.std_prompt = PromptOp.load(self.biz_scene, "std")(language=self.language) - self.triple_prompt = PromptOp.load(self.biz_scene, "triple")( - language=self.language - ) - self.kg_types = [] - for type_name, spg_type in self.schema.items(): - if type_name in SPG_KGPrompt.ignored_types: - continue - if spg_type.spg_type_enum == SpgTypeEnum.Concept: - continue - properties = list(spg_type.properties.keys()) - for p in properties: - if p not in SPG_KGPrompt.ignored_properties: - self.kg_types.append(type_name) - break - if self.kg_types: - self.kg_prompt = SPG_KGPrompt( - self.kg_types, language=self.language, project_id=self.project_id - ) + def __init__( + self, + llm: LLMClient, + ner_prompt: PromptABC = None, + std_prompt: PromptABC = None, + triple_prompt: PromptABC = None, + external_graph: ExternalGraphLoaderABC = None, + ): + """ + Initializes the KAGExtractor with the specified parameters. + + Args: + llm (LLMClient): The large language model client. + ner_prompt (PromptABC, optional): The prompt for named entity recognition. Defaults to None. + std_prompt (PromptABC, optional): The prompt for named entity standardization. Defaults to None. + triple_prompt (PromptABC, optional): The prompt for triple extraction. Defaults to None. + external_graph (ExternalGraphLoaderABC, optional): The external graph loader. Defaults to None. + """ + super().__init__() + self.llm = llm + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() + self.ner_prompt = ner_prompt + self.std_prompt = std_prompt + self.triple_prompt = triple_prompt + + biz_scene = KAG_PROJECT_CONF.biz_scene + if self.ner_prompt is None: + self.ner_prompt = init_prompt_with_fallback("ner", biz_scene) + if self.std_prompt is None: + self.std_prompt = init_prompt_with_fallback("std", biz_scene) + if self.triple_prompt is None: + self.triple_prompt = init_prompt_with_fallback("triple", biz_scene) + + self.external_graph = external_graph @property def input_types(self) -> Type[Input]: @@ -87,12 +98,34 @@ def named_entity_recognition(self, passage: str): Returns: The result of the named entity recognition operation. """ - if self.kg_types: - kg_result = self.llm.invoke({"input": passage}, self.kg_prompt) - else: - kg_result = [] ner_result = self.llm.invoke({"input": passage}, self.ner_prompt) - return kg_result + ner_result + if self.external_graph: + extra_ner_result = self.external_graph.ner(passage) + else: + extra_ner_result = [] + output = [] + dedup = set() + for item in extra_ner_result: + name = item.name + label = item.label + description = item.properties.get("desc", "") + semantic_type = item.properties.get("semanticType", label) + if name not in dedup: + dedup.add(name) + output.append( + { + "name": name, + "type": semantic_type, + "category": label, + "description": description, + } + ) + for item in ner_result: + name = item.get("name", None) + if name and name not in dedup: + dedup.add(name) + output.append(item) + return output @retry(stop=stop_after_attempt(3)) def named_entity_standardization(self, passage: str, entities: List[Dict]): @@ -125,20 +158,26 @@ def triples_extraction(self, passage: str, entities: List[Dict]): ) def assemble_sub_graph_with_spg_records(self, entities: List[Dict]): + """ + Assembles a subgraph using SPG records. + + Args: + entities (List[Dict]): A list of entities to be used for subgraph assembly. + + Returns: + The assembled subgraph and the updated list of entities. + """ sub_graph = SubGraph([], []) for record in entities: - s_name = record.get("entity", "") + s_name = record.get("name", "") s_label = record.get("category", "") properties = record.get("properties", {}) tmp_properties = copy.deepcopy(properties) spg_type = self.schema.get(s_label) - if not spg_type: - continue for prop_name, prop_value in properties.items(): if prop_value == "NAN": tmp_properties.pop(prop_name) continue - if prop_name in spg_type.properties: from knext.schema.model.property import Property @@ -173,11 +212,14 @@ def assemble_sub_graph_with_triples( sub_graph (SubGraph): The subgraph to add edges to. entities (List[Dict]): A list of entities, for looking up category information. triples (List[list]): A list of triples, each representing a relationship to be added to the subgraph. + Returns: + The constructed subgraph. + """ def get_category(entities_data, entity_name): for entity in entities_data: - if entity["entity"] == entity_name: + if entity["name"] == entity_name: return entity["category"] return None @@ -194,7 +236,6 @@ def get_category(entities_data, entity_name): if o_category is None: o_category = OTHER_TYPE sub_graph.add_node(tri[2], tri[2], o_category) - edge_type = to_camel_case(tri[1]) if edge_type: sub_graph.add_edge(tri[0], s_category, edge_type, tri[2], o_category) @@ -208,6 +249,8 @@ def assemble_sub_graph_with_chunk(sub_graph: SubGraph, chunk: Chunk): Args: sub_graph (SubGraph): The subgraph to add the chunk information to. chunk (Chunk): The chunk object containing the text and metadata. + Returns: + The constructed subgraph. """ for node in sub_graph.nodes: sub_graph.add_edge(node.id, node.label, "source", chunk.id, CHUNK_TYPE) @@ -240,7 +283,7 @@ def assemble_sub_graph( entities (List[Dict]): A list of entities identified in the chunk. triples (List[list]): A list of triples representing relationships between entities. Returns: - SubGraph: The constructed subgraph. + The constructed subgraph. """ self.assemble_sub_graph_with_entities(sub_graph, entities) self.assemble_sub_graph_with_triples(sub_graph, entities, triples) @@ -259,7 +302,7 @@ def assemble_sub_graph_with_entities( """ for ent in entities: - name = processing_phrases(ent["entity"]) + name = processing_phrases(ent["name"]) sub_graph.add_node( name, name, @@ -302,26 +345,31 @@ def append_official_name( source_entities (List[Dict]): A list of source entities. entities_with_official_name (List[Dict]): A list of entities with official names. """ - tmp_dict = {} - for tmp_entity in entities_with_official_name: - name = tmp_entity["entity"] - category = tmp_entity["category"] - official_name = tmp_entity["official_name"] - key = f"{category}{name}" - tmp_dict[key] = official_name - - for tmp_entity in source_entities: - name = tmp_entity["entity"] - category = tmp_entity["category"] - key = f"{category}{name}" - if key in tmp_dict: - official_name = tmp_dict[key] - tmp_entity["official_name"] = official_name - - def quoteStr(self, input: str) -> str: - return f"""{input}""" - - def invoke(self, input: Input, **kwargs) -> List[Output]: + try: + tmp_dict = {} + for tmp_entity in entities_with_official_name: + if "name" in tmp_entity: + name = tmp_entity["name"] + elif "entity" in tmp_entity: + name = tmp_entity["entity"] + else: + continue + category = tmp_entity["category"] + official_name = tmp_entity["official_name"] + key = f"{category}{name}" + tmp_dict[key] = official_name + + for tmp_entity in source_entities: + name = tmp_entity["name"] + category = tmp_entity["category"] + key = f"{category}{name}" + if key in tmp_dict: + official_name = tmp_dict[key] + tmp_entity["official_name"] = official_name + except Exception as e: + logger.warn(f"failed to process official name, info: {e}") + + def _invoke(self, input: Input, **kwargs) -> List[Output]: """ Invokes the semantic extractor to process input data. @@ -332,24 +380,19 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: Returns: List[Output]: A list of processed results, containing subgraph information. """ - title = input.name - passage = self.quoteStr(title + "\n" + input.content) - try: - entities = self.named_entity_recognition(passage) - sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities) - filtered_entities = [ - {k: v for k, v in ent.items() if k in ["entity", "category"]} - for ent in entities - ] - triples = self.triples_extraction(passage, filtered_entities) - std_entities = self.named_entity_standardization(passage, filtered_entities) - self.append_official_name(entities, std_entities) - self.assemble_sub_graph(sub_graph, input, entities, triples) - return [sub_graph] - except Exception as e: - import traceback - - traceback.print_exc() - logger.info(e) - return [] + title = input.name + passage = title + "\n" + input.content + out = [] + entities = self.named_entity_recognition(passage) + sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities) + filtered_entities = [ + {k: v for k, v in ent.items() if k in ["name", "category"]} + for ent in entities + ] + triples = self.triples_extraction(passage, filtered_entities) + std_entities = self.named_entity_standardization(passage, filtered_entities) + self.append_official_name(entities, std_entities) + self.assemble_sub_graph(sub_graph, input, entities, triples) + out.append(sub_graph) + return out diff --git a/kag/builder/component/extractor/spg_extractor.py b/kag/builder/component/extractor/spg_extractor.py deleted file mode 100644 index b1c63930..00000000 --- a/kag/builder/component/extractor/spg_extractor.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. -import copy -import logging -from typing import List, Dict - -from tenacity import retry, stop_after_attempt - -from kag.builder.component.extractor import KAGExtractor -from kag.builder.model.sub_graph import SubGraph -from kag.builder.prompt.spg_prompt import SPG_KGPrompt -from kag.common.base.prompt_op import PromptOp -from knext.common.base.runnable import Input, Output - -from knext.schema.client import BASIC_TYPES - -logger = logging.getLogger(__name__) - - -class SPGExtractor(KAGExtractor): - """ - A Builder Component that extracting structured data from long texts by invoking large language model. - - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.spg_ner_types, self.kag_ner_types = [], [] - for type_name, spg_type in self.schema.items(): - properties = list(spg_type.properties.keys()) - for p in properties: - if p not in SPG_KGPrompt.ignored_properties: - self.spg_ner_types.append(type_name) - continue - self.kag_ner_types.append(type_name) - self.kag_ner_prompt = PromptOp.load(self.biz_scene, "ner")(language=self.language, project_id=self.project_id) - self.spg_ner_prompt = SPG_KGPrompt(self.spg_ner_types, self.language, project_id=self.project_id) - - @retry(stop=stop_after_attempt(3)) - def named_entity_recognition(self, passage: str): - """ - Performs named entity recognition on a given text passage. - Args: - passage (str): The text to perform named entity recognition on. - Returns: - The result of the named entity recognition operation. - """ - spg_ner_result = self.llm.batch({"input": passage}, self.spg_ner_prompt) - kag_ner_result = self.llm.invoke({"input": passage}, self.kag_ner_prompt) - return spg_ner_result + kag_ner_result - - def assemble_sub_graph_with_spg_records(self, entities: List[Dict]): - sub_graph = SubGraph([], []) - for record in entities: - s_name = record.get("entity", "") - s_label = record.get("category", "") - properties = record.get("properties", {}) - tmp_properties = copy.deepcopy(properties) - spg_type = self.schema.get(s_label) - for prop_name, prop_value in properties.items(): - if prop_value == "NAN": - tmp_properties.pop(prop_name) - continue - if prop_name in spg_type.properties: - from knext.schema.model.property import Property - prop: Property = spg_type.properties.get(prop_name) - o_label = prop.object_type_name_en - if o_label not in BASIC_TYPES: - if isinstance(prop_value, str): - prop_value = [prop_value] - for o_name in prop_value: - sub_graph.add_node(id=o_name, name=o_name, label=o_label) - sub_graph.add_edge(s_id=s_name, s_label=s_label, p=prop_name, o_id=o_name, o_label=o_label) - tmp_properties.pop(prop_name) - record["properties"] = tmp_properties - sub_graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties) - return sub_graph, entities - - def invoke(self, input: Input, **kwargs) -> List[Output]: - """ - Invokes the semantic extractor to process input data. - - Args: - input (Input): Input data containing name and content. - **kwargs: Additional keyword arguments. - - Returns: - List[Output]: A list of processed results, containing subgraph information. - """ - title = input.name - passage = title + "\n" + input.content - - try: - entities = self.named_entity_recognition(passage) - sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities) - filtered_entities = [{k: v for k, v in ent.items() if k in ["entity", "category"]} for ent in entities] - triples = self.triples_extraction(passage, filtered_entities) - std_entities = self.named_entity_standardization(passage, filtered_entities) - self.append_official_name(entities, std_entities) - self.assemble_sub_graph(sub_graph, input, entities, triples) - return [sub_graph] - except Exception as e: - import traceback - - traceback.print_exc() - logger.info(e) - return [] diff --git a/kag/builder/component/mapping/__init__.py b/kag/builder/component/mapping/__init__.py index e0744009..e69de29b 100644 --- a/kag/builder/component/mapping/__init__.py +++ b/kag/builder/component/mapping/__init__.py @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.builder.component.mapping.spg_type_mapping import SPGTypeMapping -from kag.builder.component.mapping.relation_mapping import RelationMapping -from kag.builder.component.mapping.spo_mapping import SPOMapping - -__all__ = [ - "SPGTypeMapping", - "RelationMapping", - "SPOMapping", -] diff --git a/kag/builder/component/mapping/relation_mapping.py b/kag/builder/component/mapping/relation_mapping.py index 47fa9f64..d77d2db7 100644 --- a/kag/builder/component/mapping/relation_mapping.py +++ b/kag/builder/component/mapping/relation_mapping.py @@ -10,40 +10,46 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from collections import defaultdict from typing import Dict, List from kag.builder.model.sub_graph import SubGraph from knext.common.base.runnable import Input, Output from knext.schema.client import SchemaClient - -from knext.schema.model.schema_helper import ( - SPGTypeName, - RelationName, -) -from kag.interface.builder.mapping_abc import MappingABC +from kag.common.conf import KAG_PROJECT_CONF +from kag.interface import MappingABC +@MappingABC.register("relation") class RelationMapping(MappingABC): """ - A class that handles relation mappings by assembling subgraphs based on given subject, predicate, and object names. - This class extends the Mapping class. - - Args: - subject_name (SPGTypeName): The name of the subject type. - predicate_name (RelationName): The name of the predicate. - object_name (SPGTypeName): The name of the object type. + A class that extends the MappingABC class. + It handles relation mappings by assembling subgraphs based on given subject, predicate, and object names. """ def __init__( self, - subject_name: SPGTypeName, - predicate_name: RelationName, - object_name: SPGTypeName, - **kwargs + subject_name: str, + predicate_name: str, + object_name: str, + src_id_field: str = None, + dst_id_field: str = None, + property_mapping: dict = {}, + **kwargs, ): + """ + Initializes the RelationMapping instance. + + Args: + subject_name (str): The name of the subject type. + predicate_name (str): The name of the predicate type. + object_name (str): The name of the object type. + src_id_field (str, optional): The field name for the source ID. Defaults to None. + dst_id_field (str, optional): The field name for the destination ID. Defaults to None. + property_mapping (dict, optional): A dictionary mapping properties. Defaults to {}. + **kwargs: Additional keyword arguments passed to the parent class constructor. + """ super().__init__(**kwargs) - schema = SchemaClient(project_id=self.project_id).load() + schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() assert subject_name in schema, f"{subject_name} is not a valid SPG type name" assert object_name in schema, f"{object_name} is not a valid SPG type name" self.subject_type = schema.get(subject_name) @@ -54,10 +60,9 @@ def __init__( ), f"{predicate_name} is not a valid SPG property/relation name" self.predicate_name = predicate_name - self.src_id_field = None - self.dst_id_field = None - self.property_mapping: Dict = defaultdict(list) - self.linking_strategies: Dict = dict() + self.src_id_field = src_id_field + self.dst_id_field = dst_id_field + self.property_mapping = property_mapping def add_src_id_mapping(self, source_name: str): """ @@ -96,7 +101,11 @@ def add_sub_property_mapping(self, source_name: str, target_name: str): Returns: self """ - self.property_mapping[target_name].append(source_name) + + if target_name in self.property_mapping: + self.property_mapping[target_name].append(source_name) + else: + self.property_mapping[target_name] = [source_name] return self @property diff --git a/kag/builder/component/mapping/spg_type_mapping.py b/kag/builder/component/mapping/spg_type_mapping.py index 49400f70..3aa33487 100644 --- a/kag/builder/component/mapping/spg_type_mapping.py +++ b/kag/builder/component/mapping/spg_type_mapping.py @@ -15,33 +15,31 @@ import pandas from knext.schema.client import BASIC_TYPES -from kag.builder.model.sub_graph import SubGraph, Node +from kag.builder.model.sub_graph import SubGraph from knext.common.base.runnable import Input, Output from knext.schema.client import SchemaClient from knext.schema.model.base import SpgTypeEnum - from knext.schema.model.schema_helper import ( - SPGTypeName, PropertyName, ) +from kag.common.conf import KAG_PROJECT_CONF from kag.interface.builder.mapping_abc import MappingABC - -FuseFunc = Callable[[SubGraph], List[SubGraph]] -LinkFunc = Callable[[str, Node], List[Node]] +from kag.common.registry import Functor +@MappingABC.register("spg") +@MappingABC.register("spg_mapping") class SPGTypeMapping(MappingABC): """ - A class for mapping SPG (Simple Property Graph) types and handling their properties and strategies. + A class for mapping SPG(Semantic-enhanced Programmable Graph) types and handling their properties and strategies. Attributes: spg_type_name (SPGTypeName): The name of the SPG type. fuse_op (FuseOpABC, optional): The user-defined fuse operator. Defaults to None. """ - def __init__(self, spg_type_name: SPGTypeName, fuse_func: FuseFunc = None, **kwargs): - super().__init__(**kwargs) - self.schema = SchemaClient(project_id=self.project_id).load() + def __init__(self, spg_type_name: str, fuse_func: Functor = None): + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() assert ( spg_type_name in self.schema ), f"SPG type [{spg_type_name}] does not exist." @@ -55,7 +53,7 @@ def add_property_mapping( self, source_name: str, target_name: PropertyName, - link_func: LinkFunc = None, + link_func: Callable = None, ): """ Adds a property mapping from a source name to a target name within the SPG type. diff --git a/kag/builder/component/mapping/spo_mapping.py b/kag/builder/component/mapping/spo_mapping.py index 2ab11c93..57b7e978 100644 --- a/kag/builder/component/mapping/spo_mapping.py +++ b/kag/builder/component/mapping/spo_mapping.py @@ -10,7 +10,6 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. import json -from collections import defaultdict from typing import List, Type, Dict from kag.interface.builder.mapping_abc import MappingABC @@ -19,17 +18,44 @@ from knext.schema.client import OTHER_TYPE +@MappingABC.register("spo") +@MappingABC.register("spo_mapping") class SPOMapping(MappingABC): + """ + A class that extends the MappingABC base class. + It is responsible for mapping structured dictionaries to a list of SubGraphs. + """ + + def __init__( + self, + s_type_col: str = None, + s_id_col: str = None, + p_type_col: str = None, + o_type_col: str = None, + o_id_col: str = None, + sub_property_col: str = None, + sub_property_mapping: dict = {}, + ): + """ + Initializes the SPOMapping instance. - def __init__(self): + Args: + s_type_col (str, optional): The column name for the subject type. Defaults to None. + s_id_col (str, optional): The column name for the subject ID. Defaults to None. + p_type_col (str, optional): The column name for the predicate type. Defaults to None. + o_type_col (str, optional): The column name for the object type. Defaults to None. + o_id_col (str, optional): The column name for the object ID. Defaults to None. + sub_property_col (str, optional): The column name for sub-properties. Defaults to None. + sub_property_mapping (dict, optional): A dictionary mapping sub-properties. Defaults to {}. + """ super().__init__() - self.s_type_col = None - self.s_id_col = None - self.p_type_col = None - self.o_type_col = None - self.o_id_col = None - self.sub_property_mapping = defaultdict(list) - self.sub_property_col = None + self.s_type_col = s_type_col + self.s_id_col = s_id_col + self.p_type_col = p_type_col + self.o_type_col = o_type_col + self.o_id_col = o_id_col + self.sub_property_col = sub_property_col + self.sub_property_mapping = sub_property_mapping @property def input_types(self) -> Type[Input]: @@ -39,7 +65,27 @@ def input_types(self) -> Type[Input]: def output_types(self) -> Type[Output]: return SubGraph - def add_field_mappings(self, s_id_col: str, p_type_col: str, o_id_col: str, s_type_col: str = None, o_type_col: str = None): + def add_field_mappings( + self, + s_id_col: str, + p_type_col: str, + o_id_col: str, + s_type_col: str = None, + o_type_col: str = None, + ): + """ + Adds field mappings for the subject, predicate, and object types and IDs. + + Args: + s_id_col (str): The column name for the subject ID. + p_type_col (str): The column name for the predicate type. + o_id_col (str): The column name for the object ID. + s_type_col (str, optional): The column name for the subject type. Defaults to None. + o_type_col (str, optional): The column name for the object type. Defaults to None. + + Returns: + self + """ self.s_type_col = s_type_col self.s_id_col = s_id_col self.p_type_col = p_type_col @@ -63,7 +109,10 @@ def add_sub_property_mapping(self, source_name: str, target_name: str = None): if not target_name: self.sub_property_col = source_name else: - self.sub_property_mapping[target_name].append(source_name) + if target_name in self.sub_property_mapping: + self.sub_property_mapping[target_name].append(source_name) + else: + self.sub_property_mapping[target_name] = [source_name] return self def assemble_sub_graph(self, record: Dict[str, str]): @@ -86,14 +135,21 @@ def assemble_sub_graph(self, record: Dict[str, str]): sub_graph.add_node(id=o_id, name=o_id, label=o_type) sub_properties = {} if self.sub_property_col: - sub_properties = json.loads(record.get(self.sub_property_col, '{}')) + sub_properties = json.loads(record.get(self.sub_property_col, "{}")) sub_properties = {k: str(v) for k, v in sub_properties.items()} else: for target_name, source_names in self.sub_property_mapping.items(): for source_name in source_names: value = record.get(source_name) sub_properties[target_name] = value - sub_graph.add_edge(s_id=s_id, s_label=s_type, p=p, o_id=o_id, o_label=o_type, properties=sub_properties) + sub_graph.add_edge( + s_id=s_id, + s_label=s_type, + p=p, + o_id=o_id, + o_label=o_type, + properties=sub_properties, + ) return sub_graph def invoke(self, input: Input, **kwargs) -> List[Output]: @@ -105,7 +161,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: **kwargs: Additional keyword arguments. Returns: - List[Output]: A list of resulting sub-graphs. + List[Output]: A list of resulting subgraphs. """ record: Dict[str, str] = input sub_graph = self.assemble_sub_graph(record) diff --git a/kag/solver/logic/core_modules/op_executor/op_deduce/module/__init__.py b/kag/builder/component/postprocessor/__init__.py similarity index 100% rename from kag/solver/logic/core_modules/op_executor/op_deduce/module/__init__.py rename to kag/builder/component/postprocessor/__init__.py diff --git a/kag/builder/component/postprocessor/kag_postprocessor.py b/kag/builder/component/postprocessor/kag_postprocessor.py new file mode 100644 index 00000000..8af36b06 --- /dev/null +++ b/kag/builder/component/postprocessor/kag_postprocessor.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import logging +from typing import List +from tenacity import stop_after_attempt, retry +from kag.interface import PostProcessorABC +from kag.interface import ExternalGraphLoaderABC +from kag.builder.model.sub_graph import SubGraph +from kag.common.conf import KAGConstants, KAG_PROJECT_CONF +from kag.common.utils import get_vector_field_name +from knext.search.client import SearchClient +from knext.schema.client import SchemaClient, OTHER_TYPE + + +logger = logging.getLogger() + + +@PostProcessorABC.register("base", as_default=True) +@PostProcessorABC.register("kag_post_processor") +class KAGPostProcessor(PostProcessorABC): + """ + A class that extends the PostProcessorABC base class. + It provides methods to handle various post-processing tasks on subgraphs + including filtering, entity linking based on similarity, and linking based on an external graph. + """ + + def __init__( + self, + similarity_threshold: float = 0.9, + external_graph: ExternalGraphLoaderABC = None, + ): + """ + Initializes the KAGPostProcessor instance. + + Args: + similarity_threshold (float, optional): The similarity threshold for entity linking. Defaults to 0.9. + external_graph (ExternalGraphLoaderABC, optional): An instance of ExternalGraphLoaderABC for external graph-based linking. Defaults to None. + """ + super().__init__() + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() + self.similarity_threshold = similarity_threshold + self.external_graph = external_graph + self._init_search() + + def format_label(self, label: str): + """ + Formats the label by adding the project namespace if it is not already present. + + Args: + label (str): The label to be formatted. + + Returns: + str: The formatted label. + """ + namespace = KAG_PROJECT_CONF.namespace + if label.split(".")[0] == namespace: + return label + return f"{namespace}.{label}" + + def _init_search(self): + """ + Initializes the search client for entity linking. + """ + self._search_client = SearchClient( + KAG_PROJECT_CONF.host_addr, KAG_PROJECT_CONF.project_id + ) + + def filter_invalid_data(self, graph: SubGraph): + """ + Filters out invalid nodes and edges from the subgraph. + + Args: + graph (SubGraph): The subgraph to be filtered. + + Returns: + SubGraph: The filtered subgraph. + """ + valid_nodes = [] + valid_edges = [] + for node in graph.nodes: + if not node.id or not node.label: + continue + if node.label not in self.schema: + node.label = self.format_label(OTHER_TYPE) + # for k in node.properties.keys(): + # if k not in self.schema[node.label]: + # continue + valid_nodes.append(node) + for edge in graph.edges: + if edge.label: + valid_edges.append(edge) + return SubGraph(nodes=valid_nodes, edges=valid_edges) + + @retry(stop=stop_after_attempt(3)) + def _entity_link( + self, graph: SubGraph, property_key: str = "name", labels: List[str] = None + ): + """ + Performs entity linking based on the given property key and labels. + + Args: + graph (SubGraph): The subgraph to perform entity linking on. + property_key (str, optional): The property key to use for linking. Defaults to "name". + labels (List[str], optional): The labels to consider for linking. Defaults to None. + """ + vector_field_name = get_vector_field_name(property_key) + for node in graph.nodes: + if labels is None: + link_labels = [self.format_label(node.label)] + else: + link_labels = [self.format_label(x) for x in labels] + vector = node.properties.get(vector_field_name) + if vector: + all_similar_nodes = [] + for label in link_labels: + similar_nodes = self._search_client.search_vector( + label=label, + property_key=property_key, + query_vector=[float(x) for x in vector], + topk=1, + params={}, + ) + all_similar_nodes.extend(similar_nodes) + for item in all_similar_nodes: + score = item["score"] + if ( + score >= self.similarity_threshold + and node.id != item["node"]["id"] + ): + graph.add_edge( + node.id, + node.label, + KAGConstants.KAG_SIMILAR_EDGE_NAME, + item["node"]["id"], + item["node"]["__labels__"][0], + ) + + def similarity_based_link(self, graph: SubGraph, property_key: str = "name"): + """ + Performs entity linking based on similarity. + + Args: + graph (SubGraph): The subgraph to perform entity linking on. + property_key (str, optional): The property key to use for linking. Defaults to "name". + """ + self._entity_link(graph, property_key, None) + + def external_graph_based_link(self, graph: SubGraph, property_key: str = "name"): + """ + Performs entity linking based on the user provided external graph. + + Args: + graph (SubGraph): The subgraph to perform entity linking on. + property_key (str, optional): The property key to use for linking. Defaults to "name". + """ + if not self.external_graph: + return + labels = self.external_graph.get_allowed_labels() + self._entity_link(graph, property_key, labels) + + def _invoke(self, input, **kwargs): + """ + Invokes the post-processing pipeline on the input subgraph. + + Args: + input: The input subgraph to be processed. + + Returns: + List[SubGraph]: A list containing the processed subgraph. + """ + origin_num_nodes = len(input.nodes) + origin_num_edges = len(input.edges) + new_graph = self.filter_invalid_data(input) + self.similarity_based_link(new_graph) + self.external_graph_based_link(new_graph) + new_num_nodes = len(new_graph.nodes) + new_num_edges = len(new_graph.edges) + logger.debug( + f"origin: {origin_num_nodes}/{origin_num_edges}, processed: {new_num_nodes}/{new_num_edges}" + ) + return [new_graph] diff --git a/kag/builder/component/reader/__init__.py b/kag/builder/component/reader/__init__.py index df6c45b5..e69de29b 100644 --- a/kag/builder/component/reader/__init__.py +++ b/kag/builder/component/reader/__init__.py @@ -1,33 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.builder.component.reader.csv_reader import CSVReader -from kag.builder.component.reader.pdf_reader import PDFReader -from kag.builder.component.reader.json_reader import JSONReader -from kag.builder.component.reader.markdown_reader import MarkDownReader -from kag.builder.component.reader.docx_reader import DocxReader -from kag.builder.component.reader.txt_reader import TXTReader -from kag.builder.component.reader.dataset_reader import HotpotqaCorpusReader, TwowikiCorpusReader, MusiqueCorpusReader -from kag.builder.component.reader.yuque_reader import YuqueReader - -__all__ = [ - "TXTReader", - "PDFReader", - "MarkDownReader", - "JSONReader", - "HotpotqaCorpusReader", - "MusiqueCorpusReader", - "TwowikiCorpusReader", - "YuqueReader", - "CSVReader", - "DocxReader", -] diff --git a/kag/builder/component/reader/csv_reader.py b/kag/builder/component/reader/csv_reader.py deleted file mode 100644 index 9c7c157d..00000000 --- a/kag/builder/component/reader/csv_reader.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. -import os -from typing import List, Type, Dict - -import pandas as pd - -from kag.builder.model.chunk import Chunk -from kag.interface.builder.reader_abc import SourceReaderABC -from knext.common.base.runnable import Input, Output - - -class CSVReader(SourceReaderABC): - """ - A class for reading CSV files, inheriting from `SourceReader`. - Supports converting CSV data into either a list of dictionaries or a list of Chunk objects. - - Args: - output_type (Output): Specifies the output type, which can be "Dict" or "Chunk". - **kwargs: Additional keyword arguments passed to the parent class constructor. - """ - - def __init__(self, output_type="Chunk", **kwargs): - super().__init__(**kwargs) - if output_type == "Dict": - self.output_types = Dict[str, str] - else: - self.output_types = Chunk - self.id_col = kwargs.get("id_col", "id") - self.name_col = kwargs.get("name_col", "name") - self.content_col = kwargs.get("content_col", "content") - - @property - def input_types(self) -> Type[Input]: - return str - - @property - def output_types(self) -> Type[Output]: - return self._output_types - - @output_types.setter - def output_types(self, output_types): - self._output_types = output_types - - def invoke(self, input: Input, **kwargs) -> List[Output]: - """ - Reads a CSV file and converts the data format based on the output type. - - Args: - input (Input): Input parameter, expected to be a string representing the path to the CSV file. - **kwargs: Additional keyword arguments, which may include `id_column`, `name_column`, `content_column`, etc. - - Returns: - List[Output]: - - If `output_types` is `Chunk`, returns a list of Chunk objects. - - If `output_types` is `Dict`, returns a list of dictionaries. - """ - - try: - data = pd.read_csv(input) - data = data.astype(str) - except Exception as e: - raise IOError(f"Failed to read the file: {e}") - - if self.output_types == Chunk: - chunks = [] - basename, _ = os.path.splitext(os.path.basename(input)) - for idx, row in enumerate(data.to_dict(orient="records")): - kwargs = {k: v for k, v in row.items() if k not in [self.id_col, self.name_col, self.content_col]} - chunks.append( - Chunk( - id=row.get(self.id_col) or Chunk.generate_hash_id(f"{input}#{idx}"), - name=row.get(self.name_col) or f"{basename}#{idx}", - content=row[self.content_col], - **kwargs - ) - ) - return chunks - else: - return data.to_dict(orient="records") diff --git a/kag/builder/component/reader/dataset_reader.py b/kag/builder/component/reader/dataset_reader.py deleted file mode 100644 index 850b87ef..00000000 --- a/kag/builder/component/reader/dataset_reader.py +++ /dev/null @@ -1,97 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -import os -from typing import List, Type - -from kag.builder.model.chunk import Chunk -from kag.interface.builder import SourceReaderABC -from knext.common.base.runnable import Input, Output - - -class HotpotqaCorpusReader(SourceReaderABC): - @property - def input_types(self) -> Type[Input]: - """The type of input this Runnable object accepts specified as a type annotation.""" - return str - - @property - def output_types(self) -> Type[Output]: - """The type of output this Runnable object produces specified as a type annotation.""" - return Chunk - - def invoke(self, input: str, **kwargs) -> List[Output]: - if os.path.exists(str(input)): - with open(input, "r") as f: - corpus = json.load(f) - else: - corpus = json.loads(input) - chunks = [] - - for item_key, item_value in corpus.items(): - chunk = Chunk( - id=item_key, - name=item_key, - content="\n".join(item_value), - ) - chunks.append(chunk) - return chunks - - -class MusiqueCorpusReader(SourceReaderABC): - @property - def input_types(self) -> Type[Input]: - """The type of input this Runnable object accepts specified as a type annotation.""" - return str - - @property - def output_types(self) -> Type[Output]: - """The type of output this Runnable object produces specified as a type annotation.""" - return Chunk - - def get_basename(self, file_name: str): - base, ext = os.path.splitext(os.path.basename(file_name)) - return base - - def invoke(self, input: str, **kwargs) -> List[Output]: - id_column = kwargs.get("id_column", "title") - name_column = kwargs.get("name_column", "title") - content_column = kwargs.get("content_column", "text") - - if os.path.exists(str(input)): - with open(input, "r") as f: - corpusList = json.load(f) - else: - corpusList = input - chunks = [] - - for item in corpusList: - chunk = Chunk( - id=item[id_column], - name=item[name_column], - content=item[content_column], - ) - chunks.append(chunk) - return chunks - - -class TwowikiCorpusReader(MusiqueCorpusReader): - @property - def input_types(self) -> Type[Input]: - """The type of input this Runnable object accepts specified as a type annotation.""" - return str - - @property - def output_types(self) -> Type[Output]: - """The type of output this Runnable object produces specified as a type annotation.""" - return Chunk diff --git a/kag/builder/component/reader/dict_reader.py b/kag/builder/component/reader/dict_reader.py new file mode 100644 index 00000000..bf90b24d --- /dev/null +++ b/kag/builder/component/reader/dict_reader.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +from typing import Dict, List +from kag.interface import ReaderABC +from knext.common.base.runnable import Output, Input +from kag.builder.model.chunk import Chunk + + +@ReaderABC.register("dict") +@ReaderABC.register("dict_reader") +class DictReader(ReaderABC): + """ + A class for reading dictionaries into Chunk objects. + + This class inherits from ReaderABC and provides the functionality to convert dictionary inputs + into a list of Chunk objects. + + Attributes: + id_col (str): The key in the input dictionary that corresponds to the chunk's ID. + name_col (str): The key in the input dictionary that corresponds to the chunk's name. + content_col (str): The key in the input dictionary that corresponds to the chunk's content. + """ + + def __init__( + self, id_col: str = "id", name_col: str = "name", content_col: str = "content" + ): + """ + Initializes the DictReader with the specified column names. + + Args: + id_col (str): The key in the input dictionary that corresponds to the chunk's ID. Defaults to "id". + name_col (str): The key in the input dictionary that corresponds to the chunk's name. Defaults to "name". + content_col (str): The key in the input dictionary that corresponds to the chunk's content. Defaults to "content". + """ + super().__init__() + self.id_col = id_col + self.name_col = name_col + self.content_col = content_col + + @property + def input_types(self) -> Input: + return Dict + + def _invoke(self, input: Input, **kwargs) -> List[Output]: + """ + Converts the input dictionary into a list of Chunk objects. + + Args: + input (Input): The input dictionary containing the data to be parsed. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. + + Returns: + List[Output]: A list containing a single Chunk object created from the input dictionary. + """ + chunk_id = input.get(self.id_col) + chunk_name = input.get(self.name_col) + chunk_content = input.get(self.content_col) + if self.id_col in input: + input.pop(self.id_col) + if self.name_col in input: + input.pop(self.name_col) + if self.content_col in input: + input.pop(self.content_col) + + return [Chunk(id=chunk_id, name=chunk_name, content=chunk_content, **input)] diff --git a/kag/builder/component/reader/docx_reader.py b/kag/builder/component/reader/docx_reader.py index d9208f62..06464301 100644 --- a/kag/builder/component/reader/docx_reader.py +++ b/kag/builder/component/reader/docx_reader.py @@ -11,17 +11,17 @@ # or implied. import os -from typing import List, Type,Union +from typing import List, Union from docx import Document - -from kag.builder.component.reader import MarkDownReader +from kag.interface import LLMClient from kag.builder.model.chunk import Chunk -from kag.interface.builder import SourceReaderABC +from kag.interface import ReaderABC +from kag.builder.prompt.outline_prompt import OutlinePrompt +from kag.common.conf import KAG_PROJECT_CONF +from kag.common.utils import generate_hash_id from knext.common.base.runnable import Input, Output -from kag.common.llm.client import LLMClient -from kag.builder.prompt.outline_prompt import OutlinePrompt def split_txt(content): from modelscope.outputs import OutputKeys @@ -30,40 +30,49 @@ def split_txt(content): p = pipeline( task=Tasks.document_segmentation, - model='damo/nlp_bert_document-segmentation_chinese-base') + model="damo/nlp_bert_document-segmentation_chinese-base", + ) result = p(documents=content) result = result[OutputKeys.TEXT] - - res = [r for r in result.split('\n\t') if len(r) > 0] - + + res = [r for r in result.split("\n\t") if len(r) > 0] + return res - -class DocxReader(SourceReaderABC): +@ReaderABC.register("docx") +@ReaderABC.register("docx_reader") +class DocxReader(ReaderABC): """ - A class for reading Docx files, inheriting from SourceReader. - This class is specifically designed to extract text content from Docx files and generate Chunk objects based on the extracted content. + A class for reading Docx files into Chunk objects. + + This class inherits from ReaderABC and provides the functionality to process Docx files, + extract their text content, and convert it into a list of Chunk objects. """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.split_level = kwargs.get("split_level", 3) - self.split_using_outline = kwargs.get("split_using_outline", True) - self.outline_flag = True - self.llm = self._init_llm() - language = os.getenv("KAG_PROMPT_LANGUAGE", "zh") - self.prompt = OutlinePrompt(language) - - @property - def input_types(self) -> Type[Input]: - return str - - @property - def output_types(self) -> Type[Output]: - return Chunk - - def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]: + + def __init__(self, llm: LLMClient = None): + """ + Initializes the DocxReader with an optional LLMClient instance. + + Args: + llm (LLMClient): An optional LLMClient instance used for generating outlines. Defaults to None. + """ + super().__init__() + self.llm = llm + self.prompt = OutlinePrompt(KAG_PROJECT_CONF.language) + + def outline_chunk(self, chunk: Union[Chunk, List[Chunk]], basename) -> List[Chunk]: + """ + Generates outlines for the given chunk(s) and separates the content based on these outlines. + + Args: + chunk (Union[Chunk, List[Chunk]]): A single Chunk object or a list of Chunk objects. + basename: The base name used for generating chunk IDs and names. + + Returns: + List[Chunk]: A list of Chunk objects separated by the generated outlines. + """ if isinstance(chunk, Chunk): chunk = [chunk] outlines = [] @@ -71,20 +80,35 @@ def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk outline = self.llm.invoke({"input": c.content}, self.prompt) outlines.extend(outline) content = "\n".join([c.content for c in chunk]) - chunks = self.sep_by_outline(content, outlines,basename) + chunks = self.sep_by_outline(content, outlines, basename) return chunks - - def sep_by_outline(self,content,outlines,basename): + + def sep_by_outline(self, content, outlines, basename): + """ + Separates the content based on the provided outlines. + + Args: + content (str): The content to be separated. + outlines (List[str]): A list of outlines used to separate the content. + basename: The base name used for generating chunk IDs and names. + + Returns: + List[Chunk]: A list of Chunk objects separated by the provided outlines. + """ position_check = [] for outline in outlines: start = content.find(outline) - position_check.append((outline,start)) + position_check.append((outline, start)) chunks = [] - for idx,pc in enumerate(position_check): + for idx, pc in enumerate(position_check): chunk = Chunk( - id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"), + id=generate_hash_id(f"{basename}#{pc[0]}"), name=f"{basename}#{pc[0]}", - content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)], + content=content[ + pc[1] : position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(position_check) + ], ) chunks.append(chunk) return chunks @@ -111,16 +135,25 @@ def _extract_text_from_docx(doc: Document) -> str: for para in doc.paragraphs: full_text.append(para.text) return full_text - + def _get_title_from_text(self, text: str) -> str: + """ + Extracts the title from the provided text. + + Args: + text (str): The text from which to extract the title. + + Returns: + str: The extracted title and the remaining text. + """ text = text.strip() - title = text.split('\n')[0] - text = "\n".join(text.split('\n')) - return title,text + title = text.split("\n")[0] + text = "\n".join(text.split("\n")) + return title, text - def invoke(self, input: Input, **kwargs) -> List[Output]: + def _invoke(self, input: Input, **kwargs) -> List[Output]: """ - Processes the input Docx file, extracts its text content, and generates a Chunk object. + Processes the input Docx file, extracts its text content, and generates Chunk objects. Args: input (Input): The file path of the Docx file to be processed. @@ -136,9 +169,9 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: if not input: raise ValueError("Input cannot be empty") - + chunks = [] - + try: doc = Document(input) full_text = self._extract_text_from_docx(doc) @@ -148,32 +181,12 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: basename, _ = os.path.splitext(os.path.basename(input)) - for text in full_text: - title,text = self._get_title_from_text(text) - chunk = Chunk( - id=Chunk.generate_hash_id(f"{basename}#{title}"), - name=f"{basename}#{title}", - content=text, - ) - chunks.append(chunk) - - if len(chunks) < 2: - chunks = self.outline_chunk(chunks,basename) - - if len(chunks) < 2: - semantic_res = split_txt(content) - chunks = [Chunk( - id=Chunk.generate_hash_id(input+"#"+r[:10]), - name=basename+"#"+r[:10], - content=r, - ) for r in semantic_res] + chunk = Chunk( + id=generate_hash_id(input), + name=basename, + content=content, + **{"documentId": basename, "documentName": basename}, + ) + chunks.append(chunk) return chunks - - -if __name__== "__main__": - reader = DocxReader() - print(reader.output_types) - file_path = os.path.dirname(__file__) - res = reader.invoke(os.path.join(file_path,"../../../../tests/builder/data/test_docx.docx")) - print(res) \ No newline at end of file diff --git a/kag/builder/component/reader/json_reader.py b/kag/builder/component/reader/json_reader.py deleted file mode 100644 index 9ee27f54..00000000 --- a/kag/builder/component/reader/json_reader.py +++ /dev/null @@ -1,164 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -import os -from typing import List, Type, Dict, Union - -from kag.builder.component.reader.markdown_reader import MarkDownReader -from kag.builder.model.chunk import Chunk -from kag.interface.builder.reader_abc import SourceReaderABC -from knext.common.base.runnable import Input, Output - -from kag.common.llm.client import LLMClient - - -class JSONReader(SourceReaderABC): - """ - A class for reading JSON files, inheriting from `SourceReader`. - Supports converting JSON data into either a list of dictionaries or a list of Chunk objects. - - Args: - output_types (Output): Specifies the output type, which can be "Dict" or "Chunk". - **kwargs: Additional keyword arguments passed to the parent class constructor. - """ - - def __init__(self, output_type="Chunk", **kwargs): - super().__init__(**kwargs) - if output_type == "Dict": - self.output_types = Dict[str, str] - else: - self.output_types = Chunk - self.id_col = kwargs.get("id_col", "id") - self.name_col = kwargs.get("name_col", "name") - self.content_col = kwargs.get("content_col", "content") - - @property - def input_types(self) -> Type[Input]: - return str - - @property - def output_types(self) -> Type[Output]: - return self._output_types - - @output_types.setter - def output_types(self, output_types): - self._output_types = output_types - - @staticmethod - def _read_from_file(file_path: str) -> Union[dict, list]: - """ - Safely reads JSON from a file and returns its content. - - Args: - file_path (str): The path to the JSON file. - - Returns: - Union[dict, list]: The parsed JSON content. - - Raises: - ValueError: If there is an error reading the JSON file. - """ - try: - with open(file_path, "r") as file: - return json.load(file) - except json.JSONDecodeError as e: - raise ValueError(f"Error reading JSON from file: {e}") - except FileNotFoundError as e: - raise ValueError(f"File not found: {e}") - - @staticmethod - def _parse_json_string(json_string: str) -> Union[dict, list]: - """ - Parses a JSON string and returns its content. - - Args: - json_string (str): The JSON string to parse. - - Returns: - Union[dict, list]: The parsed JSON content. - - Raises: - ValueError: If there is an error parsing the JSON string. - """ - try: - return json.loads(json_string) - except json.JSONDecodeError as e: - raise ValueError(f"Error parsing JSON string: {e}") - - def invoke(self, input: str, **kwargs) -> List[Output]: - """ - Parses the input string data and generates a list of Chunk objects or returns the original data. - - This method supports receiving JSON-formatted strings. It extracts specific fields based on provided keyword arguments. - It can read from a file or directly parse a string. If the input data is in the expected format, it generates a list of Chunk objects; - otherwise, it throws a ValueError if the input is not a JSON array or object. - - Args: - input (str): The input data, which can be a JSON string or a file path. - **kwargs: Keyword arguments used to specify the field names for ID, name, and content. - - Returns: - List[Output]: A list of Chunk objects or the original data. - - Raises: - ValueError: If the input data format is incorrect or parsing fails. - """ - - id_col = kwargs.get("id_col", "id") - name_col = kwargs.get("name_col", "name") - content_col = kwargs.get("content_col", "content") - self.id_col = id_col - self.name_col = name_col - self.content_col = content_col - try: - if os.path.exists(input): - corpus = self._read_from_file(input) - else: - corpus = self._parse_json_string(input) - except ValueError as e: - raise e - - if not isinstance(corpus, (list, dict)): - raise ValueError("Expected input to be a JSON array or object") - - if isinstance(corpus, dict): - corpus = [corpus] - - if self.output_types == Chunk: - chunks = [] - basename, _ = os.path.splitext(os.path.basename(input)) - for idx, item in enumerate(corpus): - if not isinstance(item, dict): - continue - - chunk = Chunk( - id=item.get(self.id_col) or Chunk.generate_hash_id(f"{input}#{idx}"), - name=item.get(self.name_col) or f"{basename}#{idx}", - content=item.get(self.content_col), - ) - chunks.append(chunk) - - return chunks - else: - return corpus - -if __name__ == "__main__": - reader = JSONReader() - json_string = '''[ - { - "title": "test_json", - "text": "Test content" - } - ]''' - chunks = reader.invoke(json_string,name_column="title",content_col = "text") - res = 1 \ No newline at end of file diff --git a/kag/builder/component/reader/markdown_reader.py b/kag/builder/component/reader/markdown_reader.py index adfcffbd..ba212c8e 100644 --- a/kag/builder/component/reader/markdown_reader.py +++ b/kag/builder/component/reader/markdown_reader.py @@ -12,24 +12,37 @@ import os -import bs4.element import markdown from bs4 import BeautifulSoup, Tag -from typing import List, Type + import logging import re import requests -import pandas as pd -from io import StringIO -from tenacity import stop_after_attempt, retry +from typing import List, Dict -from kag.interface.builder import SourceReaderABC -from kag.builder.model.chunk import Chunk, ChunkTypeEnum -from knext.common.base.runnable import Output, Input + +from kag.interface import ReaderABC +from kag.builder.model.chunk import Chunk +from kag.interface import LLMClient from kag.builder.prompt.analyze_table_prompt import AnalyzeTablePrompt +from knext.common.base.runnable import Output, Input + +logger = logging.getLogger(__name__) -class MarkDownReader(SourceReaderABC): + +class MarkdownNode: + def __init__(self, title: str, level: int, content: str = ""): + self.title = title + self.level = level + self.content = content + self.children: List[MarkdownNode] = [] + self.tables: List[Dict] = [] # 存储表格数据 + + +@ReaderABC.register("md") +@ReaderABC.register("md_reader") +class MarkDownReader(ReaderABC): """ A class for reading MarkDown files, inheriting from `SourceReader`. Supports converting MarkDown data into a list of Chunk objects. @@ -41,352 +54,344 @@ class MarkDownReader(SourceReaderABC): ALL_LEVELS = [f"h{x}" for x in range(1, 7)] TABLE_CHUCK_FLAG = "<<>>" - def __init__(self, cut_depth: int = 1, **kwargs): + def __init__(self, cut_depth: int = 3, llm: LLMClient = None, **kwargs): super().__init__(**kwargs) self.cut_depth = int(cut_depth) - self.llm_module = kwargs.get("llm_module", None) + self.llm = llm self.analyze_table_prompt = AnalyzeTablePrompt(language="zh") self.analyze_img_prompt = AnalyzeTablePrompt(language="zh") @property - def input_types(self) -> Type[Input]: + def input_types(self): return str @property - def output_types(self) -> Type[Output]: + def output_types(self): return Chunk - def to_text(self, level_tags): - """ - Converts parsed hierarchical tags into text content. + def solve_content( + self, id: str, title: str, content: str, **kwargs + ) -> List[Output]: + # Convert Markdown to HTML with additional extensions for lists + html = markdown.markdown( + content, extensions=["tables", "nl2br", "sane_lists", "fenced_code"] + ) + soup = BeautifulSoup(html, "html.parser") + + def is_in_code_block(element): + """Check if an element is inside a code block""" + parent = element.parent + while parent: + if parent.name in ["pre", "code"]: + return True + parent = parent.parent + return False + + def process_text_with_links(element): + """Process text containing links, preserving original markdown format""" + result = [] + current_text = "" + + for child in element.children: + if isinstance(child, Tag): + if child.name == "a": + # If there's previous text, add it first + if current_text: + result.append(current_text.strip()) + current_text = "" + + # Rebuild markdown format link + link_text = child.get_text().strip() + href = child.get("href", "") + title = child.get("title", "") + + if title: + result.append(f'[{link_text}]({href} "{title}")') + else: + result.append(f"[{link_text}]({href})") + else: + current_text += child.get_text() + else: + current_text += str(child) + + if current_text: + result.append(current_text.strip()) + + return " ".join(result) + + # Initialize root node + root = MarkdownNode("root", 0) + stack = [root] + current_content = [] + + # Traverse all elements + all_elements = soup.find_all( + [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "table", + "ul", + "ol", + "li", + "pre", + "code", + ] + ) + for element in all_elements: + if element.name.startswith("h") and not is_in_code_block(element): + # Only process headers that are not in code blocks + # Handle title logic + if current_content and stack[-1].title != "root": + stack[-1].content = "\n".join(current_content) + current_content = [] + + level = int(element.name[1]) + title_text = process_text_with_links(element) # Process links in title + new_node = MarkdownNode(title_text, level) + + while stack and stack[-1].level >= level: + stack.pop() + + if stack: + stack[-1].children.append(new_node) + stack.append(new_node) + + elif element.name in ["code"]: + # Preserve code blocks as is + text = element.get_text() + if text: + current_content.append(text) + + elif element.name in ["ul", "ol"]: + continue + + elif element.name == "li": + text = process_text_with_links(element) # Process links in list items + if text: + if element.find_parent("ol"): + index = len(element.find_previous_siblings("li")) + 1 + current_content.append(f"{index}. {text}") + else: + current_content.append(f"* {text}") + + elif element.name == "table": + # Process table + table_data = [] + headers = [] + + if element.find("thead"): + for th in element.find("thead").find_all("th"): + headers.append(th.get_text().strip()) + + if element.find("tbody"): + for row in element.find("tbody").find_all("tr"): + row_data = {} + for i, td in enumerate(row.find_all("td")): + if i < len(headers): + row_data[headers[i]] = td.get_text().strip() + table_data.append(row_data) + + # Add table to current node + if stack[-1].title != "root": + stack[-1].tables.append({"headers": headers, "data": table_data}) + + elif element.name == "p": + text = process_text_with_links(element) # Process links in paragraphs + if text: + if not text.startswith("* ") and not re.match(r"^\d+\. ", text): + current_content.append(text) + + # Process content of the last node + if current_content and stack[-1].title != "root": + stack[-1].content = "\n".join(current_content) + + outputs = self._convert_to_outputs(root, id) + return outputs + + def _convert_to_outputs( + self, + node: MarkdownNode, + id: str, + parent_id: str = None, + parent_titles: List[str] = None, + parent_contents: List[str] = None, + ) -> List[Output]: + def convert_table_to_markdown(headers, data): + """Convert table data to markdown format""" + if not headers or not data: + return "" + + # Build header row + header_row = " | ".join(headers) + # Build separator row + separator = " | ".join(["---"] * len(headers)) + # Build data rows + data_rows = [] + for row in data: + row_values = [str(row.get(header, "")) for header in headers] + data_rows.append(" | ".join(row_values)) + + # Combine all rows + table_md = f"\n| {header_row} |\n| {separator} |\n" + table_md += "\n".join(f"| {row} |" for row in data_rows) + return table_md + "\n" + + def collect_tables(n: MarkdownNode): + """Collect tables from node and its children""" + tables = [] + table_md = [] + if n.tables: + for table in n.tables: + tables.append(table) + table_md.append( + convert_table_to_markdown(table["headers"], table["data"]) + ) + for child in n.children: + child_tables, child_table_md = collect_tables(child) + tables.extend(child_tables) + table_md.extend(child_table_md) + return tables, table_md + + def collect_children_content(n: MarkdownNode): + """Collect content from node and its children""" + content = [] + if n.content: + content.append(n.content) + # Add current node's table content + for table in n.tables: + content.append( + convert_table_to_markdown(table["headers"], table["data"]) + ) + # Process child nodes recursively + for child in n.children: + content.extend(collect_children_content(child)) + return content - Args: - level_tags (list): Parsed tags organized by Markdown heading levels and other tags. + outputs = [] + if parent_titles is None: + parent_titles = [] + if parent_contents is None: + parent_contents = [] - Returns: - str: Text content derived from the parsed tags. - """ - content = [] - for item in level_tags: - if isinstance(item, list): - content.append(self.to_text(item)) - else: - header, tag = item - if not isinstance(tag, Tag): - continue - elif tag.name in self.ALL_LEVELS: - content.append( - f"{header}-{tag.text}" if len(header) > 0 else tag.text - ) - else: - content.append(self.tag_to_text(tag)) - return "\n".join(content) + current_titles = parent_titles + ([node.title] if node.title != "root" else []) - def tag_to_text(self, tag: bs4.element.Tag): - """ - 将html tag转换为text - 如果是table,输出markdown,添加表格标记,方便后续构建Chunk - :param tag: - :return: - """ - if tag.name == "table": - try: - html_table = str(tag) - table_df = pd.read_html(html_table)[0] - return f"{self.TABLE_CHUCK_FLAG}{table_df.to_markdown(index=False)}{self.TABLE_CHUCK_FLAG}" - except: - logging.warning("parse table tag to text error", exc_info=True) - return tag.text - - @retry(stop=stop_after_attempt(5)) - def analyze_table(self, table,analyze_mathod="human"): - if analyze_mathod == "llm": - if self.llm_module == None: - logging.INFO("llm_module is None, cannot use analyze_table") - return table - variables = { - "table": table - } - response = self.llm_module.invoke( - variables = variables, - prompt_op = self.analyze_table_prompt, - with_json_parse=False - ) - if response is None or response == "" or response == []: - raise Exception("llm_module return None") - return response - else: - from io import StringIO - import pandas as pd - try: - df = pd.read_html(StringIO(table))[0] - except Exception as e: - logging.warning(f"analyze_table error: {e}") - return table - content = "" - for index, row in df.iterrows(): - content+=f"第{index+1}行的数据如下:" - for col_name, value in row.items(): - content+=f"{col_name}的值为{value}," - content+='\n' - return content + # If current node level equals target level, create output + if node.level >= self.cut_depth: + full_title = " / ".join(current_titles) - - @retry(stop=stop_after_attempt(5)) - def analyze_img(self, img_url): - response = requests.get(img_url) - response.raise_for_status() - image_data = response.content - - pass - - def replace_table(self, content: str): - pattern = r"]*>([\s\S]*?)<\/table>" - for match in re.finditer(pattern, content): - table = match.group(0) - table = self.analyze_table(table) - content = content.replace(match.group(1), table) - return content - - def replace_img(self, content: str): - pattern = r"]*src=[\"\']([^\"\']*)[\"\']" - for match in re.finditer(pattern, content): - img_url = match.group(1) - img_msg = self.analyze_img(img_url) - content = content.replace(match.group(0), img_msg) - return content - - def extract_table(self, level_tags, header=""): - """ - Extracts tables from the parsed hierarchical tags along with their headers. + # Merge content: parent content + current content + all_content = parent_contents + ([node.content] if node.content else []) - Args: - level_tags (list): Parsed tags organized by Markdown heading levels and other tags. - header (str): Current header text being processed. + # Add current node's table content + for table in node.tables: + all_content.append( + convert_table_to_markdown(table["headers"], table["data"]) + ) - Returns: - list: A list of tuples, each containing the table's header, context text, and the table tag. - """ - tables = [] - for idx, item in enumerate(level_tags): - if isinstance(item, list): - tables += self.extract_table(item, header) - else: - tag = item[1] - if not isinstance(tag, Tag): - continue - if tag.name in self.ALL_LEVELS: - header = f"{header}-{tag.text}" if len(header) > 0 else tag.text - - if tag.name == "table": - if idx - 1 >= 0: - context = level_tags[idx - 1] - if isinstance(context, tuple): - tables.append((header, context[1].text, tag)) - else: - tables.append((header, "", tag)) - return tables - - def parse_level_tags( - self, - level_tags: list, - level: str, - parent_header: str = "", - cur_header: str = "", - ): - """ - Recursively parses level tags to organize them into a structured format. + # Add all child node content (including tables) + for child in node.children: + child_content = collect_children_content(child) + all_content.extend(child_content) - Args: - level_tags (list): A list of tags to be parsed. - level (str): The current level being processed. - parent_header (str): The header of the parent tag. - cur_header (str): The header of the current tag. + current_output = Chunk( + id=f"{id}_{len(outputs)}", + parent_id=parent_id, + name=full_title, + content="\n".join(filter(None, all_content)), + ) - Returns: - list: A structured representation of the parsed tags. - """ - if len(level_tags) == 0: - return [] - output = [] - prefix_tags = [] - while len(level_tags) > 0: - tag = level_tags[0] - if tag.name in self.ALL_LEVELS: - break - else: - prefix_tags.append((parent_header, level_tags.pop(0))) - if len(prefix_tags) > 0: - output.append(prefix_tags) - - cur = [] - while len(level_tags) > 0: - tag = level_tags[0] - if tag.name not in self.ALL_LEVELS: - cur.append((parent_header, level_tags.pop(0))) - else: - - if tag.name > level: - cur += self.parse_level_tags( - level_tags, - tag.name, - f"{parent_header}-{cur_header}" - if len(parent_header) > 0 - else cur_header, - tag.name, + # Collect table data and convert to markdown format + all_tables = [] + table_contents = [] + if node.tables: + for table in node.tables: + all_tables.append(table) + table_contents.append( + convert_table_to_markdown(table["headers"], table["data"]) ) - elif tag.name == level: - if len(cur) > 0: - output.append(cur) - cur = [(parent_header, level_tags.pop(0))] - cur_header = tag.text - else: - if len(cur) > 0: - output.append(cur) - return output - if len(cur) > 0: - output.append(cur) - return output - - def cut(self, level_tags, cur_level, final_level): - """ - Cuts the provided level tags into chunks based on the specified levels. - Args: - level_tags (list): A list of tags to be cut. - cur_level (int): The current level in the hierarchy. - final_level (int): The final level to which the tags should be cut. + for child in node.children: + child_tables, child_table_md = collect_tables(child) + all_tables.extend(child_tables) + table_contents.extend(child_table_md) + + if all_tables: + current_output.metadata = {"tables": all_tables} + current_output.table = "\n".join( + table_contents + ) # Save all tables in markdown format + + outputs.append(current_output) + + # If current node level is less than target level, continue traversing + elif node.level < self.cut_depth: + # Check if any subtree contains target level nodes + has_target_level = False + current_contents = parent_contents + ( + [node.content] if node.content else [] + ) - Returns: - list: A list of cut chunks. - """ - output = [] - if cur_level == final_level: - cur_prefix = [] - for sublevel_tags in level_tags: - if ( - isinstance(sublevel_tags, tuple) - ): - cur_prefix.append(self.to_text([sublevel_tags,])) - else: - break - cur_prefix = "\n".join(cur_prefix) - - if len(cur_prefix) > 0: - output.append(cur_prefix) - for sublevel_tags in level_tags: - if isinstance(sublevel_tags, list): - output.append(cur_prefix + "\n" + self.to_text(sublevel_tags)) - return output - else: - cur_prefix = [] - for sublevel_tags in level_tags: - if ( - isinstance(sublevel_tags, tuple) - ): - cur_prefix.append(sublevel_tags[1].text) - else: - break - cur_prefix = "\n".join(cur_prefix) - if len(cur_prefix) > 0: - output.append(cur_prefix) + # Add current node's tables to content + for table in node.tables: + current_contents.append( + convert_table_to_markdown(table["headers"], table["data"]) + ) - for sublevel_tags in level_tags: - if isinstance(sublevel_tags, list): - output += self.cut(sublevel_tags, cur_level + 1, final_level) - return output + for child in node.children: + child_outputs = self._convert_to_outputs( + child, id, parent_id, current_titles, current_contents + ) + if child_outputs: + has_target_level = True + outputs.extend(child_outputs) + + # If no target level nodes found and current node is not root, output current node + if not has_target_level and node.title != "root": + full_title = " / ".join(current_titles) + all_content = current_contents + + for child in node.children: + child_content = collect_children_content(child) + all_content.extend(child_content) + + current_output = Chunk( + id=f"{id}_{len(outputs)}", + parent_id=parent_id, + name=full_title, + content="\n".join(filter(None, all_content)), + ) - def solve_content(self, id: str, title: str, content: str, **kwargs) -> List[Output]: - """ - Converts Markdown content into structured chunks. + # Collect table data and convert to markdown format + all_tables = [] + table_contents = [] + if node.tables: + for table in node.tables: + all_tables.append(table) + table_contents.append( + convert_table_to_markdown(table["headers"], table["data"]) + ) - Args: - id (str): An identifier for the content. - title (str): The title of the content. - content (str): The Markdown formatted content to be processed. + for child in node.children: + child_tables, child_table_md = collect_tables(child) + all_tables.extend(child_tables) + table_contents.extend(child_table_md) - Returns: - List[Output]: A list of processed content chunks. - """ - html_content = markdown.markdown( - content, extensions=["markdown.extensions.tables"] - ) - # html_content = self.replace_table(html_content) - soup = BeautifulSoup(html_content, "html.parser") - if soup is None: - raise ValueError("The MarkDown file appears to be empty or unreadable.") - - top_level = None - for level in self.ALL_LEVELS: - tmp = soup.find_all(level) - if len(tmp) > 0: - top_level = level - break - if top_level is None: - chunk = Chunk( - id=Chunk.generate_hash_id(str(id)), - name=title, - content=soup.text, - ref=kwargs.get("ref", ""), - ) - return [chunk] - tags = [tag for tag in soup.children if isinstance(tag, Tag)] - - level_tags = self.parse_level_tags(tags, top_level) - cutted = self.cut(level_tags, 0, self.cut_depth) - - chunks = [] - - for idx, content in enumerate(cutted): - chunk = None - if self.TABLE_CHUCK_FLAG in content: - chunk = self.get_table_chuck(content, title, id, idx) - chunk.ref = kwargs.get("ref", "") - else: - chunk = Chunk( - id=Chunk.generate_hash_id(f"{id}#{idx}"), - name=f"{title}#{idx}", - content=content, - ref=kwargs.get("ref", ""), - ) - chunks.append(chunk) - return chunks + if all_tables: + current_output.metadata = {"tables": all_tables} + current_output.table = "\n".join( + table_contents + ) # Save all tables in markdown format - def get_table_chuck(self, table_chunk_str: str, title: str, id: str, idx: int) -> Chunk: - """ - convert table chunk - :param table_chunk_str: - :return: - """ - table_chunk_str = table_chunk_str.replace("\\N", "") - pattern = f"{self.TABLE_CHUCK_FLAG}(.*){self.TABLE_CHUCK_FLAG}" - matches = re.findall(pattern, table_chunk_str, re.DOTALL) - if not matches or len(matches) <= 0: - # 找不到表格信息,按照Text Chunk处理 - return Chunk( - id=Chunk.generate_hash_id(f"{id}#{idx}"), - name=f"{title}#{idx}", - content=table_chunk_str, - ) - table_markdown_str = matches[0] - html_table_str = markdown.markdown(table_markdown_str, extensions=["markdown.extensions.tables"]) - try: - df = pd.read_html(html_table_str)[0] - except Exception as e: - logging.warning(f"get_table_chuck error: {e}") - df = pd.DataFrame() - - # 确认是表格Chunk,去除内容中的TABLE_CHUCK_FLAG - replaced_table_text = re.sub(pattern, f'\n{table_markdown_str}\n', table_chunk_str, flags=re.DOTALL) - return Chunk( - id=Chunk.generate_hash_id(f"{id}#{idx}"), - name=f"{title}#{idx}", - content=replaced_table_text, - type=ChunkTypeEnum.Table, - csv_data=df.to_csv(index=False), - ) + outputs.append(current_output) + + return outputs - def invoke(self, input: Input, **kwargs) -> List[Output]: + def _invoke(self, input: Input, **kwargs) -> List[Output]: """ Processes a Markdown file and returns its content as structured chunks. @@ -411,4 +416,55 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: basename, _ = os.path.splitext(os.path.basename(file_path)) chunks = self.solve_content(input, basename, content) + length_500_list = [] + length_1000_list = [] + length_5000_list = [] + length_smal_list = [] + for chunk in chunks: + if chunk.content is not None: + if len(chunk.content) > 5000: + length_5000_list.append(chunk) + elif len(chunk.content) > 1000: + length_1000_list.append(chunk) + elif len(chunk.content) > 500: + length_500_list.append(chunk) + elif len(chunk.content) <= 500: + length_smal_list.append(chunk) + return chunks + + +@ReaderABC.register("yuque") +@ReaderABC.register("yuque_reader") +class YuequeReader(MarkDownReader): + """ + A class for parsing Yueque documents into Chunk objects. + + This class inherits from MarkDownParser and provides the functionality to process Yueque documents, + extract their content, and convert it into a list of Chunk objects. + """ + + def _invoke(self, input: Input, **kwargs) -> List[Output]: + """ + Processes the input Yueque document and converts it into a list of Chunk objects. + + Args: + input (Input): The input string containing the Yueque token and URL. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. + + Returns: + List[Output]: A list of Chunk objects representing the parsed content. + + Raises: + HTTPError: If the request to the Yueque URL fails. + """ + token, url = input.split("@", 1) + headers = {"X-Auth-Token": token} + response = requests.get(url, headers=headers) + response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx) + data = response.json()["data"] + id = data.get("id", "") + title = data.get("title", "") + content = data.get("body", "") + + chunks = self.solve_content(id, title, content) return chunks diff --git a/kag/builder/component/reader/mix_reader.py b/kag/builder/component/reader/mix_reader.py new file mode 100644 index 00000000..6af7380a --- /dev/null +++ b/kag/builder/component/reader/mix_reader.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import os +from typing import List + +from kag.interface import ReaderABC +from knext.common.base.runnable import Input, Output +from kag.builder.component.reader.txt_reader import TXTReader +from kag.builder.component.reader.pdf_reader import PDFReader +from kag.builder.component.reader.docx_reader import DocxReader +from kag.builder.component.reader.markdown_reader import MarkDownReader +from kag.builder.component.reader.dict_reader import DictReader + + +@ReaderABC.register("mix", as_default=True) +@ReaderABC.register("mix_reader") +class MixReader(ReaderABC): + """ + A reader class that can handle multiple types of inputs by delegating to specific readers. + + This class initializes with a mapping of file types to their respective readers. + It provides a method to invoke the appropriate reader based on the input type. + + """ + + def __init__( + self, + txt_reader: TXTReader = None, + pdf_reader: PDFReader = None, + docx_reader: DocxReader = None, + md_reader: MarkDownReader = None, + dict_reader: DictReader = None, + ): + """ + Initializes the MixReader with a mapping of file types to their respective readers. + + Args: + txt_reader (TXTReader, optional): Reader for .txt files. Defaults to None. + pdf_reader (PDFReader, optional): Reader for .pdf files. Defaults to None. + docx_reader (DocxReader, optional): Reader for .docx files. Defaults to None. + md_reader (MarkDownReader, optional): Reader for .md files. Defaults to None. + dict_reader (DictReader, optional): Reader for dictionary inputs. Defaults to None. + """ + super().__init__() + self.parse_map = { + "txt": txt_reader, + "pdf": pdf_reader, + "docx": docx_reader, + "md": md_reader, + "dict": dict_reader, + } + + def _invoke(self, input: Input, **kwargs) -> List[Output]: + """ + Invokes the appropriate reader based on the input type. + + Args: + input (Input): The input to be parsed. This can be a file path or a dictionary. + **kwargs: Additional keyword arguments to be passed to the reader. + + Returns: + List[Output]: A list of parsed outputs. + + Raises: + ValueError: If the input is empty. + FileNotFoundError: If the input file does not exist. + NotImplementedError: If the file suffix is not supported. + KeyError: If the reader for the given file type is not correctly configured. + """ + if not input: + raise ValueError("Input cannot be empty") + if isinstance(input, dict): + reader_type = "dict" + + else: + if os.path.exists(input): + raise FileNotFoundError(f"File {input} not found.") + + file_suffix = input.split(".")[-1] + if file_suffix not in self.parse_map: + raise NotImplementedError( + f"File suffix {file_suffix} not supported yet." + ) + reader_type = file_suffix + + reader = self.reader_map[reader_type] + if reader is None: + raise KeyError(f"{reader_type} reader not correctly configured.") + return self.parse_map[file_suffix]._invoke(input) diff --git a/kag/builder/component/reader/pdf_reader.py b/kag/builder/component/reader/pdf_reader.py index c60020d8..682a5a19 100644 --- a/kag/builder/component/reader/pdf_reader.py +++ b/kag/builder/component/reader/pdf_reader.py @@ -12,28 +12,26 @@ import os import re -from typing import List, Sequence, Type, Union +from typing import List, Sequence, Union + +import pdfminer.layout # noqa -from langchain_community.document_loaders import PyPDFLoader -import pdfminer.layout from kag.builder.model.chunk import Chunk -from kag.interface.builder import SourceReaderABC -from knext.common.base.runnable import Input, Output -from kag.builder.prompt.outline_prompt import OutlinePrompt +from kag.interface import ReaderABC +from kag.builder.prompt.outline_prompt import OutlinePrompt +from kag.interface import LLMClient +from kag.common.conf import KAG_PROJECT_CONF +from kag.common.utils import generate_hash_id +from knext.common.base.runnable import Output from pdfminer.high_level import extract_text from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer, LTPage from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument -from pdfminer.layout import LAParams,LTTextBox -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.converter import PDFPageAggregator -from pdfminer.pdfpage import PDFTextExtractionNotAllowed -import pdfminer +import pdfminer # noqa +import PyPDF2 import logging @@ -41,34 +39,207 @@ logger = logging.getLogger(__name__) -class PDFReader(SourceReaderABC): +@ReaderABC.register("pdf") +@ReaderABC.register("pdf_reader") +class PDFReader(ReaderABC): """ - A PDF reader class that inherits from SourceReader. + A class for reading PDF files into a list of text chunks, inheriting from `ReaderABC`. - Attributes: - if_split (bool): Whether to split the content by pages. Default is False. - use_pypdf (bool): Whether to use PyPDF2 for processing PDF files. Default is True. + This class is responsible for parsing PDF files and converting them into a list of Chunk objects. + It inherits from `ReaderABC` and overrides the necessary methods to handle PDF-specific operations. """ - def __init__(self, **kwargs): + def __init__( + self, + cut_depth: int = 3, + outline_flag: bool = True, + is_ocr: bool = False, + llm: LLMClient = None, + **kwargs, + ): super().__init__(**kwargs) - self.split_level = kwargs.get("split_level", 3) - self.split_using_outline = kwargs.get("split_using_outline", True) - self.outline_flag = True - self.llm = self._init_llm() - language = os.getenv("KAG_PROMPT_LANGUAGE", "zh") + self.cut_depth = cut_depth + self.outline_flag = outline_flag + self.is_ocr = is_ocr + self.llm = llm + language = KAG_PROJECT_CONF.language self.prompt = OutlinePrompt(language) - @property - def input_types(self) -> Type[Input]: + def input_types(self): return str @property - def output_types(self) -> Type[Output]: + def output_types(self): return Chunk - - def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk]: + + def _get_full_outlines(self): + outlines = self.pdf_reader.outline + level_outlines = [] + + def _extract_outline_page_numbers(outlines, level=0): + for outline in outlines: + if isinstance(outline, list): + _extract_outline_page_numbers(outline, level + 1) + else: + title = outline.title + page_number = self.pdf_reader.get_destination_page_number(outline) + level_outlines.append((title, level, page_number, 0)) + + _extract_outline_page_numbers(outlines) + for idx, outline in enumerate(level_outlines): + level_outlines[idx] = ( + outline[0], + outline[1], + outline[2], + level_outlines[idx + 1][2] if idx + 1 < len(level_outlines) else -1, + ) + return level_outlines + + def extract_content_from_outline( + self, page_contents, level_outlines + ) -> List[Chunk]: + total_content = "".join(page_contents) + + def get_content_start(outline, page_contents): + page_start = outline[2] + page_end = outline[3] + + previous_pages_length = sum( + len(content) for content in page_contents[:page_start] + ) + + find_content = "".join( + page_contents[page_start : page_end + 1 if page_end != -1 else None] + ) + + # 标准化标题中的特殊字符 + def normalize_text(text): + # 将破折号"—"转换为中文数字"一" + text = text.replace("—", "一") + # 可以添加其他中英文标点的统一转换 + text = re.sub(r"[", "[", text) + text = re.sub(r"]", "]", text) + text = re.sub(r"(", "(", text) + text = re.sub(r")", ")", text) + return text + + outline = (normalize_text(outline[0]), outline[1], outline[2], outline[3]) + + def fuzzy_search(pattern, text, threshold=0.90): + from difflib import SequenceMatcher + + pattern_len = len(pattern) + for i in range(len(text) - pattern_len + 1): + substring = text[i : i + pattern_len] + similarity = SequenceMatcher(None, pattern, substring).ratio() + if similarity >= threshold: + return i + return -1 + + # 先尝试使用原始标题进行模糊匹配 + title_with_spaces = outline[0].strip() + fuzzy_match_pos = fuzzy_search(title_with_spaces, find_content) + if fuzzy_match_pos != -1: + return previous_pages_length + fuzzy_match_pos + + # 如果没找到,尝试使用去除所有空格的标题 + title_no_spaces = title_with_spaces.replace(" ", "") + find_content_no_spaces = find_content.replace(" ", "") + fuzzy_match_pos = fuzzy_search(title_no_spaces, find_content_no_spaces) + + if fuzzy_match_pos != -1: + # 计算原始文本中的实际位置 + original_pos = 0 + no_spaces_pos = 0 + while no_spaces_pos < fuzzy_match_pos: + if find_content[original_pos] != " ": + no_spaces_pos += 1 + original_pos += 1 + return previous_pages_length + original_pos + + # 在扩展范围内进行模糊匹配 + extended_content = "".join( + page_contents[ + max(0, page_start - 1) : page_end if page_end != -1 else None + ] + ) + + fuzzy_match_pos = fuzzy_search(title_with_spaces, extended_content) + if fuzzy_match_pos != -1: + extended_previous_length = sum( + len(content) for content in page_contents[: max(0, page_start - 1)] + ) + return extended_previous_length + fuzzy_match_pos + + # 最后尝试不带空格的扩展内容 + extended_content_no_spaces = extended_content.replace(" ", "") + fuzzy_match_pos = fuzzy_search(title_no_spaces, extended_content_no_spaces) + if fuzzy_match_pos != -1: + original_pos = 0 + no_spaces_pos = 0 + while no_spaces_pos < fuzzy_match_pos: + if extended_content[original_pos] != " ": + no_spaces_pos += 1 + original_pos += 1 + + extended_previous_length = sum( + len(content) for content in page_contents[: max(0, page_start - 1)] + ) + return extended_previous_length + original_pos + + return -1 + + final_content = [] + for idx, outline in enumerate(level_outlines): + start = get_content_start(outline, page_contents) + next_start = ( + get_content_start(level_outlines[idx + 1], page_contents) + if idx + 1 < len(level_outlines) + else -1 + ) + if start >= 0 and next_start >= 0: + content = total_content[start:next_start] + final_content.append( + (outline[0], outline[1], start, next_start, content) + ) + elif start >= 0 and next_start < 0 and idx + 1 == len(level_outlines): + content = total_content[start:] + final_content.append((outline[0], outline[1], start, -1, content)) + return final_content + + def convert_finel_content_to_chunks(self, final_content): + def create_chunk(title, content, basename): + return Chunk( + id=generate_hash_id(f"{basename}#{title}"), + name=f"{basename}#{title}", + content=content, + sub_chunks=[], + ) + + level_map = {} + chunks = [] + + for title, level, start, end, content in final_content: + chunk = create_chunk( + title, content, os.path.splitext(os.path.basename(self.fd.name))[0] + ) + chunks.append(chunk) + + if level == 0: + level_map[0] = chunk + else: + parent_level = level - 1 + while parent_level >= 0: + if parent_level in level_map: + level_map[parent_level].sub_chunks.append(chunk) + break + parent_level -= 1 + level_map[level] = chunk + + return chunks + + def outline_chunk(self, chunk: Union[Chunk, List[Chunk]], basename) -> List[Chunk]: if isinstance(chunk, Chunk): chunk = [chunk] outlines = [] @@ -76,26 +247,30 @@ def outline_chunk(self, chunk: Union[Chunk, List[Chunk]],basename) -> List[Chunk outline = self.llm.invoke({"input": c.content}, self.prompt) outlines.extend(outline) content = "\n".join([c.content for c in chunk]) - chunks = self.sep_by_outline(content, outlines,basename) + chunks = self.sep_by_outline(content, outlines, basename) return chunks - - def sep_by_outline(self,content,outlines,basename): + + def sep_by_outline(self, content, outlines, basename): position_check = [] for outline in outlines: start = content.find(outline) - position_check.append((outline,start)) + position_check.append((outline, start)) chunks = [] - for idx,pc in enumerate(position_check): + for idx, pc in enumerate(position_check): chunk = Chunk( - id = Chunk.generate_hash_id(f"{basename}#{pc[0]}"), + id=generate_hash_id(f"{basename}#{pc[0]}"), name=f"{basename}#{pc[0]}", - content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)], + content=content[ + pc[1] : ( + position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(position_check) + ) + ], ) chunks.append(chunk) return chunks - - @staticmethod def _process_single_page( page: str, @@ -149,7 +324,7 @@ def _extract_text_from_page(page_layout: LTPage) -> str: text += element.get_text() return text - def invoke(self, input: str, **kwargs) -> Sequence[Output]: + def _invoke(self, input: str, **kwargs) -> Sequence[Output]: """ Processes a PDF file, splitting or extracting content based on configuration. @@ -170,85 +345,140 @@ def invoke(self, input: str, **kwargs) -> Sequence[Output]: if not os.path.isfile(input): raise FileNotFoundError(f"The file {input} does not exist.") - - self.fd = open(input, "rb") - self.parser = PDFParser(self.fd) - self.document = PDFDocument(self.parser) - chunks = [] - basename, _ = os.path.splitext(os.path.basename(input)) - - - # get outline + self.fd = None try: - outlines = self.document.get_outlines() - except Exception as e: - logger.warning(f"loading PDF file: {e}") - self.outline_flag = False - - - if not self.outline_flag: - - with open(input, "rb") as file: - for idx, page_layout in enumerate(extract_pages(file)): - content = "" - for element in page_layout: - if hasattr(element, "get_text"): - content = content + element.get_text() + self.fd = open(input, "rb") + self.pdf_reader = PyPDF2.PdfReader(self.fd) + self.level_outlines = self._get_full_outlines() + self.parser = PDFParser(self.fd) + self.document = PDFDocument(self.parser) + chunks = [] + basename, _ = os.path.splitext(os.path.basename(input)) + + # get outline + try: + outlines = self.document.get_outlines() + except Exception as e: + logger.warning(f"loading PDF file: {e}") + self.outline_flag = False + + if not self.outline_flag: + + with open(input, "rb") as file: + for idx, page_layout in enumerate(extract_pages(file)): + content = "" + for element in page_layout: + if hasattr(element, "get_text"): + content = content + element.get_text() + chunk = Chunk( + id=generate_hash_id(f"{basename}#{idx}"), + name=f"{basename}#{idx}", + content=content, + ) + chunks.append(chunk) + # try: + # outline_chunks = self.outline_chunk(chunks, basename) + # except Exception as e: + # raise RuntimeError(f"Error loading PDF file: {e}") + # if len(outline_chunks) > 0: + # chunks = outline_chunks + + elif True: + split_words = [] + + page_contents = [] + + with open(input, "rb") as file: + for idx, page_layout in enumerate(extract_pages(file)): + content = "" + for element in page_layout: + if hasattr(element, "get_text"): + content = content + element.get_text() + content = content.replace("\n", "") + page_contents.append(content) + + # 使用正则表达式移除所有空白字符(包括空格、制表符、换行符等) + page_contents = [ + re.sub(r"\s+", "", content) for content in page_contents + ] + page_contents = [ + re.sub(r"[\s\u200b\u200c\u200d\ufeff]+", "", content) + for content in page_contents + ] + page_contents = ["".join(content.split()) for content in page_contents] + + final_content = self.extract_content_from_outline( + page_contents, self.level_outlines + ) + chunks = self.convert_finel_content_to_chunks(final_content) + + else: + for item in outlines: + level, title, dest, a, se = item + split_words.append(title.strip().replace(" ", "")) + # save the outline position in content + try: + text = extract_text(input) + + except Exception as e: + raise RuntimeError(f"Error loading PDF file: {e}") + + cleaned_pages = [ + self._process_single_page(x, "", False, False) for x in text + ] + sentences = [] + for cleaned_page in cleaned_pages: + sentences += cleaned_page + + content = "".join(sentences) + positions = [(input, 0)] + for split_word in split_words: + pattern = re.compile(split_word) + start = 0 + for i, match in enumerate(re.finditer(pattern, content)): + if i <= 1: + start, end = match.span() + if start > 0: + positions.append((split_word, start)) + + for idx, position in enumerate(positions): chunk = Chunk( - id=Chunk.generate_hash_id(f"{basename}#{idx}"), - name=f"{basename}#{idx}", - content=content, + id=generate_hash_id(f"{basename}#{position[0]}"), + name=f"{basename}#{position[0]}", + content=content[ + position[1] : ( + positions[idx + 1][1] + if idx + 1 < len(positions) + else None + ) + ], ) chunks.append(chunk) - try: - outline_chunks = self.outline_chunk(chunks, basename) - except Exception as e: - raise RuntimeError(f"Error loading PDF file: {e}") - if len(outline_chunks) > 0: - chunks = outline_chunks - - else: - split_words = [] - - for item in outlines: - level, title, dest, a, se = item - split_words.append(title.strip().replace(" ","")) - # save the outline position in content - try: - text = extract_text(input) - except Exception as e: - raise RuntimeError(f"Error loading PDF file: {e}") - - cleaned_pages = [ - self._process_single_page(x, "", False, False) for x in text - ] - sentences = [] - for cleaned_page in cleaned_pages: - sentences += cleaned_page - - content = "".join(sentences) - positions = [(input,0)] - for split_word in split_words: - pattern = re.compile(split_word) - for i,match in enumerate(re.finditer(pattern, content)): - if i == 1: - start, end = match.span() - positions.append((split_word,start)) - - for idx,position in enumerate(positions): - chunk = Chunk( - id = Chunk.generate_hash_id(f"{basename}#{position[0]}"), - name=f"{basename}#{position[0]}", - content=content[position[1]:positions[idx+1][1] if idx+1 < len(positions) else None], - ) - chunks.append(chunk) + # # 保存中间结果到文件 + # import pickle - return chunks + # with open("debug_data.pkl", "wb") as f: + # pickle.dump( + # {"page_contents": page_contents, "level_outlines": self.level_outlines}, + # f, + # ) + return chunks -if __name__ == '__main__': - reader = PDFReader(split_using_outline=True) - pdf_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/aiwen.pdf") - chunk = reader.invoke(pdf_path) - print(chunk) \ No newline at end of file + except Exception as e: + raise RuntimeError(f"Error loading PDF file: {e}") + finally: + if self.fd: + self.fd.close() + + +if __name__ == "__main__": + pdf_reader = PDFReader() + pdf_path = os.path.join( + os.path.dirname(__file__), "../../../../tests/builder/data/aiwen.pdf" + ) + pdf_path = "/Users/zhangxinhong.zxh/Downloads/labor-law-v5.pdf" + # pdf_path = "/Users/zhangxinhong.zxh/Downloads/toaz.info-5dsm-5-pr_56e68a629dc4fe62699960dd5afbe362.pdf" + chunk = pdf_reader.invoke(pdf_path) + a = 1 diff --git a/kag/builder/component/reader/txt_reader.py b/kag/builder/component/reader/txt_reader.py index 6f9d7a08..dfc99000 100644 --- a/kag/builder/component/reader/txt_reader.py +++ b/kag/builder/component/reader/txt_reader.py @@ -11,29 +11,27 @@ # or implied. import os -from typing import List, Type +from typing import List from kag.builder.model.chunk import Chunk -from kag.interface.builder import SourceReaderABC +from kag.interface import ReaderABC +from kag.common.utils import generate_hash_id from knext.common.base.runnable import Input, Output -class TXTReader(SourceReaderABC): +@ReaderABC.register("txt") +@ReaderABC.register("txt_reader") +class TXTReader(ReaderABC): """ - A PDF reader class that inherits from SourceReader. - """ - - @property - def input_types(self) -> Type[Input]: - return str + A class for parsing text files or text content into Chunk objects. - @property - def output_types(self) -> Type[Output]: - return Chunk + This class inherits from ReaderABC and provides the functionality to read text content, + whether it is from a file or directly provided as a string, and convert it into a list of Chunk objects. + """ - def invoke(self, input: Input, **kwargs) -> List[Output]: + def _invoke(self, input: Input, **kwargs) -> List[Output]: """ - The main method for processing text reading. This method reads the content of the input (which can be a file path or text content) and converts it into a Chunk object. + The main method for processing text reading. This method reads the content of the input (which can be a file path or text content) and converts it into chunks. Args: input (Input): The input string, which can be the path to a text file or direct text content. @@ -51,7 +49,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: try: if os.path.exists(input): - with open(input, "r", encoding='utf-8') as f: + with open(input, "r", encoding="utf-8") as f: content = f.read() else: content = input @@ -60,7 +58,7 @@ def invoke(self, input: Input, **kwargs) -> List[Output]: basename, _ = os.path.splitext(os.path.basename(input)) chunk = Chunk( - id=Chunk.generate_hash_id(input), + id=generate_hash_id(input), name=basename, content=content, ) diff --git a/kag/builder/component/reader/yuque_reader.py b/kag/builder/component/reader/yuque_reader.py deleted file mode 100644 index e585c097..00000000 --- a/kag/builder/component/reader/yuque_reader.py +++ /dev/null @@ -1,67 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import requests -from typing import Type, List - -from kag.builder.component.reader import MarkDownReader -from kag.builder.model.chunk import Chunk -from kag.interface.builder import SourceReaderABC -from knext.common.base.runnable import Input, Output - -from kag.common.llm.client import LLMClient - - -class YuqueReader(SourceReaderABC): - def __init__(self, token: str, **kwargs): - super().__init__(**kwargs) - self.token = token - self.markdown_reader = MarkDownReader(**kwargs) - - @property - def input_types(self) -> Type[Input]: - """The type of input this Runnable object accepts specified as a type annotation.""" - return str - - @property - def output_types(self) -> Type[Output]: - """The type of output this Runnable object produces specified as a type annotation.""" - return Chunk - - @staticmethod - def get_yuque_api_data(token, url): - headers = {"X-Auth-Token": token} - - try: - response = requests.get(url, headers=headers) - response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx) - return response.json()["data"] # Assuming the API returns JSON data - except requests.exceptions.HTTPError as http_err: - print(f"HTTP error occurred: {http_err}") - except requests.exceptions.RequestException as err: - print(f"Error occurred: {err}") - except Exception as err: - print(f"An error occurred: {err}") - - def invoke(self, input: str, **kwargs) -> List[Output]: - if not input: - raise ValueError("Input cannot be empty") - - url: str = input - data = self.get_yuque_api_data(self.token, url) - id = data.get("id", "") - title = data.get("title", "") - content = data.get("body", "") - - chunks = self.markdown_reader.solve_content(id, title, content) - - return chunks \ No newline at end of file diff --git a/kag/solver/logic/core_modules/op_executor/op_math/__init__.py b/kag/builder/component/scanner/__init__.py similarity index 100% rename from kag/solver/logic/core_modules/op_executor/op_math/__init__.py rename to kag/builder/component/scanner/__init__.py diff --git a/kag/builder/component/scanner/csv_scanner.py b/kag/builder/component/scanner/csv_scanner.py new file mode 100644 index 00000000..687395b8 --- /dev/null +++ b/kag/builder/component/scanner/csv_scanner.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +from typing import Dict, List + +import pandas as pd +from kag.interface import ScannerABC +from kag.common.utils import generate_hash_id +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("csv") +@ScannerABC.register("csv_scanner") +class CSVScanner(ScannerABC): + def __init__( + self, + header: bool = True, + col_names: List[str] = None, + col_ids: List[int] = None, + rank: int = 0, + world_size: int = 1, + ): + super().__init__(rank=rank, world_size=world_size) + self.header = header + self.col_names = col_names + self.col_ids = col_ids + + @property + def input_types(self) -> Input: + return str + + @property + def output_types(self) -> Output: + return Dict + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data from a CSV file and converts it into a list of dictionaries. + + Args: + input (Input): The input file path to the CSV file. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of dictionaries containing the processed data. + """ + input = self.download_data(input) + if self.header: + data = pd.read_csv(input, dtype=str) + else: + data = pd.read_csv(input, dtype=str, header=None) + col_keys = self.col_names if self.col_names else self.col_ids + if col_keys is None: + return data.to_dict(orient="records") + + contents = [] + for _, row in data.iterrows(): + for k, v in row.items(): + if k in col_keys: + v = str(v) + name = v[:5] + "..." + v[-5:] + contents.append( + {"id": generate_hash_id(v), "name": name, "content": v} + ) + + return contents diff --git a/kag/builder/component/scanner/dataset_scanner.py b/kag/builder/component/scanner/dataset_scanner.py new file mode 100644 index 00000000..7313ebf8 --- /dev/null +++ b/kag/builder/component/scanner/dataset_scanner.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import json +import os +from typing import List, Type, Dict + + +from kag.interface import ScannerABC +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("hotpotqa") +@ScannerABC.register("hotpotqa_dataset_scanner") +class HotpotqaCorpusScanner(ScannerABC): + """ + A class for reading HotpotQA dataset and converting it into a list of dictionaries, inheriting from `ScannerABC`. + + This class is responsible for reading HotpotQA corpus and converting it into a list of dictionaries. + It inherits from `ScannerABC` and overrides the necessary methods to handle HotpotQA-specific operations. + """ + + @property + def input_types(self) -> Type[Input]: + return str + + @property + def output_types(self) -> Type[Output]: + return Dict + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data from a HotpotQA corpus file or JSON string and returns it as a list of dictionaries. + + This method reads HotpotQA corpus data from a file or parses a JSON string and returns it as a list of dictionaries. + If the input is a file path, it reads the file; if the input is a JSON string, it parses the string. + + Args: + input (Input): The HotpotQA corpus file path or JSON string to load. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of dictionaries, where each dictionary represents a HotpotQA item. + """ + if os.path.exists(str(input)): + with open(input, "r") as f: + corpus = json.load(f) + else: + corpus = json.loads(input) + + data = [] + for item_key, item_value in corpus.items(): + data.append( + {"id": item_key, "name": item_key, "content": "\n".join(item_value)} + ) + return data + + +@ScannerABC.register("musique") +@ScannerABC.register("2wiki") +@ScannerABC.register("musique_dataset_scanner") +@ScannerABC.register("2wiki_dataset_scanner") +class MusiqueCorpusScanner(ScannerABC): + """ + A class for reading Musique/2Wiki dataset and converting it into a list of dictionaries, inheriting from `ScannerABC`. + + This class is responsible for reading Musique/2Wiki corpus and converting it into a list of dictionaries. + It inherits from `ScannerABC` and overrides the necessary methods to handle Musique/2Wiki-specific operations. + """ + + @property + def input_types(self) -> Type[Input]: + """The type of input this Runnable object accepts specified as a type annotation.""" + return str + + @property + def output_types(self) -> Type[Output]: + """The type of output this Runnable object produces specified as a type annotation.""" + return Dict + + def get_basename(self, file_name: str): + base, _ = os.path.splitext(os.path.basename(file_name)) + return base + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data from a Musique/2Wiki corpus file or JSON string and returns it as a list of dictionaries. + + This method reads Musique/2Wiki corpus data from a file or parses a JSON string and returns it as a list of dictionaries. + If the input is a file path, it reads the file; if the input is a JSON string, it parses the string. + + Args: + input (Input): The Musique/2Wiki corpus file path or JSON string to load. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of dictionaries, where each dictionary represents a Musique/2Wiki item. + """ + + if os.path.exists(input): + with open(input, "r") as f: + corpus = json.load(f) + else: + corpus = json.loads(input) + + data = [] + + for idx, item in enumerate(corpus): + title = item["title"] + content = item["text"] + data.append( + { + "id": f"{title}#{idx}", + "name": title, + "content": content, + } + ) + return data diff --git a/kag/builder/component/scanner/directory_scanner.py b/kag/builder/component/scanner/directory_scanner.py new file mode 100644 index 00000000..8a6cf2ca --- /dev/null +++ b/kag/builder/component/scanner/directory_scanner.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import os +import re +from typing import List + +from kag.interface import ScannerABC + +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("dir") +@ScannerABC.register("dir_file_scanner") +class DirectoryScanner(ScannerABC): + """ + A class for reading files from a directory based on a specified file pattern or suffix, inheriting from `ScannerABC`. + It can be used in conjunction with the parsers such as PDF/MarkDown parser to convert files into Chunks. + + This class is responsible for reading files from a directory and returning a list of file paths that match the specified file pattern/suffix. + It inherits from `ScannerABC` and overrides the necessary methods to handle directory-specific operations. + + """ + + def __init__( + self, + file_pattern: str = None, + file_suffix: str = None, + rank: int = 0, + world_size: int = 1, + ): + """ + Initializes the DirectoryScanner with the specified file pattern, file suffix, rank, and world size. + + Args: + file_pattern (str, optional): The regex pattern to match file names. Defaults to None. + file_suffix (str, optional): The file suffix to match if `file_pattern` is not provided. Defaults to None. + rank (int, optional): The rank of the current worker. Defaults to 0. + world_size (int, optional): The total number of workers. Defaults to 1. + """ + super().__init__(rank=rank, world_size=world_size) + if file_pattern is None: + if file_suffix: + file_pattern = f".*{file_suffix}$" + else: + file_pattern = r".*txt$" + self.file_pattern = re.compile(file_pattern) + + @property + def input_types(self) -> Input: + return str + + @property + def output_types(self) -> Output: + return str + + def find_files_by_regex(self, directory): + """ + Finds files in the specified directory that match the file pattern. + + Args: + directory (str): The directory to search for files. + + Returns: + List[str]: A list of file paths that match the file pattern. + """ + matched_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if self.file_pattern.match(file): + file_path = os.path.join(root, file) + matched_files.append(file_path) + return matched_files + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data by finding files in the specified directory that match the file pattern. + + This method searches the directory specified by the input and returns a list of file paths that match the file pattern. + + Args: + input (Input): The directory to search for files. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of file paths that match the file pattern. + """ + return self.find_files_by_regex(input) diff --git a/kag/builder/component/scanner/file_scanner.py b/kag/builder/component/scanner/file_scanner.py new file mode 100644 index 00000000..653fa6c4 --- /dev/null +++ b/kag/builder/component/scanner/file_scanner.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import os +from typing import List + +from kag.interface import ScannerABC +from kag.common.conf import KAG_PROJECT_CONF +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("file") +@ScannerABC.register("file_scanner") +class FileScanner(ScannerABC): + """ + A class for reading single file and returning the path, inheriting from `ScannerABC`. + + This class is responsible for reading SINGLE file and returning the path as a list of strings. + It inherits from `ScannerABC` and overrides the necessary methods to handle file-specific operations. + """ + + @property + def input_types(self) -> Input: + return str + + @property + def output_types(self) -> Output: + return str + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data by returning the input file path as a list of strings. + + This method takes the input file path and returns it as a list containing the file path. + + Args: + input (Input): The file path to load. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list containing the input file path. + """ + if input.startswith("http://") or input.startswith("https://"): + from kag.common.utils import download_from_http + + local_file_path = os.path.join(KAG_PROJECT_CONF.ckpt_dir, "file_scanner") + if not os.path.exists(local_file_path): + os.makedirs(local_file_path) + local_file = os.path.join(local_file_path, os.path.basename(input)) + local_file = download_from_http(input, local_file) + return [local_file] + return [input] diff --git a/kag/builder/component/scanner/json_scanner.py b/kag/builder/component/scanner/json_scanner.py new file mode 100644 index 00000000..53af7696 --- /dev/null +++ b/kag/builder/component/scanner/json_scanner.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import json +import os +from typing import Union, Dict, List + +from kag.interface import ScannerABC +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("json") +@ScannerABC.register("json_scanner") +class JSONScanner(ScannerABC): + """ + A class for reading JSON files or parsing JSON-formatted strings into a list of dictionaries, inheriting from `ScannerABC`. + + This class is responsible for reading JSON files or parsing JSON-formatted strings and converting them into a list of dictionaries. + It inherits from `ScannerABC` and overrides the necessary methods to handle JSON-specific operations. + + Note: The JSON data must be a list of dictionaries. + """ + + @property + def input_types(self) -> Input: + return str + + @property + def output_types(self) -> Output: + return Dict + + @staticmethod + def _read_from_file(file_path: str) -> Union[dict, list]: + """ + Reads JSON data from a file and returns it as a list of dictionaries. + + Args: + file_path (str): The path to the JSON file. + + Returns: + List[Dict]: The JSON data loaded from the file. + + Raises: + ValueError: If there is an error reading the JSON from the file or if the file is not found. + """ + try: + with open(file_path, "r") as file: + return json.load(file) + except json.JSONDecodeError as e: + raise ValueError(f"Error reading JSON from file: {e}") + except FileNotFoundError as e: + raise ValueError(f"File not found: {e}") + + @staticmethod + def _parse_json_string(json_string: str) -> Union[dict, list]: + """ + Parses a JSON string and returns it as a list of dictionaries. + + Args: + json_string (str): The JSON string to parse. + + Returns: + List[Dict]: The parsed JSON data. + + Raises: + ValueError: If there is an error parsing the JSON string. + """ + try: + return json.loads(json_string) + except json.JSONDecodeError as e: + raise ValueError(f"Error parsing JSON string: {e}") + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data from a JSON file or JSON string and returns it as a list of dictionaries. + + This method reads JSON data from a file or parses a JSON string and returns it as a list of dictionaries. + If the input is a file path, it reads the file; if the input is a JSON string, it parses the string. + + Args: + input (Input): The JSON file path or JSON string to load. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of dictionaries, where each dictionary represents a JSON object. + + Raises: + ValueError: If there is an error reading the JSON data or if the input is not a valid JSON array or object. + """ + input = self.download_data(input) + try: + if os.path.exists(input): + corpus = self._read_from_file(input) + else: + corpus = self._parse_json_string(input) + except ValueError as e: + raise e + + if not isinstance(corpus, (list, dict)): + raise ValueError("Expected input to be a JSON array or object") + + if isinstance(corpus, dict): + corpus = [corpus] + return corpus diff --git a/kag/builder/component/scanner/yuque_scanner.py b/kag/builder/component/scanner/yuque_scanner.py new file mode 100644 index 00000000..ef4bd5c3 --- /dev/null +++ b/kag/builder/component/scanner/yuque_scanner.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import os +import requests +from typing import Type, List, Union + +# from kag.builder.component.reader.markdown_reader import MarkDownReader +from kag.interface import ScannerABC +from knext.common.base.runnable import Input, Output + + +@ScannerABC.register("yuque") +@ScannerABC.register("yuque_scanner") +class YuqueScanner(ScannerABC): + """ + A class for reading data from Yuque, a Chinese documentation platform, inheriting from `ScannerABC`. + + This class is responsible for reading the Yuque knowledge base and return the urls of the documents it contains. + It can be used in conjunction with the Yuque parser to convert Yuque documents into Chunks. + + It inherits from `ScannerABC` and overrides the necessary methods to handle Yuque-specific operations. + + Args: + token (str): The authentication token for accessing Yuque API. + rank (int, optional): The rank of the current worker. Defaults to 0. + world_size (int, optional): The total number of workers. Defaults to 1. + """ + + def __init__(self, token: str): + """ + Initializes the YuqueScanner with the specified token, rank, and world size. + + Args: + token (str): The authentication token for accessing Yuque API. + rank (int, optional): The rank of the current worker. Defaults to 0. + world_size (int, optional): The total number of workers. Defaults to 1. + """ + super().__init__() + self.token = token + + @property + def input_types(self) -> Type[Input]: + """The type of input this Runnable object accepts specified as a type annotation.""" + return Union[str, List[str]] + + @property + def output_types(self) -> Type[Output]: + """The type of output this Runnable object produces specified as a type annotation.""" + return str + + def get_yuque_api_data(self, url): + """ + Fetches data from the Yuque API using the specified URL and authentication token. + + Args: + url (str): The URL to fetch data from. + + Returns: + dict: The JSON data returned by the Yuque API. + + Raises: + HTTPError: If the API returns a bad response (4xx or 5xx). + """ + headers = {"X-Auth-Token": self.token} + response = requests.get(url, headers=headers) + response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx) + return response.json()["data"] # Assuming the API returns JSON data + + def load_data(self, input: Input, **kwargs) -> List[Output]: + """ + Loads data from the Yuque API and returns it as a list of document url strings. + + This method fetches data from the Yuque API using the provided URL and converts it into a list of strings. + If the input is a single document url, it returns a list containing the token and URL. + If the input is a knowledge base, it returns a list of strings where each string contains the token and the URL of each document it contains. + + Args: + input (Input): The URL to fetch data from. + **kwargs: Additional keyword arguments. + + Returns: + List[Output]: A list of strings, where each string contains the token and the URL of each document. + """ + url = input + if isinstance(url, str): + data = self.get_yuque_api_data(url) + if isinstance(data, dict): + # for single yuque doc + return [f"{self.token}@{url}"] + output = [] + for item in data: + slug = item["slug"] + output.append(os.path.join(url, slug)) + return [f"{self.token}@{url}" for url in output] + else: + return [f"{self.token}@{x}" for x in url] diff --git a/kag/builder/component/splitter/__init__.py b/kag/builder/component/splitter/__init__.py index c91070a8..e69de29b 100644 --- a/kag/builder/component/splitter/__init__.py +++ b/kag/builder/component/splitter/__init__.py @@ -1,23 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.builder.component.splitter.length_splitter import LengthSplitter -from kag.builder.component.splitter.semantic_splitter import SemanticSplitter -from kag.builder.component.splitter.pattern_splitter import PatternSplitter -from kag.builder.component.splitter.outline_splitter import OutlineSplitter - - -__all__ = [ - "LengthSplitter", - "SemanticSplitter", - "PatternSplitter", -] diff --git a/kag/builder/component/splitter/base_table_splitter.py b/kag/builder/component/splitter/base_table_splitter.py index 72a0b314..8af66c9e 100644 --- a/kag/builder/component/splitter/base_table_splitter.py +++ b/kag/builder/component/splitter/base_table_splitter.py @@ -10,28 +10,52 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from abc import ABC -from typing import Type, List, Union - from kag.builder.model.chunk import Chunk -from kag.interface.builder import SplitterABC +from kag.interface import SplitterABC class BaseTableSplitter(SplitterABC): """ - A base class for splitting table, inheriting from Splitter. + A base class for splitting table data into smaller chunks. + + This class inherits from SplitterABC and provides the functionality to split table data + represented in markdown format into smaller chunks. """ + def __init__(self): + super().__init__() + def split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"): """ - split markdown format table into smaller markdown table + Splits a markdown format table into smaller markdown tables. + + Args: + org_chunk (Chunk): The original chunk containing the table data. + chunk_size (int): The maximum size of each smaller chunk. Defaults to 2000. + sep (str): The separator used to join the table rows. Defaults to "\n". + + Returns: + List[Chunk]: A list of smaller chunks resulting from the split operation. """ try: - return self._split_table(org_chunk=org_chunk, chunk_size=chunk_size, sep=sep) + return self._split_table( + org_chunk=org_chunk, chunk_size=chunk_size, sep=sep + ) except Exception: return None def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n"): + """ + Internal method to split a markdown format table into smaller markdown tables. + + Args: + org_chunk (Chunk): The original chunk containing the table data. + chunk_size (int): The maximum size of each smaller chunk. Defaults to 2000. + sep (str): The separator used to join the table rows. Defaults to "\n". + + Returns: + List[Chunk]: A list of smaller chunks resulting from the split operation. + """ output = [] content = org_chunk.content table_start = content.find("|") @@ -56,6 +80,7 @@ def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n" cur.append(row) cur_len += len(row) + cur.append(content[table_end:]) if len(cur) > 0: splitted.append(cur) @@ -66,7 +91,7 @@ def _split_table(self, org_chunk: Chunk, chunk_size: int = 2000, sep: str = "\n" name=f"{org_chunk.name}#{idx}", content=sep.join(sentences), type=org_chunk.type, - **org_chunk.kwargs + **org_chunk.kwargs, ) output.append(chunk) return output diff --git a/kag/builder/component/splitter/length_splitter.py b/kag/builder/component/splitter/length_splitter.py index 2e9dcfcd..e86cafbc 100644 --- a/kag/builder/component/splitter/length_splitter.py +++ b/kag/builder/component/splitter/length_splitter.py @@ -10,26 +10,41 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from typing import Type, List, Union - +from typing import Type, List +from kag.interface import SplitterABC from kag.builder.model.chunk import Chunk, ChunkTypeEnum +from kag.interface.builder.base import KAG_PROJECT_CONF +from kag.common.utils import generate_hash_id from knext.common.base.runnable import Input, Output from kag.builder.component.splitter.base_table_splitter import BaseTableSplitter +@SplitterABC.register("length") +@SplitterABC.register("length_splitter") class LengthSplitter(BaseTableSplitter): """ - A class for splitting text based on length, inheriting from Splitter. + A class for splitting text based on length. + + This class inherits from BaseTableSplitter and provides the functionality to split text + into smaller chunks based on a specified length and window size. It also handles table data + by splitting it into smaller markdown tables. Attributes: - split_length (int): The maximum length of each split chunk. + split_length (int): The maximum length of each chunk. window_length (int): The length of the overlap between chunks. """ - def __init__(self, split_length: int = 500, window_length: int = 100, **kwargs): - super().__init__(**kwargs) - self.split_length = int(split_length) - self.window_length = int(window_length) + def __init__(self, split_length: int = 500, window_length: int = 100): + """ + Initializes the LengthSplitter with the specified split length and window length. + + Args: + split_length (int): The maximum length of each chunk. Defaults to 500. + window_length (int): The length of the overlap between chunks. Defaults to 100. + """ + super().__init__() + self.split_length = split_length + self.window_length = window_length @property def input_types(self) -> Type[Input]: @@ -39,37 +54,52 @@ def input_types(self) -> Type[Input]: def output_types(self) -> Type[Output]: return Chunk + def chunk_breakdown(self, chunk): + chunks = self.logic_break(chunk) + if chunks: + res_chunks = [] + for c in chunks: + res_chunks.extend(self.chunk_breakdown(c)) + else: + res_chunks = self.slide_window_chunk( + chunk, self.split_length, self.window_length + ) + return res_chunks + + def logic_break(self, chunk): + return None + def split_sentence(self, content): """ Splits the given content into sentences based on delimiters. Args: - content (str): The content to be split. + content (str): The content to be split into sentences. Returns: - list: A list of sentences. + List[str]: A list of sentences. """ - sentence_delimiters = ".。??!!" + sentence_delimiters = ".。??!!" if KAG_PROJECT_CONF.language == "en" else "。?!" output = [] start = 0 for idx, char in enumerate(content): if char in sentence_delimiters: end = idx - tmp = content[start: end + 1].strip() + tmp = content[start : end + 1].strip() if len(tmp) > 0: - output.append(tmp) + output.append(tmp.strip()) start = idx + 1 - res = content[start:] + res = content[start:].strip() if len(res) > 0: output.append(res) return output def slide_window_chunk( - self, - org_chunk: Chunk, - chunk_size: int = 2000, - window_length: int = 300, - sep: str = "\n", + self, + org_chunk: Chunk, + chunk_size: int = 2000, + window_length: int = 300, + sep: str = "\n", ) -> List[Chunk]: """ Splits the content into chunks using a sliding window approach. @@ -84,7 +114,9 @@ def slide_window_chunk( List[Chunk]: A list of Chunk objects. """ if org_chunk.type == ChunkTypeEnum.Table: - table_chunks = self.split_table(org_chunk=org_chunk, chunk_size=chunk_size, sep=sep) + table_chunks = self.split_table( + org_chunk=org_chunk, chunk_size=chunk_size, sep=sep + ) if table_chunks is not None: return table_chunks content = self.split_sentence(org_chunk.content) @@ -112,38 +144,36 @@ def slide_window_chunk( output = [] for idx, sentences in enumerate(splitted): chunk = Chunk( - id=f"{org_chunk.id}#{chunk_size}#{window_length}#{idx}#LEN", + id=generate_hash_id(f"{org_chunk.id}#{idx}"), name=f"{org_chunk.name}", content=sep.join(sentences), type=org_chunk.type, - **org_chunk.kwargs + chunk_size=chunk_size, + window_length=window_length, + **org_chunk.kwargs, ) output.append(chunk) return output - def invoke(self, input: Chunk, **kwargs) -> List[Output]: + def _invoke(self, input: Chunk, **kwargs) -> List[Output]: """ - Invokes the splitter on the given input chunk. + Invokes the splitting of the input chunk based on the specified length and window size. Args: - input (Chunk): The input chunk to be split. - **kwargs: Additional keyword arguments. + input (Chunk): The chunk(s) to be split. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. Returns: - List[Output]: A list of split chunks. + List[Output]: A list of Chunk objects resulting from the split operation. """ cutted = [] - if isinstance(input,list): + if isinstance(input, list): for item in input: cutted.extend( - self.slide_window_chunk( - item, self.split_length, self.window_length - ) + self.slide_window_chunk(item, self.split_length, self.window_length) ) else: cutted.extend( - self.slide_window_chunk( - input, self.split_length, self.window_length - ) + self.slide_window_chunk(input, self.split_length, self.window_length) ) return cutted diff --git a/kag/builder/component/splitter/outline_splitter.py b/kag/builder/component/splitter/outline_splitter.py index c0f6f6d7..510ad829 100644 --- a/kag/builder/component/splitter/outline_splitter.py +++ b/kag/builder/component/splitter/outline_splitter.py @@ -9,27 +9,51 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import logging +import collections +import logging import os import re -from typing import List, Type,Union +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Type, Union, Tuple -from kag.interface.builder import SplitterABC -from kag.builder.prompt.outline_prompt import OutlinePrompt -from kag.builder.model.chunk import Chunk +import matplotlib.pyplot as plt +from kag.interface.common.prompt import PromptABC from knext.common.base.runnable import Input, Output -from kag.common.llm.client.llm_client import LLMClient +from kag.common.conf import KAG_PROJECT_CONF, KAG_CONFIG +from kag.common.utils import generate_hash_id +from kag.builder.model.chunk import Chunk, dump_chunks +from kag.builder.model.chunk import ChunkTypeEnum +from kag.builder.prompt.outline_align_prompt import OutlineAlignPrompt +from kag.interface import SplitterABC +from kag.interface import LLMClient logger = logging.getLogger(__name__) + +@SplitterABC.register("outline") +@SplitterABC.register("outline_splitter") class OutlineSplitter(SplitterABC): - - def __init__(self,**kwargs): + def __init__( + self, + llm: LLMClient, + min_length: int = 100, + workers: int = 10, + chunk_size: int = 500, + llm_max_tokens: int = 8000, + align_parallel: bool = False, + **kwargs, + ): super().__init__(**kwargs) - self.llm = self._init_llm() - language = os.getenv("KAG_PROMPT_LANGUAGE", "zh") - self.prompt = OutlinePrompt(language) - + self.llm = llm + self.prompt = PromptABC.from_config( + {"type": "outline", "language": KAG_PROJECT_CONF.language} + ) + self.min_length = min_length + self.workers = workers + self.chunk_size = chunk_size + self.llm_max_tokens = llm_max_tokens + self.align_parallel = align_parallel + @property def input_types(self) -> Type[Input]: return Chunk @@ -37,49 +61,1096 @@ def input_types(self) -> Type[Input]: @property def output_types(self) -> Type[Output]: return Chunk - + + def build_catalog_tree(self, outlines_with_content): + catalog_tree = [] + stack = [] # 用于跟踪当前的节点层级,格式为 [(title, level, node), ...] + + for title, content, sd_content, level in outlines_with_content: + # 找到正确的父节点 + while stack and stack[-1][1] >= level: # 父节点的级别应该更高(数字更小) + stack.pop() + + # # 创建新节点 + # # title应该拼上所有父节点的title + # if stack: + # # only add title if stack level + # title = "/".join([item[0] for item in stack] + [title]) + node = { + "title": title, + "content": content, + "children": [], + "start": sd_content[0], + "end": sd_content[1], + } + + # 如果栈为空,或者当前节点的级别高于栈顶节点的级别,说明当前节点是根节点或新的分支节点 + if not stack or stack[-1][1] >= level: + if stack: + stack[-1][2]["children"].append(node) # 将新节点添加到最近的父节点的 children 列表中 + else: + catalog_tree.append(node) # 如果栈为空,说明这是一个根节点 + else: + # 将新节点添加到找到的父节点的 children 列表中 + stack[-1][2]["children"].append(node) + + # 将新节点和其级别、标题添加到 stack 中,以便后续添加子节点 + stack.append((title, level, node)) + + return catalog_tree + + def simplify_catalog_tree(self, node, parent=None, parent_content_length=0): + # 首先递归处理所有子节点 + for child in list(node["children"]): # 使用 list 来复制列表,以便在迭代时修改它 + self.simplify_catalog_tree(child, node, len(node["content"])) + + # 然后检查当前节点是否可以与父节点合并 + content_length = len(node["content"]) + if content_length + parent_content_length <= self.chunk_size and parent: + # 如果当前节点的内容长度加上父节点的内容长度不超过阈值,则合并 + parent["content"] += " " + node["content"] + # 将当前节点的子节点添加到父节点的子节点列表中 + parent["children"].extend(node["children"]) + # 从父节点的子节点列表中移除当前节点 + parent["children"].remove(node) + return # 停止进一步处理 + def outline_chunk(self, chunk: Union[Chunk, List[Chunk]]) -> List[Chunk]: if isinstance(chunk, Chunk): chunk = [chunk] outlines = [] for c in chunk: - outline = self.llm.invoke({"input": c.content}, self.prompt) + outline = self.llm.invoke( + {"input": c.content, "current_outline": outlines}, self.prompt + ) + # 过滤无效的 outlines + outline = self.filter_outlines(outline) outlines.extend(outline) content = "\n".join([c.content for c in chunk]) - chunks = self.sep_by_outline(content, outlines) + # chunks = self.sep_by_outline_ignore_duplicates( + # content, outlines, org_chunk=chunk + # ) + chunks = self.sep_by_outline_with_outline_tree( + content, outlines, org_chunk=chunk + ) + return chunks + + def process_batch(self, batch: List[Chunk]) -> List[Tuple[str, int]]: + """ + 处理单个批次的文档块 + + Args: + batch: List[Chunk] 待处理的文档块 + + Returns: + List[Tuple[str, int]] 提取的outline列表 + """ + outlines = [] + current_outlines = [] + + for c in batch: + # 传入当前已提取的outlines作为上下文 + outline = self.llm.invoke( + {"input": c.content, "current_outline": current_outlines}, self.prompt + ) + + # 过滤无效的outlines + # paralle模式可以用量大的outline: 暂时没有好的方法:TODO + if self.align_parallel: + valid_outlines = self.filter_outlines_parallel(outline) + else: + valid_outlines = self.filter_outlines(outline) + outlines.extend(valid_outlines) + current_outlines.extend(valid_outlines) + + return outlines + + def align_outlines(self, outlines): + """ + 使用LLM对齐提取的outline层级,使用前一个对齐完成的batch的后30%作为交叉部分 + + Args: + outlines: List[Tuple[str, int]] 原始outline列表 + + Returns: + List[Tuple[str, int]] 对齐后的outline列表 + """ + if not outlines: + return [] + + # 初始化align prompt + align_prompt = PromptABC.from_config( + {"type": "outline_align", "language": KAG_PROJECT_CONF.language} + ) + + max_length = 4000 + + try: + # 处理第一个batch + current_batch = [] + aligned_outlines = [] + + for outline in outlines: + # 计算添加当前outline后的总字符串长度 + test_batch = current_batch + [outline] + batch_str = str(test_batch) # 将整个batch转换为字符串计算长度 + + if len(batch_str) <= max_length: + current_batch.append(outline) + else: + break + + # 对齐第一个batch + if current_batch: + aligned_batch = self.llm.invoke( + {"outlines": current_batch}, align_prompt + ) + aligned_outlines.extend(aligned_batch) + last_aligned = aligned_batch + + # 处理剩余的outlines + remaining_outlines = outlines[len(current_batch) :] + + while remaining_outlines: + # 获取前一个batch最后30%的内容作为交叉部分 + overlap_count = max(1, len(last_aligned) * 30 // 100) + overlap_part = last_aligned[-overlap_count:] + + # 构建新batch + current_batch = [] + + # 添加新的outlines直到达到长度限制 + for outline in remaining_outlines: + test_batch = overlap_part + current_batch + [outline] + batch_str = str(test_batch) + + if len(batch_str) <= max_length: + current_batch.append(outline) + else: + break + + if not current_batch: + # 如果无法添加任何新outline,说明单个outline太长,需要特殊处理 + logger.warning( + "Single outline too long, processing individually" + ) + current_batch = [remaining_outlines[0]] + + # 对齐当前batch(包含交叉部分) + full_batch = overlap_part + current_batch + aligned_batch = self.llm.invoke( + {"outlines": full_batch}, align_prompt + ) + + # 只保留非交叉部分的结果 + aligned_outlines.extend(aligned_batch[overlap_count:]) + last_aligned = aligned_batch + + # 更新remaining_outlines + remaining_outlines = remaining_outlines[len(current_batch) :] + + return aligned_outlines + + except Exception as e: + logger.error(f"Error aligning outlines with LLM: {str(e)}") + return self._rule_based_align(outlines) + + def align_outlines_parallel(self, outlines): + """ + 并行处理outline对齐,每个batch与相邻batch有30%的交叉部分 + + Args: + outlines: List[Tuple[str, int]] 原始outline列表 + + Returns: + List[Tuple[str, int]] 对齐后的outline列表 + """ + if not outlines: + return [] + + # 初始化align prompt + language = os.getenv("KAG_PROMPT_LANGUAGE", "zh") + align_prompt = OutlineAlignPrompt(language) + max_length = 8000 + + try: + # 将outlines分成多个batch,每个batch最大长度不超过max_length + batches = [] + current_batch = [] + + for outline in outlines: + test_batch = current_batch + [outline] + batch_str = str(test_batch) + + if len(batch_str) <= max_length: + current_batch.append(outline) + else: + if current_batch: + batches.append(current_batch) + current_batch = [outline] + + if current_batch: + batches.append(current_batch) + + # 并行处理每个batch + futures = [] + with ThreadPoolExecutor(max_workers=self.workers) as executor: + for i, batch in enumerate(batches): + # 获取与前一个batch的交叉部分 + prev_overlap = [] + if i > 0: + prev_batch = batches[i - 1] + overlap_count = max(1, len(prev_batch) * 30 // 100) + prev_overlap = prev_batch[-overlap_count:] + + # 获取与后一个batch的交叉部分 + next_overlap = [] + if i < len(batches) - 1: + next_batch = batches[i + 1] + overlap_count = max(1, len(next_batch) * 30 // 100) + next_overlap = next_batch[:overlap_count] + + # 构建完整的batch(包含交叉部分) + full_batch = prev_overlap + batch + next_overlap + + # 提交任务到线程池 + future = executor.submit( + self.llm.invoke, {"outlines": full_batch}, align_prompt + ) + futures.append((i, future, len(prev_overlap), len(next_overlap))) + + # 收集结果并按原始顺序合并 + results = [None] * len(batches) + for i, future, prev_len, next_len in futures: + try: + aligned_batch = future.result() + # 只保留非交叉部分 + results[i] = aligned_batch[prev_len : len(aligned_batch) - next_len] + except Exception as e: + logger.error(f"Error processing batch {i}: {str(e)}") + # 如果处理失败,使用规则based对齐处理该batch + results[i] = self._rule_based_align(batches[i]) + + # 合并所有结果 + aligned_outlines = [] + for batch_result in results: + aligned_outlines.extend(batch_result) + + return aligned_outlines + + except Exception as e: + logger.error(f"Error aligning outlines with LLM: {str(e)}") + return self._rule_based_align(outlines) + + def _rule_based_align(self, outlines): + """ + 基于规则的outline对齐(作为备选方案) + """ + # 保留原有的基于规则的对齐逻辑作为备选 + title_patterns = { + "chapter": r"第[一二三四五六七八九十\d]+章", + "section": r"第[一二三四五六七八九十\d]+节", + "part": r"第[一二三四五六七八九十\d]+部分", + "article": r"第[一二三四五六七八九十\d]+条", + } + + pattern_levels = {"chapter": 1, "section": 2, "part": 1, "article": 3} + + aligned_outlines = [] + for title, level in outlines: + matched_pattern = None + for pattern_type, pattern in title_patterns.items(): + if re.search(pattern, title): + matched_pattern = pattern_type + break + + if matched_pattern: + aligned_level = pattern_levels[matched_pattern] + else: + aligned_level = level + + aligned_outlines.append((title, aligned_level)) + + return aligned_outlines + + def outline_chunk_batch(self, chunk: List[Chunk]) -> List[Chunk]: + """ + 批量处理文档块并提取大纲 + + Args: + chunk: List[Chunk] 输入的文档块列表 + + Returns: + List[Chunk] 处理后的文档块列表 + """ + assert isinstance(chunk, list) + self.batch_size = len(chunk) // self.workers if len(chunk) > self.workers else 1 + + outlines = [] + # 将 chunk 分成多个批次,这里注意,为了保证outline抽取的连续行,每个batch需要连续的chunk + batches = [ + chunk[i : i + self.batch_size] + for i in range(0, len(chunk), self.batch_size) + ] + + mapping = {} + futures = [] + with ThreadPoolExecutor(max_workers=self.workers) as executor: + # 提交每个批次到线程池 + for idx, batch in enumerate(batches): + future = executor.submit(self.process_batch, batch) + mapping[future] = idx + futures.append(future) + + results = [0] * len(batches) + # 等待所有批次完成并收集结果 + for future in as_completed(futures): + results[mapping[future]] = future.result() + # logger.info(f"outline batch{mapping[future]} done") + + for result in results: + outlines.extend(result) + + content = "\n".join([c.content for c in chunk]) + + if self.align_parallel: + aligned_outlines = self.align_outlines_parallel(outlines) + else: + aligned_outlines = self.align_outlines(outlines) + # 使用对齐后的outlines进行分块 + chunks = self.sep_by_outline_with_outline_tree( + content, aligned_outlines, org_chunk=chunk + ) + return chunks - - def sep_by_outline(self,content,outlines): + + def filter_outlines_parallel(self, raw_outlines): + """ + 过滤掉无效的标题,保留包含数字特征的标题。 + 数字特征包括: + 1. 阿拉伯数字 (0-9) + 2. 中文数字 (一二三...百千万亿) + 3. 罗马数字 (I,II,III,IV...) + 4. 序号标记 (①,②,③...) + 5. 带数字的常见标记 (第x章、x.x、x)等 + """ + # 匹配纯数字和标点的无效标题 + invalid_pattern = r""" + ^ # 匹配开头 + [0-9一二三四五六七八九十零IIVXLCDM\-.\(\)\[\]\s]* # 数字和标点 + $ # 匹配结尾 + """ + + # 匹配数字特征的模式 + number_pattern = r""" + \d+ | # 阿拉伯数字 + [一二三四五六七八九十百千万亿]+ | # 中文数字 + [ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+ | # 中文罗马数字 + [IVXLCDMivxlcdm]+ | # 英文罗马数字 + [①②③④⑤⑥⑦⑧⑨⑩]+ | # 圈数字 + [⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽]+ | # 括号数字 + 第[一二三四五六七八九十百千万\d]+[章节篇部] | # 第x章/节/篇/部 + [第]?[0-9一二三四五六七八九十百千万]+[条] | # (第)x条 + \d+\.\d+ | # 数字层级(如1.1) + [(]\d+[)] # 括号数字 + """ + + valid_outlines = [] + for title, level in raw_outlines: + title = title.strip() + # 过滤纯数字标题 + if re.fullmatch(invalid_pattern, title, re.VERBOSE): + continue + # 检查是否包含数字特征 + if not re.search(number_pattern, title, re.VERBOSE): + continue + valid_outlines.append((title, level)) + + return valid_outlines + + def filter_outlines(self, raw_outlines): + """ + 过滤标题,只保留具有明确章节层级的标题。 + + 章节层级分为四级: + 1级(最高层): 篇、卷、部、编 + 2级: 章 + 3级: 节 + 4级: 小节、款、项、目 + + 支持多种常见写法: + - 带"第"字: 第一章、第1章 + - 不带"第"字: 一、1、(一)、(1) + - 数字类型: 阿拉伯数字、中文数字、罗马数字 + """ + # 数字模式 + numbers = r""" + (?: + (?:[一二三四五六七八九十百千万]+) | # 中文数字 + (?:\d+) | # 阿拉伯数字 + (?:[IVXLCDMivxlcdm]+) | # 罗马数字 + (?:①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳) | # 圈数字 + (?:\(\d+\)) | # (1) + (?:\((?:[一二三四五六七八九十]+)\)) # (一) + ) + """ + + # 章节标识词 + level1_words = r"(?:篇|卷|部|编)" + level2_words = r"(?:章)" + level3_words = r"(?:节)" + level4_words = r"(?:小节|款|项|目)" + + # 完整的章节匹配模式 + section_pattern = rf""" + ^ # 匹配开头 + (?: + (?:第\s*{numbers}\s*(?:{level1_words})) | # 第x篇/卷/部/编 + (?:第\s*{numbers}\s*(?:{level2_words})) | # 第x章 + (?:第\s*{numbers}\s*(?:{level3_words})) | # 第x节 + (?:第\s*{numbers}\s*(?:{level4_words})) | # 第x小节/款/项/目 + (?:{numbers}\s*[、.\s]\s*(?:{level1_words})) | # x、篇/卷/部/编 + (?:{numbers}\s*[、.\s]\s*(?:{level2_words})) | # x、章 + (?:{numbers}\s*[、.\s]\s*(?:{level3_words})) | # x、节 + (?:{numbers}\s*[、.\s]\s*(?:{level4_words})) # x、小节/款/项/目 + ) + [\s\S]* # 标题剩余部分 + $ # 匹配结尾 + """ + + def determine_level(title: str) -> int: + """根据标题内容确定层级""" + if any(word in title for word in level1_words.strip("(?:)").split("|")): + return 1 + elif any(word in title for word in level2_words.strip("(?:)").split("|")): + return 2 + elif any(word in title for word in level3_words.strip("(?:)").split("|")): + return 3 + elif any(word in title for word in level4_words.strip("(?:)").split("|")): + return 4 + return 0 # 未匹配到任何层级 + + valid_outlines = [] + for title, level in raw_outlines: + title = title.strip() + # 检查是否是有效的章节标题 + if re.match(section_pattern, title, re.VERBOSE): + # 根据标题内容确定实际层级 + actual_level = determine_level(title) + if actual_level > 0: # 只添加成功确定层级的标题 + valid_outlines.append((title, actual_level)) + + return valid_outlines + + def unify_outline_levels(self, outlines): + """ + 统一相同类型标题的级别,如 "第一节" 和 "第二节" 应有相同的层级。 + + Args: + outlines (list): 提取的标题列表,格式为 [(标题文本, 级别), ...]。 + + Returns: + list: 调整后的标题列表,格式同输入。 + """ + if not outlines: + return [] + + # 辅助函数:判断标题是否属于同类型 + def is_same_type(title1, title2): + """ + 判断两个标题是否属于同一类型。 + """ + # 检查是否包含 "章" 或 "节",并判断编号相似 + keywords = ["章", "节", "部分", "篇"] + for keyword in keywords: + if keyword in title1 and keyword in title2: + return True + return False + + # 建立类型到级别的映射 + type_to_level = {} + for title, level in outlines: + for keyword in ["章", "节", "部分", "篇"]: + if keyword in title: + type_to_level.setdefault(keyword, level) + + # 调整级别 + unified_outlines = [] + for title, level in outlines: + for keyword in ["章", "节", "部分", "篇"]: + if keyword in title and keyword in type_to_level: + level = type_to_level[keyword] + break + unified_outlines.append((title, level)) + + return unified_outlines + + def sep_by_outline(self, content, outlines): + """ + 按层级划分内容为 chunks,剔除无效的标题。 + """ + # 过滤无效的 outlines + outlines = self.filter_outlines(outlines) + position_check = [] for outline in outlines: - start = content.find(outline) - position_check.append((outline,start)) + start = content.find(outline[0]) + if start != -1: + position_check.append((outline, start)) + + if not position_check: + return [] # 如果没有找到任何标题,返回空 + chunks = [] - for idx,pc in enumerate(position_check): + father_stack = [] + + for idx, (outline, start) in enumerate(position_check): + title, level = outline + end = ( + position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(content) + ) + while father_stack and father_stack[-1][1] >= level: + father_stack.pop() + full_path = "/".join([item[0] for item in father_stack] + [title]) + chunk_content = content[start:end] chunk = Chunk( - id = Chunk.generate_hash_id(f"{pc[0]}#{idx}"), - name=f"{pc[0]}#{idx}", - content=content[pc[1]:position_check[idx+1][1] if idx+1 < len(position_check) else len(position_check)], + id=generate_hash_id(f"{full_path}#{idx}"), + name=full_path, + content=chunk_content, ) chunks.append(chunk) + father_stack.append((title, level)) + return chunks - - def invoke(self,input: Input, **kwargs) -> List[Chunk]: - chunks = self.outline_chunk(input) + + def sep_by_outline_with_merge( + self, content, outlines, min_length=200, max_length=5000 + ): + """ + 按层级划分内容为 chunks,并对过短的 chunk 尝试进行合并,控制合并后长度。 + + 参数: + - content: str,完整内容。 + - outlines: List[Tuple[str, int]],每个标题及其层级的列表。 + - min_length: int,chunk 的最小长度,低于此值时尝试合并。 + - max_length: int,chunk 的最大长度,合并后不能超过此值。 + + 返回: + - List[Chunk],分割后的 chunk 列表。 + """ + # 过滤无效的 outlines + outlines = self.filter_outlines(outlines) + + position_check = [] + for outline in outlines: + start = content.find(outline[0]) + if start != -1: + position_check.append((outline, start)) + + if not position_check: + return [] # 如果没有找到任何标题,返回空 + + chunks = [] + father_stack = [] + + for idx, (outline, start) in enumerate(position_check): + title, level = outline + end = ( + position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(content) + ) + while father_stack and father_stack[-1][1] >= level: + father_stack.pop() + full_path = "/".join([item[0] for item in father_stack] + [title]) + chunk_content = content[start:end] + chunk = Chunk( + id=generate_hash_id(f"{full_path}#{idx}"), + name=full_path, + content=chunk_content, + ) + chunks.append(chunk) + father_stack.append((title, level)) + + # 合并过短的 chunks + merged_chunks = [] + buffer = None + + for chunk in chunks: + if buffer: + # 当前 chunk 合并到 buffer 中 + if ( + chunk.name.startswith(buffer.name) # 同一父级目录 + and len(buffer.content) + len(chunk.content) <= max_length + ): + buffer.content += chunk.content + buffer.name = buffer.name # 名称不变,保持父级目录路径 + continue + else: + merged_chunks.append(buffer) + buffer = None + + if len(chunk.content) < min_length: + # 缓存过短的 chunk + buffer = chunk + else: + # 长度足够,直接加入结果 + merged_chunks.append(chunk) + + # 如果最后一个 chunk 被缓存在 buffer,直接加入结果 + if buffer: + merged_chunks.append(buffer) + + return merged_chunks + + def split_sentence(self, content): + """ + Splits the given content into sentences based on delimiters. + + Args: + content (str): The content to be split. + + Returns: + list: A list of sentences. + """ + sentence_delimiters = ".。??!!" + output = [] + start = 0 + for idx, char in enumerate(content): + if char in sentence_delimiters: + end = idx + tmp = content[start : end + 1].strip() + if len(tmp) > 0: + output.append(tmp) + start = idx + 1 + res = content[start:] + if len(res) > 0: + output.append(res) + return output + + def slide_window_chunk( + self, + org_chunk: Chunk, + chunk_size: int = 2000, + window_length: int = 300, + sep: str = "\n", + ) -> List[Chunk]: + """ + Splits the content into chunks using a sliding window approach. + + Args: + org_chunk (Chunk): The original chunk to be split. + chunk_size (int, optional): The maximum size of each chunk. Defaults to 2000. + window_length (int, optional): The length of the overlap between chunks. Defaults to 300. + sep (str, optional): The separator used to join sentences. Defaults to "\n". + + Returns: + List[Chunk]: A list of Chunk objects. + """ + if org_chunk.type == ChunkTypeEnum.Table: + table_chunks = self.split_table( + org_chunk=org_chunk, chunk_size=chunk_size, sep=sep + ) + if table_chunks is not None: + return table_chunks + content = self.split_sentence(org_chunk.content) + splitted = [] + cur = [] + cur_len = 0 + for sentence in content: + if cur_len + len(sentence) > chunk_size: + if cur: + splitted.append(cur) + tmp = [] + cur_len = 0 + for item in cur[::-1]: + if cur_len >= window_length: + break + tmp.append(item) + cur_len += len(item) + cur = tmp[::-1] + + cur.append(sentence) + cur_len += len(sentence) + if len(cur) > 0: + splitted.append(cur) + + output = [] + for idx, sentences in enumerate(splitted): + chunk = Chunk( + id=generate_hash_id(f"{org_chunk.id}#{idx}"), + name=f"{org_chunk.name}#{idx}", + content=sep.join(sentences), + type=org_chunk.type, + **org_chunk.kwargs, + ) + output.append(chunk) + return output + + def sep_by_outline_ignore_duplicates( + self, content, outlines, min_length=50, max_length=500, org_chunk=None + ): + """ + 按层级划分内容为 chunks,剔除无效的标题,并忽略重复的标题。 + + 参数: + - content: str,完整内容。 + - outlines: List[Tuple[str, int]],每个标题及其层级的列表。 + - min_length: int,chunk 的最小长度,低于此值时尝试合并。 + - max_length: int,chunk 的最大长度,合并后不能超过此值。 + + 返回: + - List[Chunk],分割后的 chunk 列表。 + """ + + if not outlines or len(outlines) == 0: + cutted = [] + if isinstance(org_chunk, list): + for item in org_chunk: + cutted.extend(self.slide_window_chunk(item)) + return cutted + + position_check = [] + seen_titles = set() + for outline in outlines: + title, level = outline + start = content.find(title) + if start != -1 and title not in seen_titles: + # 检查position_check是否为空或者当前start是否大于上一个元素的start + if not position_check or start > position_check[-1][1]: + position_check.append((outline, start)) + else: + # 如果当前start不大于上一个元素的start,则跳过这个元素 + continue + seen_titles.add(title) + + if not position_check: + return [] + + chunks = [] + father_stack = [] + + for idx, (outline, start) in enumerate(position_check): + title, level = outline + end = ( + position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(content) + ) + while father_stack and father_stack[-1][1] >= level: + father_stack.pop() + full_path = "/".join([item[0] for item in father_stack] + [title]) + chunk_content = content[start:end] + + # add origin kwargs + origin_properties = {} + for key, value in org_chunk[0].kwargs.items(): + origin_properties[key] = value + + chunk = Chunk( + id=generate_hash_id(f"{full_path}#{idx}"), + name=full_path, + content=chunk_content, + **origin_properties, + start=start, + end=end, + ) + chunks.append(chunk) + father_stack.append((title, level)) + + # 导出start end的chunk结果 + # dump_chunks_with_start_end(chunks, output_path="./start_end_chunk.json") + + # 合并过短的 chunks + merged_chunks = [] + buffer = None + + for chunk in chunks: + if buffer: + # 当前 chunk 合并到 buffer 中 + if ( + chunk.name.startswith(buffer.name) # 同一父级目录 + and len(buffer.content) + len(chunk.content) <= max_length + ): + buffer.content += chunk.content + continue + else: + merged_chunks.append(buffer) + buffer = None + + if len(chunk.content) < min_length: + # 缓存过短的 chunk + buffer = chunk + else: + # 长度足够,直接加入结果 + merged_chunks.append(chunk) + + # 如果最后一个 chunk 被缓存在 buffer,直接加入结果 + if buffer: + merged_chunks.append(buffer) + + for idx, chunk in enumerate(merged_chunks): + chunk.prev_content = merged_chunks[idx - 1].content if idx > 0 else None + chunk.next_content = ( + merged_chunks[idx + 1].content if idx < len(merged_chunks) - 1 else None + ) + + return merged_chunks + + def sep_by_outline_with_outline_tree(self, content, outlines, org_chunk=None): + """ + 按层级划分内容为 chunks,剔除无效的标题,并忽略重复的标题。 + + 参数: + - content: str,完整内容。 + - outlines: List[Tuple[str, int]],每个标题及其层级的列表。 + - min_length: int,chunk 的最小长度,低于此值时尝试合并。 + - max_length: int,chunk 的最大长度,合并后不能超过此值。 + + 返回: + - List[Chunk],分割后的 chunk 列表。 + """ + + if not outlines or len(outlines) == 0: + cutted = [] + if isinstance(org_chunk, list): + for item in org_chunk: + cutted.extend(self.slide_window_chunk(item)) + return cutted + + position_check = [] + seen_titles = set() + for outline in outlines: + title, level = outline + start = content.find(title) + if start != -1 and title not in seen_titles: + # 检查position_check是否为空或者当前start是否大于上一个元素的start + if not position_check or start > position_check[-1][1]: + position_check.append((outline, start)) + else: + # 如果当前start不大于上一个元素的start,则跳过这个元素 + continue + seen_titles.add(title) + + for idx, (outline, start) in enumerate(position_check): + title, level = outline + end = ( + position_check[idx + 1][1] + if idx + 1 < len(position_check) + else len(content) + ) + position_check[idx] = (outline, start, end) + + outlines_with_content = [] + for outline, start, end in position_check: + title, level = outline + t_content = content[start:end] + sd_content = (start, end) + outlines_with_content.append((title, t_content, sd_content, level)) + + # 构建目录树 + catalog_tree = self.build_catalog_tree(outlines_with_content) + + # 简化目录树 + # if catalog_tree: + # for node in catalog_tree: + # self.simplify_catalog_tree(node) + + # add origin kwargs + origin_properties = {} + for key, value in org_chunk[0].kwargs.items(): + origin_properties[key] = value + + def generate_chunks(node, chunks=None, parent_title=""): + if chunks is None: + chunks = [] + + # 构建当前节点的完整title + full_title = ( + "/".join([parent_title, node["title"]]) + if parent_title + else node["title"] + ) + + # 为当前节点生成chunk + chunk_id = generate_hash_id(full_title) # 使用完整title生成ID + chunk = Chunk( + id=chunk_id, + name=full_title, # 使用完整title + content=node["content"], + # 假设origin_properties是全局的或者在函数外部定义的,包含其他需要的属性 + **origin_properties, + start=node["start"], + end=node["end"], + ) + chunks.append(chunk) + + # 递归为子节点生成chunk + for child in node.get("children", []): + generate_chunks(child, chunks, full_title) # 将当前完整title传递给子节点 + + return chunks + + chunks = [] + for node in catalog_tree: + chunks.extend(generate_chunks(node)) + + # 导出start end的chunk结果 + # dump_chunks_with_start_end(chunks, output_path="./start_end_chunk.json") + + # 合并过短的 chunks + merged_chunks = [] + buffer = None + + for chunk in chunks: + if buffer: + # 当前 chunk 合并到 buffer 中 + if ( + chunk.name.startswith(buffer.name) # 同一父级目录 + and len(buffer.content) + len(chunk.content) <= self.chunk_size + ): + buffer.content += chunk.content + continue + else: + merged_chunks.append(buffer) + buffer = None + + if len(chunk.content) < self.min_length: + # 缓存过短的 chunk + buffer = chunk + else: + # 长度足够,直接加入结果 + merged_chunks.append(chunk) + + # 如果最后一个 chunk 被缓存在 buffer,直接加入结果 + if buffer: + merged_chunks.append(buffer) + + for i in range(len(merged_chunks) - 1, -1, -1): + chunk = merged_chunks[i] + if len(chunk.content) < (self.min_length * 0.5): + del merged_chunks[i] + + for idx, chunk in enumerate(merged_chunks): + chunk.prev_content = merged_chunks[idx - 1].content if idx > 0 else None + chunk.next_content = ( + merged_chunks[idx + 1].content if idx < len(merged_chunks) - 1 else None + ) + + return merged_chunks + + def log(self, chunks, log_path="./chunk_log.txt"): + length_counts = collections.defaultdict(int) + + for chunk in chunks: + length = len(chunk.content) + length_segment = length // 10 + length_counts[length_segment] += 1 + + with open(log_path, "a") as f: + for length_segment, count in length_counts.items(): + f.write( + f"Length segment {length_segment*10}-{(length_segment+1)*10} chunks: {count}\n" + ) + + # 绘制长度分布图 + self.plot_length_distribution(length_counts) + + def plot_length_distribution(self, length_counts): + segments = list(length_counts.keys()) + counts = list(length_counts.values()) + + plt.figure(figsize=(10, 6)) + plt.bar(segments, counts, color="blue") + plt.xlabel("Length Segment") + plt.ylabel("Number of Chunks") + plt.title("Chunk Length Distribution") + plt.xticks(segments) + plt.savefig("chunk_length_distribution.png") + + def splitter_chunk(self, input: Input, **kwargs) -> List[Chunk]: + cutted = [] + chunk_size = kwargs.get("chunk_size") + if isinstance(input, list): + for item in input: + cutted.extend(self.slide_window_chunk(item, chunk_size=chunk_size)) + else: + cutted.extend(self.slide_window_chunk(input, chunk_size=chunk_size)) + return cutted + + def invoke(self, input: Input, **kwargs) -> List[Chunk]: + chunks = self.splitter_chunk(input, chunk_size=self.llm_max_tokens // 2) + chunks = self.outline_chunk_batch(chunks) + # chunks = self.splitter_chunk(chunks, chunk_size=self.chunk_size) + # self.log(chunks) return chunks + if __name__ == "__main__": from kag.builder.component.splitter.length_splitter import LengthSplitter - from kag.builder.component.splitter.outline_splitter import OutlineSplitter from kag.builder.component.reader.docx_reader import DocxReader - from kag.common.env import init_kag_config - init_kag_config(os.path.join(os.path.dirname(__file__),"../../../../tests/builder/component/test_config.cfg")) + from kag.builder.component.reader.txt_reader import TXTReader + from kag.builder.component.reader.pdf_reader import PDFReader + + pdf_reader = PDFReader() docx_reader = DocxReader() - length_splitter = LengthSplitter(split_length=8000) - outline_splitter = OutlineSplitter() - docx_path = os.path.join(os.path.dirname(__file__),"../../../../tests/builder/data/test_docx.docx") - # chain = docx_reader >> length_splitter >> outline_splitter - chunk = docx_reader.invoke(docx_path) - chunks = length_splitter.invoke(chunk) - chunks = outline_splitter.invoke(chunks) - print(chunks) \ No newline at end of file + txt_reader = TXTReader() + length_splitter = LengthSplitter(split_length=5000) + + llm = LLMClient.from_config(KAG_CONFIG.all_config["llm"]) + outline_splitter = OutlineSplitter(llm=llm) + txt_path = os.path.join( + os.path.dirname(__file__), "../../../../tests/builder/data/儿科学_short.txt" + ) + docx_path = "/Users/zhangxinhong.zxh/Downloads/waikexue_short.docx" + test_dir = "/Users/zhangxinhong.zxh/Downloads/1127_medkag_book" + pdf_path = "/Users/zhangxinhong.zxh/Downloads/toaz.info-5dsm-5-pr_56e68a629dc4fe62699960dd5afbe362.pdf" + files = [ + os.path.join(test_dir, file) + for file in os.listdir(test_dir) + if file.endswith(".docx") + ] + files = [ + files[0], + ] + + def process_file(file): + chain = docx_reader >> outline_splitter + chunks = chain.invoke(file, max_workers=10) + dump_chunks(chunks, output_path=file.replace(".docx", ".json")) + + def process_txt(txt): + chain = txt_reader >> outline_splitter + chunks = chain.invoke(txt, max_workers=10) + dump_chunks(chunks, output_path=txt.replace(".txt", ".json")) + + def process_file_without_chain(file): + chunk = docx_reader.invoke(file) + chunks = outline_splitter.invoke(chunk) + dump_chunks(chunks, output_path=file.replace(".docx", ".json")) + + def process_txt_without_chain(txt): + chunk = txt_reader.invoke(txt) + chunks = outline_splitter.invoke(chunk) + dump_chunks(chunks, output_path=txt.replace(".txt", ".json")) + + def process_pdf_without_chain(pdf): + chunk = pdf_reader.invoke(pdf) + chunks = outline_splitter.invoke(chunk) + dump_chunks(chunks, output_path=pdf.replace(".pdf", ".json")) + + # with ThreadPoolExecutor(max_workers=10) as executor: + # futures = [executor.submit(process_file, file) for file in files] + + # for future in as_completed(futures): + # print(future.result()) + + process_file_without_chain(docx_path) + a = 1 + # chunk = docx_reader.invoke(docx_path) + # chunk = txt_reader.invoke(txt_path) + # chunks = length_splitter.invoke(chunk) + # chunks = outline_splitter.invoke(chunks) + # print(chunks) diff --git a/kag/builder/component/splitter/pattern_splitter.py b/kag/builder/component/splitter/pattern_splitter.py index 0b72f265..32f4737c 100644 --- a/kag/builder/component/splitter/pattern_splitter.py +++ b/kag/builder/component/splitter/pattern_splitter.py @@ -10,27 +10,37 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from typing import Type, List, Union +# flake8: noqa import re -import os +from typing import Type, List, Union -from kag.builder.model.chunk import Chunk, ChunkTypeEnum -from kag.interface.builder.splitter_abc import SplitterABC + +from kag.builder.model.chunk import Chunk +from kag.interface import SplitterABC +from kag.common.utils import generate_hash_id from knext.common.base.runnable import Input, Output +@SplitterABC.register("pattern") +@SplitterABC.register("pattern_splitter") class PatternSplitter(SplitterABC): - def __init__(self, pattern_dict: dict = None, chunk_cut_num=None): + """ + A class for splitting text content based on specified patterns and chunking strategies. + """ + + def __init__(self, pattern_dict: dict = None, chunk_cut_num: int = None): """ - pattern_dict: - { - "pattern": 匹配pattern, - "group": { - "header":1, - "name":2, - "content":3 - } - } + Initializes the PatternSplitter with the given pattern dictionary and chunk cut number. + + Args: + pattern_dict (dict, optional): A dictionary containing the pattern and group mappings. + Defaults to a predefined pattern if not provided. + Example: + { + "pattern": r"(\d+).([^0-9]+?)?([^0-9第版].*?)(?=\d+\.|$)", + "group": {"header": 2, "name": 2, "content": 0} + } + chunk_cut_num (int, optional): The number of characters to cut chunks into. Defaults to None. """ super().__init__() if pattern_dict is None: @@ -53,6 +63,15 @@ def output_types(self) -> Type[Output]: return List[Chunk] def split_sentence(self, content): + """ + Splits the given content into sentences based on delimiters. + + Args: + content (str): The content to be split into sentences. + + Returns: + List[str]: A list of sentences extracted from the content. + """ sentence_delimiters = "。??!!;;\n" output = [] start = 0 @@ -76,7 +95,19 @@ def slide_window_chunk( sep: str = "\n", prefix: str = "SlideWindow", ) -> List[Chunk]: + """ + Splits the content into chunks using a sliding window approach. + Args: + content (Union[str, List[str]]): The content to be chunked. + chunk_size (int, optional): The maximum size of each chunk. Defaults to 2000. + window_length (int, optional): The length of the sliding window. Defaults to 300. + sep (str, optional): The separator to join sentences within a chunk. Defaults to "\n". + prefix (str, optional): The prefix to use for chunk names. Defaults to "SlideWindow". + + Returns: + List[Chunk]: A list of Chunk objects representing the chunked content. + """ if isinstance(content, str): content = self.split_sentence(content) splitted = [] @@ -103,7 +134,7 @@ def slide_window_chunk( for idx, sentences in enumerate(splitted): chunk_name = f"{prefix}#{idx}" chunk = Chunk( - id=Chunk.generate_hash_id(chunk_name), + id=generate_hash_id(chunk_name), name=chunk_name, content=sep.join(sentences), ) @@ -114,6 +145,15 @@ def chunk_split( self, chunk: Chunk, ) -> List[Chunk]: + """ + Splits the given chunk into smaller chunks based on the pattern and chunk cut number. + + Args: + chunk (Chunk): The chunk to be split. + + Returns: + List[Chunk]: A list of smaller Chunk objects. + """ text = chunk.content pattern = re.compile(self.pattern, re.DOTALL) @@ -127,7 +167,7 @@ def chunk_split( chunk = Chunk( chunk_header=match.group(self.group["header"]), name=match.group(self.group["name"]), - id=Chunk.generate_hash_id(match.group(self.group["content"])), + id=generate_hash_id(match.group(self.group["content"])), content=match.group(self.group["content"]), ) chunk = [chunk] @@ -145,43 +185,16 @@ def chunk_split( return chunks - def invoke(self, input: Chunk, **kwargs) -> List[Output]: + def _invoke(self, input: Chunk, **kwargs) -> List[Output]: + """ + Invokes the chunk splitting process on the given input. + + Args: + input (Chunk): The input chunk to be processed. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. + Returns: + List[Output]: A list of output chunks. + """ chunks = self.chunk_split(input) return chunks - - def to_rest(self): - pass - - @classmethod - def from_rest(cls, rest_model): - pass - - -class LayeredPatternSpliter(PatternSplitter): - pass - - -def _test(): - pattern_dict = { - "pattern": r"(\d+)\.([^0-9]+?)?([^0-9第版].*?)(?=\d+\.|$)", - "group": {"header": 2, "name": 2, "content": 0}, - } - ds = PatternSplitter(pattern_dict=pattern_dict) - from kag.builder.component.reader.pdf_reader import PDFReader - - reader = PDFReader() - file_path = os.path.dirname(__file__) - test_file_path = os.path.join(file_path, "../../../../tests/builder/data/aiwen.pdf") - pre_output = reader._handle(test_file_path) - - handle_input = pre_output[0] - handle_result = ds._handle(handle_input) - print("handle_result", handle_result) - - return handle_result - - -if __name__ == "__main__": - res = _test() - print(res) diff --git a/kag/builder/component/splitter/semantic_splitter.py b/kag/builder/component/splitter/semantic_splitter.py index 40ba22b2..d5e15391 100644 --- a/kag/builder/component/splitter/semantic_splitter.py +++ b/kag/builder/component/splitter/semantic_splitter.py @@ -10,41 +10,56 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. import logging -import os import re from typing import List, Type -from kag.interface.builder import SplitterABC +from kag.interface import SplitterABC from kag.builder.prompt.semantic_seg_prompt import SemanticSegPrompt from kag.builder.model.chunk import Chunk +from kag.interface import LLMClient +from kag.common.conf import KAG_PROJECT_CONF +from kag.common.utils import generate_hash_id from knext.common.base.runnable import Input, Output -from kag.common.llm.client.llm_client import LLMClient logger = logging.getLogger(__name__) +@SplitterABC.register("semantic") +@SplitterABC.register("semantic_splitter") class SemanticSplitter(SplitterABC): """ A class for semantically splitting text into smaller chunks based on the content's structure and meaning. - Inherits from the Splitter class. + Inherits from the SplitterABC class. - Attributes: - kept_char_pattern (re.Pattern): Regex pattern to match Chinese/ASCII characters. - split_length (int): The maximum length of each chunk after splitting. - llm_client (LLMClient): Instance of LLMClient initialized with `model` config. - semantic_seg_op (SemanticSegPrompt): Instance of SemanticSegPrompt for semantic segmentation. """ - def __init__(self, split_length: int = 1000, **kwargs): - super().__init__(**kwargs) + def __init__( + self, + llm: LLMClient, + kept_char_pattern: str = None, + split_length: int = 1000, + ): + """ + Initializes the SemanticSplitter with the given LLMClient, kept character pattern, and split length. + + Args: + llm (LLMClient): Instance of LLMClient initialized with `model` config. + kept_char_pattern (str, optional): Regex pattern to match Chinese/ASCII characters. + Defaults to a predefined pattern if not provided. + split_length (int, optional): The maximum length of each chunk after splitting. Defaults to 1000. + **kwargs: Additional keyword arguments to be passed to the superclass. + """ + super().__init__() # Chinese/ASCII characters - self.kept_char_pattern = re.compile( - r"[^\u4e00-\u9fa5\u3000-\u303F\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\x00-\x7F]+" - ) - self.split_length = int(split_length) - self.llm = self._init_llm() - language = os.getenv("KAG_PROMPT_LANGUAGE", "zh") - self.semantic_seg_op = SemanticSegPrompt(language) + if kept_char_pattern is None: + self.kept_char_pattern = re.compile( + r"[^\u4e00-\u9fa5\u3000-\u303F\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\x00-\x7F]+" + ) + else: + self.kept_char_pattern = re.compile(kept_char_pattern) + self.split_length = split_length + self.llm = llm + self.semantic_seg_op = SemanticSegPrompt(KAG_PROJECT_CONF.language) @property def input_types(self) -> Type[Input]: @@ -103,6 +118,8 @@ def semantic_chunk( """ result = self.llm.invoke({"input": org_chunk.content}, self.semantic_seg_op) splitted = self.parse_llm_output(org_chunk.content, result) + if len(splitted) == 0: + return [org_chunk] logger.debug(f"splitted = {splitted}") chunks = [] for idx, item in enumerate(splitted): @@ -113,30 +130,26 @@ def semantic_chunk( name=f"{org_chunk.name}#{split_name}", content=item["content"], abstract=item["name"], - **org_chunk.kwargs + **org_chunk.kwargs, ) chunks.append(chunk) else: print("chunk over size") innerChunk = Chunk( - id=Chunk.generate_hash_id(item["content"]), + id=generate_hash_id(item["content"]), name=f"{org_chunk.name}#{split_name}", content=item["content"], ) - chunks.extend( - self.semantic_chunk( - innerChunk, chunk_size - ) - ) + chunks.extend(self.semantic_chunk(innerChunk, chunk_size)) return chunks - def invoke(self, input: Input, **kwargs) -> List[Output]: + def _invoke(self, input: Input, **kwargs) -> List[Output]: """ Invokes the splitting process on the provided input. Args: input (Input): The input to be processed. - **kwargs: Additional keyword arguments. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. Returns: List[Output]: A list of outputs generated from the input. diff --git a/kag/builder/component/vectorizer/__init__.py b/kag/builder/component/vectorizer/__init__.py index 93aa6cd4..e69de29b 100644 --- a/kag/builder/component/vectorizer/__init__.py +++ b/kag/builder/component/vectorizer/__init__.py @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. diff --git a/kag/builder/component/vectorizer/batch_vectorizer.py b/kag/builder/component/vectorizer/batch_vectorizer.py index 208f8e9f..9a2b1125 100644 --- a/kag/builder/component/vectorizer/batch_vectorizer.py +++ b/kag/builder/component/vectorizer/batch_vectorizer.py @@ -9,17 +9,18 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import os from collections import defaultdict from typing import List +from tenacity import stop_after_attempt, retry from kag.builder.model.sub_graph import SubGraph -from knext.common.base.runnable import Input, Output -from kag.common.vectorizer import Vectorizer -from kag.interface.builder.vectorizer_abc import VectorizerABC +from kag.common.conf import KAG_PROJECT_CONF + +from kag.common.utils import get_vector_field_name +from kag.interface import VectorizerABC, VectorizeModelABC from knext.schema.client import SchemaClient -from knext.project.client import ProjectClient from knext.schema.model.base import IndexTypeEnum +from knext.common.base.runnable import Input, Output class EmbeddingVectorPlaceholder(object): @@ -43,22 +44,15 @@ class EmbeddingVectorManager(object): def __init__(self): self._placeholders = [] - def _create_vector_field_name(self, property_key): - from kag.common.utils import to_snake_case - - name = f"{property_key}_vector" - name = to_snake_case(name) - return "_" + name - def get_placeholder(self, properties, vector_field): for property_key, property_value in properties.items(): - field_name = self._create_vector_field_name(property_key) + field_name = get_vector_field_name(property_key) if field_name != vector_field: continue if not property_value: return None if not isinstance(property_value, str): - message = f"property {property_key!r} must be string to generate embedding vector" + message = f"property {property_key!r} must be string to generate embedding vector, got {property_value} with type {type(property_value)}" raise RuntimeError(message) num = len(self._placeholders) placeholder = EmbeddingVectorPlaceholder( @@ -78,11 +72,10 @@ def _get_text_batch(self): return text_batch def _generate_vectors(self, vectorizer, text_batch, batch_size=32): - if isinstance(text_batch, str): - text_batch = [text_batch] texts = list(text_batch) if not texts: return [] + if len(texts) % batch_size == 0: n_batchs = len(texts) // batch_size else: @@ -99,9 +92,9 @@ def _fill_vectors(self, vectors, text_batch): for placeholder in placeholders: placeholder._embedding_vector = vector - def batch_generate(self, vectorizer): + def batch_generate(self, vectorizer, batch_size=32): text_batch = self._get_text_batch() - vectors = self._generate_vectors(vectorizer, text_batch) + vectors = self._generate_vectors(vectorizer, text_batch, batch_size) self._fill_vectors(vectors, text_batch) def patch(self): @@ -115,7 +108,7 @@ def __init__(self, vectorizer, vector_index_meta=None, extra_labels=("Entity",)) self._extra_labels = extra_labels self._vector_index_meta = vector_index_meta or {} - def batch_generate(self, node_batch): + def batch_generate(self, node_batch, batch_size=32): manager = EmbeddingVectorManager() vector_index_meta = self._vector_index_meta for node_item in node_batch: @@ -132,41 +125,49 @@ def batch_generate(self, node_batch): placeholder = manager.get_placeholder(properties, vector_field) if placeholder is not None: properties[vector_field] = placeholder - manager.batch_generate(self._vectorizer) + manager.batch_generate(self._vectorizer, batch_size) manager.patch() +@VectorizerABC.register("batch") +@VectorizerABC.register("batch_vectorizer") class BatchVectorizer(VectorizerABC): - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.project_id = self.project_id or os.getenv("KAG_PROJECT_ID") - self._init_graph_store() - self.vec_meta = self._init_vec_meta() - self.vectorizer = Vectorizer.from_config(self.vectorizer_config) + """ + A class for generating embedding vectors for node attributes in a SubGraph in batches. - def _init_graph_store(self): - """ - Initializes the Graph Store client. + This class inherits from VectorizerABC and provides the functionality to generate embedding vectors + for node attributes in a SubGraph in batches. It uses a specified vectorization model and processes + the nodes of a specified batch size. - This method retrieves the graph store configuration from environment variables and the project ID. - It then fetches the project configuration using the project ID and updates the graph store configuration - with any additional settings from the project. Finally, it creates and initializes the graph store client - using the updated configuration. + Attributes: + project_id (int): The ID of the project associated with the SubGraph. + vec_meta (defaultdict): Metadata for vector fields in the SubGraph. + vectorize_model (VectorizeModelABC): The model used for generating embedding vectors. + batch_size (int): The size of the batches in which to process the nodes. + """ - Args: - project_id (str): The id of project. + def __init__(self, vectorize_model: VectorizeModelABC, batch_size: int = 32): + """ + Initializes the BatchVectorizer with the specified vectorization model and batch size. - Returns: - GraphStore + Args: + vectorize_model (VectorizeModelABC): The model used for generating embedding vectors. + batch_size (int): The size of the batches in which to process the nodes. Defaults to 32. """ - graph_store_config = eval(os.getenv("KAG_GRAPH_STORE", "{}")) - vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}")) - config = ProjectClient().get_config(self.project_id) - graph_store_config.update(config.get("graph_store", {})) - vectorizer_config.update(config.get("vectorizer", {})) - self.vectorizer_config = vectorizer_config + super().__init__() + self.project_id = KAG_PROJECT_CONF.project_id + # self._init_graph_store() + self.vec_meta = self._init_vec_meta() + self.vectorize_model = vectorize_model + self.batch_size = batch_size def _init_vec_meta(self): + """ + Initializes the vector metadata for the SubGraph. + + Returns: + defaultdict: Metadata for vector fields in the SubGraph. + """ vec_meta = defaultdict(list) schema_client = SchemaClient(project_id=self.project_id) spg_types = schema_client.load() @@ -176,32 +177,31 @@ def _init_vec_meta(self): IndexTypeEnum.Vector, IndexTypeEnum.TextAndVector, ]: - vec_meta[type_name].append( - self._create_vector_field_name(prop_name) - ) + vec_meta[type_name].append(get_vector_field_name(prop_name)) return vec_meta - def _create_vector_field_name(self, property_key): - from kag.common.utils import to_snake_case + @retry(stop=stop_after_attempt(3)) + def _generate_embedding_vectors(self, input_subgraph: SubGraph) -> SubGraph: + """ + Generates embedding vectors for the nodes in the input SubGraph. - name = f"{property_key}_vector" - name = to_snake_case(name) - return "_" + name + Args: + input_subgraph (SubGraph): The SubGraph for which to generate embedding vectors. - def _generate_embedding_vectors( - self, vectorizer: Vectorizer, input: SubGraph - ) -> SubGraph: + Returns: + SubGraph: The modified SubGraph with generated embedding vectors. + """ node_list = [] node_batch = [] - for node in input.nodes: + for node in input_subgraph.nodes: if not node.id or not node.name: continue properties = {"id": node.id, "name": node.name} properties.update(node.properties) node_list.append((node, properties)) node_batch.append((node.label, properties.copy())) - generator = EmbeddingVectorGenerator(vectorizer, self.vec_meta) - generator.batch_generate(node_batch) + generator = EmbeddingVectorGenerator(self.vectorize_model, self.vec_meta) + generator.batch_generate(node_batch, self.batch_size) for (node, properties), (_node_label, new_properties) in zip( node_list, node_batch ): @@ -209,8 +209,18 @@ def _generate_embedding_vectors( if key in new_properties and new_properties[key] == value: del new_properties[key] node.properties.update(new_properties) - return input + return input_subgraph - def invoke(self, input: Input, **kwargs) -> List[Output]: - modified_input = self._generate_embedding_vectors(self.vectorizer, input) + def _invoke(self, input_subgraph: Input, **kwargs) -> List[Output]: + """ + Invokes the generation of embedding vectors for the input SubGraph. + + Args: + input_subgraph (Input): The SubGraph for which to generate embedding vectors. + **kwargs: Additional keyword arguments, currently unused but kept for potential future expansion. + + Returns: + List[Output]: A list containing the modified SubGraph with generated embedding vectors. + """ + modified_input = self._generate_embedding_vectors(input_subgraph) return [modified_input] diff --git a/kag/builder/component/writer/__init__.py b/kag/builder/component/writer/__init__.py index 3d92f23e..e69de29b 100644 --- a/kag/builder/component/writer/__init__.py +++ b/kag/builder/component/writer/__init__.py @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.builder.component.writer.kg_writer import KGWriter - -__all__ = [ - "KGWriter", -] diff --git a/kag/builder/component/writer/kg_writer.py b/kag/builder/component/writer/kg_writer.py index 155bf1bf..8b687b0d 100644 --- a/kag/builder/component/writer/kg_writer.py +++ b/kag/builder/component/writer/kg_writer.py @@ -9,14 +9,15 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. +import json import logging -import os from enum import Enum from typing import Type, Dict, List -from knext.graph_algo.client import GraphAlgoClient +from knext.graph.client import GraphClient from kag.builder.model.sub_graph import SubGraph -from kag.interface.builder.writer_abc import SinkWriterABC +from kag.interface import SinkWriterABC +from kag.common.conf import KAG_PROJECT_CONF from knext.common.base.runnable import Input, Output logger = logging.getLogger(__name__) @@ -27,19 +28,30 @@ class AlterOperationEnum(str, Enum): Delete = "DELETE" +@SinkWriterABC.register("kg", as_default=True) +@SinkWriterABC.register("kg_writer", as_default=True) class KGWriter(SinkWriterABC): """ - A class that extends `SinkWriter` to handle writing data into a Neo4j knowledge graph. + A class for writing SubGraphs to a Knowledge Graph (KG) storage. - This class is responsible for configuring the graph store based on environment variables and - an optional project ID, initializing the Neo4j client, and setting up the schema. - It also manages semantic indexing and multi-threaded operations. + This class inherits from SinkWriterABC and provides the functionality to write SubGraphs + to a Knowledge Graph storage system. It supports operations like upsert and delete. """ - def __init__(self, project_id: str = None, **kwargs): + def __init__(self, project_id: int = None, **kwargs): + """ + Initializes the KGWriter with the specified project ID. + + Args: + project_id (int): The ID of the project associated with the KG. Defaults to None. + **kwargs: Additional keyword arguments passed to the superclass. + """ super().__init__(**kwargs) - self.project_id = project_id or os.getenv("KAG_PROJECT_ID") - self.client = GraphAlgoClient(project_id=project_id) + if project_id is None: + self.project_id = KAG_PROJECT_CONF.project_id + else: + self.project_id = project_id + self.client = GraphClient(project_id=project_id) @property def input_types(self) -> Type[Input]: @@ -49,25 +61,84 @@ def input_types(self) -> Type[Input]: def output_types(self) -> Type[Output]: return None + def format_label(self, label: str): + """ + Formats the label by adding the project namespace if it is not already present. + + Args: + label (str): The label to be formatted. + + Returns: + str: The formatted label. + """ + namespace = KAG_PROJECT_CONF.namespace + if label.split(".")[0] == namespace: + return label + return f"{namespace}.{label}" + + def standarlize_graph(self, graph): + for node in graph.nodes: + node.label = self.format_label(node.label) + for edge in graph.edges: + edge.from_type = self.format_label(edge.from_type) + edge.to_type = self.format_label(edge.to_type) + + for node in graph.nodes: + for k, v in node.properties.items(): + if k.startswith("_"): + continue + if not isinstance(v, str): + node.properties[k] = json.dumps(v, ensure_ascii=False) + for edge in graph.edges: + for k, v in edge.properties.items(): + if k.startswith("_"): + continue + if not isinstance(v, str): + edge.properties[k] = json.dumps(v, ensure_ascii=False) + + return graph + def invoke( - self, input: Input, alter_operation: str = AlterOperationEnum.Upsert, lead_to_builder: bool = False + self, + input: Input, + alter_operation: str = AlterOperationEnum.Upsert, + lead_to_builder: bool = False, + **kwargs, ) -> List[Output]: """ Invokes the specified operation (upsert or delete) on the graph store. Args: input (Input): The input object representing the subgraph to operate on. - alter_operation (str): The type of operation to perform (Upsert or Delete). - lead_to_builder (str): enable lead to event infer builder + alter_operation (str): The type of operation to perform (Upsert or Delete). Defaults to Upsert. + lead_to_builder (bool): Enable lead to event infer builder. Defaults to False. Returns: List[Output]: A list of output objects (currently always [None]). """ - self.client.write_graph(sub_graph=input.to_dict(), operation=alter_operation, lead_to_builder=lead_to_builder) - return [None] + + input = self.standarlize_graph(input) + logger.debug(f"final graph to write: {input}") + self.client.write_graph( + sub_graph=input.to_dict(), + operation=alter_operation, + lead_to_builder=lead_to_builder, + ) + return [input] def _handle(self, input: Dict, alter_operation: str, **kwargs): - """The calling interface provided for SPGServer.""" + """ + The calling interface provided for SPGServer. + + Args: + input (Dict): The input dictionary representing the subgraph to operate on. + alter_operation (str): The type of operation to perform (Upsert or Delete). + **kwargs: Additional keyword arguments. + + Returns: + None: This method currently returns None. + """ _input = self.input_types.from_dict(input) - _output = self.invoke(_input, alter_operation) + _output = self.invoke(_input, alter_operation) # noqa + return None diff --git a/kag/builder/default_chain.py b/kag/builder/default_chain.py index ab04aff9..1f7ea2aa 100644 --- a/kag/builder/default_chain.py +++ b/kag/builder/default_chain.py @@ -9,149 +9,182 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. - import logging -import importlib -import os - -from kag.builder.component import SPGTypeMapping, KGWriter -from kag.builder.component.extractor import KAGExtractor -from kag.builder.component.splitter import LengthSplitter -from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer -from knext.common.base.chain import Chain -from knext.builder.builder_chain_abc import BuilderChainABC +from concurrent.futures import ThreadPoolExecutor, as_completed +from kag.interface import ( + ReaderABC, + MappingABC, + ExtractorABC, + SplitterABC, + VectorizerABC, + PostProcessorABC, + SinkWriterABC, + KAGBuilderChain, +) +from kag.common.utils import generate_hash_id logger = logging.getLogger(__name__) -def get_reader(file_path: str): - file = os.path.basename(file_path) - suffix = file.split(".")[-1] - assert suffix.lower() in READER_MAPPING, f"{suffix} is not supported. Supported suffixes are: {list(READER_MAPPING.keys())}" - reader_path = READER_MAPPING.get(suffix.lower()) - mod_path, class_name = reader_path.rsplit('.', 1) - module = importlib.import_module(mod_path) - reader_class = getattr(module, class_name) - - return reader_class - - -READER_MAPPING = { - "csv": "kag.builder.component.reader.csv_reader.CSVReader", - "json": "kag.builder.component.reader.json_reader.JSONReader", - "txt": "kag.builder.component.reader.txt_reader.TXTReader", - "pdf": "kag.builder.component.reader.pdf_reader.PDFReader", - "docx": "kag.builder.component.reader.docx_reader.DocxReader", - "md": "kag.builder.component.reader.markdown_reader.MarkdownReader", -} - - -class DefaultStructuredBuilderChain(BuilderChainABC): +@KAGBuilderChain.register("structured") +@KAGBuilderChain.register("structured_builder_chain") +class DefaultStructuredBuilderChain(KAGBuilderChain): """ - A class representing a default SPG builder chain, used to import structured data based on schema definitions - - Steps: - 0. Initializing by a give SpgType name, which indicates the target of import. - 1. SourceReader: Reading structured dicts from a given file. - 2. SPGTypeMapping: Mapping source fields to the properties of target type, and assemble a sub graph. - By default, the same name mapping is used, which means importing the source field into a property with the same name. - 3. KGWriter: Writing sub graph into KG storage. - - Attributes: - spg_type_name (str): The name of the SPG type. + A class representing a default SPG builder chain, used to import structured data based on schema definitions. + It consists of a mapping component, a writer component, and an optional vectorizer component. """ - def __init__(self, spg_type_name: str, **kwargs): - super().__init__(**kwargs) - self.spg_type_name = spg_type_name - - def build(self, **kwargs): + def __init__( + self, + mapping: MappingABC, + writer: SinkWriterABC, + vectorizer: VectorizerABC = None, + ): """ - Builds the processing chain for the SPG. + Initializes the DefaultStructuredBuilderChain instance. Args: - **kwargs: Additional keyword arguments. - - Returns: - chain: The constructed processing chain. + mapping (MappingABC): The mapping component to be used. + writer (SinkWriterABC): The writer component to be used. + vectorizer (VectorizerABC, optional): The vectorizer component to be used. Defaults to None. """ - file_path = kwargs.get("file_path") - source = get_reader(file_path)(output_type="Dict") - mapping = SPGTypeMapping(spg_type_name=self.spg_type_name) - sink = KGWriter() - - chain = source >> mapping >> sink - return chain + self.mapping = mapping + self.writer = writer + self.vectorizer = vectorizer - def invoke(self, file_path, max_workers=10, **kwargs): - logger.info(f"begin processing file_path:{file_path}") + def build(self, **kwargs): """ - Invokes the processing chain with the given file path and optional parameters. + Construct the builder chain by connecting the mapping, vectorizer (if available), and writer components. Args: - file_path (str): The path to the input file. - max_workers (int, optional): The maximum number of workers. Defaults to 10. **kwargs: Additional keyword arguments. Returns: - The result of invoking the processing chain. + KAGBuilderChain: The constructed builder chain. """ - return super().invoke(file_path=file_path, max_workers=max_workers, **kwargs) + if self.vectorizer: + chain = self.mapping >> self.vectorizer >> self.writer + else: + chain = self.mapping >> self.writer + return chain -class DefaultUnstructuredBuilderChain(BuilderChainABC): - """ - A class representing a default KAG builder chain, used to extract graph from documents and import unstructured data. + # def get_component_with_ckpts(self): + # return [ + # self.mapping, + # self.vectorizer, + # self.writer, + # ] - Steps: - 0. Initializing. - 1. SourceReader: Reading chunks from a given file. - 2. LengthSplitter: Splitting chunk to smaller chunks. The chunk size can be adjusted through parameters. - 3. KAGExtractor: Extracting entities and relations from chunks, and assembling a sub graph. - By default,the extraction process includes NER and SPO Extraction. - 4. KGWriter: Writing sub graph into KG storage. + # def close_checkpointers(self): + # for node in self.get_component_with_ckpts(): + # if node and hasattr(node, "checkpointer"): + # node.checkpointer.close() - """ - def __init__(self, **kwargs): - super().__init__(**kwargs) +@KAGBuilderChain.register("unstructured") +@KAGBuilderChain.register("unstructured_builder_chain") +class DefaultUnstructuredBuilderChain(KAGBuilderChain): + """ + A class representing a default unstructured builder chain, used to build a knowledge graph from unstructured text data such as txt and pdf files. + It consists of a reader, splitter, extractor, vectorizer, optional post-processor, and writer components. + """ - def build(self, **kwargs) -> Chain: + def __init__( + self, + reader: ReaderABC, + splitter: SplitterABC, + extractor: ExtractorABC = None, + vectorizer: VectorizerABC = None, + writer: SinkWriterABC = None, + post_processor: PostProcessorABC = None, + ): """ - Builds the processing chain for the KAG. + Initializes the DefaultUnstructuredBuilderChain instance. Args: - **kwargs: Additional keyword arguments. - - Returns: - chain: The constructed processing chain. + reader (ReaderABC): The reader component to be used. + splitter (SplitterABC): The splitter component to be used. + extractor (ExtractorABC): The extractor component to be used. + vectorizer (VectorizerABC): The vectorizer component to be used. + writer (SinkWriterABC): The writer component to be used. + post_processor (PostProcessorABC, optional): The post-processor component to be used. Defaults to None. """ - file_path = kwargs.get("file_path") - split_length = kwargs.get("split_length") - window_length = kwargs.get("window_length") - source = get_reader(file_path)() - splitter = LengthSplitter(split_length, window_length) - extractor = KAGExtractor() - vectorizer = BatchVectorizer() - sink = KGWriter() - - chain = source >> splitter >> extractor >> vectorizer >> sink - return chain + self.reader = reader + self.splitter = splitter + self.extractor = extractor + self.vectorizer = vectorizer + self.post_processor = post_processor + self.writer = writer - def invoke(self, file_path: str, split_length: int = 500, window_length: int = 100, max_workers=10, **kwargs): - logger.info(f"begin processing file_path:{file_path}") + def build(self, **kwargs): + pass + + def invoke(self, input_data, max_workers=10, **kwargs): """ - Invokes the processing chain with the given file path and optional parameters. + Invokes the builder chain to process the input file. Args: - file_path (str): The path to the input file. - split_length (int, optional): The length at which the file should be split. Defaults to 500. - window_length (int, optional): The length of the processing window. Defaults to 100. - max_workers (int, optional): The maximum number of worker threads. Defaults to 10. - + file_path: The path to the input file to be processed. + max_workers (int, optional): The maximum number of threads to use. Defaults to 10. **kwargs: Additional keyword arguments. Returns: - The result of invoking the processing chain. + List: The final output from the builder chain. """ - return super().invoke(file_path=file_path, max_workers=max_workers, split_length=window_length, window_length=window_length, **kwargs) + + def execute_node(node, node_input, **kwargs): + if not isinstance(node_input, list): + node_input = [node_input] + node_output = [] + for item in node_input: + node_output.extend(node.invoke(item, **kwargs)) + return node_output + + def run_extract(chunk): + flow_data = [chunk] + input_key = chunk.hash_key + for node in [ + self.extractor, + self.vectorizer, + self.post_processor, + self.writer, + ]: + if node is None: + continue + flow_data = execute_node(node, flow_data, key=input_key) + return {input_key: flow_data[0]} + + reader_output = self.reader.invoke(input_data, key=generate_hash_id(input_data)) + splitter_output = [] + + for chunk in reader_output: + splitter_output.extend(self.splitter.invoke(chunk, key=chunk.hash_key)) + + processed_chunk_keys = kwargs.get("processed_chunk_keys", set()) + filtered_chunks = [] + processed = 0 + for chunk in splitter_output: + if chunk.hash_key not in processed_chunk_keys: + filtered_chunks.append(chunk) + else: + processed += 1 + logger.debug( + f"Total chunks: {len(splitter_output)}. Checkpointed: {processed}, Pending: {len(filtered_chunks)}." + ) + result = [] + with ThreadPoolExecutor(max_workers) as executor: + futures = [executor.submit(run_extract, chunk) for chunk in filtered_chunks] + + from tqdm import tqdm + + for inner_future in tqdm( + as_completed(futures), + total=len(futures), + desc="KAG Extraction From Chunk", + position=1, + leave=False, + ): + ret = inner_future.result() + result.append(ret) + return result diff --git a/kag/builder/model/chunk.py b/kag/builder/model/chunk.py index a5db11c3..526fffcd 100644 --- a/kag/builder/model/chunk.py +++ b/kag/builder/model/chunk.py @@ -9,9 +9,10 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import hashlib from enum import Enum from typing import Dict, Any +from kag.common.utils import generate_hash_id +import json class ChunkTypeEnum(str, Enum): @@ -26,29 +27,27 @@ def __init__( name: str, content: str, type: ChunkTypeEnum = ChunkTypeEnum.Text, - **kwargs + **kwargs, ): self.id = id self.name = name self.type = type self.content = content self.kwargs = kwargs + for key, value in kwargs.items(): + setattr(self, key, value) - @staticmethod - def generate_hash_id(value): - if isinstance(value, str): - value = value.encode("utf-8") - hasher = hashlib.sha256() - hasher.update(value) - return hasher.hexdigest() + @property + def hash_key(self): + return generate_hash_id(f"{self.id}{self.name}{self.content}") def __str__(self): tmp = { "id": self.id, "name": self.name, - "content": self.content - if len(self.content) <= 64 - else self.content[:64] + " ...", + "content": ( + self.content if len(self.content) <= 64 else self.content[:64] + " ..." + ), } return f": {tmp}" @@ -59,7 +58,9 @@ def to_dict(self): "id": self.id, "name": self.name, "content": self.content, - "type": self.type.value if isinstance(self.type, ChunkTypeEnum) else self.type, + "type": ( + self.type.value if isinstance(self.type, ChunkTypeEnum) else self.type + ), "properties": self.kwargs, } @@ -72,3 +73,10 @@ def from_dict(cls, input_: Dict[str, Any]): type=input_.get("type"), **input_.get("properties", {}), ) + + +def dump_chunks(chunks, **kwargs): + if kwargs.get("output_path"): + with open(kwargs.get("output_path"), "w") as f: + for chunk in chunks: + f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + "\n") diff --git a/kag/builder/model/spg_record.py b/kag/builder/model/spg_record.py index 5c5b6825..f737dc00 100644 --- a/kag/builder/model/spg_record.py +++ b/kag/builder/model/spg_record.py @@ -23,145 +23,165 @@ class SPGRecord: """Data structure in operator, used to store entity information.""" def __init__(self, spg_type_name: SPGTypeName): + """ + Initializes a new instance of the SPGRecord class. + + Args: + spg_type_name (SPGTypeName): The type name of the SPG entity. + """ self._spg_type_name = spg_type_name self._properties = {} self._relations = {} @property def id(self) -> str: + """ + Gets the ID of the SPGRecord. + + Returns: + str: The ID of the SPGRecord. + """ return self.get_property("id", "") @property def name(self) -> str: + """ + Gets the name of the SPGRecord. + + Returns: + str: The name of the SPGRecord. + """ return self.get_property("name", self.id) @property def spg_type_name(self) -> SPGTypeName: - """Gets the spg_type_name of this SPGRecord. # noqa: E501 - + """ + Gets the SPG type name of this SPGRecord. - :return: The spg_type_name of this SPGRecord. # noqa: E501 - :rtype: str + Returns: + SPGTypeName: The SPG type name of this SPGRecord. """ return self._spg_type_name @spg_type_name.setter def spg_type_name(self, spg_type_name: SPGTypeName): - """Sets the spg_type_name of this SPGRecord. - + """ + Sets the SPG type name of this SPGRecord. - :param spg_type_name: The spg_type_name of this SPGRecord. # noqa: E501 - :type: str + Args: + spg_type_name (SPGTypeName): The SPG type name of this SPGRecord. """ self._spg_type_name = spg_type_name @property def properties(self) -> Dict[PropertyName, str]: - """Gets the properties of this SPGRecord. # noqa: E501 - + """ + Gets the properties of this SPGRecord. - :return: The properties of this SPGRecord. # noqa: E501 - :rtype: dict + Returns: + Dict[PropertyName, str]: The properties of this SPGRecord. """ return self._properties @properties.setter def properties(self, properties: Dict[PropertyName, str]): - """Sets the properties of this SPGRecord. - + """ + Sets the properties of this SPGRecord. - :param properties: The properties of this SPGRecord. # noqa: E501 - :type: dict + Args: + properties (Dict[PropertyName, str]): The properties of this SPGRecord. """ self._properties = properties @property def relations(self) -> Dict[str, str]: - """Gets the relations of this SPGRecord. # noqa: E501 - + """ + Gets the relations of this SPGRecord. - :return: The relations of this SPGRecord. # noqa: E501 - :rtype: dict + Returns: + Dict[str, str]: The relations of this SPGRecord. """ return self._relations @relations.setter def relations(self, relations: Dict[str, str]): - """Sets the properties of this SPGRecord. - + """ + Sets the relations of this SPGRecord. - :param relations: The relations of this SPGRecord. # noqa: E501 - :type: dict + Args: + relations (Dict[str, str]): The relations of this SPGRecord. """ self._relations = relations def get_property( self, property_name: PropertyName, default_value: str = None ) -> str: - """Gets a property of this SPGRecord by name. # noqa: E501 + """ + Gets a property of this SPGRecord by name. + Args: + property_name (PropertyName): The property name. + default_value (str, optional): If the property value is None, the default_value will be returned. Defaults to None. - :param property_name: The property name. # noqa: E501 - :param default_value: If property value is None, the default_value will be return. # noqa: E501 - :return: A property value. # noqa: E501 - :rtype: str + Returns: + str: The property value. """ return self.properties.get(property_name, default_value) def upsert_property(self, property_name: PropertyName, value: str): - """Upsert a property of this SPGRecord. # noqa: E501 - + """ + Upserts a property of this SPGRecord. - :param property_name: The updated property name. # noqa: E501 - :param value: The updated property value. # noqa: E501 - :type: str + Args: + property_name (PropertyName): The updated property name. + value (str): The updated property value. """ self.properties[property_name] = value return self def append_property(self, property_name: PropertyName, value: str): - """Append a property of this SPGRecord. # noqa: E501 - + """ + Appends a property of this SPGRecord. - :param property_name: The updated property name. # noqa: E501 - :param value: The updated property value. # noqa: E501 - :type: str + Args: + property_name (PropertyName): The updated property name. + value (str): The updated property value. """ property_value = self.get_property(property_name) if property_value: - property_value_list = property_value.split(',') + property_value_list = property_value.split(",") if value not in property_value_list: - self.properties[property_name] = property_value + ',' + value + self.properties[property_name] = property_value + "," + value else: self.properties[property_name] = value return self def upsert_properties(self, properties: Dict[PropertyName, str]): - """Upsert properties of this SPGRecord. # noqa: E501 - + """ + Upserts properties of this SPGRecord. - :param properties: The updated properties. # noqa: E501 - :type: dict + Args: + properties (Dict[PropertyName, str]): The updated properties. """ self.properties.update(properties) return self def remove_property(self, property_name: PropertyName): - """Removes a property of this SPGRecord. # noqa: E501 - + """ + Removes a property of this SPGRecord. - :param property_name: The property name. # noqa: E501 - :type: str + Args: + property_name (PropertyName): The property name. """ self.properties.pop(property_name) return self def remove_properties(self, property_names: List[PropertyName]): - """Removes properties by given names. # noqa: E501 - + """ + Removes properties by given names. - :param property_names: A list of property names. # noqa: E501 - :type: list + Args: + property_names (List[PropertyName]): A list of property names. """ for property_name in property_names: self.properties.pop(property_name) @@ -173,37 +193,39 @@ def get_relation( object_type_name: SPGTypeName, default_value: str = None, ) -> str: - """Gets a relation of this SPGRecord by name. # noqa: E501 + """ + Gets a relation of this SPGRecord by name. + Args: + relation_name (RelationName): The relation name. + object_type_name (SPGTypeName): The object SPG type name. + default_value (str, optional): If the relation value is None, the default_value will be returned. Defaults to None. - :param relation_name: The relation name. # noqa: E501 - :param object_type_name: The object SPG type name. # noqa: E501 - :param default_value: If property value is None, the default_value will be return. # noqa: E501 - :return: A relation value. # noqa: E501 - :rtype: str + Returns: + str: The relation value. """ return self.relations.get(relation_name + "#" + object_type_name, default_value) def upsert_relation( self, relation_name: RelationName, object_type_name: SPGTypeName, value: str ): - """Upsert a relation of this SPGRecord. # noqa: E501 - + """ + Upserts a relation of this SPGRecord. - :param relation_name: The updated relation name. # noqa: E501 - :param object_type_name: The object SPG type name. # noqa: E501 - :param value: The updated relation value. # noqa: E501 - :type: str + Args: + relation_name (RelationName): The updated relation name. + object_type_name (SPGTypeName): The object SPG type name. + value (str): The updated relation value. """ self.relations[relation_name + "#" + object_type_name] = value return self def upsert_relations(self, relations: Dict[Tuple[RelationName, SPGTypeName], str]): - """Upsert relations of this SPGRecord. # noqa: E501 - + """ + Upserts relations of this SPGRecord. - :param relations: The updated relations. # noqa: E501 - :type: dict + Args: + relations (Dict[Tuple[RelationName, SPGTypeName], str]): The updated relations. """ for (relation_name, object_type_name), value in relations.items(): self.relations[relation_name + "#" + object_type_name] = value @@ -212,33 +234,43 @@ def upsert_relations(self, relations: Dict[Tuple[RelationName, SPGTypeName], str def remove_relation( self, relation_name: RelationName, object_type_name: SPGTypeName ): - """Removes a relation of this SPGRecord. # noqa: E501 - + """ + Removes a relation of this SPGRecord. - :param relation_name: The relation name. # noqa: E501 - :param object_type_name: The object SPG type name. # noqa: E501 - :type: str + Args: + relation_name (RelationName): The relation name. + object_type_name (SPGTypeName): The object SPG type name. """ self.relations.pop(relation_name + "#" + object_type_name) return self def remove_relations(self, relation_names: List[Tuple[RelationName, SPGTypeName]]): - """Removes relations by given names. # noqa: E501 - + """ + Removes relations by given names. - :param relation_names: A list of relation names. # noqa: E501 - :type: list + Args: + relation_names (List[Tuple[RelationName, SPGTypeName]]): A list of relation names. """ - for (relation_name, object_type_name) in relation_names: + for relation_name, object_type_name in relation_names: self.relations.pop(relation_name + "#" + object_type_name) return self def to_str(self): - """Returns the string representation of the model""" + """ + Returns the string representation of the model. + + Returns: + str: The string representation of the model. + """ return pprint.pformat(self.__dict__()) def to_dict(self): - """Returns the model properties as a dict""" + """ + Returns the model properties as a dict. + + Returns: + dict: The model properties as a dict. + """ return { "spgTypeName": self.spg_type_name, @@ -249,7 +281,12 @@ def to_dict(self): } def __dict__(self): - """Returns this SPGRecord as a dict""" + """ + Returns this SPGRecord as a dict. + + Returns: + dict: This SPGRecord as a dict. + """ return { "spgTypeName": self.spg_type_name, "properties": self.properties, @@ -258,7 +295,15 @@ def __dict__(self): @classmethod def from_dict(cls, input: Dict[str, Any]): - """Returns the model from a dict""" + """ + Returns the model from a dict. + + Args: + input (Dict[str, Any]): The input dictionary. + + Returns: + SPGRecord: The model from the input dictionary. + """ spg_type_name = input.get("spgTypeName") _cls = cls(spg_type_name) properties = input.get("properties") @@ -272,5 +317,10 @@ def from_dict(cls, input: Dict[str, Any]): return _cls def __repr__(self): - """For `print` and `pprint`""" + """ + For `print` and `pprint`. + + Returns: + str: The string representation of the model. + """ return pprint.pformat(self.__dict__()) diff --git a/kag/builder/model/sub_graph.py b/kag/builder/model/sub_graph.py index ff4ebb7f..ccc4c5c6 100644 --- a/kag/builder/model/sub_graph.py +++ b/kag/builder/model/sub_graph.py @@ -10,10 +10,11 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. import pprint +import copy from typing import Dict, List, Any -from knext.schema.client import BASIC_TYPES from kag.builder.model.spg_record import SPGRecord +from knext.schema.client import BASIC_TYPES from knext.schema.model.base import BaseSpgType @@ -41,14 +42,14 @@ def from_spg_record(cls, idx, spg_record: SPGRecord): @staticmethod def unique_key(spg_record): - return spg_record.spg_type_name + '_' + spg_record.get_property("name", "") + return spg_record.spg_type_name + "_" + spg_record.get_property("name", "") def to_dict(self): return { "id": self.id, "name": self.name, "label": self.label, - "properties": self.properties, + "properties": copy.deepcopy(self.properties), } @classmethod @@ -57,11 +58,15 @@ def from_dict(cls, input: Dict): _id=input["id"], name=input["name"], label=input["label"], - properties=input["properties"], + properties=input.get("properties", {}), ) def __eq__(self, other): - return self.name == other.name and self.label == other.label and self.properties == other.properties + return ( + self.name == other.name + and self.label == other.label + and self.properties == other.properties + ) class Edge(object): @@ -74,7 +79,12 @@ class Edge(object): properties: Dict[str, str] def __init__( - self, _id: str, from_node: Node, to_node: Node, label: str, properties: Dict[str, str] + self, + _id: str, + from_node: Node, + to_node: Node, + label: str, + properties: Dict[str, str], ): self.from_id = from_node.id self.from_type = from_node.label @@ -88,12 +98,19 @@ def __init__( @classmethod def from_spg_record( - cls, s_idx, subject_record: SPGRecord, o_idx, object_record: SPGRecord, label: str + cls, + s_idx, + subject_record: SPGRecord, + o_idx, + object_record: SPGRecord, + label: str, ): from_node = Node.from_spg_record(s_idx, subject_record) to_node = Node.from_spg_record(o_idx, object_record) - return cls(_id="", from_node=from_node, to_node=to_node, label=label, properties={}) + return cls( + _id="", from_node=from_node, to_node=to_node, label=label, properties={} + ) def to_dict(self): return { @@ -103,21 +120,35 @@ def to_dict(self): "fromType": self.from_type, "toType": self.to_type, "label": self.label, - "properties": self.properties, + "properties": copy.deepcopy(self.properties), } @classmethod def from_dict(cls, input: Dict): return cls( _id=input["id"], - from_node=Node(_id=input["from"], name=input["from"],label=input["fromType"], properties={}), - to_node=Node(_id=input["to"], name=input["to"], label=input["toType"], properties={}), + from_node=Node( + _id=input["from"], + name=input["from"], + label=input["fromType"], + properties={}, + ), + to_node=Node( + _id=input["to"], name=input["to"], label=input["toType"], properties={} + ), label=input["label"], - properties=input["properties"], + properties=input.get("properties", {}), ) def __eq__(self, other): - return self.from_id == other.from_id and self.to_id == other.to_id and self.label == other.label and self.properties == other.properties and self.from_type == other.from_type and self.to_type == other.to_type + return ( + self.from_id == other.from_id + and self.to_id == other.to_id + and self.label == other.label + and self.properties == other.properties + and self.from_type == other.from_type + and self.to_type == other.to_type + ) class SubGraph(object): @@ -135,12 +166,18 @@ def add_node(self, id: str, name: str, label: str, properties=None): self.nodes.append(Node(_id=id, name=name, label=label, properties=properties)) return self - def add_edge(self, s_id: str, s_label: str, p: str, o_id: str, o_label: str, properties=None): + def add_edge( + self, s_id: str, s_label: str, p: str, o_id: str, o_label: str, properties=None + ): if not properties: properties = dict() s_node = Node(_id=s_id, name=s_id, label=s_label, properties={}) o_node = Node(_id=o_id, name=o_id, label=o_label, properties={}) - self.edges.append(Edge(_id="", from_node=s_node, to_node=o_node, label=p, properties=properties)) + self.edges.append( + Edge( + _id="", from_node=s_node, to_node=o_node, label=p, properties=properties + ) + ) return self def to_dict(self): @@ -152,7 +189,7 @@ def to_dict(self): def __repr__(self): return pprint.pformat(self.to_dict()) - def merge(self, sub_graph: 'SubGraph'): + def merge(self, sub_graph: "SubGraph"): self.nodes.extend(sub_graph.nodes) self.edges.extend(sub_graph.edges) @@ -164,21 +201,30 @@ def from_spg_record( for record in spg_records: s_id = record.id s_name = record.name - s_label = record.spg_type_name.split('.')[-1] + s_label = record.spg_type_name.split(".")[-1] properties = record.properties spg_type = spg_types.get(record.spg_type_name) for prop_name, prop_value in record.properties.items(): if prop_name in spg_type.properties: from knext.schema.model.property import Property + prop: Property = spg_type.properties.get(prop_name) - o_label = prop.object_type_name.split('.')[-1] + o_label = prop.object_type_name.split(".")[-1] if o_label not in BASIC_TYPES: - prop_value_list = prop_value.split(',') + prop_value_list = prop_value.split(",") for o_id in prop_value_list: - sub_graph.add_edge(s_id=s_id, s_label=s_label, p=prop_name, o_id=o_id, o_label=o_label) + sub_graph.add_edge( + s_id=s_id, + s_label=s_label, + p=prop_name, + o_id=o_id, + o_label=o_label, + ) properties.pop(prop_name) - sub_graph.add_node(id=s_id, name=s_name, label=s_label, properties=properties) + sub_graph.add_node( + id=s_id, name=s_name, label=s_label, properties=properties + ) return sub_graph diff --git a/kag/builder/operator/__init__.py b/kag/builder/operator/__init__.py index 123acd8d..93aa6cd4 100644 --- a/kag/builder/operator/__init__.py +++ b/kag/builder/operator/__init__.py @@ -9,4 +9,3 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. - diff --git a/kag/builder/prompt/__init__.py b/kag/builder/prompt/__init__.py index e69de29b..ad9e9bd2 100644 --- a/kag/builder/prompt/__init__.py +++ b/kag/builder/prompt/__init__.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +from kag.builder.prompt.default.ner import OpenIENERPrompt as DefaultOpenIENERPrompt +from kag.builder.prompt.default.std import ( + OpenIEEntitystandardizationdPrompt as DefaultOpenIEEntitystandardizationdPrompt, +) +from kag.builder.prompt.default.triple import ( + OpenIETriplePrompt as DefaultOpenIETriplePrompt, +) + +from kag.builder.prompt.medical.ner import OpenIENERPrompt as MedicalOpenIENERPrompt +from kag.builder.prompt.medical.std import ( + OpenIEEntitystandardizationdPrompt as MedicalOpenIEEntitystandardizationdPrompt, +) +from kag.builder.prompt.medical.triple import ( + OpenIETriplePrompt as MedicalOpenIETriplePrompt, +) + +from kag.builder.prompt.analyze_table_prompt import AnalyzeTablePrompt +from kag.builder.prompt.spg_prompt import SPGPrompt, SPGEntityPrompt, SPGEventPrompt +from kag.builder.prompt.semantic_seg_prompt import SemanticSegPrompt +from kag.builder.prompt.outline_prompt import OutlinePrompt + + +__all__ = [ + "DefaultOpenIENERPrompt", + "DefaultOpenIEEntitystandardizationdPrompt", + "DefaultOpenIETriplePrompt", + "MedicalOpenIENERPrompt", + "MedicalOpenIEEntitystandardizationdPrompt", + "MedicalOpenIETriplePrompt", + "AnalyzeTablePrompt", + "OutlinePrompt", + "SemanticSegPrompt", + "SPGPrompt", + "SPGEntityPrompt", + "SPGEventPrompt", +] diff --git a/kag/builder/prompt/analyze_table_prompt.py b/kag/builder/prompt/analyze_table_prompt.py index 00b9ade0..cda19aa2 100644 --- a/kag/builder/prompt/analyze_table_prompt.py +++ b/kag/builder/prompt/analyze_table_prompt.py @@ -13,34 +13,24 @@ import json import logging -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC logger = logging.getLogger(__name__) - - -class AnalyzeTablePrompt(PromptOp): +@PromptABC.register("analyze_table") +class AnalyzeTablePrompt(PromptABC): template_zh: str = """你是一个分析表格的专家, 从table中提取信息并分析,最后返回表格有效信息""" template_en: str = """You are an expert in knowledge graph extraction. Based on the schema defined by the constraint, extract all entities and their attributes from the input. Return NAN for attributes not explicitly mentioned in the input. Output the results in standard JSON format, as a list.""" - def __init__( - self, - language: str = "zh", - ): - super().__init__( - language=language, - ) - def build_prompt(self, variables) -> str: return json.dumps( { "instruction": self.template, - "table": variables.get("table",""), + "table": variables.get("table", ""), }, ensure_ascii=False, ) def parse_response(self, response: str, **kwargs): return response - diff --git a/kag/builder/prompt/default/ner.py b/kag/builder/prompt/default/ner.py index 1cc92310..66709a6d 100644 --- a/kag/builder/prompt/default/ner.py +++ b/kag/builder/prompt/default/ner.py @@ -12,66 +12,66 @@ import json from string import Template -from typing import List, Optional - -from kag.common.base.prompt_op import PromptOp +from typing import List +from kag.common.conf import KAG_PROJECT_CONF +from kag.interface import PromptABC from knext.schema.client import SchemaClient -class OpenIENERPrompt(PromptOp): - +@PromptABC.register("default_ner") +class OpenIENERPrompt(PromptABC): template_en = """ { "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.", "schema": $schema, "example": [ { - "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", + "input": "The Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger. It stars Dougray Scott, Jessica De Gouw and Martin McCann. After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport. When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", "output": [ { - "entity": "The Rezort", + "name": "The Rezort", "type": "Movie", "category": "Works", "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger." }, { - "entity": "2015", + "name": "2015", "type": "Year", "category": "Date", "description": "The year the movie 'The Rezort' was released." }, { - "entity": "British", + "name": "British", "type": "Nationality", "category": "GeographicLocation", "description": "Great Britain, the island that includes England, Scotland, and Wales." }, { - "entity": "Steve Barker", + "name": "Steve Barker", "type": "Director", "category": "Person", "description": "Steve Barker is an English film director and screenwriter." }, { - "entity": "Paul Gerstenberger", + "name": "Paul Gerstenberger", "type": "Writer", "category": "Person", "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)." }, { - "entity": "Dougray Scott", + "name": "Dougray Scott", "type": "Actor", "category": "Person", "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor." }, { - "entity": "Jessica De Gouw", + "name": "Jessica De Gouw", "type": "Actor", "category": "Person", "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. " }, { - "entity": "Martin McCann", + "name": "Martin McCann", "type": "Actor", "category": "Person", "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors" @@ -89,52 +89,52 @@ class OpenIENERPrompt(PromptOp): "schema": $schema, "example": [ { - "input": "《Rezort》\n《Rezort》是一部 2015 年英国僵尸恐怖片,由史蒂夫·巴克执导,保罗·格斯滕伯格编剧。\n 该片由道格瑞·斯科特、杰西卡·德·古维和马丁·麦凯恩主演。\n 在人类赢得与僵尸的毁灭性战争后,剩下的少数不死生物被关在一个安全的岛屿上,在那里他们被猎杀作为消遣。\n 当岛上的安全出现问题时,客人们必须面对新一轮疫情爆发的可能性。", + "input": "《Rezort》是一部 2015年英国僵尸恐怖片,由史蒂夫·巴克执导,保罗·格斯滕伯格编剧。该片由道格瑞·斯科特、杰西卡·德·古维和马丁·麦凯恩主演。在人类赢得与僵尸的毁灭性战争后,剩下的少数不死生物被关在一个安全的岛屿上,在那里他们被猎杀作为消遣。当岛上的安全出现问题时,客人们必须面对新一轮疫情爆发的可能性。", "output": [ { - "entity": "The Rezort", + "name": "The Rezort", "type": "Movie", "category": "Works", "description": "一部 2015 年英国僵尸恐怖片,由史蒂夫·巴克执导,保罗·格斯滕伯格编剧。" }, { - "entity": "2015", + "name": "2015", "type": "Year", "category": "Date", "description": "电影《The Rezort》上映的年份。" }, { - "entity": "英国", + "name": "英国", "type": "Nationality", "category": "GeographicLocation", "description": "大不列颠,包括英格兰、苏格兰和威尔士的岛屿。" }, { - "entity": "史蒂夫·巴克", + "name": "史蒂夫·巴克", "type": "Director", "category": "Person", "description": "史蒂夫·巴克 是一名英国电影导演和剧作家" }, { - "entity": "保罗·格斯滕伯格", + "name": "保罗·格斯滕伯格", "type": "Writer", "category": "Person", "description": "保罗·格斯滕伯格 (Paul Gerstenberger) 是一名作家和制片人,因《The Rezort》(2015 年)、《Primeval》(2007 年)和《House of Anubis》(2011 年)而闻名。" }, { - "entity": "道格雷·斯科特", + "name": "道格雷·斯科特", "type": "Actor", "category": "Person", "description": "斯蒂芬·道格雷·斯科特 (Stephen Dougray Scott,1965 年 11 月 26 日出生) 是一位苏格兰演员。" }, { - "entity": "杰西卡·德·古维", + "name": "杰西卡·德·古维", "type": "Actor", "category": "Person", "description": "杰西卡·伊莉斯·德·古维 (Jessica Elise De Gouw,1988 年 2 月 15 日出生) 是一位澳大利亚女演员。" }, { - "entity": "马丁·麦肯", + "name": "马丁·麦肯", "type": "Actor", "category": "Person", "description": "马丁·麦肯是来自北爱尔兰的演员。2020 年,他在《爱尔兰时报》爱尔兰最伟大电影演员名单中排名第 48 位" @@ -146,12 +146,14 @@ class OpenIENERPrompt(PromptOp): } """ - def __init__( - self, language: Optional[str] = "en", **kwargs - ): + def __init__(self, language: str = "", **kwargs): super().__init__(language, **kwargs) - self.schema = SchemaClient(project_id=self.project_id).extract_types() - self.template = Template(self.template).safe_substitute(schema=self.schema) + self.schema = SchemaClient( + project_id=KAG_PROJECT_CONF.project_id + ).extract_types() + self.template = Template(self.template).safe_substitute( + schema=json.dumps(self.schema) + ) @property def template_variables(self) -> List[str]: diff --git a/kag/builder/prompt/default/std.py b/kag/builder/prompt/default/std.py index d56f0090..8045a582 100644 --- a/kag/builder/prompt/default/std.py +++ b/kag/builder/prompt/default/std.py @@ -11,65 +11,66 @@ # or implied. import json -from typing import Optional, List +from typing import List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC -class OpenIEEntitystandardizationdPrompt(PromptOp): +@PromptABC.register("default_std") +class OpenIEEntitystandardizationdPrompt(PromptABC): template_en = """ { "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.", "example": { - "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.", + "input": "American History.When did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.", "named_entities": [ - {"entity": "American", "category": "GeographicLocation"}, - {"entity": "political party", "category": "Organization"}, - {"entity": "southern states", "category": "GeographicLocation"}, - {"entity": "Civil War", "category": "Keyword"}, - {"entity": "House", "category": "Organization"}, - {"entity": "Republicans", "category": "Organization"}, - {"entity": "chamber", "category": "Organization"}, - {"entity": "2006 midterm elections", "category": "Date"} + {"name": "American", "category": "GeographicLocation"}, + {"name": "political party", "category": "Organization"}, + {"name": "southern states", "category": "GeographicLocation"}, + {"name": "Civil War", "category": "Keyword"}, + {"name": "House", "category": "Organization"}, + {"name": "Republicans", "category": "Organization"}, + {"name": "chamber", "category": "Organization"}, + {"name": "2006 midterm elections", "category": "Date"} ], "output": [ { - "entity": "American", + "name": "American", "category": "GeographicLocation", "official_name": "United States of America" }, { - "entity": "political party", + "name": "political party", "category": "Organization", "official_name": "Radical Republicans" }, { - "entity": "southern states", + "name": "southern states", "category": "GeographicLocation", "official_name": "Confederacy" }, { - "entity": "Civil War", + "name": "Civil War", "category": "Keyword", "official_name": "American Civil War" }, { - "entity": "House", + "name": "House", "category": "Organization", "official_name": "United States House of Representatives" }, { - "entity": "Republicans", + "name": "Republicans", "category": "Organization", "official_name": "Republican Party" }, { - "entity": "chamber", + "name": "chamber", "category": "Organization", "official_name": "United States House of Representatives" }, { - "entity": "midterm elections", + "name": "midterm elections", "category": "Date", "official_name": "United States midterm elections" } @@ -84,26 +85,26 @@ class OpenIEEntitystandardizationdPrompt(PromptOp): { "instruction": "input字段包含用户提供的上下文。命名实体字段包含从上下文中提取的命名实体,这些可能是含义不明的缩写、别名或俚语。为了消除歧义,请尝试根据上下文和您自己的知识提供这些实体的官方名称。请注意,具有相同含义的实体只能有一个官方名称。请按照提供的示例中的输出字段格式,以单个JSONArray字符串形式回复,无需任何解释。", "example": { - "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。\n若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", + "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", "named_entities": [ - {"entity": "烦躁不安", "category": "Symptom"}, - {"entity": "语妄", "category": "Symptom"}, - {"entity": "失眠", "category": "Symptom"}, - {"entity": "镇静药", "category": "Medicine"}, - {"entity": "肺外感染", "category": "Disease"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation"} + {"name": "烦躁不安", "category": "Symptom"}, + {"name": "语妄", "category": "Symptom"}, + {"name": "失眠", "category": "Symptom"}, + {"name": "镇静药", "category": "Medicine"}, + {"name": "肺外感染", "category": "Disease"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation"} ], "output": [ - {"entity": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"}, - {"entity": "语妄", "category": "Symptom", "official_name": "谵妄"}, - {"entity": "失眠", "category": "Symptom", "official_name": "失眠症"}, - {"entity": "镇静药", "category": "Medicine", "official_name": "镇静剂"}, - {"entity": "肺外感染", "category": "Disease", "official_name": "肺外感染"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"} + {"name": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"}, + {"name": "语妄", "category": "Symptom", "official_name": "谵妄"}, + {"name": "失眠", "category": "Symptom", "official_name": "失眠症"}, + {"name": "镇静药", "category": "Medicine", "official_name": "镇静剂"}, + {"name": "肺外感染", "category": "Disease", "official_name": "肺外感染"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"} ] }, "input": $input, @@ -111,15 +112,11 @@ class OpenIEEntitystandardizationdPrompt(PromptOp): } """ - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - @property def template_variables(self) -> List[str]: return ["input", "named_entities"] def parse_response(self, response: str, **kwargs): - rsp = response if isinstance(rsp, str): rsp = json.loads(rsp) @@ -134,10 +131,10 @@ def parse_response(self, response: str, **kwargs): entities = kwargs.get("named_entities", []) for entity in standardized_entity: merged.append(entity) - entities_with_offical_name.add(entity["entity"]) + entities_with_offical_name.add(entity["name"]) # in case llm ignores some entities for entity in entities: - if entity["entity"] not in entities_with_offical_name: - entity["official_name"] = entity["entity"] + if entity["name"] not in entities_with_offical_name: + entity["official_name"] = entity["name"] merged.append(entity) return merged diff --git a/kag/builder/prompt/default/triple.py b/kag/builder/prompt/default/triple.py index c870604c..03584a6b 100644 --- a/kag/builder/prompt/default/triple.py +++ b/kag/builder/prompt/default/triple.py @@ -11,66 +11,67 @@ # or implied. import json -from typing import Optional, List +from typing import List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC -class OpenIETriplePrompt(PromptOp): +@PromptABC.register("default_triple") +class OpenIETriplePrompt(PromptABC): template_en = """ { - "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\", \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.", + "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\", \"object\"]]}. If there are none, do not list them..Pay attention to the following requirements:- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.- Clearly resolve pronouns to their specific names to maintain clarity.", "entity_list": $entity_list, "input": "$input", "example": { - "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", + "input": "The RezortThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger. It stars Dougray Scott, Jessica De Gouw and Martin McCann. After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport. When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", "entity_list": [ { - "entity": "The Rezort", + "name": "The Rezort", "category": "Works" }, { - "entity": "2015", + "name": "2015", "category": "Others" }, { - "entity": "British", + "name": "British", "category": "GeographicLocation" }, { - "entity": "Steve Barker", + "name": "Steve Barker", "category": "Person" }, { - "entity": "Paul Gerstenberger", + "name": "Paul Gerstenberger", "category": "Person" }, { - "entity": "Dougray Scott", + "name": "Dougray Scott", "category": "Person" }, { - "entity": "Jessica De Gouw", + "name": "Jessica De Gouw", "category": "Person" }, { - "entity": "Martin McCann", + "name": "Martin McCann", "category": "Person" }, { - "entity": "zombies", + "name": "zombies", "category": "Creature" }, { - "entity": "zombie horror film", + "name": "zombie horror film", "category": "Concept" }, { - "entity": "humanity", + "name": "humanity", "category": "Concept" }, { - "entity": "secure island", + "name": "secure island", "category": "GeographicLocation" } ], @@ -151,16 +152,16 @@ class OpenIETriplePrompt(PromptOp): "entity_list": $entity_list, "input": "$input", "example": { - "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。\n若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", + "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", "entity_list": [ - {"entity": "烦躁不安", "category": "Symptom"}, - {"entity": "语妄", "category": "Symptom"}, - {"entity": "失眠", "category": "Symptom"}, - {"entity": "镇静药", "category": "Medicine"}, - {"entity": "肺外感染", "category": "Disease"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation"} + {"name": "烦躁不安", "category": "Symptom"}, + {"name": "语妄", "category": "Symptom"}, + {"name": "失眠", "category": "Symptom"}, + {"name": "镇静药", "category": "Medicine"}, + {"name": "肺外感染", "category": "Disease"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation"} ], "output":[ ["烦躁不安", "酌用", "镇静药"], @@ -178,9 +179,6 @@ class OpenIETriplePrompt(PromptOp): } """ - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - @property def template_variables(self) -> List[str]: return ["entity_list", "input"] diff --git a/kag/builder/prompt/medical/ner.py b/kag/builder/prompt/medical/ner.py index 07c6298a..1e2ce65e 100644 --- a/kag/builder/prompt/medical/ner.py +++ b/kag/builder/prompt/medical/ner.py @@ -12,14 +12,14 @@ import json from string import Template -from typing import List, Optional - -from kag.common.base.prompt_op import PromptOp +from typing import List +from kag.common.conf import KAG_PROJECT_CONF +from kag.interface import PromptABC from knext.schema.client import SchemaClient -class OpenIENERPrompt(PromptOp): - +@PromptABC.register("medical_ner") +class OpenIENERPrompt(PromptABC): template_zh = """ { "instruction": "你是命名实体识别的专家。请从输入中提取与模式定义匹配的实体。如果不存在该类型的实体,请返回一个空列表。请以JSON字符串格式回应。你可以参照example进行抽取。", @@ -28,14 +28,14 @@ class OpenIENERPrompt(PromptOp): { "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。\n若体温降而复升或3天后仍不降者,应考虑SP的肺外感染。\n治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", "output": [ - {"entity": "烦躁不安", "category": "Symptom"}, - {"entity": "语妄", "category": "Symptom"}, - {"entity": "失眠", "category": "Symptom"}, - {"entity": "镇静药", "category": "Medicine"}, - {"entity": "肺外感染", "category": "Disease"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation"} + {"name": "烦躁不安", "category": "Symptom"}, + {"name": "语妄", "category": "Symptom"}, + {"name": "失眠", "category": "Symptom"}, + {"name": "镇静药", "category": "Medicine"}, + {"name": "肺外感染", "category": "Disease"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation"} ] } ], @@ -45,11 +45,11 @@ class OpenIENERPrompt(PromptOp): template_en = template_zh - def __init__( - self, language: Optional[str] = "en", **kwargs - ): + def __init__(self, language: str = "", **kwargs): super().__init__(language, **kwargs) - self.schema = SchemaClient(project_id=self.project_id).extract_types() + self.schema = SchemaClient( + project_id=KAG_PROJECT_CONF.project_id + ).extract_types() self.template = Template(self.template).safe_substitute(schema=self.schema) @property diff --git a/kag/builder/prompt/medical/std.py b/kag/builder/prompt/medical/std.py index 88ec1283..19f2232c 100644 --- a/kag/builder/prompt/medical/std.py +++ b/kag/builder/prompt/medical/std.py @@ -11,37 +11,37 @@ # or implied. import json -from typing import Optional, List +from typing import List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC -class OpenIEEntitystandardizationdPrompt(PromptOp): - +@PromptABC.register("medical_std") +class OpenIEEntitystandardizationdPrompt(PromptABC): template_zh = """ { "instruction": "input字段包含用户提供的上下文。命名实体字段包含从上下文中提取的命名实体,这些可能是含义不明的缩写、别名或俚语。为了消除歧义,请尝试根据上下文和您自己的知识提供这些实体的官方名称。请注意,具有相同含义的实体只能有一个官方名称。请按照提供的示例中的输出字段格式,以单个JSONArray字符串形式回复,无需任何解释。", "example": { "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。\n若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", "named_entities": [ - {"entity": "烦躁不安", "category": "Symptom"}, - {"entity": "语妄", "category": "Symptom"}, - {"entity": "失眠", "category": "Symptom"}, - {"entity": "镇静药", "category": "Medicine"}, - {"entity": "肺外感染", "category": "Disease"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation"} + {"name": "烦躁不安", "category": "Symptom"}, + {"name": "语妄", "category": "Symptom"}, + {"name": "失眠", "category": "Symptom"}, + {"name": "镇静药", "category": "Medicine"}, + {"name": "肺外感染", "category": "Disease"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation"} ], "output": [ - {"entity": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"}, - {"entity": "语妄", "category": "Symptom", "official_name": "谵妄"}, - {"entity": "失眠", "category": "Symptom", "official_name": "失眠症"}, - {"entity": "镇静药", "category": "Medicine", "official_name": "镇静剂"}, - {"entity": "肺外感染", "category": "Disease", "official_name": "肺外感染"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"} + {"name": "烦躁不安", "category": "Symptom", "official_name": "焦虑不安"}, + {"name": "语妄", "category": "Symptom", "official_name": "谵妄"}, + {"name": "失眠", "category": "Symptom", "official_name": "失眠症"}, + {"name": "镇静药", "category": "Medicine", "official_name": "镇静剂"}, + {"name": "肺外感染", "category": "Disease", "official_name": "肺外感染"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment", "official_name": "胸腔引流管"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment", "official_name": "负压吸引装置"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation", "official_name": "闭式负压引流"} ] }, "input": $input, @@ -51,15 +51,11 @@ class OpenIEEntitystandardizationdPrompt(PromptOp): template_en = template_zh - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - @property def template_variables(self) -> List[str]: return ["input", "named_entities"] def parse_response(self, response: str, **kwargs): - rsp = response if isinstance(rsp, str): rsp = json.loads(rsp) @@ -74,10 +70,10 @@ def parse_response(self, response: str, **kwargs): entities = kwargs.get("named_entities", []) for entity in standardized_entity: merged.append(entity) - entities_with_offical_name.add(entity["entity"]) + entities_with_offical_name.add(entity["name"]) # in case llm ignores some entities for entity in entities: - if entity["entity"] not in entities_with_offical_name: - entity["official_name"] = entity["entity"] + if entity["name"] not in entities_with_offical_name: + entity["official_name"] = entity["name"] merged.append(entity) return merged diff --git a/kag/builder/prompt/medical/triple.py b/kag/builder/prompt/medical/triple.py index 2b5aaff8..1c573fac 100644 --- a/kag/builder/prompt/medical/triple.py +++ b/kag/builder/prompt/medical/triple.py @@ -11,13 +11,13 @@ # or implied. import json -from typing import Optional, List, Dict, Any +from typing import List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC -class OpenIETriplePrompt(PromptOp): - +@PromptABC.register("medical_triple") +class OpenIETriplePrompt(PromptABC): template_zh = """ { "instruction": "您是一位专门从事开放信息提取(OpenIE)的专家。请从input字段的文本中提取任何可能的关系(包括主语、谓语、宾语),并按照JSON格式列出它们,须遵循example字段的示例格式。请注意以下要求:1. 每个三元组应至少包含entity_list实体列表中的一个,但最好是两个命名实体。2. 明确地将代词解析为特定名称,以保持清晰度。", @@ -26,14 +26,14 @@ class OpenIETriplePrompt(PromptOp): "example": { "input": "烦躁不安、语妄、失眠酌用镇静药,禁用抑制呼吸的镇静药。\n3.并发症的处理经抗菌药物治疗后,高热常在24小时内消退,或数日内逐渐下降。\n若体温降而复升或3天后仍不降者,应考虑SP的肺外感染,如腋胸、心包炎或关节炎等。治疗:接胸腔压力调节管+吸引机负压吸引水瓶装置闭式负压吸引宜连续,如经12小时后肺仍未复张,应查找原因。", "entity_list": [ - {"entity": "烦躁不安", "category": "Symptom"}, - {"entity": "语妄", "category": "Symptom"}, - {"entity": "失眠", "category": "Symptom"}, - {"entity": "镇静药", "category": "Medicine"}, - {"entity": "肺外感染", "category": "Disease"}, - {"entity": "胸腔压力调节管", "category": "MedicalEquipment"}, - {"entity": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, - {"entity": "闭式负压吸引", "category": "SurgicalOperation"} + {"name": "烦躁不安", "category": "Symptom"}, + {"name": "语妄", "category": "Symptom"}, + {"name": "失眠", "category": "Symptom"}, + {"name": "镇静药", "category": "Medicine"}, + {"name": "肺外感染", "category": "Disease"}, + {"name": "胸腔压力调节管", "category": "MedicalEquipment"}, + {"name": "吸引机负压吸引水瓶装置", "category": "MedicalEquipment"}, + {"name": "闭式负压吸引", "category": "SurgicalOperation"} ], "output":[ ["烦躁不安", "酌用", "镇静药"], @@ -53,9 +53,6 @@ class OpenIETriplePrompt(PromptOp): template_en = template_zh - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - @property def template_variables(self) -> List[str]: return ["entity_list", "input"] diff --git a/kag/builder/prompt/oneke_prompt.py b/kag/builder/prompt/oneke_prompt.py deleted file mode 100644 index 25c3dd69..00000000 --- a/kag/builder/prompt/oneke_prompt.py +++ /dev/null @@ -1,518 +0,0 @@ -# -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -import logging -import re -from abc import ABC -from typing import List, Dict, Any -from collections import defaultdict - -from knext.schema.model.schema_helper import SPGTypeName -from kag.builder.model.spg_record import SPGRecord -from kag.builder.prompt.spg_prompt import SPGPrompt -import uuid - -logger = logging.getLogger(__name__) - - -class OneKEPrompt(SPGPrompt, ABC): - template_zh: str = "" - template_en: str = "" - - def __init__(self, **kwargs): - types_list = kwargs.get("types_list", []) - language = kwargs.get("language", "zh") - with_description = kwargs.get("with_description", False) - split_num = kwargs.get("split_num", 4) - super().__init__(types_list, **kwargs) - self.language = language - if language == "zh": - self.template = self.template_zh - else: - self.template = self.template_en - self.with_description = with_description - self.split_num = split_num - - self._init_render_variables() - self._render() - - self.params = kwargs - - def build_prompt(self, variables: Dict[str, str]) -> List[str]: - instructions = [] - for schema in self.schema_list: - instructions.append( - json.dumps( - { - "instruction": self.template, - "schema": schema, - "input": variables.get("input"), - }, - ensure_ascii=False, - ) - ) - return instructions - - def parse_response(self, response: str) -> List[SPGRecord]: - raise NotImplementedError - - def _render(self): - raise NotImplementedError - - def multischema_split_by_num(self, split_num, schemas: List[Any]): - negative_length = max(len(schemas) // split_num, 1) * split_num - total_schemas = [] - for i in range(0, negative_length, split_num): - total_schemas.append(schemas[i : i + split_num]) - - remain_len = max(1, split_num // 2) - tmp_schemas = schemas[negative_length:] - if len(schemas) - negative_length >= remain_len and len(tmp_schemas) > 0: - total_schemas.append(tmp_schemas) - elif len(tmp_schemas) > 0: - total_schemas[-1].extend(tmp_schemas) - return total_schemas - - -class OneKE_NERPrompt(OneKEPrompt): - template_zh: str = ( - "你是专门进行实体抽取的专家。请从input中抽取出符合schema定义的实体,不存在的实体类型返回空列表。请按照JSON字符串的格式回答。" - ) - template_en: str = "You are an expert in named entity recognition. Please extract entities that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string." - - def __init__( - self, - entity_types: List[SPGTypeName], - language: str = "zh", - with_description: bool = False, - split_num: int = 4, - **kwargs, - ): - super().__init__( - types_list=entity_types, - language=language, - with_description=with_description, - split_num=split_num, - **kwargs, - ) - - def parse_response(self, response: str) -> List[SPGRecord]: - if isinstance(response, list) and len(response) > 0: - response = response[0] - try: - ent_obj = json.loads(response) - except json.decoder.JSONDecodeError: - logger.error("OneKE_NERPrompt response JSONDecodeError error.") - return [] - if type(ent_obj) != dict: - logger.error("OneKE_NERPrompt response type error.") - return [] - - spg_records = [] - for type_zh, values in ent_obj.items(): - if type_zh not in self.spg_type_schema_info_zh: - logger.warning(f"Unrecognized entity_type: {type_zh}") - continue - type_en, _ = self.spg_type_schema_info_zh[type_zh] - for value in values: - spg_record = SPGRecord(type_en) - spg_record.upsert_properties({"id": value, "name": value}) - spg_records.append(spg_record) - return spg_records - - def _render(self): - entity_list = [] - for spg_type in self.spg_types: - entity_list.append(spg_type.name_zh) - self.schema_list = self.multischema_split_by_num(self.split_num, entity_list) - - -class OneKE_SPOPrompt(OneKEPrompt): - template_zh: str = ( - "你是专门进行SPO三元组抽取的专家。请从input中抽取出符合schema定义的spo关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。" - ) - template_en: str = "You are an expert in spo(subject, predicate, object) triples extraction. Please extract SPO relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string." - - def __init__( - self, - spo_types: List[SPGTypeName], - language: str = "zh", - with_description: bool = False, - split_num: int = 4, - **kwargs, - ): - super().__init__( - types_list=spo_types, - language=language, - with_description=with_description, - split_num=split_num, - **kwargs, - ) - self.properties_mapper = {} - self.relations_mapper = {} - - def parse_response(self, response: str) -> List[SPGRecord]: - if isinstance(response, list) and len(response) > 0: - response = response[0] - try: - re_obj = json.loads(response) - except json.decoder.JSONDecodeError: - logger.error("OneKE_REPrompt response JSONDecodeError error.") - return [] - if type(re_obj) != dict: - logger.error("OneKE_REPrompt response type error.") - return [] - - relation_dcir = defaultdict(list) - for relation_zh, values in re_obj.items(): - if relation_zh not in self.property_info_zh[relation_zh]: - logger.warning(f"Unrecognized relation: {relation_zh}") - continue - if values and isinstance(values, list): - for value in values: - if ( - type(value) != dict - or "subject" not in value - or "object" not in value - ): - logger.warning("OneKE_REPrompt response type error.") - continue - s_zh, o_zh = value.get("subject", ""), value.get("object", "") - relation_dcir[relation_zh].append((s_zh, o_zh)) - - spg_records = [] - for relation_zh, sub_obj_list in relation_dcir.items(): - sub_dict = defaultdict(list) - for s_zh, o_zh in sub_obj_list: - sub_dict[s_zh].append(o_zh) - for s_zh, o_list in sub_dict.items(): - if s_zh in self.spg_type_schema_info_zh: - logger.warning(f"Unrecognized subject_type: {s_zh}") - continue - object_value = ",".join(o_list) - s_type_zh = self.properties_mapper.get(relation_zh, None) - if s_type_zh is not None: - s_type_en, _ = self.spg_type_schema_info_zh[s_type_zh] - relation_en, _ = self.property_info_zh[relation_zh] - spg_record = SPGRecord(s_type_en).upsert_properties( - {"id": s_zh, "name": s_zh} - ) - spg_record.upsert_property(relation_en, object_value) - else: - s_type_zh, o_type_zh = self.relations_mapper.get( - relation_zh, [None, None] - ) - if s_type_zh is None or o_type_zh is None: - logger.warning(f"Unrecognized relation: {relation_zh}") - continue - s_type_en, _ = self.spg_type_schema_info_zh[s_type_zh] - spg_record = SPGRecord(s_type_en).upsert_properties( - {"id": s_zh, "name": s_zh} - ) - relation_en, _, object_type = self.relation_info_zh[s_type_zh][ - relation_zh - ] - spg_record.upsert_relation(relation_en, object_type, object_value) - spg_records.append(spg_record) - return spg_records - - def _render(self): - spo_list = [] - for spg_type in self.spg_types: - type_en, _ = self.spg_type_schema_info_zh[spg_type] - for v in spg_type.properties.values(): - spo_list.append( - { - "subject_type": spg_type.name_zh, - "predicate": v.name_zh, - "object_type": "文本", - } - ) - self.properties_mapper[v.name_zh] = spg_type - for v in spg_type.relations.values(): - _, _, object_type = self.relation_info_en[type_en][v.name] - spo_list.append( - { - "subject_type": spg_type.name_zh, - "predicate": v.name_zh, - "object_type": object_type, - } - ) - self.relations_mapper[v.name_zh] = [spg_type, object_type] - self.schema_list = self.multischema_split_by_num(self.split_num, spo_list) - - -class OneKE_REPrompt(OneKE_SPOPrompt): - template_zh: str = ( - "你是专门进行关系抽取的专家。请从input中抽取出符合schema定义的关系三元组,不存在的关系返回空列表。请按照JSON字符串的格式回答。" - ) - template_en: str = "You are an expert in relationship extraction. Please extract relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string." - - def __init__( - self, - relation_types: List[SPGTypeName], - language: str = "zh", - with_description: bool = False, - split_num: int = 4, - **kwargs, - ): - super().__init__( - relation_types, language, with_description, split_num, **kwargs - ) - - def _render(self): - re_list = [] - for spg_type in self.spg_types: - type_en, _ = self.spg_type_schema_info_zh[spg_type] - for v in spg_type.properties.values(): - re_list.append(v.name_zh) - self.properties_mapper[v.name_zh] = spg_type - for v in spg_type.relations.values(): - v_zh, _, object_type = self.relation_info_en[type_en][v.name] - re_list.append(v.name_zh) - self.relations_mapper[v.name_zh] = [spg_type, object_type] - self.schema_list = self.multischema_split_by_num(self.split_num, re_list) - - -class OneKE_KGPrompt(OneKEPrompt): - template_zh: str = "你是一个图谱实体知识结构化专家。根据输入实体类型(entity type)的schema描述,从文本中抽取出相应的实体实例和其属性信息,不存在的属性不输出, 属性存在多值就返回列表,并输出为可解析的json格式。" - template_en: str = "You are an expert in structured knowledge systems for graph entities. Based on the schema description of the input entity type, you extract the corresponding entity instances and their attribute information from the text. Attributes that do not exist should not be output. If an attribute has multiple values, a list should be returned. The results should be output in a parsable JSON format." - - def __init__( - self, - entity_types: List[SPGTypeName], - language: str = "zh", - with_description: bool = False, - split_num: int = 4, - **kwargs, - ): - super().__init__( - types_list=entity_types, - language=language, - with_description=with_description, - split_num=split_num, - **kwargs, - ) - - def parse_response(self, response: str) -> List[SPGRecord]: - if isinstance(response, list) and len(response) > 0: - response = response[0] - try: - re_obj = json.loads(response) - except json.decoder.JSONDecodeError: - logger.error("OneKE_KGPrompt response JSONDecodeError error.") - return [] - if type(re_obj) != dict: - logger.error("OneKE_KGPrompt response type error.") - return [] - - spg_records = [] - for type_zh, type_value in re_obj.items(): - if type_zh not in self.spg_type_schema_info_zh: - logger.warning(f"Unrecognized entity_type: {type_zh}") - continue - type_en, _ = self.spg_type_schema_info_zh[type_zh] - if type_value and isinstance(type_value, dict): - for name, attrs in type_value.items(): - spg_record = SPGRecord(type_en).upsert_properties( - {"id": name, "name": name} - ) - for attr_zh, attr_value in attrs.items(): - if isinstance(attr_value, list): - attr_value = ",".join(attr_value) - if attr_zh in self.property_info_zh[type_zh]: - attr_en, _, object_type = self.property_info_zh[type_zh][ - attr_zh - ] - spg_record.upsert_property(attr_en, attr_value) - elif attr_zh in self.relation_info_zh[type_zh]: - attr_en, _, object_type = self.relation_info_zh[type_zh][ - attr_zh - ] - spg_record.upsert_relation(attr_en, object_type, attr_value) - else: - logger.warning(f"Unrecognized attribute: {attr_zh}") - continue - if object_type == "Integer": - matches = re.findall(r"\d+", attr_value) - if matches: - spg_record.upsert_property(attr_en, matches[0]) - elif object_type == "Float": - matches = re.findall(r"\d+(?:\.\d+)?", attr_value) - if matches: - spg_record.upsert_property(attr_en, matches[0]) - spg_records.append(spg_record) - return spg_records - - def _render(self): - spo_list = [] - for spg_type in self.spg_types: - if not self.with_description: - attributes = [] - attributes.extend( - [ - v.name_zh - for k, v in spg_type.properties.items() - if k not in self.ignored_properties - ] - ) - attributes.extend( - [ - v.name_zh - for k, v in spg_type.relations.items() - if v.name_zh not in attributes - and k not in self.ignored_relations - ] - ) - else: - attributes = {} - attributes.update( - { - v.name_zh: v.desc or "" - for k, v in spg_type.properties.items() - if k not in self.ignored_properties - } - ) - attributes.update( - { - v.name_zh: v.desc or "" - for k, v in spg_type.relations.items() - if v.name_zh not in attributes - and k not in self.ignored_relations - } - ) - entity_type = spg_type.name_zh - spo_list.append({"entity_type": entity_type, "attributes": attributes}) - - self.schema_list = self.multischema_split_by_num(self.split_num, spo_list) - - -class OneKE_EEPrompt(OneKEPrompt): - template_zh: str = "你是专门进行事件提取的专家。请从input中抽取出符合schema定义的事件,不存在的事件返回空列表,不存在的论元返回NAN,如果论元存在多值请返回列表。请按照JSON字符串的格式回答。" - template_en: str = "You are an expert in event extraction. Please extract events from the input that conform to the schema definition. Return an empty list for events that do not exist, and return NAN for arguments that do not exist. If an argument has multiple values, please return a list. Respond in the format of a JSON string." - - def __init__( - self, - event_types: List[SPGTypeName], - language: str = "zh", - with_description: bool = False, - split_num: int = 4, - **kwargs, - ): - super().__init__( - types_list=event_types, - language=language, - with_description=with_description, - split_num=split_num, - **kwargs, - ) - - def parse_response(self, response: str) -> List[SPGRecord]: - if isinstance(response, list) and len(response) > 0: - response = response[0] - try: - ee_obj = json.loads(response) - except json.decoder.JSONDecodeError: - logger.error("OneKE_EEPrompt response JSONDecodeError error.") - return [] - if type(ee_obj) != dict: - logger.error("OneKE_EEPrompt response type error.") - return [] - - spg_records = [] - for type_zh, type_values in ee_obj.items(): - if type_zh not in self.spg_type_schema_info_zh: - logger.warning(f"Unrecognized event_type: {type_zh}") - continue - type_en, _ = self.spg_type_schema_info_zh[type_zh] - if type_values and isinstance(type_values, list): - for type_value in type_values: - uuid_4 = uuid.uuid4() - spg_record = ( - SPGRecord(type_en) - .upsert_property("id", str(uuid_4)) - .upsert_property("name", type_zh) - ) - arguments = type_value.get("arguments") - if arguments and isinstance(arguments, dict): - for attr_zh, attr_value in arguments.items(): - if isinstance(attr_value, list): - attr_value = ",".join(attr_value) - if attr_zh in self.property_info_zh[type_zh]: - attr_en, _, object_type = self.property_info_zh[ - type_zh - ][attr_zh] - spg_record.upsert_property(attr_en, attr_value) - elif attr_zh in self.relation_info_zh[type_zh]: - attr_en, _, object_type = self.relation_info_zh[ - type_zh - ][attr_zh] - spg_record.upsert_relation( - attr_en, object_type, attr_value - ) - else: - logger.warning(f"Unrecognized attribute: {attr_zh}") - continue - if object_type == "Integer": - matches = re.findall(r"\d+", attr_value) - if matches: - spg_record.upsert_property(attr_en, matches[0]) - elif object_type == "Float": - matches = re.findall(r"\d+(?:\.\d+)?", attr_value) - if matches: - spg_record.upsert_property(attr_en, matches[0]) - spg_records.append(spg_record) - return spg_records - - def _render(self): - event_list = [] - for spg_type in self.spg_types: - if not self.with_description: - arguments = [] - arguments.extend( - [ - v.name_zh - for k, v in spg_type.properties.items() - if k not in self.ignored_properties - ] - ) - arguments.extend( - [ - v.name_zh - for k, v in spg_type.relations.items() - if v.name_zh not in arguments - and k not in self.ignored_relations - ] - ) - else: - arguments = {} - arguments.update( - { - v.name_zh: v.desc or "" - for k, v in spg_type.properties.items() - if k not in self.ignored_properties - } - ) - arguments.update( - { - v.name_zh: v.desc or "" - for k, v in spg_type.relations.items() - if v.name_zh not in arguments - and k not in self.ignored_relations - } - ) - event_type = spg_type.name_zh - event_list.append( - {"event_type": event_type, "trigger": True, "arguments": arguments} - ) - self.schema_list = self.multischema_split_by_num(self.split_num, event_list) diff --git a/kag/builder/prompt/outline_align_prompt.py b/kag/builder/prompt/outline_align_prompt.py new file mode 100644 index 00000000..0fb0aa30 --- /dev/null +++ b/kag/builder/prompt/outline_align_prompt.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +from typing import Optional, List +from kag.interface import PromptABC +import ast + + +@PromptABC.register("outline_align") +class OutlineAlignPrompt(PromptABC): + template_zh = """ +{ + "instruction": "请分析以下大纲列表,统一调整标题的层级。遵循以下规则: +1. 相同类型的标题应该有相同的层级,例如所有'第X章'都应该是同一层级 +2. 层级关系应该符合逻辑,例如: + - 章(1级) > 节(2级) > 条(3级) + - 部分(1级) > 章(2级) > 节(3级) +3. 考虑标题的上下文关系,确保层级的连贯性 +4. 如果标题不含明确的层级标识,根据其内容和上下文推断合适的层级 + +请务必按照以下格式返回,不要返回其他任何内容,请返回调整后的大纲列表,格式为: +[(标题1, 层级1), (标题2, 层级2), ...] + +输入的大纲列表为: +$outlines", + "example": [ + { + "input": [ + ("第一章 绪论", 2), + ("第一节 研究背景", 1), + ("第二章 文献综述", 1), + ("第二节 研究方法", 2) + ], + "output": [ + ("第一章 绪论", 1), + ("第一节 研究背景", 2), + ("第二章 文献综述", 1), + ("第二节 研究方法", 2) + ] + } + ] +} +""" + + template_en = """ +{ + "instruction": "Please analyze the following outline list and unify the levels of titles according to these rules: +1. Similar types of titles should have the same level (e.g., all 'Chapter X' should be at the same level) +2. Level relationships should follow logic, e.g.: + - Chapter(1) > Section(2) > Article(3) + - Part(1) > Chapter(2) > Section(3) +3. Consider context relationships between titles to ensure level continuity +4. For titles without clear level indicators, infer appropriate levels based on content and context + +Please return the adjusted outline list in the format: +[(title1, level1), (title2, level2), ...] + +Input outline list: +$outlines", + "example": [ + { + "input": [ + ("Chapter 1 Introduction", 2), + ("Section 1.1 Background", 1), + ("Chapter 2 Literature Review", 1), + ("Section 2.1 Methods", 2) + ], + "output": [ + ("Chapter 1 Introduction", 1), + ("Section 1.1 Background", 2), + ("Chapter 2 Literature Review", 1), + ("Section 2.1 Methods", 2) + ] + } + ] +} +""" + + def __init__(self, language: Optional[str] = "zh"): + super().__init__(language) + + @property + def template_variables(self) -> List[str]: + return ["outlines"] + + def parse_response(self, response: str, **kwargs): + if isinstance(response, str): + cleaned_data = response.strip("`python\n[] \n") + cleaned_data = "[" + cleaned_data + "]" + return ast.literal_eval(cleaned_data) + if isinstance(response, dict) and "output" in response: + return response["output"] + return response diff --git a/kag/builder/prompt/outline_prompt.py b/kag/builder/prompt/outline_prompt.py index f7911a69..01cc299f 100644 --- a/kag/builder/prompt/outline_prompt.py +++ b/kag/builder/prompt/outline_prompt.py @@ -10,74 +10,43 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import json from typing import Optional, List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC +import ast -class OutlinePrompt(PromptOp): +@PromptABC.register("outline") +class OutlinePrompt(PromptABC): template_zh = """ { - "instruction": "\n请理解input字段中的文本内容,识别文本的结构和组成部分,并帮我提取出以下内容的标题,可能有多个标题分散在文本的各个地方,仅返属于原文的回标题文本即可,不要返回其他任何内容,须按照python list的格式回答,具体形式请遵从example字段中给出的若干例子。", + "instruction": "\n给定一段纯文本内容,请提取其中的标题,并返回一个列表。每个标题应包含以下信息:\n- 标题文本\n- 标题级别(例如 1 表示一级标题,2 表示二级标题等)\n\n假设标题遵循以下规则:\n1. 标题通常带有数字,我们的文本可能是从一些图片OCR生成的,所以标题可能隐藏在段落中,尽可能找出这些隐藏在段落中带有数字的标题\n2. 标题的级别可以通过以下方式推断:\n - 一级标题:通常是篇章级别的内容。\n - 二级标题:通常是章节级别的内容,具有简洁的文字描述,有时以 \"第X部分\"、\"第X章\"、\"Part X\" 等类似形式开头。\n - 三级标题及以下:通常是段落或细节级别的标题,可能包含数字编号(如\"1.\"或\"1.1\"),或者较长且具体的描述(如\"1.1 子标题\"或\"第1节 概述\")。\n3. 标题的级别也可以通过上下文判断:\n - 如果两个标题之间的文本内容非常短(例如少于一定字数),后面的标题可能是更高或相同级别的标题。\n - 连续编号的标题(如“第1条”“第2条”)通常属于同一级别。\n - 标题层级通常由其数字层次决定,例如“1”“1.1”“1.1.1”依次为 1 级、2 级、3 级。\n - 如果一个标题包含关键词如“部分”“章”“节”“条”,且其长度适中(例如 5 至 20 个字符),该标题的级别往往比更长或更短的标题要高。\n4. 以下标题可以直接忽略:\n - 含有纯数字或仅由数字和标点组成的标题(例如“1.”、“2.1”等)。\n - 重复出现的标题(例如页眉或页脚被误识别为标题的情况)。\n5. 如果某些内容无法明确判断为标题,或者不符合上述规则,请忽略。\n\n请根据上述规则,返回一个包含标题和对应级别的列表,格式如下:\n[\n (\"标题文本1\", 1),\n (\"标题文本2\", 2),\n (\"标题文本3\", 3),\n ...\n],我还会给你提供之前内容抽取出的目录current_outlines,你需要根据当前已经抽取的目录,自行判断抽取标题的粒度以及对应的等级", "input": "$input", + "current_outline:": "$current_outline", "example": [ { - "input": "第8条 原 则 - -1.各成员方在制订或修正其法律和规章时,可采取必要措施以保护公众健康和营养,并促进对其社会经济和技术发展至关重要部门的公众利益,只要该措施符合本协议规定。 - -2.可能需要采取与本协议的规定相一致的适当的措施,以防止知识产权所有者滥用知识产权或藉以对贸易进行不合理限制或实行对国际间的技术转让产生不利影响的作法。 - -第二部分 关于知识产权的效力、范围及使用的标准 - -第1节 版权及相关权利 - -第9条 与《伯尔尼公约》的关系", + "input": "第8条 原 则\n\n1.各成员方在制订或修正其法律和规章时,可采取必要措施以保护公众健康和营养,并促进对其社会经济和技术发展至关重要部门的公众利益,只要该措施符合本协议规定。\n\n2.可能需要采取与本协议的规定相一致的适当的措施,以防止知识产权所有者滥用知识产权或藉以对贸易进行不合理限制或实行对国际间的技术转让产生不利影响的作法。\n\n第二部分 关于知识产权的效力、范围及使用的标准\n\n第1节 版权及相关权利\n\n第9条 与《伯尔尼公约》的关系", "output": [ - "第8条 原 则", - "第二部分 关于知识产权的效力、范围及使用的标准", - "第1节 版权及相关权利", - "第9条 与《伯尔尼公约》的关系" - ], + ("第8条 原 则",3), + ("第二部分 关于知识产权的效力、范围及使用的标准",1), + ("第1节 版权及相关权利",2), + ("第9条 与《伯尔尼公约》的关系",3) + ] }, { - "input": "第16条 授予权利 - -1.已注册商标所有者应拥有阻止所有未经其同意的第三方在贸易中使用与已注册商标相同或相似的商品或服务的,其使用有可能招致混淆的相同或相似的标志。在对相同商品或服务使用相同标志的情况下,应推定存在混淆之可能。上述权利不应妨碍任何现行的优先权,也不应影响各成员方以使用为条件获得注册权的可能性。 - -2.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于服务。在确定一个商标是否为知名商标时,各成员方应考虑到有关部分的公众对该商标的了解,包括由于该商标的推行而在有关成员方得到的了解。 - -3.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于与已注册商标的商品和服务不相似的商品或服务,条件是该商标与该商品和服务有关的使用会表明该商品或服务与已注册商标所有者之间的联系,而且已注册商标所有者的利益有可能为此种使用所破坏。 - -第17条 例 外\n ", + "input": "第16条 授予权利\n\n1.已注册商标所有者应拥有阻止所有未经其同意的第三方在贸易中使用与已注册商标相同或相似的商品或服务的,其使用有可能招致混淆的相同或相似的标志。在对相同商品或服务使用相同标志的情况下,应推定存在混淆之可能。上述权利不应妨碍任何现行的优先权,也不应影响各成员方以使用为条件获得注册权的可能性。\n\n2.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于服务。在确定一个商标是否为知名商标时,各成员方应考虑到有关部分的公众对该商标的了解,包括由于该商标的推行而在有关成员方得到的了解。\n\n3.1967《巴黎公约》第6条副则经对细节作必要修改后应适用于与已注册商标的商品和服务不相似的商品或服务,条件是该商标与该商品和服务有关的使用会表明该商品或服务与已注册商标所有者之间的联系,而且已注册商标所有者的利益有可能为此种使用所破坏。\n\n第17条 例 外\n ", "output": [ - "第16条 授予权利", - "第17条 例 外" - ], + ("第16条 授予权利",3), + ("第17条 例 外",3) + ] }, { - "input":"的做法。 - -(4)此类使用应是非独占性的。 - -(5)此类使用应是不可转让的,除非是同享有此类使用的那部分企业或信誉一道转让。 - -(6)任何此类使用之授权,均应主要是为授权此类使用的成员方国内市场供应之目的。 - -(7)在被授权人的合法利益受到充分保护的条件下,当导致此类使用授权的情况下不复存在和可能不再产生时,有义务将其终止;应有动机的请求,主管当局应有权对上述情况的继续存在进行检查。 - -(8)考虑到授权的经济价值,应视具体情况向权利人支付充分的补偿金。 - -(9)任何与此类使用之授权有关的决定,其法律效力应接受该成员方境内更高当局的司法审查或其他独立审查。 - -(10)任何与为此类使用而提供的补偿金有关的决定,应接受成员方境内更高当局的司法审查或其他独立审查。 -", - "output": [], - }, + "input": "的做法。\n\n(4)此类使用应是非独占性的。\n\n(5)此类使用应是不可转让的,除非是同享有此类使用的那部分企业或信誉一道转让。\n\n(6)任何此类使用之授权,均应主要是为授权此类使用的成员方国内市场供应之目的。\n\n(7)在被授权人的合法利益受到充分保护的条件下,当导致此类使用授权的情况下不复存在和可能不再产生时,有义务将其终止;应有动机的请求,主管当局应有权对上述情况的继续存在进行检查。\n\n(8)考虑到授权的经济价值,应视具体情况向权利人支付充分的补偿金。\n\n(9)任何与此类使用之授权有关的决定,其法律效力应接受该成员方境内更高当局的司法审查或其他独立审查。\n\n(10)任何与为此类使用而提供的补偿金有关的决定,应接受成员方境内更高当局的司法审查或其他独立审查。\n", + "output": [] + } ] -} - """ +} +""" template_en = """ { @@ -147,11 +116,16 @@ def __init__(self, language: Optional[str] = "zh"): @property def template_variables(self) -> List[str]: - return ["input"] + return ["input", "current_outline"] def parse_response(self, response: str, **kwargs): if isinstance(response, str): - response = json.loads(response) + cleaned_data = response.strip("`python\n[] \n") # 去除 Markdown 语法和多余的空格 + cleaned_data = "[" + cleaned_data + "]" # 恢复为列表格式 + + # 使用 ast.literal_eval 将字符串转换为实际的列表对象 + list_data = ast.literal_eval(cleaned_data) + return list_data if isinstance(response, dict) and "output" in response: response = response["output"] diff --git a/kag/builder/prompt/semantic_seg_prompt.py b/kag/builder/prompt/semantic_seg_prompt.py index fb09b564..9399cda6 100644 --- a/kag/builder/prompt/semantic_seg_prompt.py +++ b/kag/builder/prompt/semantic_seg_prompt.py @@ -11,12 +11,13 @@ # or implied. import json -from typing import Optional, List +from typing import List -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC -class SemanticSegPrompt(PromptOp): +@PromptABC.register("semantic_seg") +class SemanticSegPrompt(PromptABC): template_zh = """ { "instruction": "\n请理解input字段中的文本内容,识别文本的结构和组成部分,并按照语义主题确定分割点,将其切分成互不重叠的若干小节。如果文章有章节等可识别的结构信息,请直接按照顶层结构进行切分。\n请按照schema定义的字段返回,包含小节摘要和小节起始点。须按照JSON字符串的格式回答。具体形式请遵从example字段中给出的若干例子。", @@ -111,9 +112,6 @@ class SemanticSegPrompt(PromptOp): } """ - def __init__(self, language: Optional[str] = "zh"): - super().__init__(language) - @property def template_variables(self) -> List[str]: return ["input"] diff --git a/kag/builder/prompt/spg_prompt.py b/kag/builder/prompt/spg_prompt.py index f14678de..a094f1ea 100644 --- a/kag/builder/prompt/spg_prompt.py +++ b/kag/builder/prompt/spg_prompt.py @@ -12,244 +12,589 @@ import json import logging -from abc import ABC +import copy from typing import List, Dict -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC from knext.schema.client import SchemaClient -from knext.schema.model.base import BaseSpgType, SpgTypeEnum +from knext.schema.model.base import SpgTypeEnum, ConstraintTypeEnum from knext.schema.model.schema_helper import SPGTypeName from kag.builder.model.spg_record import SPGRecord +from kag.common.conf import KAG_PROJECT_CONF +from knext.schema.client import OTHER_TYPE logger = logging.getLogger(__name__) -class SPGPrompt(PromptOp, ABC): - spg_types: Dict[str, BaseSpgType] +class SPGPrompt(PromptABC): + """ + Base class for generating SPG schema-based entity/event extraction prompts. + + Attributes: + ignored_types (List[str]): List of SPG types to be ignored. + ignored_properties (List[str]): List of properties to be ignored. + default_properties (Dict[str, str]): Default properties for SPG types. + ignored_relations (List[str]): List of relations to be ignored. + """ + ignored_types: List[str] = ["Chunk"] - ignored_properties: List[str] = ["id", "name", "description", "stdId", "eventTime", "desc", "semanticType"] + ignored_properties: List[str] = [ + "id", + "stdId", + "desc", + "description", + "eventTime", + ] + default_properties: Dict[str, str] = { + "name": "Text", + } + ignored_relations: List[str] = ["isA"] - basic_types = {"Text": "文本", "Integer": "整型", "Float": "浮点型"} def __init__( self, - spg_type_names: List[SPGTypeName], - language: str = "zh", + spg_type_names: List[SPGTypeName] = [], + language: str = "", **kwargs, ): + """ + Initializes the SPGPrompt instance. + + Args: + spg_type_names (List[SPGTypeName], optional): List of SPG type names. Defaults to []. + language (str, optional): Language for the prompt. Defaults to "". + **kwargs: Additional keyword arguments. + """ super().__init__(language=language, **kwargs) - self.all_schema_types = SchemaClient(project_id=self.project_id).load() + self.schema = SchemaClient(project_id=KAG_PROJECT_CONF.project_id).load() self.spg_type_names = spg_type_names if not spg_type_names: - self.spg_types = self.all_schema_types + self.spg_types = self.schema else: - self.spg_types = {k: v for k, v in self.all_schema_types.items() if k in spg_type_names} - self.schema_list = [] - - self._init_render_variables() + self.spg_types = { + k: v for k, v in self.schema.items() if k in spg_type_names + } + self.create_prompt_schema() + # self._init_render_variables() @property def template_variables(self) -> List[str]: + """ + Returns the list of template variables used in the prompt. + + Returns: + List[str]: List of template variables. + """ return ["schema", "input"] - def _init_render_variables(self): - self.type_en_to_zh = {"Text": "文本", "Integer": "整型", "Float": "浮点型"} - self.type_zh_to_en = { - "文本": "Text", - "整型": "Integer", - "浮点型": "Float", - } - self.prop_en_to_zh = {} - self.prop_zh_to_en = {} - for type_name, spg_type in self.all_schema_types.items(): - self.type_en_to_zh[type_name] = spg_type.name_zh - self.type_en_to_zh[spg_type.name_zh] = type_name - self.prop_zh_to_en[type_name] = {} - self.prop_en_to_zh[type_name] = {} - for _prop in spg_type.properties.values(): - if _prop.name in self.ignored_properties: + def get_accept_types(self): + """ + Returns the list of accepted SPG types. + + Returns: + List[SpgTypeEnum]: List of accepted SPG types. + """ + return [ + SpgTypeEnum.Entity, + SpgTypeEnum.Concept, + SpgTypeEnum.Event, + ] + + def build_prompt(self, variables: Dict[str, str]) -> str: + """ + Builds the prompt using the provided variables. + + Args: + variables (Dict[str, str]): Dictionary of variables to be used in the prompt. + + Returns: + str: The built prompt. + """ + return super().build_prompt( + { + "schema": copy.deepcopy(self.prompt_schema), + "input": variables.get("input"), + } + ) + + def process_property_name(self, name: str): + """ + Process property name by removing descriptions enclosed in parentheses. + Args: + name (dict): property names (possibly containing descriptions in parentheses) + + Returns: + str: A new string having the descriptions in parentheses removed. + + Example: + >>> name = 'authors(authors of work, such as director, actor, lyricist, composer and singer)' + >>> process_property_name(input_properties) + 'authors' + """ + + return name.split("(")[0] + + def process_property_names(self, properties: Dict): + """ + Process property names by removing descriptions enclosed in parentheses. + + This method iterates through the given dictionary of properties, removes any + descriptions enclosed in parentheses from the property names, and returns a new + dictionary with the processed names. If a property value is itself a dictionary, + this method will recursively process it. + + Args: + properties (dict): A dictionary where keys are property names (possibly containing + descriptions in parentheses) and values are either property values + or nested dictionaries. + + Returns: + dict: A new dictionary with the same structure as the input, but with all property + names having their descriptions in parentheses removed. + Example: + >>> input_properties = { + ... "authors(authors of work, such as director, actor, lyricist, composer and singer)": "John Doe" + ... } + >>> process_property_names(input_properties) + {'authors': 'John Doe'} + """ + output = {} + for k, v in properties.items(): + k = self.process_property_name(k) + if isinstance(v, dict): + output[k] = self.process_property_names(v) + else: + output[k] = v + return output + + def parse_response(self, response: str, **kwargs) -> List[SPGRecord]: + """ + Parses the response string into a list of SPG records. + + Args: + response (str): The response string to be parsed. + **kwargs: Additional keyword arguments. + + Returns: + List[SPGRecord]: List of parsed SPG records. + """ + rsp = response + if isinstance(rsp, str): + rsp = json.loads(rsp) + if isinstance(rsp, dict) and "output" in rsp: + rsp = rsp["output"] + outputs = [] + for item in rsp: + if "category" not in item or item["category"] not in self.schema: + continue + properties = item.get("properties", {}) + if "name" not in properties: + continue + output = {} + output["category"] = item["category"] + output["name"] = properties.pop("name") + output["properties"] = self.process_property_names(properties) + outputs.append(output) + return outputs + + def create_prompt_schema(self): + """ + Creates the schema for extraction prompt based on the project schema. + """ + prompt_schema = [] + accept_types = self.get_accept_types() + for type_name, spg_type in self.spg_types.items(): + if type_name in self.ignored_types: + continue + if spg_type.spg_type_enum not in accept_types: + continue + type_desc = spg_type.desc + properties = copy.deepcopy(self.default_properties) + for k, v in spg_type.properties.items(): + if k in self.ignored_properties or k in self.default_properties: continue - self.prop_en_to_zh[type_name][_prop.name] = _prop.name_zh - self.prop_zh_to_en[type_name][_prop.name_zh] = _prop.name - for _rel in spg_type.relations.values(): - if _rel.is_dynamic: + multi_value = ConstraintTypeEnum.MultiValue.value in v.constraint + obj_type_name = v.object_type_name.split(".")[-1] + if multi_value: + obj_type_name = f"List[{obj_type_name}]" + if v.desc: + v_name = f"{v.name}({v.desc})" + else: + v_name = v.name + properties[v_name] = obj_type_name + + for k, v in spg_type.relations.items(): + if k in self.ignored_relations or k in self.default_properties: + continue + if v.name in properties: continue - self.prop_en_to_zh[type_name][_rel.name] = _rel.name_zh - self.prop_zh_to_en[type_name][_rel.name_zh] = _rel.name + obj_type_name = v.object_type_name.split(".")[-1] + if v.desc: + v_name = f"{v.name}({v.desc})" + else: + v_name = v.name + properties[v_name] = obj_type_name + + if type_desc: + prompt_schema.append( + {f"{type_name}({type_desc})": {"properties": properties}} + ) + else: + prompt_schema.append({type_name: {"properties": properties}}) - def _render(self): - raise NotImplementedError + self.prompt_schema = prompt_schema -class SPG_KGPrompt(SPGPrompt): - template_zh: str = """ - { - "instruction": "你是一个图谱知识抽取的专家, 基于constraint 定义的schema,从input 中抽取出所有的实体及其属性,input中未明确提及的属性返回NAN,以标准json 格式输出,结果返回list", - "schema": $schema, +@PromptABC.register("spg_entity") +class SPGEntityPrompt(SPGPrompt): + template_zh: dict = { + "instruction": "作为一个图谱知识抽取的专家, 你需要基于定义了实体类型及对应属性的schema,从input字段的文本中抽取出所有的实体及其属性,schema中标记为List的属性返回list,未能提取的属性返回null。以标准json list格式输出,list中每个元素形如{category: properties},你可以参考example字段中给出的示例格式。注意实体属性的SemanticType指的是一个相比实体类型更具体且明确定义的类型,例如Person类型的SemanticType可以是Professor或Actor。", "example": [ - { - "input": "甲状腺结节是指在甲状腺内的肿块,可随吞咽动作随甲状腺而上下移动,是临床常见的病症,可由多种病因引起。临床上有多种甲状腺疾病,如甲状腺退行性变、炎症、自身免疫以及新生物等都可以表现为结节。甲状腺结节可以单发,也可以多发,多发结节比单发结节的发病率高,但单发结节甲状腺癌的发生率较高。患者通常可以选择在普外科,甲状腺外科,内分泌科,头颈外科挂号就诊。有些患者可以触摸到自己颈部前方的结节。在大多情况下,甲状腺结节没有任何症状,甲状腺功能也是正常的。甲状腺结节进展为其它甲状腺疾病的概率只有1%。有些人会感觉到颈部疼痛、咽喉部异物感,或者存在压迫感。当甲状腺结节发生囊内自发性出血时,疼痛感会更加强烈。治疗方面,一般情况下可以用放射性碘治疗,复方碘口服液(Lugol液)等,或者服用抗甲状腺药物来抑制甲状腺激素的分泌。目前常用的抗甲状腺药物是硫脲类化合物,包括硫氧嘧啶类的丙基硫氧嘧啶(PTU)和甲基硫氧嘧啶(MTU)及咪唑类的甲硫咪唑和卡比马唑。", - "schema": { - "Disease": { - "properties": { - "complication": "并发症", - "commonSymptom": "常见症状", - "applicableMedicine": "适用药品", - "department": "就诊科室", - "diseaseSite": "发病部位", - } - },"Medicine": { - "properties": { - } - } + { + "input": "周杰伦(Jay Chou),1979年1月18日出生于台湾省新北市,祖籍福建省永春县,华语流行乐男歌手、音乐人、演员、导演、编剧,毕业于淡江中学。2000年,发行个人首张音乐专辑《Jay》 [26]。2023年凭借《最伟大的作品》获得第一届浪潮音乐大赏年度制作、最佳作曲、最佳音乐录影带三项大奖。", + "output": [ + { + "category": "Person", + "properties": { + "name": "周杰伦", + "semanticType": "Musician", + "description": "华语流行乐男歌手、音乐人、演员、导演、编剧", + }, + }, + { + "category": "GeographicLocation", + "properties": { + "name": "台湾省新北市", + "semanticType": "City", + "description": "周杰伦的出生地", + }, + }, + { + "category": "GeographicLocation", + "properties": { + "name": "福建省永春县", + "semanticType": "County", + "description": "周杰伦的祖籍", + }, + }, + { + "category": "Organization", + "properties": { + "name": "淡江中学", + "semanticType": "School", + "description": "周杰伦的毕业学校", + }, + }, + { + "category": "Works", + "properties": { + "name": "Jay", + "semanticType": "Album", + "description": "周杰伦的个人首张音乐专辑", + }, + }, + { + "category": "Works", + "properties": { + "name": "最伟大的作品", + "semanticType": "MusicVideo", + "description": "周杰伦凭借此作品获得多项音乐大奖", + }, + }, + ], } - "output": [ - { - "entity": "甲状腺结节", - "category":"Disease" - "properties": { - "complication": "甲状腺癌", - "commonSymptom": ["颈部疼痛", "咽喉部异物感", "压迫感"], - "applicableMedicine": ["复方碘口服液(Lugol液)", "丙基硫氧嘧啶(PTU)", "甲基硫氧嘧啶(MTU)", "甲硫咪唑", "卡比马唑"], - "department": ["普外科", "甲状腺外科", "内分泌科", "头颈外科"], - "diseaseSite": "甲状腺", - } - },{ - "entity":"复方碘口服液(Lugol液)", - "category":"Medicine" - },{ - "entity":"丙基硫氧嘧啶(PTU)", - "category":"Medicine" - },{ - "entity":"甲基硫氧嘧啶(MTU)", - "category":"Medicine" - },{ - "entity":"甲硫咪唑", - "category":"Medicine" - },{ - "entity":"卡比马唑", - "category":"Medicine" - } - ], - "input": "$input" + ], } - """ - template_en: str = """ - { - "instruction": "You are an expert in knowledge graph extraction. Based on the schema defined by constraints, extract all entities and their attributes from the input. For attributes not explicitly mentioned in the input, return NAN. Output the results in standard JSON format as a list.", - "schema": $schema, + template_en: dict = { + "instruction": "As an expert in graph knowledge extraction, you need to extract all entities and their properties from the text in the input field based on a schema that defines entity types and their corresponding attributes. Attributes marked as List in the schema should return a list, and attributes not extracted should return null. Output the results in a standard JSON list format, where each element in the list is in the form of {category: properties}. You can refer to the example format provided in the example field. Note that the SemanticType of an entity attribute refers to a more specific and clearly defined type compared to the entity type itself, such as Professor or Actor for the Person type.", "example": [ - { - "input": "Thyroid nodules refer to lumps within the thyroid gland that can move up and down with swallowing, and they are a common clinical condition that can be caused by various etiologies. Clinically, many thyroid diseases, such as thyroid degeneration, inflammation, autoimmune conditions, and neoplasms, can present as nodules. Thyroid nodules can occur singly or in multiple forms; multiple nodules have a higher incidence than single nodules, but single nodules have a higher likelihood of being thyroid cancer. Patients typically have the option to register for consultation in general surgery, thyroid surgery, endocrinology, or head and neck surgery. Some patients can feel the nodules in the front of their neck. In most cases, thyroid nodules are asymptomatic, and thyroid function is normal. The probability of thyroid nodules progressing to other thyroid diseases is only about 1%. Some individuals may experience neck pain, a foreign body sensation in the throat, or a feeling of pressure. When spontaneous intracystic bleeding occurs in a thyroid nodule, the pain can be more intense. Treatment options generally include radioactive iodine therapy, Lugol's solution (a compound iodine oral solution), or antithyroid medications to suppress thyroid hormone secretion. Currently, commonly used antithyroid drugs are thiourea compounds, including propylthiouracil (PTU) and methylthiouracil (MTU) from the thiouracil class, and methimazole and carbimazole from the imidazole class.", - "schema": { - "Disease": { - "properties": { - "complication": "Disease", - "commonSymptom": "Symptom", - "applicableMedicine": "Medicine", - "department": "HospitalDepartment", - "diseaseSite": "HumanBodyPart" - } - },"Medicine": { - "properties": { - } - } + { + "input": "Jay Chou, born on January 18, 1979, in New Taipei City, Taiwan Province, with ancestral roots in Yongchun County, Fujian Province, is a renowned male singer, musician, actor, director, and screenwriter in the realm of Chinese pop music. He graduated from Tamkang University. In 2000, he released his debut solo album, [26]. In 2023, he was honored with three major awards at the inaugural Wave Music Awards for Best Production, Best Composition, and Best Music Video for his album The Greatest Work.", + "output": [ + { + "category": "Person", + "properties": { + "name": "Jay Chou", + "semanticType": "Musician", + "description": "renowned male singer, musician, actor, director, and screenwriter in the realm of Chinese pop music", + }, + }, + { + "category": "GeographicLocation", + "properties": { + "name": "New Taipei City, Taiwan Province", + "semanticType": "City", + "description": "Jay Chou's birthplace", + }, + }, + { + "category": "GeographicLocation", + "properties": { + "name": "Yongchun County, Fujian Province", + "semanticType": "County", + "description": "Jay Chou's ancestral roots", + }, + }, + { + "category": "Organization", + "properties": { + "name": "Tamkang University", + "semanticType": "University", + "description": "Jay Chou's alma mater", + }, + }, + { + "category": "Works", + "properties": { + "name": "Jay", + "semanticType": "Album", + "description": "Jay Chou's debut solo album", + }, + }, + { + "category": "Works", + "properties": { + "name": "The Greatest Work", + "semanticType": "Album", + "description": "Jay Chou's album for which he won multiple awards", + }, + }, + ], } + ], + } + + def get_accept_types(self): + return [ + SpgTypeEnum.Entity, + SpgTypeEnum.Concept, + ] + + +@PromptABC.register("spg_event") +class SPGEventPrompt(SPGPrompt): + template_zh: dict = { + "instruction": "作为一个知识图谱图谱事件抽取的专家, 你需要基于定义的事件类型及对应属性的schema,从input字段的文本中抽取出所有的事件及其属性,schema中标记为List的属性返回list,未能提取的属性返回null。以标准json list格式输出,list中每个元素形如{category: properties},你可以参考example字段中给出的示例格式。", + "example": { + "input": "1986年,周星驰被调入无线电视台戏剧组;同年,他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟,这也是他第一次在情景剧中担任男主角;之后,他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。", "output": [ { - "entity": "Thyroid Nodule", - "category": "Disease", + "category": "Event", + "properties": { + "name": "周星驰被调入无线电视台戏剧组", + "abstract": "1986年,周星驰被调入无线电视台戏剧组。", + "subject": "周星驰", + "time": "1986年", + "location": "无线电视台", + "participants": [], + "semanticType": "调动", + }, + }, + { + "category": "Event", "properties": { - "complication": "Thyroid Cancer", - "commonSymptom": ["Neck Pain", "Foreign Body Sensation in the Throat", "Feeling of Pressure"], - "applicableMedicine": ["Lugol's Solution (Compound Iodine Oral Solution)", "Propylthiouracil (PTU)", "Methylthiouracil (MTU)", "Methimazole", "Carbimazole"],\n "department": ["General Surgery", "Thyroid Surgery", "Endocrinology", "Head and Neck Surgery"],\n "diseaseSite": "Thyroid"\n }\n },\n {\n "entity": "Lugol's Solution (Compound Iodine Oral Solution)", - "category": "Medicine" + "name": "周星驰在《哥哥的女友》中饰演潘家伟", + "abstract": "1986年,周星驰在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟,这也是他第一次在情景剧中担任男主角。", + "subject": "周星驰", + "time": "1986年", + "location": None, + "participants": [], + "semanticType": "演出", + }, }, { - "entity": "Propylthiouracil (PTU)", - "category": "Medicine" + "category": "Event", + "properties": { + "name": "周星驰跑龙套", + "abstract": "1986年,周星驰在温兆伦、郭晋安等人主演的电视剧中跑龙套。", + "subject": "周星驰", + "time": "1986年", + "location": None, + "participants": ["温兆伦", "郭晋安"], + "semanticType": "演出", + }, }, + ], + }, + } + + template_en: dict = { + "instruction": "As an expert in knowledge graph event extraction, you need to extract all events and their attributes from the text in the input field based on the defined event types and corresponding attribute schema. For attributes marked as List in the schema, return them as a list, and for attributes that cannot be extracted, return null. Output in the standard JSON list format, with each element in the list having the form {category: properties}. You can refer to the example format provided in the example field.", + "example": { + "input": "In 1986, Stephen Chow was transferred to the drama department of Television Broadcasts Limited (TVB). In the same year, he played the role of Pan Jiawei, a lovable, lively, and slightly shy character, in the episodic situational comedy This was his first time taking on a lead role in a sitcom. Later, he also had minor roles in TV series starring actors such as Anthony Wong and Aaron Kwok.", + "output": [ { - "entity": "Methylthiouracil (MTU)", - "category": "Medicine" + "category": "Event", + "properties": { + "name": "Stephen Chow was transferred to the drama department of TVB", + "abstract": "In 1986, Stephen Chow was transferred to the drama department of Television Broadcasts Limited (TVB).", + "subject": "Stephen Chow", + "time": "1986", + "location": "Television Broadcasts Limited (TVB)", + "participants": [], + "semanticType": "调动", + }, }, { - "entity": "Methimazole", - "category": "Medicine" + "category": "Event", + "properties": { + "name": "Stephen Chow played Pan Jiawei in My Brother's Girlfriend", + "abstract": "In 1986, Stephen Chow played the role of Pan Jiawei, a lovable, lively, and slightly shy character, in the episodic situational comedy This was his first time taking on a lead role in a sitcom.", + "subject": "Stephen Chow", + "time": "1986", + "location": None, + "participants": [], + "semanticType": "演出", + }, }, { - "entity": "Carbimazole", - "category": "Medicine" - } + "category": "Event", + "properties": { + "name": "Stephen Chow had minor roles in TV series", + "abstract": "Later, Stephen Chow also had minor roles in TV series starring actors such as Anthony Wong and Aaron Kwok.", + "subject": "Stephen Chow", + "time": None, + "location": None, + "participants": ["Anthony Wong", "Aaron Kwok"], + "semanticType": "演出", + }, + }, ], - "input": "$input" + }, } - """ - def __init__( - self, - spg_type_names: List[SPGTypeName], - language: str = "zh", - **kwargs - ): - super().__init__( - spg_type_names=spg_type_names, - language=language, - **kwargs - ) - self._render() + def get_accept_types(self): + return [ + SpgTypeEnum.Event, + ] + + +@PromptABC.register("spg_relation") +class SPGRelationPrompt(SPGPrompt): + template_zh: dict = { + "instruction": "您是一位专门从事开放信息提取(OpenIE)的专家。schema定义了你需要关注的实体类型以及可选的用括号包围的类型解释,entity_list是一组实体列表。请从input字段的文本中提取任何可能的[主语实体,主语实体类类型,谓语,宾语实体,宾语实体类型]五元组,并按照JSON列表格式列出它们。请严格遵循以下要求:\n1. 主语实体和宾语实体应至少有一个包含在entity_list实体列表,但不要求都包含\n2. 主语和宾语实体类型必须是schema定义的类型,否则无效,\n3. 明确地将代词解析为对应名称,以保持清晰度。", + "example": { + "input": "1986年,周星驰被调入无线电视台戏剧组;同年,他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟,这也是他第一次在情景剧中担任男主角;之后,他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。", + "entity_list": [ + {"name": "周星驰", "category": "Person"}, + {"name": "无线电视台", "category": "Organization"}, + {"name": "哥哥的女友", "category": "Works"}, + {"name": "潘家伟", "category": "Person"}, + {"name": "温兆伦", "category": "Person"}, + {"name": "郭晋安", "category": "Person"}, + ], + "output": [ + ["周星驰", "Person", "被调入", "无线电视台", "Organization"], + ["周星驰", "Person", "出演", "哥哥的女朋友", "Works"], + ["周星驰", "Person", "饰演", "潘家伟", "Person"], + ["周星驰", "Person", "共演", "温兆伦", "Person"], + ["周星驰", "Person", "共演", "郭晋安", "Person"], + [ + "周星驰", + "Person", + "跑龙套", + "温兆伦、郭晋安等人主演的电视剧", + "Works", + ], + ], + }, + } + + template_en: dict = { + "instruction": "You are an expert in Open Information Extraction (OpenIE). The schema defines the entity types you need to focus on, along with optional type explanations enclosed in parentheses. The entity_list is a set of entity lists. Please extract any possible [subject entity, subject entity class type, predicate, object entity, object entity type] quintuples from the text in the input field and list them in JSON list format. Please adhere strictly to the following requirements:1. At least one of the subject entity and object entity must appear in the entity_list.\n2. The subject and object entity types must be defined in the schema; otherwise, they are considered invalid.\n3.Resolve pronouns to their corresponding names explicitly to maintain clarity.", + "example": { + "input": "In 1986, Stephen Chow was transferred to the drama division of TVB; that same year, he played the cute, lively, and slightly shy Pan Jiawei in the situational drama 'My Brother's Girlfriend,' which was also his first time as the male lead in a situational drama; later, he also appeared as an extra in TV dramas starring Deric Wan, Roger Kwok, and others.", + "entity_list": [ + {"name": "Stephen Chow", "category": "Person"}, + {"name": "TVB", "category": "Organization"}, + {"name": "My Brother's Girlfriend", "category": "Works"}, + {"name": "Pan Jiawei", "category": "Person"}, + {"name": "Deric Wan", "category": "Person"}, + {"name": "Roger Kwok", "category": "Person"}, + ], + "output": [ + ["Stephen Chow", "Person", "was transferred to", "TVB", "Organization"], + [ + "Stephen Chow", + "Person", + "starred in", + "My Brother's Girlfriend", + "Works", + ], + ["Stephen Chow", "Person", "played", "Pan Jiawei", "Person"], + ["Stephen Chow", "Person", "co-starred with", "Deric Wan", "Person"], + ["Stephen Chow", "Person", "co-starred with", "Roger Kwok", "Person"], + [ + "Stephen Chow", + "Person", + "appeared as an extra in", + "TV dramas starring Deric Wan, Roger Kwok, and others", + "Works", + ], + ], + }, + } + + def get_accept_types(self): + """ + Returns the list of accepted SPG types. + + Returns: + List[SpgTypeEnum]: List of accepted SPG types. + """ + return [ + SpgTypeEnum.Entity, + SpgTypeEnum.Concept, + ] def build_prompt(self, variables: Dict[str, str]) -> str: - schema = {} - for tmpSchema in self.schema_list: - schema.update(tmpSchema) + """ + Builds the prompt using the provided variables. + + Args: + variables (Dict[str, str]): Dictionary of variables to be used in the prompt. - return super().build_prompt({"schema": schema, "input": variables.get("input")}) + Returns: + str: The built prompt. + """ + schema = [] + for item in self.prompt_schema: + schema.extend(item.keys()) + return super().build_prompt( + { + "schema": schema, + "input": variables.get("input"), + } + ) def parse_response(self, response: str, **kwargs) -> List[SPGRecord]: + """ + Parses the response string into a list of SPG records. + + Args: + response (str): The response string to be parsed. + **kwargs: Additional keyword arguments. + + Returns: + List[SPGRecord]: List of parsed SPG records. + """ rsp = response if isinstance(rsp, str): rsp = json.loads(rsp) if isinstance(rsp, dict) and "output" in rsp: rsp = rsp["output"] - if isinstance(rsp, dict) and "named_entities" in rsp: - entities = rsp["named_entities"] - else: - entities = rsp - - return entities - - def _render(self): - spo_list = [] - for type_name, spg_type in self.spg_types.items(): - if spg_type.spg_type_enum not in [SpgTypeEnum.Entity, SpgTypeEnum.Concept, SpgTypeEnum.Event]: + outputs = [] + for item in rsp: + if len(item) != 5: continue - constraint = {} - properties = {} - properties.update( - { - v.name: (f"{v.name_zh}" if not v.desc else f"{v.name_zh},{v.desc}") if self.language == "zh" else (f"{v.name}" if not v.desc else f"{v.name}, {v.desc}") - for k, v in spg_type.properties.items() - if k not in self.ignored_properties - } - ) - properties.update( - { - f"{v.name}#{v.object_type_name_en}": ( - f"{v.name_zh},类型是{v.object_type_name_zh}" - if not v.desc - else f"{v.name_zh},{v.desc},类型是{v.object_type_name_zh}" - ) if self.language == "zh" else ( - f"{v.name}, the type is {v.object_type_name_en}" - if not v.desc - else f"{v.name},{v.desc}, the type is {v.object_type_name_en}" - ) - for k, v in spg_type.relations.items() - if not v.is_dynamic and k not in self.ignored_relations - } - ) - constraint.update({"properties": properties}) - spo_list.append({type_name: constraint}) - - self.schema_list = spo_list + s_name, s_label, predicate, o_name, o_label = item + s_label = self.process_property_name(s_label) + o_label = self.process_property_name(o_label) + # force convert to OTHER_TYPE or just drop it? + if s_label not in self.schema: + s_label = OTHER_TYPE + if o_label not in self.schema: + o_label = OTHER_TYPE + outputs.append([s_name, s_label, predicate, o_name, o_label]) + return outputs diff --git a/kag/builder/component/extractor/user_defined_extractor.py b/kag/builder/prompt/utils.py similarity index 54% rename from kag/builder/component/extractor/user_defined_extractor.py rename to kag/builder/prompt/utils.py index ada267c2..79984001 100644 --- a/kag/builder/component/extractor/user_defined_extractor.py +++ b/kag/builder/prompt/utils.py @@ -10,20 +10,15 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from typing import Dict, List +from kag.interface import PromptABC -from knext.common.base.runnable import Input, Output -from kag.interface.builder import ExtractorABC +def init_prompt_with_fallback(prompt_name, biz_scene): + try: + return PromptABC.from_config({"type": f"{biz_scene}_{prompt_name}"}) + except Exception as e: + print( + f"fail to initialize prompts with biz scene {biz_scene}, fallback to default biz scene, info: {e}" + ) -class UserDefinedExtractor(ExtractorABC): - @property - def input_types(self) -> Input: - return Dict[str, str] - - @property - def output_types(self) -> Output: - return Dict[str, str] - - def invoke(self, input: Input, **kwargs) -> List[Output]: - return input + return PromptABC.from_config({"type": f"default_{prompt_name}"}) diff --git a/kag/builder/runner.py b/kag/builder/runner.py new file mode 100644 index 00000000..c1420d85 --- /dev/null +++ b/kag/builder/runner.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + + +import os +import traceback +import logging +import threading +from typing import Dict +from tqdm import tqdm + +from kag.common.conf import KAG_PROJECT_CONF +from kag.common.registry import Registrable +from kag.common.utils import reset, bold, red, generate_hash_id +from kag.common.checkpointer import CheckpointerManager +from kag.interface import KAGBuilderChain, ScannerABC + +from kag.builder.model.sub_graph import SubGraph +from concurrent.futures import ThreadPoolExecutor, as_completed + +logger = logging.getLogger() + + +def str_abstract(value: str): + """ + Abstracts a string value by returning the base name if it is a file path, or the first 10 characters otherwise. + + Args: + value (str): The string value to be abstracted. + + Returns: + str: The abstracted string value. + """ + if os.path.exists(value): + return os.path.basename(value) + return value[:10] + + +def dict_abstract(value: Dict): + """ + Abstracts each value in a dictionary by converting it to a string and then abstracting the string. + + Args: + value (Dict): The dictionary to be abstracted. + + Returns: + Dict: The abstracted dictionary. + """ + output = {} + for k, v in value.items(): + output[k] = str_abstract(str(v)) + return output + + +def generate_hash_id_and_abstract(value): + hash_id = generate_hash_id(value) + if isinstance(value, dict): + abstract = dict_abstract(value) + else: + abstract = str_abstract(value) + return hash_id, abstract + + +class BuilderChainRunner(Registrable): + """ + A class that manages the execution of a KAGBuilderChain with parallel processing and checkpointing. + + This class provides methods to initialize the runner, process input data, and manage checkpoints for tracking processed data. + """ + + def __init__( + self, + scanner: ScannerABC, + chain: KAGBuilderChain, + num_chains: int = 2, + num_threads_per_chain: int = 8, + ): + """ + Initializes the BuilderChainRunner instance. + + Args: + scanner (ScannerABC): The source scanner to generate input data. + chain (KAGBuilderChain): The builder chain to process the input data. + num_chains (int, optional): The number of parallel threads to use, with each thread launching a builder chain instance. Defaults to 2. + num_threads_per_chain (int, optional): The number of parallel workers within a builder chain. Defaults to 8. + ckpt_dir (str, optional): The directory to store checkpoint files. Defaults to "./ckpt". + """ + self.scanner = scanner + self.chain = chain + self.num_chains = num_chains + self.num_threads_per_chain = num_threads_per_chain + self.ckpt_dir = KAG_PROJECT_CONF.ckpt_dir + + self.checkpointer = CheckpointerManager.get_checkpointer( + { + "type": "txt", + "ckpt_dir": self.ckpt_dir, + "rank": self.scanner.sharding_info.get_rank(), + "world_size": self.scanner.sharding_info.get_world_size(), + } + ) + self.processed_chunks = CheckpointerManager.get_checkpointer( + { + "type": "zodb", + "ckpt_dir": os.path.join(self.ckpt_dir, "chain"), + "rank": self.scanner.sharding_info.get_rank(), + "world_size": self.scanner.sharding_info.get_world_size(), + } + ) + self._local = threading.local() + + def invoke(self, input): + """ + Processes the input data using the builder chain in parallel and manages checkpoints. + + Args: + input: The input data to be processed. + """ + + # def process(thread_local, chain_conf, data, data_id, data_abstract): + # try: + # if not hasattr(thread_local, "chain"): + # if chain_conf: + # thread_local.chain = KAGBuilderChain.from_config(chain_conf) + # else: + # thread_local.chain = self.chain + # result = thread_local.chain.invoke( + # data, max_workers=self.num_threads_per_chain + # ) + # return data, data_id, data_abstract, result + # except Exception: + # traceback.print_exc() + # return None + + def process(data, data_id, data_abstract): + try: + result = self.chain.invoke( + data, + max_workers=self.num_threads_per_chain, + processed_chunk_keys=self.processed_chunks.keys(), + ) + return data, data_id, data_abstract, result + except Exception: + traceback.print_exc() + return None + + futures = [] + print(f"Processing {input}") + success = 0 + try: + with ThreadPoolExecutor(self.num_chains) as executor: + for item in self.scanner.generate(input): + item_id, item_abstract = generate_hash_id_and_abstract(item) + if self.checkpointer.exists(item_id): + continue + fut = executor.submit( + process, + item, + item_id, + item_abstract, + ) + futures.append(fut) + + success = 0 + for future in tqdm( + as_completed(futures), + total=len(futures), + desc="Progress", + position=0, + ): + result = future.result() + if result is not None: + item, item_id, item_abstract, chain_output = result + info = {} + num_nodes = 0 + num_edges = 0 + num_subgraphs = 0 + for item in chain_output: + if isinstance(item, SubGraph): + num_nodes += len(item.nodes) + num_edges += len(item.edges) + num_subgraphs += 1 + elif isinstance(item, dict): + + for k, v in item.items(): + self.processed_chunks.write_to_ckpt(k, k) + if isinstance(v, SubGraph): + num_nodes += len(v.nodes) + num_edges += len(v.edges) + num_subgraphs += 1 + + info = { + "num_nodes": num_nodes, + "num_edges": num_edges, + "num_subgraphs": num_subgraphs, + } + self.checkpointer.write_to_ckpt( + item_id, {"abstract": item_abstract, "graph_stat": info} + ) + success += 1 + except: + traceback.print_exc() + CheckpointerManager.close() + msg = ( + f"{bold}{red}Done process {len(futures)} records, with {success} successfully processed and {len(futures)-success} failures encountered.\n" + f"The log file is located at {self.checkpointer._ckpt_file_path}. " + f"Please access this file to obtain detailed task statistics.{reset}" + ) + print(msg) + + +BuilderChainRunner.register("base", as_default=True)(BuilderChainRunner) diff --git a/kag/common/__init__.py b/kag/common/__init__.py index 123acd8d..93aa6cd4 100644 --- a/kag/common/__init__.py +++ b/kag/common/__init__.py @@ -9,4 +9,3 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. - diff --git a/kag/common/arks_pb2.py b/kag/common/arks_pb2.py index 0a693f00..01462624 100644 --- a/kag/common/arks_pb2.py +++ b/kag/common/arks_pb2.py @@ -6,191 +6,166 @@ # 参考文档: https://yuque.antfin-inc.com/ai-infra/ndhopc/smk38dcs9zqr1ssh#Kb7e0 import sys -_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) + +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1")) from google.protobuf.internal import enum_type_wrapper from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() - - DESCRIPTOR = _descriptor.FileDescriptor( - name='arks.proto', - package='arks', - syntax='proto2', - serialized_options=_b('\n\025com.alipay.arks.proto'), - serialized_pb=_b('\n\narks.proto\x12\x04\x61rks\"\xfc\x01\n\x13InferTensorContents\x12\x14\n\x0cstring_value\x18\x01 \x03(\t\x12\x12\n\nbool_value\x18\x02 \x03(\x08\x12\x11\n\tint_value\x18\x03 \x03(\x05\x12\x13\n\x0bint64_value\x18\x04 \x03(\x03\x12\x12\n\nuint_value\x18\x05 \x03(\r\x12\x14\n\x0cuint64_value\x18\x06 \x03(\x04\x12\x12\n\nfp32_value\x18\x07 \x03(\x02\x12\x12\n\nfp64_value\x18\x08 \x03(\x01\x12\x12\n\nbyte_value\x18\t \x03(\x0c\x12-\n\x04type\x18\n \x01(\x0e\x32\x11.arks.ContentType:\x0cTYPE_INVALID\"q\n\x04Pair\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\x12+\n\x08\x63ontents\x18\x03 \x01(\x0b\x32\x19.arks.InferTensorContents\x12\x10\n\x08pb_value\x18\x04 \x03(\x0c\x12\x0e\n\x06shapes\x18\x05 \x03(\x05\"\x97\x01\n\x06RowKey\x12\x0f\n\x07row_key\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\x03\x12\x1a\n\x12\x61nt_fea_track_info\x18\x03 \x01(\t\x12\'\n\npartitions\x18\x04 \x03(\x0b\x32\x13.arks.PartitionInfo\x12%\n\x11realtime_features\x18\x05 \x03(\x0b\x32\n.arks.Pair\",\n\rPartitionInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\xb9\x01\n\x04Item\x12\x0f\n\x07item_id\x18\x01 \x02(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\x1e\n\nattributes\x18\x03 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x04 \x01(\x02\x12 \n\tsub_items\x18\x05 \x03(\x0b\x32\r.arks.SubItem\x12\x1d\n\x11is_features_valid\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x07 \x03(\x02\x42\x02\x10\x01\"\x9a\x01\n\x07SubItem\x12\x0f\n\x07item_id\x18\x01 \x01(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x1d\n\x11is_features_valid\x18\x04 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x1e\n\nattributes\x18\x06 \x03(\x0b\x32\n.arks.Pair\"\xc7\x03\n\x08SeekPlan\x12\x14\n\x0cstorage_type\x18\x01 \x01(\t\x12\r\n\x05table\x18\x02 \x01(\t\x12\x15\n\rcolumn_family\x18\x03 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x04 \x03(\t\x12\x12\n\nkvpair_sep\x18\x05 \x01(\t\x12\x0e\n\x06kv_sep\x18\x06 \x01(\t\x12\x0f\n\x07\x63luster\x18\x07 \x01(\t\x12\x1e\n\x08row_keys\x18\x08 \x03(\x0b\x32\x0c.arks.RowKey\x12\x12\n\ntimeout_ms\x18\t \x01(\x05\x12\x1b\n\x13\x63\x61\x63he_expire_second\x18\n \x01(\x05\x12\x10\n\x08url_user\x18\x0b \x01(\t\x12\x10\n\x08url_item\x18\x0c \x01(\t\x12\x17\n\x0f\x61nt_feature_req\x18\r \x01(\x0c\x12\n\n\x02id\x18\x0e \x01(\t\x12\x16\n\x0ekb_feature_req\x18\x0f \x01(\x0c\x12\x11\n\tdebuginfo\x18\x10 \x01(\t\x12\x11\n\tseparator\x18\x11 \x01(\t\x12=\n\x12item_sequence_type\x18\x12 \x01(\x0e\x32\x16.arks.ItemSequenceType:\tTYPE_NONE\x12\"\n\x0emissing_values\x18\x13 \x03(\x0b\x32\n.arks.Pair\"\x8f\x01\n\x0b\x44umpReqInfo\x12\x0e\n\x06time_s\x18\x01 \x01(\x05\x12\x0e\n\x06oss_id\x18\x02 \x01(\t\x12\x0f\n\x07oss_key\x18\x03 \x01(\t\x12\x13\n\x0btarget_addr\x18\x04 \x01(\t\x12\x10\n\x08query_id\x18\x05 \x01(\x03\x12\r\n\x05token\x18\x06 \x01(\t\x12\x0b\n\x03\x61pp\x18\x07 \x01(\t\x12\x0c\n\x04host\x18\x08 \x01(\t\"\xb3\x04\n\x0b\x41rksRequest\x12\x12\n\x07version\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05\x64\x65\x62ug\x18\x02 \x01(\x05\x12\x0f\n\x07is_ping\x18\x03 \x01(\x08\x12\x12\n\nsession_id\x18\x04 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x05 \x01(\t\x12\x0b\n\x03uid\x18\x06 \x01(\t\x12 \n\x0cuser_profile\x18\x07 \x03(\x0b\x32\n.arks.Pair\x12\"\n\x0escene_features\x18\x08 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\t \x03(\x0b\x32\n.arks.Item\x12\x15\n\x07is_sort\x18\n \x01(\x08:\x04true\x12\x11\n\x05\x63ount\x18\x0b \x01(\x05:\x02\x31\x30\x12.\n\nout_format\x18\x0c \x01(\x0e\x32\x16.arks.OutputFormatType:\x02PB\x12\x12\n\nchain_name\x18\r \x01(\t\x12\x0b\n\x03scm\x18\x0e \x01(\t\x12\x12\n\nscene_name\x18\x0f \x01(\t\x12\x14\n\x0citem_schemas\x18\x10 \x03(\t\x12\x18\n\x10sub_item_schemas\x18\x11 \x03(\t\x12\"\n\nseek_plans\x18\x12 \x03(\x0b\x32\x0e.arks.SeekPlan\x12(\n\rdump_req_info\x18\x13 \x01(\x0b\x32\x11.arks.DumpReqInfo\x12\x10\n\x08\x61pp_name\x18\x14 \x01(\t\x12\x16\n\x0ereq_timeout_ms\x18\x15 \x01(\x04\x12\x16\n\x0e\x63lient_version\x18\x16 \x01(\t\x12\n\n\x02ip\x18\x17 \x01(\t\"\xba\x02\n\x0c\x41rksResponse\x12,\n\nerror_code\x18\x01 \x01(\x0e\x32\x0f.arks.ErrorCode:\x07SUCCESS\x12\x12\n\nsession_id\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12 \n\x0cuser_profile\x18\x04 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\x05 \x03(\x0b\x32\n.arks.Item\x12\x11\n\tdebug_msg\x18\x06 \x01(\t\x12\x0b\n\x03scm\x18\x07 \x01(\t\x12\"\n\nseek_plans\x18\x08 \x03(\x0b\x32\x0e.arks.SeekPlan\x12\x0f\n\x07\x65rr_msg\x18\t \x01(\t\x12\x10\n\x08\x61lgo_ret\x18\n \x01(\x05\x12\x10\n\x08\x61lgo_msg\x18\x0b \x01(\t\x12\x11\n\ttrace_msg\x18\x0c \x01(\t\x12\n\n\x02rt\x18\r \x01(\x05*T\n\x10OutputFormatType\x12\x06\n\x02PB\x10\x01\x12\x08\n\x04JSON\x10\x02\x12\x08\n\x04TEXT\x10\x03\x12\r\n\tSNAPPY_PB\x10\x04\x12\x06\n\x02\x46\x42\x10\x05\x12\r\n\tSNAPPY_FB\x10\x06*\x86\x01\n\tErrorCode\x12\x0b\n\x07SUCCESS\x10\x00\x12\x0b\n\x07TIMEOUT\x10\x01\x12\r\n\tSCENE_ERR\x10\x02\x12\r\n\tPARAM_ERR\x10\x03\x12\x0e\n\nSYSTEM_ERR\x10\x04\x12\x0f\n\x0bSERVICE_ERR\x10\x05\x12\x10\n\x0c\x46LOW_CONTROL\x10\x06\x12\x0e\n\nOTHERS_ERR\x10\x07*\xae\x01\n\x0b\x43ontentType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_INT32\x10\x02\x12\x0e\n\nTYPE_INT64\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_FP32\x10\x06\x12\r\n\tTYPE_FP64\x10\x07\x12\x0f\n\x0bTYPE_STRING\x10\x08\x12\r\n\tTYPE_BYTE\x10\t*A\n\x10ItemSequenceType\x12\r\n\tTYPE_NONE\x10\x00\x12\x0f\n\x0bTYPE_CONCAT\x10\x01\x12\r\n\tTYPE_FLAT\x10\x02\x42\x17\n\x15\x63om.alipay.arks.proto') + name="arks.proto", + package="arks", + syntax="proto2", + serialized_options=_b("\n\025com.alipay.arks.proto"), + serialized_pb=_b( + '\n\narks.proto\x12\x04\x61rks"\xfc\x01\n\x13InferTensorContents\x12\x14\n\x0cstring_value\x18\x01 \x03(\t\x12\x12\n\nbool_value\x18\x02 \x03(\x08\x12\x11\n\tint_value\x18\x03 \x03(\x05\x12\x13\n\x0bint64_value\x18\x04 \x03(\x03\x12\x12\n\nuint_value\x18\x05 \x03(\r\x12\x14\n\x0cuint64_value\x18\x06 \x03(\x04\x12\x12\n\nfp32_value\x18\x07 \x03(\x02\x12\x12\n\nfp64_value\x18\x08 \x03(\x01\x12\x12\n\nbyte_value\x18\t \x03(\x0c\x12-\n\x04type\x18\n \x01(\x0e\x32\x11.arks.ContentType:\x0cTYPE_INVALID"q\n\x04Pair\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\x12+\n\x08\x63ontents\x18\x03 \x01(\x0b\x32\x19.arks.InferTensorContents\x12\x10\n\x08pb_value\x18\x04 \x03(\x0c\x12\x0e\n\x06shapes\x18\x05 \x03(\x05"\x97\x01\n\x06RowKey\x12\x0f\n\x07row_key\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\x03\x12\x1a\n\x12\x61nt_fea_track_info\x18\x03 \x01(\t\x12\'\n\npartitions\x18\x04 \x03(\x0b\x32\x13.arks.PartitionInfo\x12%\n\x11realtime_features\x18\x05 \x03(\x0b\x32\n.arks.Pair",\n\rPartitionInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t"\xb9\x01\n\x04Item\x12\x0f\n\x07item_id\x18\x01 \x02(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\x1e\n\nattributes\x18\x03 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x04 \x01(\x02\x12 \n\tsub_items\x18\x05 \x03(\x0b\x32\r.arks.SubItem\x12\x1d\n\x11is_features_valid\x18\x06 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x07 \x03(\x02\x42\x02\x10\x01"\x9a\x01\n\x07SubItem\x12\x0f\n\x07item_id\x18\x01 \x01(\t\x12\x1c\n\x08\x66\x65\x61tures\x18\x02 \x03(\x0b\x32\n.arks.Pair\x12\r\n\x05score\x18\x03 \x01(\x02\x12\x1d\n\x11is_features_valid\x18\x04 \x03(\x08\x42\x02\x10\x01\x12\x12\n\x06scores\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x1e\n\nattributes\x18\x06 \x03(\x0b\x32\n.arks.Pair"\xc7\x03\n\x08SeekPlan\x12\x14\n\x0cstorage_type\x18\x01 \x01(\t\x12\r\n\x05table\x18\x02 \x01(\t\x12\x15\n\rcolumn_family\x18\x03 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x04 \x03(\t\x12\x12\n\nkvpair_sep\x18\x05 \x01(\t\x12\x0e\n\x06kv_sep\x18\x06 \x01(\t\x12\x0f\n\x07\x63luster\x18\x07 \x01(\t\x12\x1e\n\x08row_keys\x18\x08 \x03(\x0b\x32\x0c.arks.RowKey\x12\x12\n\ntimeout_ms\x18\t \x01(\x05\x12\x1b\n\x13\x63\x61\x63he_expire_second\x18\n \x01(\x05\x12\x10\n\x08url_user\x18\x0b \x01(\t\x12\x10\n\x08url_item\x18\x0c \x01(\t\x12\x17\n\x0f\x61nt_feature_req\x18\r \x01(\x0c\x12\n\n\x02id\x18\x0e \x01(\t\x12\x16\n\x0ekb_feature_req\x18\x0f \x01(\x0c\x12\x11\n\tdebuginfo\x18\x10 \x01(\t\x12\x11\n\tseparator\x18\x11 \x01(\t\x12=\n\x12item_sequence_type\x18\x12 \x01(\x0e\x32\x16.arks.ItemSequenceType:\tTYPE_NONE\x12"\n\x0emissing_values\x18\x13 \x03(\x0b\x32\n.arks.Pair"\x8f\x01\n\x0b\x44umpReqInfo\x12\x0e\n\x06time_s\x18\x01 \x01(\x05\x12\x0e\n\x06oss_id\x18\x02 \x01(\t\x12\x0f\n\x07oss_key\x18\x03 \x01(\t\x12\x13\n\x0btarget_addr\x18\x04 \x01(\t\x12\x10\n\x08query_id\x18\x05 \x01(\x03\x12\r\n\x05token\x18\x06 \x01(\t\x12\x0b\n\x03\x61pp\x18\x07 \x01(\t\x12\x0c\n\x04host\x18\x08 \x01(\t"\xb3\x04\n\x0b\x41rksRequest\x12\x12\n\x07version\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05\x64\x65\x62ug\x18\x02 \x01(\x05\x12\x0f\n\x07is_ping\x18\x03 \x01(\x08\x12\x12\n\nsession_id\x18\x04 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x05 \x01(\t\x12\x0b\n\x03uid\x18\x06 \x01(\t\x12 \n\x0cuser_profile\x18\x07 \x03(\x0b\x32\n.arks.Pair\x12"\n\x0escene_features\x18\x08 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\t \x03(\x0b\x32\n.arks.Item\x12\x15\n\x07is_sort\x18\n \x01(\x08:\x04true\x12\x11\n\x05\x63ount\x18\x0b \x01(\x05:\x02\x31\x30\x12.\n\nout_format\x18\x0c \x01(\x0e\x32\x16.arks.OutputFormatType:\x02PB\x12\x12\n\nchain_name\x18\r \x01(\t\x12\x0b\n\x03scm\x18\x0e \x01(\t\x12\x12\n\nscene_name\x18\x0f \x01(\t\x12\x14\n\x0citem_schemas\x18\x10 \x03(\t\x12\x18\n\x10sub_item_schemas\x18\x11 \x03(\t\x12"\n\nseek_plans\x18\x12 \x03(\x0b\x32\x0e.arks.SeekPlan\x12(\n\rdump_req_info\x18\x13 \x01(\x0b\x32\x11.arks.DumpReqInfo\x12\x10\n\x08\x61pp_name\x18\x14 \x01(\t\x12\x16\n\x0ereq_timeout_ms\x18\x15 \x01(\x04\x12\x16\n\x0e\x63lient_version\x18\x16 \x01(\t\x12\n\n\x02ip\x18\x17 \x01(\t"\xba\x02\n\x0c\x41rksResponse\x12,\n\nerror_code\x18\x01 \x01(\x0e\x32\x0f.arks.ErrorCode:\x07SUCCESS\x12\x12\n\nsession_id\x18\x02 \x01(\t\x12\x13\n\x0b\x62ucket_name\x18\x03 \x01(\t\x12 \n\x0cuser_profile\x18\x04 \x03(\x0b\x32\n.arks.Pair\x12\x19\n\x05items\x18\x05 \x03(\x0b\x32\n.arks.Item\x12\x11\n\tdebug_msg\x18\x06 \x01(\t\x12\x0b\n\x03scm\x18\x07 \x01(\t\x12"\n\nseek_plans\x18\x08 \x03(\x0b\x32\x0e.arks.SeekPlan\x12\x0f\n\x07\x65rr_msg\x18\t \x01(\t\x12\x10\n\x08\x61lgo_ret\x18\n \x01(\x05\x12\x10\n\x08\x61lgo_msg\x18\x0b \x01(\t\x12\x11\n\ttrace_msg\x18\x0c \x01(\t\x12\n\n\x02rt\x18\r \x01(\x05*T\n\x10OutputFormatType\x12\x06\n\x02PB\x10\x01\x12\x08\n\x04JSON\x10\x02\x12\x08\n\x04TEXT\x10\x03\x12\r\n\tSNAPPY_PB\x10\x04\x12\x06\n\x02\x46\x42\x10\x05\x12\r\n\tSNAPPY_FB\x10\x06*\x86\x01\n\tErrorCode\x12\x0b\n\x07SUCCESS\x10\x00\x12\x0b\n\x07TIMEOUT\x10\x01\x12\r\n\tSCENE_ERR\x10\x02\x12\r\n\tPARAM_ERR\x10\x03\x12\x0e\n\nSYSTEM_ERR\x10\x04\x12\x0f\n\x0bSERVICE_ERR\x10\x05\x12\x10\n\x0c\x46LOW_CONTROL\x10\x06\x12\x0e\n\nOTHERS_ERR\x10\x07*\xae\x01\n\x0b\x43ontentType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_INT32\x10\x02\x12\x0e\n\nTYPE_INT64\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_FP32\x10\x06\x12\r\n\tTYPE_FP64\x10\x07\x12\x0f\n\x0bTYPE_STRING\x10\x08\x12\r\n\tTYPE_BYTE\x10\t*A\n\x10ItemSequenceType\x12\r\n\tTYPE_NONE\x10\x00\x12\x0f\n\x0bTYPE_CONCAT\x10\x01\x12\r\n\tTYPE_FLAT\x10\x02\x42\x17\n\x15\x63om.alipay.arks.proto' + ), ) _OUTPUTFORMATTYPE = _descriptor.EnumDescriptor( - name='OutputFormatType', - full_name='arks.OutputFormatType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='PB', index=0, number=1, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='JSON', index=1, number=2, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TEXT', index=2, number=3, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SNAPPY_PB', index=3, number=4, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FB', index=4, number=5, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SNAPPY_FB', index=5, number=6, - serialized_options=None, - type=None), - ], - containing_type=None, - serialized_options=None, - serialized_start=2422, - serialized_end=2506, + name="OutputFormatType", + full_name="arks.OutputFormatType", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="PB", index=0, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="JSON", index=1, number=2, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TEXT", index=2, number=3, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="SNAPPY_PB", index=3, number=4, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="FB", index=4, number=5, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="SNAPPY_FB", index=5, number=6, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=2422, + serialized_end=2506, ) _sym_db.RegisterEnumDescriptor(_OUTPUTFORMATTYPE) OutputFormatType = enum_type_wrapper.EnumTypeWrapper(_OUTPUTFORMATTYPE) _ERRORCODE = _descriptor.EnumDescriptor( - name='ErrorCode', - full_name='arks.ErrorCode', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='SUCCESS', index=0, number=0, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TIMEOUT', index=1, number=1, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SCENE_ERR', index=2, number=2, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='PARAM_ERR', index=3, number=3, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SYSTEM_ERR', index=4, number=4, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='SERVICE_ERR', index=5, number=5, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='FLOW_CONTROL', index=6, number=6, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='OTHERS_ERR', index=7, number=7, - serialized_options=None, - type=None), - ], - containing_type=None, - serialized_options=None, - serialized_start=2509, - serialized_end=2643, + name="ErrorCode", + full_name="arks.ErrorCode", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="SUCCESS", index=0, number=0, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TIMEOUT", index=1, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="SCENE_ERR", index=2, number=2, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="PARAM_ERR", index=3, number=3, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="SYSTEM_ERR", index=4, number=4, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="SERVICE_ERR", index=5, number=5, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="FLOW_CONTROL", index=6, number=6, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="OTHERS_ERR", index=7, number=7, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=2509, + serialized_end=2643, ) _sym_db.RegisterEnumDescriptor(_ERRORCODE) ErrorCode = enum_type_wrapper.EnumTypeWrapper(_ERRORCODE) _CONTENTTYPE = _descriptor.EnumDescriptor( - name='ContentType', - full_name='arks.ContentType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='TYPE_INVALID', index=0, number=0, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_BOOL', index=1, number=1, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_INT32', index=2, number=2, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_INT64', index=3, number=3, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_UINT32', index=4, number=4, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_UINT64', index=5, number=5, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_FP32', index=6, number=6, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_FP64', index=7, number=7, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_STRING', index=8, number=8, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_BYTE', index=9, number=9, - serialized_options=None, - type=None), - ], - containing_type=None, - serialized_options=None, - serialized_start=2646, - serialized_end=2820, + name="ContentType", + full_name="arks.ContentType", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="TYPE_INVALID", index=0, number=0, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_BOOL", index=1, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_INT32", index=2, number=2, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_INT64", index=3, number=3, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_UINT32", index=4, number=4, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_UINT64", index=5, number=5, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_FP32", index=6, number=6, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_FP64", index=7, number=7, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_STRING", index=8, number=8, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_BYTE", index=9, number=9, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=2646, + serialized_end=2820, ) _sym_db.RegisterEnumDescriptor(_CONTENTTYPE) ContentType = enum_type_wrapper.EnumTypeWrapper(_CONTENTTYPE) _ITEMSEQUENCETYPE = _descriptor.EnumDescriptor( - name='ItemSequenceType', - full_name='arks.ItemSequenceType', - filename=None, - file=DESCRIPTOR, - values=[ - _descriptor.EnumValueDescriptor( - name='TYPE_NONE', index=0, number=0, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_CONCAT', index=1, number=1, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='TYPE_FLAT', index=2, number=2, - serialized_options=None, - type=None), - ], - containing_type=None, - serialized_options=None, - serialized_start=2822, - serialized_end=2887, + name="ItemSequenceType", + full_name="arks.ItemSequenceType", + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name="TYPE_NONE", index=0, number=0, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_CONCAT", index=1, number=1, serialized_options=None, type=None + ), + _descriptor.EnumValueDescriptor( + name="TYPE_FLAT", index=2, number=2, serialized_options=None, type=None + ), + ], + containing_type=None, + serialized_options=None, + serialized_start=2822, + serialized_end=2887, ) _sym_db.RegisterEnumDescriptor(_ITEMSEQUENCETYPE) @@ -224,1044 +199,2131 @@ TYPE_FLAT = 2 - _INFERTENSORCONTENTS = _descriptor.Descriptor( - name='InferTensorContents', - full_name='arks.InferTensorContents', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='string_value', full_name='arks.InferTensorContents.string_value', index=0, - number=1, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='bool_value', full_name='arks.InferTensorContents.bool_value', index=1, - number=2, type=8, cpp_type=7, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='int_value', full_name='arks.InferTensorContents.int_value', index=2, - number=3, type=5, cpp_type=1, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='int64_value', full_name='arks.InferTensorContents.int64_value', index=3, - number=4, type=3, cpp_type=2, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='uint_value', full_name='arks.InferTensorContents.uint_value', index=4, - number=5, type=13, cpp_type=3, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='uint64_value', full_name='arks.InferTensorContents.uint64_value', index=5, - number=6, type=4, cpp_type=4, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='fp32_value', full_name='arks.InferTensorContents.fp32_value', index=6, - number=7, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='fp64_value', full_name='arks.InferTensorContents.fp64_value', index=7, - number=8, type=1, cpp_type=5, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='byte_value', full_name='arks.InferTensorContents.byte_value', index=8, - number=9, type=12, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='type', full_name='arks.InferTensorContents.type', index=9, - number=10, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=21, - serialized_end=273, + name="InferTensorContents", + full_name="arks.InferTensorContents", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="string_value", + full_name="arks.InferTensorContents.string_value", + index=0, + number=1, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="bool_value", + full_name="arks.InferTensorContents.bool_value", + index=1, + number=2, + type=8, + cpp_type=7, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="int_value", + full_name="arks.InferTensorContents.int_value", + index=2, + number=3, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="int64_value", + full_name="arks.InferTensorContents.int64_value", + index=3, + number=4, + type=3, + cpp_type=2, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="uint_value", + full_name="arks.InferTensorContents.uint_value", + index=4, + number=5, + type=13, + cpp_type=3, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="uint64_value", + full_name="arks.InferTensorContents.uint64_value", + index=5, + number=6, + type=4, + cpp_type=4, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="fp32_value", + full_name="arks.InferTensorContents.fp32_value", + index=6, + number=7, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="fp64_value", + full_name="arks.InferTensorContents.fp64_value", + index=7, + number=8, + type=1, + cpp_type=5, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="byte_value", + full_name="arks.InferTensorContents.byte_value", + index=8, + number=9, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="type", + full_name="arks.InferTensorContents.type", + index=9, + number=10, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=21, + serialized_end=273, ) _PAIR = _descriptor.Descriptor( - name='Pair', - full_name='arks.Pair', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='key', full_name='arks.Pair.key', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='value', full_name='arks.Pair.value', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='contents', full_name='arks.Pair.contents', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='pb_value', full_name='arks.Pair.pb_value', index=3, - number=4, type=12, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='shapes', full_name='arks.Pair.shapes', index=4, - number=5, type=5, cpp_type=1, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=275, - serialized_end=388, + name="Pair", + full_name="arks.Pair", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="key", + full_name="arks.Pair.key", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="value", + full_name="arks.Pair.value", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="contents", + full_name="arks.Pair.contents", + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="pb_value", + full_name="arks.Pair.pb_value", + index=3, + number=4, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="shapes", + full_name="arks.Pair.shapes", + index=4, + number=5, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=275, + serialized_end=388, ) _ROWKEY = _descriptor.Descriptor( - name='RowKey', - full_name='arks.RowKey', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='row_key', full_name='arks.RowKey.row_key', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='versions', full_name='arks.RowKey.versions', index=1, - number=2, type=3, cpp_type=2, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='ant_fea_track_info', full_name='arks.RowKey.ant_fea_track_info', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='partitions', full_name='arks.RowKey.partitions', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='realtime_features', full_name='arks.RowKey.realtime_features', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=391, - serialized_end=542, + name="RowKey", + full_name="arks.RowKey", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="row_key", + full_name="arks.RowKey.row_key", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="versions", + full_name="arks.RowKey.versions", + index=1, + number=2, + type=3, + cpp_type=2, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="ant_fea_track_info", + full_name="arks.RowKey.ant_fea_track_info", + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="partitions", + full_name="arks.RowKey.partitions", + index=3, + number=4, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="realtime_features", + full_name="arks.RowKey.realtime_features", + index=4, + number=5, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=391, + serialized_end=542, ) _PARTITIONINFO = _descriptor.Descriptor( - name='PartitionInfo', - full_name='arks.PartitionInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='arks.PartitionInfo.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='value', full_name='arks.PartitionInfo.value', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=544, - serialized_end=588, + name="PartitionInfo", + full_name="arks.PartitionInfo", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="name", + full_name="arks.PartitionInfo.name", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="value", + full_name="arks.PartitionInfo.value", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=544, + serialized_end=588, ) _ITEM = _descriptor.Descriptor( - name='Item', - full_name='arks.Item', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='item_id', full_name='arks.Item.item_id', index=0, - number=1, type=9, cpp_type=9, label=2, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='features', full_name='arks.Item.features', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='attributes', full_name='arks.Item.attributes', index=2, - number=3, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='score', full_name='arks.Item.score', index=3, - number=4, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='sub_items', full_name='arks.Item.sub_items', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='is_features_valid', full_name='arks.Item.is_features_valid', index=5, - number=6, type=8, cpp_type=7, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=_b('\020\001'), file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scores', full_name='arks.Item.scores', index=6, - number=7, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=_b('\020\001'), file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=591, - serialized_end=776, + name="Item", + full_name="arks.Item", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="item_id", + full_name="arks.Item.item_id", + index=0, + number=1, + type=9, + cpp_type=9, + label=2, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="features", + full_name="arks.Item.features", + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="attributes", + full_name="arks.Item.attributes", + index=2, + number=3, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="score", + full_name="arks.Item.score", + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="sub_items", + full_name="arks.Item.sub_items", + index=4, + number=5, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="is_features_valid", + full_name="arks.Item.is_features_valid", + index=5, + number=6, + type=8, + cpp_type=7, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b("\020\001"), + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scores", + full_name="arks.Item.scores", + index=6, + number=7, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b("\020\001"), + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=591, + serialized_end=776, ) _SUBITEM = _descriptor.Descriptor( - name='SubItem', - full_name='arks.SubItem', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='item_id', full_name='arks.SubItem.item_id', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='features', full_name='arks.SubItem.features', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='score', full_name='arks.SubItem.score', index=2, - number=3, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='is_features_valid', full_name='arks.SubItem.is_features_valid', index=3, - number=4, type=8, cpp_type=7, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=_b('\020\001'), file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scores', full_name='arks.SubItem.scores', index=4, - number=5, type=2, cpp_type=6, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=_b('\020\001'), file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='attributes', full_name='arks.SubItem.attributes', index=5, - number=6, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=779, - serialized_end=933, + name="SubItem", + full_name="arks.SubItem", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="item_id", + full_name="arks.SubItem.item_id", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="features", + full_name="arks.SubItem.features", + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="score", + full_name="arks.SubItem.score", + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="is_features_valid", + full_name="arks.SubItem.is_features_valid", + index=3, + number=4, + type=8, + cpp_type=7, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b("\020\001"), + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scores", + full_name="arks.SubItem.scores", + index=4, + number=5, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=_b("\020\001"), + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="attributes", + full_name="arks.SubItem.attributes", + index=5, + number=6, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=779, + serialized_end=933, ) _SEEKPLAN = _descriptor.Descriptor( - name='SeekPlan', - full_name='arks.SeekPlan', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='storage_type', full_name='arks.SeekPlan.storage_type', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='table', full_name='arks.SeekPlan.table', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='column_family', full_name='arks.SeekPlan.column_family', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='columns', full_name='arks.SeekPlan.columns', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='kvpair_sep', full_name='arks.SeekPlan.kvpair_sep', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='kv_sep', full_name='arks.SeekPlan.kv_sep', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='cluster', full_name='arks.SeekPlan.cluster', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='row_keys', full_name='arks.SeekPlan.row_keys', index=7, - number=8, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='timeout_ms', full_name='arks.SeekPlan.timeout_ms', index=8, - number=9, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='cache_expire_second', full_name='arks.SeekPlan.cache_expire_second', index=9, - number=10, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='url_user', full_name='arks.SeekPlan.url_user', index=10, - number=11, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='url_item', full_name='arks.SeekPlan.url_item', index=11, - number=12, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='ant_feature_req', full_name='arks.SeekPlan.ant_feature_req', index=12, - number=13, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='id', full_name='arks.SeekPlan.id', index=13, - number=14, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='kb_feature_req', full_name='arks.SeekPlan.kb_feature_req', index=14, - number=15, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=_b(""), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='debuginfo', full_name='arks.SeekPlan.debuginfo', index=15, - number=16, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='separator', full_name='arks.SeekPlan.separator', index=16, - number=17, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='item_sequence_type', full_name='arks.SeekPlan.item_sequence_type', index=17, - number=18, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='missing_values', full_name='arks.SeekPlan.missing_values', index=18, - number=19, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=936, - serialized_end=1391, + name="SeekPlan", + full_name="arks.SeekPlan", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="storage_type", + full_name="arks.SeekPlan.storage_type", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="table", + full_name="arks.SeekPlan.table", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="column_family", + full_name="arks.SeekPlan.column_family", + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="columns", + full_name="arks.SeekPlan.columns", + index=3, + number=4, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="kvpair_sep", + full_name="arks.SeekPlan.kvpair_sep", + index=4, + number=5, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="kv_sep", + full_name="arks.SeekPlan.kv_sep", + index=5, + number=6, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="cluster", + full_name="arks.SeekPlan.cluster", + index=6, + number=7, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="row_keys", + full_name="arks.SeekPlan.row_keys", + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="timeout_ms", + full_name="arks.SeekPlan.timeout_ms", + index=8, + number=9, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="cache_expire_second", + full_name="arks.SeekPlan.cache_expire_second", + index=9, + number=10, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="url_user", + full_name="arks.SeekPlan.url_user", + index=10, + number=11, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="url_item", + full_name="arks.SeekPlan.url_item", + index=11, + number=12, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="ant_feature_req", + full_name="arks.SeekPlan.ant_feature_req", + index=12, + number=13, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="id", + full_name="arks.SeekPlan.id", + index=13, + number=14, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="kb_feature_req", + full_name="arks.SeekPlan.kb_feature_req", + index=14, + number=15, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="debuginfo", + full_name="arks.SeekPlan.debuginfo", + index=15, + number=16, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="separator", + full_name="arks.SeekPlan.separator", + index=16, + number=17, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="item_sequence_type", + full_name="arks.SeekPlan.item_sequence_type", + index=17, + number=18, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="missing_values", + full_name="arks.SeekPlan.missing_values", + index=18, + number=19, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=936, + serialized_end=1391, ) _DUMPREQINFO = _descriptor.Descriptor( - name='DumpReqInfo', - full_name='arks.DumpReqInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='time_s', full_name='arks.DumpReqInfo.time_s', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='oss_id', full_name='arks.DumpReqInfo.oss_id', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='oss_key', full_name='arks.DumpReqInfo.oss_key', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='target_addr', full_name='arks.DumpReqInfo.target_addr', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='query_id', full_name='arks.DumpReqInfo.query_id', index=4, - number=5, type=3, cpp_type=2, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='token', full_name='arks.DumpReqInfo.token', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='app', full_name='arks.DumpReqInfo.app', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='host', full_name='arks.DumpReqInfo.host', index=7, - number=8, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1394, - serialized_end=1537, + name="DumpReqInfo", + full_name="arks.DumpReqInfo", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="time_s", + full_name="arks.DumpReqInfo.time_s", + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="oss_id", + full_name="arks.DumpReqInfo.oss_id", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="oss_key", + full_name="arks.DumpReqInfo.oss_key", + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="target_addr", + full_name="arks.DumpReqInfo.target_addr", + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="query_id", + full_name="arks.DumpReqInfo.query_id", + index=4, + number=5, + type=3, + cpp_type=2, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="token", + full_name="arks.DumpReqInfo.token", + index=5, + number=6, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="app", + full_name="arks.DumpReqInfo.app", + index=6, + number=7, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="host", + full_name="arks.DumpReqInfo.host", + index=7, + number=8, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=1394, + serialized_end=1537, ) _ARKSREQUEST = _descriptor.Descriptor( - name='ArksRequest', - full_name='arks.ArksRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='version', full_name='arks.ArksRequest.version', index=0, - number=1, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=1, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='debug', full_name='arks.ArksRequest.debug', index=1, - number=2, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='is_ping', full_name='arks.ArksRequest.is_ping', index=2, - number=3, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='session_id', full_name='arks.ArksRequest.session_id', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='bucket_name', full_name='arks.ArksRequest.bucket_name', index=4, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='uid', full_name='arks.ArksRequest.uid', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='user_profile', full_name='arks.ArksRequest.user_profile', index=6, - number=7, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scene_features', full_name='arks.ArksRequest.scene_features', index=7, - number=8, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='items', full_name='arks.ArksRequest.items', index=8, - number=9, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='is_sort', full_name='arks.ArksRequest.is_sort', index=9, - number=10, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=True, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='count', full_name='arks.ArksRequest.count', index=10, - number=11, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=10, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='out_format', full_name='arks.ArksRequest.out_format', index=11, - number=12, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=1, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='chain_name', full_name='arks.ArksRequest.chain_name', index=12, - number=13, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scm', full_name='arks.ArksRequest.scm', index=13, - number=14, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scene_name', full_name='arks.ArksRequest.scene_name', index=14, - number=15, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='item_schemas', full_name='arks.ArksRequest.item_schemas', index=15, - number=16, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='sub_item_schemas', full_name='arks.ArksRequest.sub_item_schemas', index=16, - number=17, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='seek_plans', full_name='arks.ArksRequest.seek_plans', index=17, - number=18, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='dump_req_info', full_name='arks.ArksRequest.dump_req_info', index=18, - number=19, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='app_name', full_name='arks.ArksRequest.app_name', index=19, - number=20, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='req_timeout_ms', full_name='arks.ArksRequest.req_timeout_ms', index=20, - number=21, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='client_version', full_name='arks.ArksRequest.client_version', index=21, - number=22, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='ip', full_name='arks.ArksRequest.ip', index=22, - number=23, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1540, - serialized_end=2103, + name="ArksRequest", + full_name="arks.ArksRequest", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="version", + full_name="arks.ArksRequest.version", + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="debug", + full_name="arks.ArksRequest.debug", + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="is_ping", + full_name="arks.ArksRequest.is_ping", + index=2, + number=3, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="session_id", + full_name="arks.ArksRequest.session_id", + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="bucket_name", + full_name="arks.ArksRequest.bucket_name", + index=4, + number=5, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="uid", + full_name="arks.ArksRequest.uid", + index=5, + number=6, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="user_profile", + full_name="arks.ArksRequest.user_profile", + index=6, + number=7, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scene_features", + full_name="arks.ArksRequest.scene_features", + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="items", + full_name="arks.ArksRequest.items", + index=8, + number=9, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="is_sort", + full_name="arks.ArksRequest.is_sort", + index=9, + number=10, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=True, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="count", + full_name="arks.ArksRequest.count", + index=10, + number=11, + type=5, + cpp_type=1, + label=1, + has_default_value=True, + default_value=10, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="out_format", + full_name="arks.ArksRequest.out_format", + index=11, + number=12, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=1, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="chain_name", + full_name="arks.ArksRequest.chain_name", + index=12, + number=13, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scm", + full_name="arks.ArksRequest.scm", + index=13, + number=14, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scene_name", + full_name="arks.ArksRequest.scene_name", + index=14, + number=15, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="item_schemas", + full_name="arks.ArksRequest.item_schemas", + index=15, + number=16, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="sub_item_schemas", + full_name="arks.ArksRequest.sub_item_schemas", + index=16, + number=17, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="seek_plans", + full_name="arks.ArksRequest.seek_plans", + index=17, + number=18, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="dump_req_info", + full_name="arks.ArksRequest.dump_req_info", + index=18, + number=19, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="app_name", + full_name="arks.ArksRequest.app_name", + index=19, + number=20, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="req_timeout_ms", + full_name="arks.ArksRequest.req_timeout_ms", + index=20, + number=21, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="client_version", + full_name="arks.ArksRequest.client_version", + index=21, + number=22, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="ip", + full_name="arks.ArksRequest.ip", + index=22, + number=23, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=1540, + serialized_end=2103, ) _ARKSRESPONSE = _descriptor.Descriptor( - name='ArksResponse', - full_name='arks.ArksResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - fields=[ - _descriptor.FieldDescriptor( - name='error_code', full_name='arks.ArksResponse.error_code', index=0, - number=1, type=14, cpp_type=8, label=1, - has_default_value=True, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='session_id', full_name='arks.ArksResponse.session_id', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='bucket_name', full_name='arks.ArksResponse.bucket_name', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='user_profile', full_name='arks.ArksResponse.user_profile', index=3, - number=4, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='items', full_name='arks.ArksResponse.items', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='debug_msg', full_name='arks.ArksResponse.debug_msg', index=5, - number=6, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='scm', full_name='arks.ArksResponse.scm', index=6, - number=7, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='seek_plans', full_name='arks.ArksResponse.seek_plans', index=7, - number=8, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='err_msg', full_name='arks.ArksResponse.err_msg', index=8, - number=9, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='algo_ret', full_name='arks.ArksResponse.algo_ret', index=9, - number=10, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='algo_msg', full_name='arks.ArksResponse.algo_msg', index=10, - number=11, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='trace_msg', full_name='arks.ArksResponse.trace_msg', index=11, - number=12, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=_b("").decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - _descriptor.FieldDescriptor( - name='rt', full_name='arks.ArksResponse.rt', index=12, - number=13, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=2106, - serialized_end=2420, + name="ArksResponse", + full_name="arks.ArksResponse", + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name="error_code", + full_name="arks.ArksResponse.error_code", + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="session_id", + full_name="arks.ArksResponse.session_id", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="bucket_name", + full_name="arks.ArksResponse.bucket_name", + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="user_profile", + full_name="arks.ArksResponse.user_profile", + index=3, + number=4, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="items", + full_name="arks.ArksResponse.items", + index=4, + number=5, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="debug_msg", + full_name="arks.ArksResponse.debug_msg", + index=5, + number=6, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="scm", + full_name="arks.ArksResponse.scm", + index=6, + number=7, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="seek_plans", + full_name="arks.ArksResponse.seek_plans", + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="err_msg", + full_name="arks.ArksResponse.err_msg", + index=8, + number=9, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="algo_ret", + full_name="arks.ArksResponse.algo_ret", + index=9, + number=10, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="algo_msg", + full_name="arks.ArksResponse.algo_msg", + index=10, + number=11, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="trace_msg", + full_name="arks.ArksResponse.trace_msg", + index=11, + number=12, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + _descriptor.FieldDescriptor( + name="rt", + full_name="arks.ArksResponse.rt", + index=12, + number=13, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto2", + extension_ranges=[], + oneofs=[], + serialized_start=2106, + serialized_end=2420, ) -_INFERTENSORCONTENTS.fields_by_name['type'].enum_type = _CONTENTTYPE -_PAIR.fields_by_name['contents'].message_type = _INFERTENSORCONTENTS -_ROWKEY.fields_by_name['partitions'].message_type = _PARTITIONINFO -_ROWKEY.fields_by_name['realtime_features'].message_type = _PAIR -_ITEM.fields_by_name['features'].message_type = _PAIR -_ITEM.fields_by_name['attributes'].message_type = _PAIR -_ITEM.fields_by_name['sub_items'].message_type = _SUBITEM -_SUBITEM.fields_by_name['features'].message_type = _PAIR -_SUBITEM.fields_by_name['attributes'].message_type = _PAIR -_SEEKPLAN.fields_by_name['row_keys'].message_type = _ROWKEY -_SEEKPLAN.fields_by_name['item_sequence_type'].enum_type = _ITEMSEQUENCETYPE -_SEEKPLAN.fields_by_name['missing_values'].message_type = _PAIR -_ARKSREQUEST.fields_by_name['user_profile'].message_type = _PAIR -_ARKSREQUEST.fields_by_name['scene_features'].message_type = _PAIR -_ARKSREQUEST.fields_by_name['items'].message_type = _ITEM -_ARKSREQUEST.fields_by_name['out_format'].enum_type = _OUTPUTFORMATTYPE -_ARKSREQUEST.fields_by_name['seek_plans'].message_type = _SEEKPLAN -_ARKSREQUEST.fields_by_name['dump_req_info'].message_type = _DUMPREQINFO -_ARKSRESPONSE.fields_by_name['error_code'].enum_type = _ERRORCODE -_ARKSRESPONSE.fields_by_name['user_profile'].message_type = _PAIR -_ARKSRESPONSE.fields_by_name['items'].message_type = _ITEM -_ARKSRESPONSE.fields_by_name['seek_plans'].message_type = _SEEKPLAN -DESCRIPTOR.message_types_by_name['InferTensorContents'] = _INFERTENSORCONTENTS -DESCRIPTOR.message_types_by_name['Pair'] = _PAIR -DESCRIPTOR.message_types_by_name['RowKey'] = _ROWKEY -DESCRIPTOR.message_types_by_name['PartitionInfo'] = _PARTITIONINFO -DESCRIPTOR.message_types_by_name['Item'] = _ITEM -DESCRIPTOR.message_types_by_name['SubItem'] = _SUBITEM -DESCRIPTOR.message_types_by_name['SeekPlan'] = _SEEKPLAN -DESCRIPTOR.message_types_by_name['DumpReqInfo'] = _DUMPREQINFO -DESCRIPTOR.message_types_by_name['ArksRequest'] = _ARKSREQUEST -DESCRIPTOR.message_types_by_name['ArksResponse'] = _ARKSRESPONSE -DESCRIPTOR.enum_types_by_name['OutputFormatType'] = _OUTPUTFORMATTYPE -DESCRIPTOR.enum_types_by_name['ErrorCode'] = _ERRORCODE -DESCRIPTOR.enum_types_by_name['ContentType'] = _CONTENTTYPE -DESCRIPTOR.enum_types_by_name['ItemSequenceType'] = _ITEMSEQUENCETYPE +_INFERTENSORCONTENTS.fields_by_name["type"].enum_type = _CONTENTTYPE +_PAIR.fields_by_name["contents"].message_type = _INFERTENSORCONTENTS +_ROWKEY.fields_by_name["partitions"].message_type = _PARTITIONINFO +_ROWKEY.fields_by_name["realtime_features"].message_type = _PAIR +_ITEM.fields_by_name["features"].message_type = _PAIR +_ITEM.fields_by_name["attributes"].message_type = _PAIR +_ITEM.fields_by_name["sub_items"].message_type = _SUBITEM +_SUBITEM.fields_by_name["features"].message_type = _PAIR +_SUBITEM.fields_by_name["attributes"].message_type = _PAIR +_SEEKPLAN.fields_by_name["row_keys"].message_type = _ROWKEY +_SEEKPLAN.fields_by_name["item_sequence_type"].enum_type = _ITEMSEQUENCETYPE +_SEEKPLAN.fields_by_name["missing_values"].message_type = _PAIR +_ARKSREQUEST.fields_by_name["user_profile"].message_type = _PAIR +_ARKSREQUEST.fields_by_name["scene_features"].message_type = _PAIR +_ARKSREQUEST.fields_by_name["items"].message_type = _ITEM +_ARKSREQUEST.fields_by_name["out_format"].enum_type = _OUTPUTFORMATTYPE +_ARKSREQUEST.fields_by_name["seek_plans"].message_type = _SEEKPLAN +_ARKSREQUEST.fields_by_name["dump_req_info"].message_type = _DUMPREQINFO +_ARKSRESPONSE.fields_by_name["error_code"].enum_type = _ERRORCODE +_ARKSRESPONSE.fields_by_name["user_profile"].message_type = _PAIR +_ARKSRESPONSE.fields_by_name["items"].message_type = _ITEM +_ARKSRESPONSE.fields_by_name["seek_plans"].message_type = _SEEKPLAN +DESCRIPTOR.message_types_by_name["InferTensorContents"] = _INFERTENSORCONTENTS +DESCRIPTOR.message_types_by_name["Pair"] = _PAIR +DESCRIPTOR.message_types_by_name["RowKey"] = _ROWKEY +DESCRIPTOR.message_types_by_name["PartitionInfo"] = _PARTITIONINFO +DESCRIPTOR.message_types_by_name["Item"] = _ITEM +DESCRIPTOR.message_types_by_name["SubItem"] = _SUBITEM +DESCRIPTOR.message_types_by_name["SeekPlan"] = _SEEKPLAN +DESCRIPTOR.message_types_by_name["DumpReqInfo"] = _DUMPREQINFO +DESCRIPTOR.message_types_by_name["ArksRequest"] = _ARKSREQUEST +DESCRIPTOR.message_types_by_name["ArksResponse"] = _ARKSRESPONSE +DESCRIPTOR.enum_types_by_name["OutputFormatType"] = _OUTPUTFORMATTYPE +DESCRIPTOR.enum_types_by_name["ErrorCode"] = _ERRORCODE +DESCRIPTOR.enum_types_by_name["ContentType"] = _CONTENTTYPE +DESCRIPTOR.enum_types_by_name["ItemSequenceType"] = _ITEMSEQUENCETYPE _sym_db.RegisterFileDescriptor(DESCRIPTOR) -InferTensorContents = _reflection.GeneratedProtocolMessageType('InferTensorContents', (_message.Message,), dict( - DESCRIPTOR = _INFERTENSORCONTENTS, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.InferTensorContents) - )) +InferTensorContents = _reflection.GeneratedProtocolMessageType( + "InferTensorContents", + (_message.Message,), + dict( + DESCRIPTOR=_INFERTENSORCONTENTS, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.InferTensorContents) + ), +) _sym_db.RegisterMessage(InferTensorContents) -Pair = _reflection.GeneratedProtocolMessageType('Pair', (_message.Message,), dict( - DESCRIPTOR = _PAIR, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.Pair) - )) +Pair = _reflection.GeneratedProtocolMessageType( + "Pair", + (_message.Message,), + dict( + DESCRIPTOR=_PAIR, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.Pair) + ), +) _sym_db.RegisterMessage(Pair) -RowKey = _reflection.GeneratedProtocolMessageType('RowKey', (_message.Message,), dict( - DESCRIPTOR = _ROWKEY, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.RowKey) - )) +RowKey = _reflection.GeneratedProtocolMessageType( + "RowKey", + (_message.Message,), + dict( + DESCRIPTOR=_ROWKEY, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.RowKey) + ), +) _sym_db.RegisterMessage(RowKey) -PartitionInfo = _reflection.GeneratedProtocolMessageType('PartitionInfo', (_message.Message,), dict( - DESCRIPTOR = _PARTITIONINFO, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.PartitionInfo) - )) +PartitionInfo = _reflection.GeneratedProtocolMessageType( + "PartitionInfo", + (_message.Message,), + dict( + DESCRIPTOR=_PARTITIONINFO, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.PartitionInfo) + ), +) _sym_db.RegisterMessage(PartitionInfo) -Item = _reflection.GeneratedProtocolMessageType('Item', (_message.Message,), dict( - DESCRIPTOR = _ITEM, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.Item) - )) +Item = _reflection.GeneratedProtocolMessageType( + "Item", + (_message.Message,), + dict( + DESCRIPTOR=_ITEM, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.Item) + ), +) _sym_db.RegisterMessage(Item) -SubItem = _reflection.GeneratedProtocolMessageType('SubItem', (_message.Message,), dict( - DESCRIPTOR = _SUBITEM, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.SubItem) - )) +SubItem = _reflection.GeneratedProtocolMessageType( + "SubItem", + (_message.Message,), + dict( + DESCRIPTOR=_SUBITEM, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.SubItem) + ), +) _sym_db.RegisterMessage(SubItem) -SeekPlan = _reflection.GeneratedProtocolMessageType('SeekPlan', (_message.Message,), dict( - DESCRIPTOR = _SEEKPLAN, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.SeekPlan) - )) +SeekPlan = _reflection.GeneratedProtocolMessageType( + "SeekPlan", + (_message.Message,), + dict( + DESCRIPTOR=_SEEKPLAN, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.SeekPlan) + ), +) _sym_db.RegisterMessage(SeekPlan) -DumpReqInfo = _reflection.GeneratedProtocolMessageType('DumpReqInfo', (_message.Message,), dict( - DESCRIPTOR = _DUMPREQINFO, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.DumpReqInfo) - )) +DumpReqInfo = _reflection.GeneratedProtocolMessageType( + "DumpReqInfo", + (_message.Message,), + dict( + DESCRIPTOR=_DUMPREQINFO, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.DumpReqInfo) + ), +) _sym_db.RegisterMessage(DumpReqInfo) -ArksRequest = _reflection.GeneratedProtocolMessageType('ArksRequest', (_message.Message,), dict( - DESCRIPTOR = _ARKSREQUEST, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.ArksRequest) - )) +ArksRequest = _reflection.GeneratedProtocolMessageType( + "ArksRequest", + (_message.Message,), + dict( + DESCRIPTOR=_ARKSREQUEST, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.ArksRequest) + ), +) _sym_db.RegisterMessage(ArksRequest) -ArksResponse = _reflection.GeneratedProtocolMessageType('ArksResponse', (_message.Message,), dict( - DESCRIPTOR = _ARKSRESPONSE, - __module__ = 'arks_pb2' - # @@protoc_insertion_point(class_scope:arks.ArksResponse) - )) +ArksResponse = _reflection.GeneratedProtocolMessageType( + "ArksResponse", + (_message.Message,), + dict( + DESCRIPTOR=_ARKSRESPONSE, + __module__="arks_pb2" + # @@protoc_insertion_point(class_scope:arks.ArksResponse) + ), +) _sym_db.RegisterMessage(ArksResponse) DESCRIPTOR._options = None -_ITEM.fields_by_name['is_features_valid']._options = None -_ITEM.fields_by_name['scores']._options = None -_SUBITEM.fields_by_name['is_features_valid']._options = None -_SUBITEM.fields_by_name['scores']._options = None +_ITEM.fields_by_name["is_features_valid"]._options = None +_ITEM.fields_by_name["scores"]._options = None +_SUBITEM.fields_by_name["is_features_valid"]._options = None +_SUBITEM.fields_by_name["scores"]._options = None # @@protoc_insertion_point(module_scope) diff --git a/kag/common/base/prompt_op.py b/kag/common/base/prompt_op.py deleted file mode 100644 index 057e35bf..00000000 --- a/kag/common/base/prompt_op.py +++ /dev/null @@ -1,184 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import importlib -import inspect -import os -import sys -from abc import ABC -from string import Template -from typing import List - - -BUILDER_PROMPT_PATH = "kag.builder.prompt" -SOLVER_PROMPT_PATH = "kag.solver.prompt" - - -class PromptOp(ABC): - """ - Provides a template for generating and parsing prompts related to specific business scenes. - - Subclasses must implement the template strings for specific languages (English or Chinese) - and override the `template_variables` and `parse_response` methods. - """ - - """English template string""" - template_en: str = "" - """Chinese template string""" - template_zh: str = "" - - def __init__(self, language: str, **kwargs): - """ - Initializes the PromptOp instance with the selected language. - - Args: - language (str): The language for the prompt, should be either "en" or "zh". - - Raises: - AssertionError: If the provided language is not supported. - """ - - assert language in ["en", "zh"], f"language[{language}] is not supported." - self.template = self.template_en if language == "en" else self.template_zh - self.language = language - self.template_variables_value = {} - if "project_id" in kwargs: - self.project_id = kwargs["project_id"] - - @property - def template_variables(self) -> List[str]: - """ - Gets the list of template variables. - - Must be implemented by subclasses. - - Returns: - - List[str]: A list of template variable names. - - Raises: - - NotImplementedError: If the subclass does not implement this method. - """ - - raise NotImplementedError( - f"{self.__class__.__name__} need to implement `template_variables` method." - ) - - def process_template_string_to_avoid_dollar_problem(self, template_string): - new_template_str = template_string.replace('$', '$$') - for var in self.template_variables: - new_template_str = new_template_str.replace(f'$${var}', f'${var}') - return new_template_str - - def build_prompt(self, variables) -> str: - """ - Build a prompt based on the template and provided variables. - - This method replaces placeholders in the template with actual variable values. - If a variable is not provided, it defaults to an empty string. - - Parameters: - - variables: A dictionary containing variable names and their corresponding values. - - Returns: - - A string or list of strings, depending on the template content. - """ - - self.template_variables_value = variables - template_string = self.process_template_string_to_avoid_dollar_problem(self.template) - template = Template(template_string) - return template.substitute(**variables) - - def parse_response(self, response: str, **kwargs): - """ - Parses the response string. - - Must be implemented by subclasses. - - Parameters: - - response (str): The response string to be parsed. - - Raises: - - NotImplementedError: If the subclass does not implement this method. - """ - - raise NotImplementedError( - f"{self.__class__.__name__} need to implement `parse_response` method." - ) - - @classmethod - def load(cls, biz_scene: str, type: str): - """ - Dynamically loads the corresponding PromptOp subclass object based on the business scene and type. - - Parameters: - - biz_scene (str): The name of the business scene. - - type (str): The type of prompt. - - Returns: - - subclass of PromptOp: The loaded PromptOp subclass object. - - Raises: - - ImportError: If the specified module or class does not exist. - """ - dir_paths = [ - os.path.join(os.getenv("KAG_PROJECT_ROOT_PATH", ""), "builder", "prompt"), - os.path.join(os.getenv("KAG_PROJECT_ROOT_PATH", ""), "solver", "prompt"), - ] - module_paths = [ - '.'.join([BUILDER_PROMPT_PATH, biz_scene, type]), - '.'.join([SOLVER_PROMPT_PATH, biz_scene, type]), - '.'.join([BUILDER_PROMPT_PATH, 'default', type]), - '.'.join([SOLVER_PROMPT_PATH, 'default', type]), - ] - - def find_class_from_dir(dir, type): - sys.path.append(dir) - - for root, dirs, files in os.walk(dir): - for file in files: - if file.endswith(".py") and file.startswith(f"{type}."): - module_name = file[:-3] - try: - module = importlib.import_module(module_name) - except ImportError: - continue - cls_found = find_class_from_module(module) - if cls_found: - return cls_found - return None - - def find_class_from_module(module): - classes = inspect.getmembers(module, inspect.isclass) - for class_name, class_obj in classes: - import kag - if issubclass(class_obj, kag.common.base.prompt_op.PromptOp) and inspect.getmodule(class_obj) == module: - return class_obj - return None - - for dir_path in dir_paths: - try: - cls_found = find_class_from_dir(dir_path, type) - if cls_found: - return cls_found - except ImportError: - continue - - for module_path in module_paths: - try: - module = importlib.import_module(module_path) - cls_found = find_class_from_module(module) - if cls_found: - return cls_found - except ModuleNotFoundError: - continue - - raise ValueError(f'Not support prompt with biz_scene[{biz_scene}] and type[{type}]') diff --git a/kag/common/benchmarks/evaUtils.py b/kag/common/benchmarks/evaUtils.py index f443e8a0..3543f74f 100644 --- a/kag/common/benchmarks/evaUtils.py +++ b/kag/common/benchmarks/evaUtils.py @@ -1,5 +1,7 @@ import re +import json import string +import traceback from collections import Counter @@ -17,15 +19,16 @@ def normalize_answer(s): Returns: str: The standardized answer string. """ + def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) + return re.sub(r"\b(a|an|the)\b", " ", text) def white_space_fix(text): - return ' '.join(text.split()) + return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) + return "".join(ch for ch in text if ch not in exclude) def lower(text): return str(text).lower() @@ -52,10 +55,16 @@ def f1_score(prediction, ground_truth): ZERO_METRIC = (0, 0, 0) - if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: + if ( + normalized_prediction in ["yes", "no", "noanswer"] + and normalized_prediction != normalized_ground_truth + ): return ZERO_METRIC - if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: + if ( + normalized_ground_truth in ["yes", "no", "noanswer"] + and normalized_prediction != normalized_ground_truth + ): return ZERO_METRIC prediction_tokens = normalized_prediction.split() @@ -78,35 +87,156 @@ def f1_score(prediction, ground_truth): def exact_match_score(prediction, ground_truth): """ Calculates the exact match score between a predicted answer and the ground truth answer. - + This function normalizes both the predicted answer and the ground truth answer before comparing them. Normalization is performed to ensure that non-essential differences such as spaces and case are ignored. - + Parameters: prediction (str): The predicted answer string. ground_truth (str): The ground truth answer string. - + Returns: int: 1 if the predicted answer exactly matches the ground truth answer, otherwise 0. """ return 1 if normalize_answer(prediction) == normalize_answer(ground_truth) else 0 + def get_em_f1(prediction, gold): """ Calculates the Exact Match (EM) score and F1 score between the prediction and the gold standard. - + This function evaluates the performance of a model in text similarity tasks by calculating the EM score and F1 score to measure the accuracy of the predictions. - + Parameters: prediction (str): The output predicted by the model. gold (str): The gold standard output (i.e., the correct output). - + Returns: tuple: A tuple containing two floats, the EM score and the F1 score. The EM score represents the exact match accuracy, while the F1 score is a combination of precision and recall. """ em = exact_match_score(prediction, gold) f1, precision, recall = f1_score(prediction, gold) - - return float(em), f1 \ No newline at end of file + + return float(em), f1 + + +def compare_summarization_answers( + query, + answer1, + answer2, + *, + api_key="EMPTY", + base_url="http://127.0.0.1:38080/v1", + model="gpt-4o-mini", + language="English", + retries=3, +): + """ + Given a query and two answers, compare the answers with an LLM for Comprehensiveness, Diversity and Empowerment. + + This function is adapted from LightRAG for evaluating GraphRAG and LightRAG in QFS (query-focused summarization) + tasks: + + https://github.com/HKUDS/LightRAG/blob/45cea6e/examples/batch_eval.py + + Parameters: + query (str): The query inputed to LLMs. + answer1 (str): Answer generated by an LLM. + answer2 (str): Answer generated by another LLM. + api_key (str): API key to use when invoke the evaluating LLM. + base_url (str): base url to use when invoke the evaluating LLM. + model (str): model name to use when invoke the evaluating LLM. + language (str): language of the explanation + retries (int): number of retries + + Returns: + str: response content generated by the evaluating LLM. + """ + from openai import OpenAI + + sys_prompt = """ + ---Role--- + You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + """ + prompt = f""" + You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + + - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? + - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? + - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? + + For each criterion, give each answer a score between 0 and 10, choose the better answer (either Answer 1 or Answer 2) and explain why. + Then, give each answer an overall score between 0 and 10, and select an overall winner based on these three categories. + + Here is the question: + {query} + + Here are the two answers: + + **Answer 1:** + {answer1} + + **Answer 2:** + {answer2} + + Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. + + Output your evaluation in the following JSON format: + + {{ + "Comprehensiveness": {{ + "Score 1": [Score of Answer 1 - an integer between 0 and 10], + "Score 2": [Score of Answer 2 - an integer between 0 and 10], + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation in {language} here]" + }}, + "Diversity": {{ + "Score 1": [Score of Answer 1 - an integer between 0 and 10], + "Score 2": [Score of Answer 2 - an integer between 0 and 10], + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation in {language} here]" + }}, + "Empowerment": {{ + "Score 1": [Score of Answer 1 - an integer between 0 and 10], + "Score 2": [Score of Answer 2 - an integer between 0 and 10], + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Provide explanation in {language} here]" + }}, + "Overall": {{ + "Score 1": [Score of Answer 1 - an integer between 0 and 10], + "Score 2": [Score of Answer 2 - an integer between 0 and 10], + "Winner": "[Answer 1 or Answer 2]", + "Explanation": "[Summarize why this answer is the overall winner based on the three criteria in {language}]" + }} + }} + """ + for index in range(retries): + content = None + try: + client = OpenAI(api_key=api_key, base_url=base_url) + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": sys_prompt}, + {"role": "user", "content": prompt}, + ], + ) + content = response.choices[0].message.content + if content.startswith("```json") and content.endswith("```"): + content = content[7:-3] + metrics = json.loads(content) + return metrics + except Exception: + if index == retries - 1: + message = ( + f"Comparing summarization answers failed.\n" + f"query: {query}\n" + f"answer1: {answer1}\n" + f"answer2: {answer2}\n" + f"content: {content}\n" + f"exception:\n{traceback.format_exc()}" + ) + print(message) + return None diff --git a/kag/common/benchmarks/evaluate.py b/kag/common/benchmarks/evaluate.py index 4b920f93..1a574627 100644 --- a/kag/common/benchmarks/evaluate.py +++ b/kag/common/benchmarks/evaluate.py @@ -1,22 +1,25 @@ - from typing import List +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, as_completed from .evaUtils import get_em_f1 +from .evaUtils import compare_summarization_answers -class Evaluate(): +class Evaluate: """ provide evaluation for benchmarks, such as em、f1、answer_similarity, answer_correctness """ - def __init__(self, embedding_factory = "text-embedding-ada-002"): + + def __init__(self, embedding_factory="text-embedding-ada-002"): self.embedding_factory = embedding_factory def evaForSimilarity(self, predictionlist: List[str], goldlist: List[str]): """ evaluate the similarity between prediction and gold #TODO """ - # data_samples = { + # data_samples = { # 'question': [], # 'answer': predictionlist, # 'ground_truth': goldlist @@ -29,7 +32,6 @@ def evaForSimilarity(self, predictionlist: List[str], goldlist: List[str]): # return np.average(score.to_pandas()[['answer_similarity']]) return 0.0 - def getBenchMark(self, predictionlist: List[str], goldlist: List[str]): """ Calculates and returns evaluation metrics between predictions and ground truths. @@ -45,21 +47,113 @@ def getBenchMark(self, predictionlist: List[str], goldlist: List[str]): dict: Dictionary containing EM, F1 score, and answer similarity. """ # Initialize total metrics - total_metrics = {'em': 0.0, 'f1': 0.0, 'answer_similarity': 0.0} - + total_metrics = {"em": 0.0, "f1": 0.0, "answer_similarity": 0.0} + # Iterate over prediction and gold lists to calculate EM and F1 scores for prediction, gold in zip(predictionlist, goldlist): - em, f1 = get_em_f1(prediction, gold) # Call external function to calculate EM and F1 - total_metrics['em'] += em # Accumulate EM score - total_metrics['f1'] += f1 # Accumulate F1 score - + em, f1 = get_em_f1( + prediction, gold + ) # Call external function to calculate EM and F1 + total_metrics["em"] += em # Accumulate EM score + total_metrics["f1"] += f1 # Accumulate F1 score + # Calculate average EM and F1 scores - total_metrics['em'] /= len(predictionlist) - total_metrics['f1'] /= len(predictionlist) - + total_metrics["em"] /= len(predictionlist) + total_metrics["f1"] /= len(predictionlist) + # Call method to calculate answer similarity - total_metrics['answer_similarity'] = self.evaForSimilarity(predictionlist, goldlist) + total_metrics["answer_similarity"] = self.evaForSimilarity( + predictionlist, goldlist + ) # Return evaluation metrics dictionary return total_metrics + def getSummarizationMetrics( + self, + queries: List[str], + answers1: List[str], + answers2: List[str], + *, + api_key="EMPTY", + base_url="http://127.0.0.1:38080/v1", + model="gpt-4o-mini", + language="English", + retries=3, + max_workers=50, + ): + """ + Calculates and returns QFS (query-focused summarization) evaluation metrics + for the given queries, answers1 and answers2. + + This function evaluates the triple (query, answer1, answer2) by feeding it + into an evaluating LLM specified as `api_key`, `base_url` and `model`. + + Parameters: + queries (List[str]): List of queries. + answers1 (List[str]): List of answers generated by an LLM (LLM-1). + answers2 (List[str]): List of answers generated by another LLM (LLM-2). + api_key (str): API key to use when invoke the evaluating LLM. + base_url (str): base url to use when invoke the evaluating LLM. + model (str): model name to use when invoke the evaluating LLM. + language (str): language of the explanation + retries (int): number of retries + max_workers (int): number of workers + + Returns: + dict: Dictionary containing the average metrics and the responses + generated by the evaluating LLM. + """ + responses = [None] * len(queries) + all_keys = "Comprehensiveness", "Diversity", "Empowerment", "Overall" + all_items = "Score 1", "Score 2" + average_metrics = {key: {item: 0.0 for item in all_items} for key in all_keys} + success_count = 0 + + def process_sample(index, query, answer1, answer2): + metrics = compare_summarization_answers( + query, + answer1, + answer2, + api_key=api_key, + base_url=base_url, + model=model, + language=language, + retries=retries, + ) + if metrics is None: + print( + f"fail to compare answers of query {index + 1}.\n" + f" query: {query}\n" + f" answer1: {answer1}\n" + f" answer2: {answer2}\n" + ) + else: + responses[index] = metrics + return metrics + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [ + executor.submit(process_sample, index, query, answer1, answer2) + for index, (query, answer1, answer2) in enumerate( + zip(queries, answers1, answers2) + ) + ] + for future in tqdm( + as_completed(futures), total=len(futures), desc="Evaluating: " + ): + metrics = future.result() + if metrics is not None: + for key in all_keys: + for item in all_items: + average_metrics[key][item] += metrics[key][item] + success_count += 1 + if success_count > 0: + for key in all_keys: + for item in all_items: + average_metrics[key][item] /= success_count + result = { + "average_metrics": average_metrics, + "responses": responses, + } + return result diff --git a/kag/common/llm/config/__init__.py b/kag/common/checkpointer/__init__.py similarity index 62% rename from kag/common/llm/config/__init__.py rename to kag/common/checkpointer/__init__.py index 9a3a13aa..d2deddb2 100644 --- a/kag/common/llm/config/__init__.py +++ b/kag/common/checkpointer/__init__.py @@ -9,15 +9,9 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. +from kag.common.checkpointer.base import CheckPointer, CheckpointerManager +from kag.common.checkpointer.txt_checkpointer import TxtCheckPointer +from kag.common.checkpointer.bin_checkpointer import BinCheckPointer -from kag.common.llm.config.openai import OpenAIConfig -from kag.common.llm.config.base import LLMConfig -from kag.common.llm.config.vllm import VLLMConfig -from kag.common.llm.config.ollama import OllamaConfig -__all__ = [ - "OpenAIConfig", - "LLMConfig", - "VLLMConfig", - "OllamaConfig" -] +__all__ = ["CheckPointer", "CheckpointerManager", "TxtCheckPointer", "BinCheckPointer"] diff --git a/kag/common/checkpointer/base.py b/kag/common/checkpointer/base.py new file mode 100644 index 00000000..17c2f6fd --- /dev/null +++ b/kag/common/checkpointer/base.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import os +import threading +from kag.common.registry import Registrable +from kag.common.utils import reset, bold, red, generate_hash_id + + +class CheckPointer(Registrable): + """ + A class for managing checkpoints in a distributed environment. + + This class provides methods to open, read, write, and close checkpoint files. + It is designed to handle checkpoints in a distributed setting, where multiple + processes may be writing checkpoints in parallel. + + Attributes: + ckpt_file_name (str): The format string for checkpoint file names. + """ + + ckpt_file_name = "kag_checkpoint_{}_{}.ckpt" + + def __init__(self, ckpt_dir: str, rank: int = 0, world_size: int = 1): + """ + Initializes the CheckPointer with the given checkpoint directory, rank, and world size. + + Args: + ckpt_dir (str): The directory where checkpoint files are stored. + rank (int): The rank of the current process (default is 0). + world_size (int): The total number of processes in the distributed environment (default is 1). + """ + self._ckpt_dir = ckpt_dir + if not os.path.exists(ckpt_dir): + os.makedirs(ckpt_dir, exist_ok=True) + self.rank = rank + self.world_size = world_size + self._ckpt_file_path = os.path.join( + self._ckpt_dir, CheckPointer.ckpt_file_name.format(rank, world_size) + ) + self._ckpt = self.open() + self._closed = False + if self.size() > 0: + print( + f"{bold}{red}Existing checkpoint found in {self._ckpt_dir}, with {self.size()} records.{reset}" + ) + + def open(self): + """ + Opens the checkpoint file and returns the checkpoint object. + + Returns: + Any: The checkpoint object, which can be used for reading and writing. + """ + raise NotImplementedError("open not implemented yet.") + + def read_from_ckpt(self, key): + """ + Reads a value from the checkpoint file using the specified key. + + Args: + key (str): The key to retrieve the value from the checkpoint. + + Returns: + Any: The value associated with the key in the checkpoint. + """ + raise NotImplementedError("read_from_ckpt not implemented yet.") + + def write_to_ckpt(self, key, value): + """ + Writes a value to the checkpoint file using the specified key. + + Args: + key (str): The key to store the value in the checkpoint. + value (Any): The value to be stored in the checkpoint. + """ + raise NotImplementedError("write_to_ckpt not implemented yet.") + + def _close(self): + """ + Closes the checkpoint file. + """ + raise NotImplementedError("close not implemented yet.") + + def close(self): + """ + Closes the checkpoint file. + """ + if not self._closed: + self._close() + self._closed = True + + def exists(self, key): + """ + Checks if a key exists in the checkpoint file. + + Args: + key (str): The key to check for existence in the checkpoint. + + Returns: + bool: True if the key exists in the checkpoint, False otherwise. + """ + raise NotImplementedError("close not implemented yet.") + + def keys(self): + """ + Returns the key set contained in the checkpoint file. + + Returns: + set: The key set contained in the checkpoint. + """ + + raise NotImplementedError("keys not implemented yet.") + + def size(self): + """ + Return the number of records in the checkpoint file. + + Returns: + int: the number of records in the checkpoint file. + """ + + raise NotImplementedError("size not implemented yet.") + + def __contains__(self, key): + """ + Defines the behavior of the `in` operator for the object. + Args: + key (str): The key to check for existence in the checkpoint. + + Returns: + bool: True if the key exists in the checkpoint, False otherwise. + """ + + return self.exists(key) + + +class CheckpointerManager: + """ + Manages the lifecycle of CheckPointer objects. + + This class provides a thread-safe mechanism to retrieve and close CheckPointer + instances based on a configuration. It uses a global dictionary to cache + CheckPointer objects, ensuring that each configuration corresponds to a unique + instance. + """ + + _CKPT_OBJS = {} + _LOCK = threading.Lock() + + @staticmethod + def get_checkpointer(config): + """ + Retrieves or creates a CheckPointer instance based on the provided configuration. + + Args: + config (dict): The configuration used to initialize the CheckPointer. + + Returns: + CheckPointer: A CheckPointer instance corresponding to the configuration. + """ + with CheckpointerManager._LOCK: + key = generate_hash_id(config) + if key not in CheckpointerManager._CKPT_OBJS: + ckpter = CheckPointer.from_config(config) + CheckpointerManager._CKPT_OBJS[key] = ckpter + return CheckpointerManager._CKPT_OBJS[key] + + @staticmethod + def close(): + """ + Closes all cached CheckPointer instances. + + This method iterates through all cached CheckPointer objects and calls their + `close` method to release resources. After calling this method, the cache + will be cleared. + """ + with CheckpointerManager._LOCK: + for v in CheckpointerManager._CKPT_OBJS.values(): + v.close() + CheckpointerManager._CKPT_OBJS.clear() diff --git a/kag/common/checkpointer/bin_checkpointer.py b/kag/common/checkpointer/bin_checkpointer.py new file mode 100644 index 00000000..e247972a --- /dev/null +++ b/kag/common/checkpointer/bin_checkpointer.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import shelve +import logging +import transaction +import threading +import pickle +import BTrees.OOBTree +from ZODB import DB +from ZODB.FileStorage import FileStorage +from kag.common.checkpointer.base import CheckPointer + +logger = logging.getLogger() + + +@CheckPointer.register("bin") +class BinCheckPointer(CheckPointer): + """ + A subclass of CheckPointer that uses shelve for binary checkpoint management. + + This class extends the CheckPointer class to provide binary checkpoint + management using the shelve module. It supports opening, reading, writing, + and closing checkpoint files in a binary format. + """ + + def open(self): + """ + Opens the checkpoint file using shelve in writeback mode. + + Returns: + Any: The shelve object representing the checkpoint file. + """ + return shelve.open(self._ckpt_file_path, "c", writeback=True) + + def exists(self, key): + """ + Checks if a key exists in the checkpoint file. + + Args: + key (str): The key to check for existence in the checkpoint. + + Returns: + bool: True if the key exists in the checkpoint, False otherwise. + """ + return key in self._ckpt + + def read_from_ckpt(self, key): + """ + Reads a value from the checkpoint file using the specified key. + + Args: + key (str): The key to retrieve the value from the checkpoint. + + Returns: + Any: The value associated with the key in the checkpoint. + """ + return self._ckpt[key] + + def write_to_ckpt(self, key, value): + """ + Writes a value to the checkpoint file using the specified key. + + Args: + key (str): The key to store the value in the checkpoint. + value (Any): The value to be stored in the checkpoint. + """ + self._ckpt[key] = value + self._ckpt.sync() + + def _close(self): + """ + Closes the checkpoint file and ensures data is written to disk. + """ + self._ckpt.sync() + self._ckpt.close() + + def size(self): + """ + Returns the number of entries in the checkpoint. + Returns: + int: The number of entries in the checkpoint. + """ + + return len(self._ckpt) + + def keys(self): + return set(self._ckpt.keys()) + + +@CheckPointer.register("zodb") +class ZODBCheckPointer(CheckPointer): + """ + A CheckPointer implementation that uses ZODB as the underlying storage. + + This class provides methods to open, read, write, and close checkpoints using ZODB. + """ + + def __init__(self, ckpt_dir: str, rank: int = 0, world_size: int = 1): + """ + Initializes the ZODBCheckPointer with the given checkpoint directory, rank, and world size. + + Args: + ckpt_dir (str): The directory where checkpoint files are stored. + rank (int): The rank of the current process (default is 0). + world_size (int): The total number of processes in the distributed environment (default is 1). + """ + self._lock = threading.Lock() + super().__init__(ckpt_dir, rank, world_size) + + def open(self): + """ + Opens the ZODB database and returns the root object for checkpoint storage. + + Returns: + dict: The root object of the ZODB database, which is a dictionary-like object. + """ + with self._lock: + storage = FileStorage(self._ckpt_file_path) + db = DB(storage) + with db.transaction() as conn: + if not hasattr(conn.root, "data"): + conn.root.data = BTrees.OOBTree.BTree() + return db + + def read_from_ckpt(self, key): + """ + Reads a value from the checkpoint using the specified key. + + Args: + key (str): The key to retrieve the value from the checkpoint. + + Returns: + Any: The value associated with the key in the checkpoint. + """ + with self._lock: + with self._ckpt.transaction() as conn: + obj = conn.root.data.get(key, None) + if obj: + return pickle.loads(obj) + else: + return None + + def write_to_ckpt(self, key, value): + """ + Writes a value to the checkpoint using the specified key. + By default, ZODB tracks modifications to the written object (value) and + continuously synchronizes these changes to the storage. For example, if + the value is a `SubGraph` object, subsequent modifications to its + attributes will be synchronized, which is not what we expect. + Therefore, we use `pickle` to serialize the value object before writing it, + ensuring that the object behaves as an immutable object. + + Args: + key (str): The key to store the value in the checkpoint. + value (Any): The value to be stored in the checkpoint. + """ + with self._lock: + try: + with self._ckpt.transaction() as conn: + conn.root.data[key] = pickle.dumps(value) + except Exception as e: + logger.warn(f"failed to write checkpoint {key} to db, info: {e}") + + def _close(self): + """ + Closes the ZODB database connection. + """ + with self._lock: + try: + transaction.commit() + except: + transaction.abort() + if self._ckpt is not None: + self._ckpt.close() + + def exists(self, key): + """ + Checks if a key exists in the checkpoint. + + Args: + key (str): The key to check for existence in the checkpoint. + + Returns: + bool: True if the key exists in the checkpoint, False otherwise. + """ + with self._lock: + with self._ckpt.transaction() as conn: + return key in conn.root.data + + def size(self): + """ + Returns the number of entries in the checkpoint. + + This method calculates the size of the checkpoint by counting the number + of keys stored in the checkpoint's data dictionary. It ensures thread-safe + access to the checkpoint by using a lock. + + Returns: + int: The number of entries in the checkpoint. + """ + with self._lock: + with self._ckpt.transaction() as conn: + return len(conn.root.data) + + def keys(self): + with self._lock: + with self._ckpt.transaction() as conn: + return set(conn.root.data.keys()) diff --git a/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db b/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db new file mode 100644 index 00000000..71e41cd7 Binary files /dev/null and b/kag/common/checkpointer/ckpt/kag_checkpoint_0_1.ckpt.db differ diff --git a/kag/common/checkpointer/txt_checkpointer.py b/kag/common/checkpointer/txt_checkpointer.py new file mode 100644 index 00000000..5f58afde --- /dev/null +++ b/kag/common/checkpointer/txt_checkpointer.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import os +import json +from kag.common.checkpointer.base import CheckPointer + + +@CheckPointer.register("txt") +class TxtCheckPointer(CheckPointer): + """ + A subclass of CheckPointer that uses a text file for checkpoint management. + + This class extends the CheckPointer class to provide checkpoint management + using a text file. It supports opening, reading, writing, and closing + checkpoint files in a text format. Each checkpoint entry is stored as a + JSON object in the file. + """ + + def open(self): + """ + Opens the checkpoint file and loads existing data into a dictionary. + + Returns: + dict: A dictionary containing the checkpoint data. + """ + ckpt = {} + if os.path.exists(self._ckpt_file_path): + with open(self._ckpt_file_path, "r") as reader: + for line in reader: + data = json.loads(line) + ckpt[data["id"]] = data["value"] + self._writer = open(self._ckpt_file_path, "a") + return ckpt + + def exists(self, key): + """ + Checks if a key exists in the checkpoint file. + + Args: + key (str): The key to check for existence in the checkpoint. + + Returns: + bool: True if the key exists in the checkpoint, False otherwise. + """ + return key in self._ckpt + + def read_from_ckpt(self, key): + """ + Reads a value from the checkpoint file using the specified key. + + Args: + key (str): The key to retrieve the value from the checkpoint. + + Returns: + Any: The value associated with the key in the checkpoint. + """ + return self._ckpt[key] + + def write_to_ckpt(self, key, value): + """ + Writes a value to the checkpoint file using the specified key. + + Args: + key (str): The key to store the value in the checkpoint. + value (Any): The value to be stored in the checkpoint. + """ + self._ckpt[key] = value + self._writer.write(json.dumps({"id": key, "value": value}, ensure_ascii=False)) + self._writer.write("\n") + self._writer.flush() + + def _close(self): + """ + Closes the checkpoint file and ensures data is written to disk. + """ + self._writer.flush() + self._writer.close() + + def size(self): + return len(self._ckpt) + + def keys(self): + return set(self._ckpt.keys()) diff --git a/kag/common/conf.py b/kag/common/conf.py new file mode 100644 index 00000000..044b8faa --- /dev/null +++ b/kag/common/conf.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import copy +import os +import logging +import yaml +import json +import pprint +from pathlib import Path +from typing import Union, Optional + +from knext.project.client import ProjectClient + + +class KAGConstants(object): + LOCAL_SCHEMA_URL = "http://localhost:8887" + DEFAULT_KAG_CONFIG_FILE_NAME = "default_config.yaml" + KAG_CONFIG_FILE_NAME = "kag_config.yaml" + DEFAULT_KAG_CONFIG_PATH = os.path.join(__file__, DEFAULT_KAG_CONFIG_FILE_NAME) + KAG_CFG_PREFIX = "KAG" + GLOBAL_CONFIG_KEY = "global" + PROJECT_CONFIG_KEY = "project" + KAG_NAMESPACE_KEY = "namespace" + KAG_PROJECT_ID_KEY = "id" + KAG_PROJECT_HOST_ADDR_KEY = "host_addr" + KAG_LANGUAGE_KEY = "language" + KAG_CKPT_DIR_KEY = "checkpoint_path" + KAG_BIZ_SCENE_KEY = "biz_scene" + ENV_KAG_PROJECT_ID = "KAG_PROJECT_ID" + ENV_KAG_PROJECT_HOST_ADDR = "KAG_PROJECT_HOST_ADDR" + ENV_KAG_DEBUG_DUMP_CONFIG = "KAG_DEBUG_DUMP_CONFIG" + KAG_SIMILAR_EDGE_NAME = "similar" + + KS8_ENV_TF_CONFIG = "TF_CONFIG" + K8S_ENV_MASTER_ADDR = "MASTER_ADDR" + K8S_ENV_MASTER_PORT = "MASTER_PORT" + K8S_ENV_WORLD_SIZE = "WORLD_SIZE" + K8S_ENV_RANK = "RANK" + K8S_ENV_POD_NAME = "POD_NAME" + + +class KAGGlobalConf: + def __init__(self): + self._extra = {} + + def initialize(self, **kwargs): + self.project_id = kwargs.pop( + KAGConstants.KAG_PROJECT_ID_KEY, + os.getenv(KAGConstants.ENV_KAG_PROJECT_ID, "1"), + ) + self.host_addr = kwargs.pop( + KAGConstants.KAG_PROJECT_HOST_ADDR_KEY, + os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR, "http://127.0.0.1:8887"), + ) + self.biz_scene = kwargs.pop(KAGConstants.KAG_BIZ_SCENE_KEY, "default") + self.language = kwargs.pop(KAGConstants.KAG_LANGUAGE_KEY, "en") + self.namespace = kwargs.pop(KAGConstants.KAG_NAMESPACE_KEY, None) + self.ckpt_dir = kwargs.pop(KAGConstants.KAG_CKPT_DIR_KEY, "ckpt") + + # process configs set to class attr directly + for k in self._extra.keys(): + if hasattr(self, k): + delattr(self, k) + + for k, v in kwargs.items(): + setattr(self, k, v) + self._extra = kwargs + + print( + f"Done initialize project config with host addr {self.host_addr} and project_id {self.project_id}" + ) + + +def _closest_cfg( + path: Union[str, os.PathLike] = ".", + prev_path: Optional[Union[str, os.PathLike]] = None, +) -> str: + """ + Return the path to the closest .kag.cfg file by traversing the current + directory and its parents + """ + if prev_path is not None and str(path) == str(prev_path): + return "" + path = Path(path).resolve() + cfg_file = path / KAGConstants.KAG_CONFIG_FILE_NAME + if cfg_file.exists(): + return str(cfg_file) + return _closest_cfg(path.parent, path) + + +def load_config(prod: bool = False): + """ + Get kag config file as a ConfigParser. + """ + if prod: + project_id = os.getenv(KAGConstants.ENV_KAG_PROJECT_ID) + host_addr = os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR) + project_client = ProjectClient(host_addr=host_addr) + project = project_client.get_by_id(project_id) + config = json.loads(project.config) + if "project" not in config: + config["project"] = { + KAGConstants.KAG_PROJECT_ID_KEY: project_id, + KAGConstants.KAG_PROJECT_HOST_ADDR_KEY: host_addr, + KAGConstants.KAG_NAMESPACE_KEY: project.namespace, + } + prompt_config = config.pop("prompt", {}) + for key in [KAGConstants.KAG_LANGUAGE_KEY, KAGConstants.KAG_BIZ_SCENE_KEY]: + if key in prompt_config: + config["project"][key] = prompt_config[key] + if "vectorizer" in config and "vectorize_model" not in config: + config["vectorize_model"] = config["vectorizer"] + return config + else: + config_file = _closest_cfg() + if os.path.exists(config_file) and os.path.isfile(config_file): + print(f"found config file: {config_file}") + with open(config_file, "r") as reader: + config = reader.read() + return yaml.safe_load(config) + else: + return {} + + +class KAGConfigMgr: + def __init__(self): + self.config = {} + self.global_config = KAGGlobalConf() + self._is_initialized = False + + def init_log_config(self, config): + log_conf = config.get("log", {}) + if log_conf: + log_level = log_conf.get("level", "INFO") + else: + log_level = "INFO" + logging.basicConfig(level=logging.getLevelName(log_level)) + logging.getLogger("neo4j.notifications").setLevel(logging.ERROR) + logging.getLogger("neo4j.io").setLevel(logging.INFO) + logging.getLogger("neo4j.pool").setLevel(logging.INFO) + + def initialize(self, prod: bool = True): + config = load_config(prod) + if self._is_initialized: + print( + "Reinitialize the KAG configuration, an operation that should exclusively be triggered within the Java invocation context." + ) + print(f"original config: {self.config}") + print(f"new config: {config}") + self.prod = prod + self.config = config + global_config = self.config.get(KAGConstants.PROJECT_CONFIG_KEY, {}) + self.global_config.initialize(**global_config) + self.init_log_config(self.config) + self._is_initialized = True + + @property + def all_config(self): + return copy.deepcopy(self.config) + + +KAG_CONFIG = KAGConfigMgr() + +KAG_PROJECT_CONF = KAG_CONFIG.global_config + + +def init_env(): + project_id = os.getenv(KAGConstants.ENV_KAG_PROJECT_ID) + host_addr = os.getenv(KAGConstants.ENV_KAG_PROJECT_HOST_ADDR) + if project_id and host_addr: + prod = True + else: + prod = False + global KAG_CONFIG + KAG_CONFIG.initialize(prod) + + if prod: + msg = "Done init config from server" + else: + msg = "Done init config from local file" + os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = str(KAG_PROJECT_CONF.project_id) + os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = str(KAG_PROJECT_CONF.host_addr) + if len(KAG_CONFIG.all_config) > 0: + dump_flag = os.getenv(KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG) + if dump_flag is not None and dump_flag.strip() == "1": + print(f"{msg}:") + pprint.pprint(KAG_CONFIG.all_config, indent=2) + else: + print( + f"{msg}: set {KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG}=1 to dump config" + ) + else: + print("No config found.") diff --git a/kag/common/default_config.cfg b/kag/common/default_config.cfg deleted file mode 100644 index 04de60db..00000000 --- a/kag/common/default_config.cfg +++ /dev/null @@ -1,33 +0,0 @@ - -[project] -with_server = True -host_addr = http://127.0.0.1:8887 - -[vectorizer] -vectorizer = kag.common.vectorizer.OpenAIVectorizer -model = bge-m3 -api_key = EMPTY -base_url = http://127.0.0.1:11434/v1 -vector_dimensions = 1024 - -[llm] -client_type = ollama -base_url = http://localhost:11434/api/generate -model = llama3.1 - - -[indexer] -with_semantic = False -similarity_threshold = 0.8 - -[retriever] -with_semantic = False -pagerank_threshold = 0.9 -match_threshold = 0.8 -top_k = 10 - -[schedule] -interval_minutes = -1 - -[log] -level = INFO \ No newline at end of file diff --git a/kag/common/env.py b/kag/common/env.py index 60e9907f..916726de 100644 --- a/kag/common/env.py +++ b/kag/common/env.py @@ -1,117 +1,145 @@ # -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. -import logging import os -import sys -from configparser import ConfigParser as CP -from pathlib import Path -from typing import Union, Optional - -import kag.common as common - -class ConfigParser(CP): - def __init__(self,defaults=None): - CP.__init__(self,defaults=defaults) - def optionxform(self, optionstr): - return optionstr - - -LOCAL_SCHEMA_URL = "http://localhost:8887" -DEFAULT_KAG_CONFIG_FILE_NAME = "default_config.cfg" -DEFAULT_KAG_CONFIG_PATH = os.path.join(common.__path__[0], DEFAULT_KAG_CONFIG_FILE_NAME) -KAG_CFG_PREFIX = "KAG" - - -def init_env(): - """Initialize environment to use command-line tool from inside a project - dir. This sets the Scrapy settings module and modifies the Python path to - be able to locate the project module. - """ - project_cfg, root_path = get_config() - - init_kag_config(Path(root_path) / "kag_config.cfg") - - -def get_config(): - """ - Get kag config file as a ConfigParser. - """ - local_cfg_path = _closest_cfg() - local_cfg = ConfigParser() - local_cfg.read(local_cfg_path) - - projdir = "" - if local_cfg_path: - projdir = str(Path(local_cfg_path).parent) - if projdir not in sys.path: - sys.path.append(projdir) - - return local_cfg, projdir - - -def _closest_cfg( - path: Union[str, os.PathLike] = ".", - prev_path: Optional[Union[str, os.PathLike]] = None, -) -> str: - """ - Return the path to the closest .kag.cfg file by traversing the current - directory and its parents - """ - if prev_path is not None and str(path) == str(prev_path): - return "" - path = Path(path).resolve() - cfg_file = path / "kag_config.cfg" - if cfg_file.exists(): - return str(cfg_file) - return _closest_cfg(path.parent, path) - - -def get_cfg_files(): - """ - Get global and local kag config files and paths. - """ - local_cfg_path = _closest_cfg() - local_cfg = ConfigParser() - local_cfg.read(local_cfg_path) - - if local_cfg_path: - projdir = str(Path(local_cfg_path).parent) - if projdir not in sys.path: - sys.path.append(projdir) - - return local_cfg, local_cfg_path - - - -def init_kag_config(config_path: Union[str, Path] = None): - if not config_path or isinstance(config_path, Path) and not config_path.exists(): - config_path = DEFAULT_KAG_CONFIG_PATH - kag_cfg = ConfigParser() - kag_cfg.read(config_path) - os.environ["KAG_PROJECT_ROOT_PATH"] = os.path.abspath(os.path.dirname(config_path)) - - for section in kag_cfg.sections(): - sec_cfg = {} - for key, value in kag_cfg.items(section): - item_cfg_key = f"{KAG_CFG_PREFIX}_{section}_{key}".upper() - os.environ[item_cfg_key] = value - sec_cfg[key] = value - sec_cfg_key = f"{KAG_CFG_PREFIX}_{section}".upper() - os.environ[sec_cfg_key] = str(sec_cfg) - if section == "log": - for key, value in kag_cfg.items(section): - if key == "level": - logging.basicConfig(level=logging.getLevelName(value)) - # neo4j log level set to be default error - logging.getLogger("neo4j.notifications").setLevel(logging.ERROR) - logging.getLogger("neo4j.io").setLevel(logging.INFO) - logging.getLogger("neo4j.pool").setLevel(logging.INFO) +import json +import time +import datetime +import socket +import traceback +from kag.common.conf import KAGConstants + + +def parse_tf_config(): + tf_config_str = os.environ.get(KAGConstants.KS8_ENV_TF_CONFIG, None) + if tf_config_str is None: + return None + else: + return json.loads(tf_config_str) + + +def get_role_number(config, role_name): + role_info = config["cluster"].get(role_name, None) + if role_info is None: + return 0 + else: + return len(role_info) + + +def get_rank(default=None): + if KAGConstants.K8S_ENV_RANK in os.environ: + return int(os.environ[KAGConstants.K8S_ENV_RANK]) + + tf_config = parse_tf_config() + if tf_config is None: + return default + + num_master = get_role_number(tf_config, "master") + task_type = tf_config["task"]["type"] + task_index = tf_config["task"]["index"] + if task_type == "master": + rank = task_index + elif task_type == "worker": + rank = num_master + task_index + else: + rank = default + + return rank + + +def get_world_size(default=None): + if KAGConstants.K8S_ENV_WORLD_SIZE in os.environ: + return os.environ[KAGConstants.K8S_ENV_WORLD_SIZE] + + tf_config = parse_tf_config() + if tf_config is None: + return default + + num_master = get_role_number(tf_config, "master") + num_worker = get_role_number(tf_config, "worker") + + return num_master + num_worker + + +def get_master_port(default=None): + return os.environ.get(KAGConstants.K8S_ENV_MASTER_PORT, default) + + +def get_master_addr(default=None): + if KAGConstants.K8S_ENV_MASTER_ADDR in os.environ: + return os.environ[KAGConstants.K8S_ENV_MASTER_ADDR] + + tf_config = parse_tf_config() + if tf_config is None: + return default + + return tf_config["cluster"]["worker"][0] + + +def host2tensor(master_port): + import torch + + host_str = socket.gethostbyname(socket.gethostname()) + host = [int(x) for x in host_str.split(".")] + host.append(int(master_port)) + host_tensor = torch.tensor(host) + return host_tensor + + +def tensor2host(host_tensor): + host_tensor = host_tensor.tolist() + host = ".".join([str(x) for x in host_tensor[0:4]]) + port = host_tensor[4] + return f"{host}:{port}" + + +def sync_hosts(): + import torch + import torch.distributed as dist + + rank = get_rank() + if rank is None: + raise ValueError("can't get rank of container") + rank = int(rank) + + world_size = get_world_size() + if world_size is None: + raise ValueError("can't get world_size of container") + world_size = int(world_size) + + master_port = get_master_port() + if master_port is None: + raise ValueError("can't get master_port of container") + master_port = int(master_port) + + while True: + try: + dist.init_process_group( + backend="gloo", + rank=rank, + world_size=world_size, + timeout=datetime.timedelta(days=1), + ) + break + except Exception as e: + error_traceback = traceback.format_exc() + print(f"failed to init process group, info: {e}\n\n\n{error_traceback}") + time.sleep(60) + print("Done init process group, get all hosts...") + host_tensors = [torch.tensor([0, 0, 0, 0, 0]) for x in range(world_size)] + dist.all_gather(host_tensors, host2tensor(master_port)) + # we need to destory torch process group to release MASTER_PORT, otherwise the server + # can't serving on it . + print("Done get all hosts, destory process group...") + dist.destroy_process_group() + time.sleep(10) + return [tensor2host(x) for x in host_tensors] + + +def extract_job_name_from_pod_name(pod_name): + if "-ptjob" in pod_name: + return pod_name.rsplit("-ptjob", maxsplit=1)[0] + elif "-tfjob" in pod_name: + return pod_name.rsplit("-tfjob", maxsplit=1)[0] + elif "-mpijob" in pod_name: + return pod_name.rsplit("-mpijob", maxsplit=1)[0] + else: + return None diff --git a/kag/common/graphstore/graph_store.py b/kag/common/graphstore/graph_store.py index 8877ad2b..1cc65f83 100644 --- a/kag/common/graphstore/graph_store.py +++ b/kag/common/graphstore/graph_store.py @@ -49,7 +49,9 @@ def upsert_node(self, label, properties, id_key="id", extra_labels=("Entity",)): pass @abstractmethod - def upsert_nodes(self, label, properties_list, id_key="id", extra_labels=("Entity",)): + def upsert_nodes( + self, label, properties_list, id_key="id", extra_labels=("Entity",) + ): """ Insert or update multiple nodes. @@ -112,10 +114,18 @@ def delete_nodes(self, label, id_values, id_key="id"): pass @abstractmethod - def upsert_relationship(self, start_node_label, start_node_id_value, - end_node_label, end_node_id_value, - rel_type, properties, upsert_nodes=True, - start_node_id_key="id", end_node_id_key="id"): + def upsert_relationship( + self, + start_node_label, + start_node_id_value, + end_node_label, + end_node_id_value, + rel_type, + properties, + upsert_nodes=True, + start_node_id_key="id", + end_node_id_key="id", + ): """ Insert or update a relationship. @@ -133,9 +143,16 @@ def upsert_relationship(self, start_node_label, start_node_id_value, pass @abstractmethod - def upsert_relationships(self, start_node_label, end_node_label, rel_type, - relationships, upsert_nodes=True, start_node_id_key="id", - end_node_id_key="id"): + def upsert_relationships( + self, + start_node_label, + end_node_label, + rel_type, + relationships, + upsert_nodes=True, + start_node_id_key="id", + end_node_id_key="id", + ): """ Insert or update multiple relationships. @@ -151,9 +168,16 @@ def upsert_relationships(self, start_node_label, end_node_label, rel_type, pass @abstractmethod - def delete_relationship(self, start_node_label, start_node_id_value, - end_node_label, end_node_id_value, - rel_type, start_node_id_key="id", end_node_id_key="id"): + def delete_relationship( + self, + start_node_label, + start_node_id_value, + end_node_label, + end_node_id_value, + rel_type, + start_node_id_key="id", + end_node_id_key="id", + ): """ Delete a specified relationship. @@ -169,9 +193,16 @@ def delete_relationship(self, start_node_label, start_node_id_value, pass @abstractmethod - def delete_relationships(self, start_node_label, start_node_id_values, - end_node_label, end_node_id_values, rel_type, - start_node_id_key="id", end_node_id_key="id"): + def delete_relationships( + self, + start_node_label, + start_node_id_values, + end_node_label, + end_node_id_values, + rel_type, + start_node_id_key="id", + end_node_id_key="id", + ): """ Delete multiple relationships. @@ -211,9 +242,16 @@ def create_text_index(self, labels, property_keys, index_name=None): pass @abstractmethod - def create_vector_index(self, label, property_key, index_name=None, - vector_dimensions=768, metric_type="cosine", - hnsw_m=None, hnsw_ef_construction=None): + def create_vector_index( + self, + label, + property_key, + index_name=None, + vector_dimensions=768, + metric_type="cosine", + hnsw_m=None, + hnsw_ef_construction=None, + ): """ Create a vector index. @@ -239,7 +277,9 @@ def delete_index(self, index_name): pass @abstractmethod - def text_search(self, query_string, label_constraints=None, topk=10, index_name=None): + def text_search( + self, query_string, label_constraints=None, topk=10, index_name=None + ): """ Perform a text search. @@ -255,7 +295,15 @@ def text_search(self, query_string, label_constraints=None, topk=10, index_name= pass @abstractmethod - def vector_search(self, label, property_key, query_text_or_vector, topk=10, index_name=None, ef_search=None): + def vector_search( + self, + label, + property_key, + query_text_or_vector, + topk=10, + index_name=None, + ef_search=None, + ): """ Perform a vector search. diff --git a/kag/common/graphstore/neo4j_graph_store.py b/kag/common/graphstore/neo4j_graph_store.py index 33b46d9d..97bd5c47 100644 --- a/kag/common/graphstore/neo4j_graph_store.py +++ b/kag/common/graphstore/neo4j_graph_store.py @@ -10,7 +10,6 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. import logging -import os import re import threading import time @@ -25,18 +24,20 @@ logger = logging.getLogger(__name__) + class SingletonMeta(ABCMeta): """ Thread-safe Singleton metaclass """ + _instances = {} _lock = threading.Lock() def __call__(cls, *args, **kwargs): - uri = kwargs.get('uri') - user = kwargs.get('user') - password = kwargs.get('password') - database = kwargs.get('database', 'neo4j') + uri = kwargs.get("uri") + user = kwargs.get("user") + password = kwargs.get("password") + database = kwargs.get("database", "neo4j") key = (cls, uri, user, password, database) with cls._lock: @@ -46,12 +47,19 @@ def __call__(cls, *args, **kwargs): class Neo4jClient(GraphStore, metaclass=SingletonMeta): - - def __init__(self, uri, user, password, database="neo4j", init_type="write", interval_minutes=10): + def __init__( + self, + uri, + user, + password, + database="neo4j", + init_type="write", + interval_minutes=10, + ): self._driver = GraphDatabase.driver(uri, auth=(user, password)) logger.info(f"init Neo4jClient uri: {uri} database: {database}") self._database = database - self._lucene_special_chars = "\\+-!():^[]\"{}~*?|&/" + self._lucene_special_chars = '\\+-!():^[]"{}~*?|&/' self._lucene_pattern = self._get_lucene_pattern() self._simple_ident = "[A-Za-z_][A-Za-z0-9_]*" self._simple_ident_pattern = re.compile(self._simple_ident) @@ -71,14 +79,16 @@ def close(self): self._driver.close() def schedule_constraint(self, interval_minutes): - def job(): try: self._labels = self._create_unique_constraint() self._update_pagerank_graph() except Exception as e: import traceback - logger.error(f"Error run scheduled job: {traceback.format_exc()}") + + logger.error( + f"Error run scheduled job, info: {e},\ntraceback:\n {traceback.format_exc()}" + ) def run_scheduled_tasks(): while True: @@ -116,7 +126,9 @@ def _create_unique_index_constraint(self, label, session): try: result = session.run(create_constraint_query) result.consume() - logger.debug(f"Unique constraint created for constraint_name: {constraint_name}") + logger.debug( + f"Unique constraint created for constraint_name: {constraint_name}" + ) except Exception as e: logger.debug(f"warn creating constraint for {constraint_name}: {e}") self._create_index_constraint(self, label, session) @@ -186,7 +198,12 @@ def _collect_text_index_info(self, schema_types): label_property_keys = {} for property_key in properties: index_type = properties[property_key].index_type - if property_key == "name" or index_type and index_type in (IndexTypeEnum.Text, IndexTypeEnum.TextAndVector): + if ( + property_key == "name" + or index_type + and index_type + in (IndexTypeEnum.Text, IndexTypeEnum.TextAndVector) + ): label_property_keys[property_key] = True if label_property_keys: labels[label] = True @@ -199,9 +216,13 @@ def upsert_node(self, label, properties, id_key="id", extra_labels=("Entity",)): if label not in self._labels: self._create_unique_index_constraint(self, label, session) try: - return session.execute_write(self._upsert_node, self, label, id_key, properties, extra_labels) + return session.execute_write( + self._upsert_node, self, label, id_key, properties, extra_labels + ) except Exception as e: - logger.error(f"upsert_node label:{label} properties:{properties} Exception: {e}") + logger.error( + f"upsert_node label:{label} properties:{properties} Exception: {e}" + ) return None @staticmethod @@ -209,23 +230,36 @@ def _upsert_node(tx, self, label, id_key, properties, extra_labels): if not label: logger.warning("label cannot be None or empty strings") return None - query = (f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: $properties.{self._escape_neo4j(id_key)}}}) " - "SET n += $properties ") + query = ( + f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: $properties.{self._escape_neo4j(id_key)}}}) " + "SET n += $properties " + ) if extra_labels: query += f", n:{':'.join(self._escape_neo4j(extra_label) for extra_label in extra_labels)} " query += "RETURN n" result = tx.run(query, properties=properties) return result.single()[0] - def upsert_nodes(self, label, properties_list, id_key="id", extra_labels=("Entity",)): + def upsert_nodes( + self, label, properties_list, id_key="id", extra_labels=("Entity",) + ): self._preprocess_node_properties_list(label, properties_list, extra_labels) with self._driver.session(database=self._database) as session: if label not in self._labels: self._create_unique_index_constraint(self, label, session) try: - return session.execute_write(self._upsert_nodes, self, label, properties_list, id_key, extra_labels) + return session.execute_write( + self._upsert_nodes, + self, + label, + properties_list, + id_key, + extra_labels, + ) except Exception as e: - logger.error(f"upsert_nodes label:{label} properties:{properties_list} Exception: {e}") + logger.error( + f"upsert_nodes label:{label} properties:{properties_list} Exception: {e}" + ) return None @staticmethod @@ -233,14 +267,16 @@ def _upsert_nodes(tx, self, label, properties_list, id_key, extra_labels): if not label: logger.warning("label cannot be None or empty strings") return None - query = ("UNWIND $properties_list AS properties " - f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: properties.{self._escape_neo4j(id_key)}}}) " - "SET n += properties ") + query = ( + "UNWIND $properties_list AS properties " + f"MERGE (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: properties.{self._escape_neo4j(id_key)}}}) " + "SET n += properties " + ) if extra_labels: query += f", n:{':'.join(self._escape_neo4j(extra_label) for extra_label in extra_labels)} " query += "RETURN n" result = tx.run(query, properties_list=properties_list) - return [record['n'] for record in result] + return [record["n"] for record in result] def _get_embedding_vector(self, properties, vector_field): for property_key, property_value in properties.items(): @@ -256,7 +292,9 @@ def _get_embedding_vector(self, properties, vector_field): vector = self.vectorizer.vectorize(property_value) return vector except Exception as e: - logger.info(f"An error occurred while vectorizing property {property_key!r}: {e}") + logger.info( + f"An error occurred while vectorizing property {property_key!r}: {e}" + ) return None return None @@ -287,7 +325,9 @@ def batch_preprocess_node_properties(self, node_batch, extra_labels=("Entity",)) return class EmbeddingVectorPlaceholder(object): - def __init__(self, number, properties, vector_field, property_key, property_value): + def __init__( + self, number, properties, vector_field, property_key, property_value + ): self._number = number self._properties = properties self._vector_field = vector_field @@ -317,7 +357,9 @@ def get_placeholder(self, graph_store, properties, vector_field): message = f"property {property_key!r} must be string to generate embedding vector" raise RuntimeError(message) num = len(self._placeholders) - placeholder = EmbeddingVectorPlaceholder(num, properties, vector_field, property_key, property_value) + placeholder = EmbeddingVectorPlaceholder( + num, properties, vector_field, property_key, property_value + ) self._placeholders.append(placeholder) return placeholder return None @@ -364,7 +406,9 @@ def patch(self): for vector_field in vec_meta[label]: if vector_field in properties: continue - placeholder = manager.get_placeholder(self, properties, vector_field) + placeholder = manager.get_placeholder( + self, properties, vector_field + ) if placeholder is not None: properties[vector_field] = placeholder manager.batch_vectorize(self._vectorizer) @@ -406,25 +450,58 @@ def _delete_nodes(tx, self, label, id_key, id_values): query = f"UNWIND $id_values AS id_value MATCH (n:{self._escape_neo4j(label)} {{{self._escape_neo4j(id_key)}: id_value}}) DETACH DELETE n" tx.run(query, id_values=id_values) - def upsert_relationship(self, start_node_label, start_node_id_value, - end_node_label, end_node_id_value, rel_type, - properties, upsert_nodes=True, start_node_id_key="id", end_node_id_key="id"): + def upsert_relationship( + self, + start_node_label, + start_node_id_value, + end_node_label, + end_node_id_value, + rel_type, + properties, + upsert_nodes=True, + start_node_id_key="id", + end_node_id_key="id", + ): rel_type = self._escape_neo4j(rel_type) with self._driver.session(database=self._database) as session: try: - return session.execute_write(self._upsert_relationship, self, start_node_label, start_node_id_key, - start_node_id_value, end_node_label, end_node_id_key, - end_node_id_value, rel_type, properties, upsert_nodes) + return session.execute_write( + self._upsert_relationship, + self, + start_node_label, + start_node_id_key, + start_node_id_value, + end_node_label, + end_node_id_key, + end_node_id_value, + rel_type, + properties, + upsert_nodes, + ) except Exception as e: - logger.error(f"upsert_relationship rel_type:{rel_type} properties:{properties} Exception: {e}") + logger.error( + f"upsert_relationship rel_type:{rel_type} properties:{properties} Exception: {e}" + ) return None @staticmethod - def _upsert_relationship(tx, self, start_node_label, start_node_id_key, start_node_id_value, - end_node_label, end_node_id_key, end_node_id_value, - rel_type, properties, upsert_nodes): + def _upsert_relationship( + tx, + self, + start_node_label, + start_node_id_key, + start_node_id_value, + end_node_label, + end_node_id_key, + end_node_id_value, + rel_type, + properties, + upsert_nodes, + ): if not start_node_label or not end_node_label or not rel_type: - logger.warning("start_node_label, end_node_label, and rel_type cannot be None or empty strings") + logger.warning( + "start_node_label, end_node_label, and rel_type cannot be None or empty strings" + ) return None if upsert_nodes: query = ( @@ -438,25 +515,59 @@ def _upsert_relationship(tx, self, start_node_label, start_node_id_key, start_no f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: $end_node_id_value}}) " f"MERGE (a)-[r:{self._escape_neo4j(rel_type)}]->(b) SET r += $properties RETURN r" ) - result = tx.run(query, start_node_id_value=start_node_id_value, - end_node_id_value=end_node_id_value, properties=properties) + result = tx.run( + query, + start_node_id_value=start_node_id_value, + end_node_id_value=end_node_id_value, + properties=properties, + ) return result.single() - def upsert_relationships(self, start_node_label, end_node_label, rel_type, relations, - upsert_nodes=True, start_node_id_key="id", end_node_id_key="id"): + def upsert_relationships( + self, + start_node_label, + end_node_label, + rel_type, + relations, + upsert_nodes=True, + start_node_id_key="id", + end_node_id_key="id", + ): with self._driver.session(database=self._database) as session: try: - return session.execute_write(self._upsert_relationships, self, relations, start_node_label, - start_node_id_key, end_node_label, end_node_id_key, rel_type, upsert_nodes) + return session.execute_write( + self._upsert_relationships, + self, + relations, + start_node_label, + start_node_id_key, + end_node_label, + end_node_id_key, + rel_type, + upsert_nodes, + ) except Exception as e: - logger.error(f"upsert_relationships rel_type:{rel_type} relations:{relations} Exception: {e}") + logger.error( + f"upsert_relationships rel_type:{rel_type} relations:{relations} Exception: {e}" + ) return None @staticmethod - def _upsert_relationships(tx, self, relations, start_node_label, start_node_id_key, - end_node_label, end_node_id_key, rel_type, upsert_nodes): + def _upsert_relationships( + tx, + self, + relations, + start_node_label, + start_node_id_key, + end_node_label, + end_node_id_key, + rel_type, + upsert_nodes, + ): if not start_node_label or not end_node_label or not rel_type: - logger.warning("start_node_label, end_node_label, and rel_type cannot be None or empty strings") + logger.warning( + "start_node_label, end_node_label, and rel_type cannot be None or empty strings" + ) return None if upsert_nodes: query = ( @@ -473,51 +584,111 @@ def _upsert_relationships(tx, self, relations, start_node_label, start_node_id_k f"MERGE (a)-[r:{self._escape_neo4j(rel_type)}]->(b) SET r += relationship.properties RETURN r" ) - result = tx.run(query, relations=relations, - start_node_label=start_node_label, start_node_id_key=start_node_id_key, - end_node_label=end_node_label, end_node_id_key=end_node_id_key, - rel_type=rel_type) - return [record['r'] for record in result] - - def delete_relationship(self, start_node_label, start_node_id_value, - end_node_label, end_node_id_value, rel_type, - start_node_id_key="id", end_node_id_key="id"): + result = tx.run( + query, + relations=relations, + start_node_label=start_node_label, + start_node_id_key=start_node_id_key, + end_node_label=end_node_label, + end_node_id_key=end_node_id_key, + rel_type=rel_type, + ) + return [record["r"] for record in result] + + def delete_relationship( + self, + start_node_label, + start_node_id_value, + end_node_label, + end_node_id_value, + rel_type, + start_node_id_key="id", + end_node_id_key="id", + ): with self._driver.session(database=self._database) as session: try: - session.execute_write(self._delete_relationship, self, start_node_label, start_node_id_key, - start_node_id_value, end_node_label, end_node_id_key, - end_node_id_value, rel_type) + session.execute_write( + self._delete_relationship, + self, + start_node_label, + start_node_id_key, + start_node_id_value, + end_node_label, + end_node_id_key, + end_node_id_value, + rel_type, + ) except Exception as e: logger.error(f"delete_relationship rel_type:{rel_type} Exception: {e}") - @staticmethod - def _delete_relationship(tx, self, start_node_label, start_node_id_key, start_node_id_value, - end_node_label, end_node_id_key, end_node_id_value, rel_type): + def _delete_relationship( + tx, + self, + start_node_label, + start_node_id_key, + start_node_id_value, + end_node_label, + end_node_id_key, + end_node_id_value, + rel_type, + ): query = ( f"MATCH (a:{self._escape_neo4j(start_node_label)} {{{self._escape_neo4j(start_node_id_key)}: $start_node_id_value}})-[r:{self._escape_neo4j(rel_type)}]->" f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: $end_node_id_value}}) DELETE r" ) - tx.run(query, start_node_id_value=start_node_id_value, end_node_id_value=end_node_id_value) + tx.run( + query, + start_node_id_value=start_node_id_value, + end_node_id_value=end_node_id_value, + ) - def delete_relationships(self, start_node_label, start_node_id_values, - end_node_label, end_node_id_values, rel_type, - start_node_id_key="id", end_node_id_key="id"): + def delete_relationships( + self, + start_node_label, + start_node_id_values, + end_node_label, + end_node_id_values, + rel_type, + start_node_id_key="id", + end_node_id_key="id", + ): with self._driver.session(database=self._database) as session: - session.execute_write(self._delete_relationships, self, - start_node_label, start_node_id_key, start_node_id_values, - end_node_label, end_node_id_key, end_node_id_values, rel_type) + session.execute_write( + self._delete_relationships, + self, + start_node_label, + start_node_id_key, + start_node_id_values, + end_node_label, + end_node_id_key, + end_node_id_values, + rel_type, + ) @staticmethod - def _delete_relationships(tx, self, start_node_label, start_node_id_key, start_node_id_values, - end_node_label, end_node_id_key, end_node_id_values, rel_type): + def _delete_relationships( + tx, + self, + start_node_label, + start_node_id_key, + start_node_id_values, + end_node_label, + end_node_id_key, + end_node_id_values, + rel_type, + ): query = ( "UNWIND $start_node_id_values AS start_node_id_value " "UNWIND $end_node_id_values AS end_node_id_value " f"MATCH (a:{self._escape_neo4j(start_node_label)} {{{self._escape_neo4j(start_node_id_key)}: start_node_id_value}})-[r:{self._escape_neo4j(rel_type)}]->" f"(b:{self._escape_neo4j(end_node_label)} {{{self._escape_neo4j(end_node_id_key)}: end_node_id_value}}) DELETE r" ) - tx.run(query, start_node_id_values=start_node_id_values, end_node_id_values=end_node_id_values) + tx.run( + query, + start_node_id_values=start_node_id_values, + end_node_id_values=end_node_id_values, + ) def _get_lucene_pattern(self): string = re.escape(self._lucene_special_chars) @@ -539,7 +710,7 @@ def _get_utf16_codepoints(self, string): for ch in string: data = ch.encode("utf-16-le") for i in range(0, len(data), 2): - value = int.from_bytes(data[i:i+2], "little") + value = int.from_bytes(data[i : i + 2], "little") result.append(value) return tuple(result) @@ -562,6 +733,7 @@ def _escape_neo4j(self, name): def _to_snake_case(self, name): import re + words = re.findall("[A-Za-z][a-z0-9]*", name) result = "_".join(words).lower() return result @@ -578,7 +750,9 @@ def _create_vector_field_name(self, property_key): def create_index(self, label, property_key, index_name=None): with self._driver.session(database=self._database) as session: - session.execute_write(self._create_index, self, label, property_key, index_name) + session.execute_write( + self._create_index, self, label, property_key, index_name + ) @staticmethod def _create_index(tx, self, label, property_key, index_name): @@ -596,50 +770,87 @@ def create_text_index(self, labels, property_keys, index_name=None): if index_name is None: index_name = "_default_text_index" label_spec = "|".join(self._escape_neo4j(label) for label in labels) - property_spec = ", ".join(f"n.{self._escape_neo4j(key)}" for key in property_keys) + property_spec = ", ".join( + f"n.{self._escape_neo4j(key)}" for key in property_keys + ) query = ( f"CREATE FULLTEXT INDEX {self._escape_neo4j(index_name)} IF NOT EXISTS " f"FOR (n:{label_spec}) ON EACH [{property_spec}]" ) + def do_create_text_index(tx): tx.run(query) + with self._driver.session(database=self._database) as session: session.execute_write(do_create_text_index) return index_name - def create_vector_index(self, label, property_key, index_name=None, - vector_dimensions=768, metric_type="cosine", - hnsw_m=None, hnsw_ef_construction=None): + def create_vector_index( + self, + label, + property_key, + index_name=None, + vector_dimensions=768, + metric_type="cosine", + hnsw_m=None, + hnsw_ef_construction=None, + ): if index_name is None: index_name = self._create_vector_index_name(label, property_key) if not property_key.lower().endswith("vector"): property_key = self._create_vector_field_name(property_key) with self._driver.session(database=self._database) as session: - session.execute_write(self._create_vector_index, self, label, property_key, index_name, - vector_dimensions, metric_type, hnsw_m, hnsw_ef_construction) + session.execute_write( + self._create_vector_index, + self, + label, + property_key, + index_name, + vector_dimensions, + metric_type, + hnsw_m, + hnsw_ef_construction, + ) self.refresh_vector_index_meta(force=True) return index_name @staticmethod - def _create_vector_index(tx, self, label, property_key, index_name, vector_dimensions, metric_type, hnsw_m, hnsw_ef_construction): + def _create_vector_index( + tx, + self, + label, + property_key, + index_name, + vector_dimensions, + metric_type, + hnsw_m, + hnsw_ef_construction, + ): query = ( f"CREATE VECTOR INDEX {self._escape_neo4j(index_name)} IF NOT EXISTS FOR (n:{self._escape_neo4j(label)}) ON (n.{self._escape_neo4j(property_key)}) " - "OPTIONS { indexConfig: {" - " `vector.dimensions`: $vector_dimensions," - " `vector.similarity_function`: $metric_type" + "OPTIONS { indexConfig: {" + " `vector.dimensions`: $vector_dimensions," + " `vector.similarity_function`: $metric_type" ) if hnsw_m is not None: query += ", `vector.hnsw.m`: $hnsw_m" if hnsw_ef_construction is not None: query += ", `vector.hnsw.ef_construction`: $hnsw_ef_construction" query += "}}" - tx.run(query, vector_dimensions=vector_dimensions, metric_type=metric_type, - hnsw_m=hnsw_m, hnsw_ef_construction=hnsw_ef_construction) + tx.run( + query, + vector_dimensions=vector_dimensions, + metric_type=metric_type, + hnsw_m=hnsw_m, + hnsw_ef_construction=hnsw_ef_construction, + ) def refresh_vector_index_meta(self, force=False): import time + if not force and time.time() - self._vec_meta_ts < self._vec_meta_timeout: return + def do_refresh_vector_index_meta(tx): query = "SHOW VECTOR INDEX" res = tx.run(query) @@ -647,14 +858,17 @@ def do_refresh_vector_index_meta(tx): meta = dict() for record in data: if record["entityType"] == "NODE": - label, = record["labelsOrTypes"] - vector_field, = record["properties"] - if vector_field.startswith("_") and vector_field.endswith("_vector"): + (label,) = record["labelsOrTypes"] + (vector_field,) = record["properties"] + if vector_field.startswith("_") and vector_field.endswith( + "_vector" + ): if label not in meta: meta[label] = [] meta[label].append(vector_field) self._vec_meta = meta self._vec_meta_ts = time.time() + with self._driver.session(database=self._database) as session: session.execute_read(do_refresh_vector_index_meta) @@ -678,7 +892,9 @@ def vectorizer(self): def vectorizer(self, value): self._vectorizer = value - def text_search(self, query_string, label_constraints=None, topk=10, index_name=None): + def text_search( + self, query_string, label_constraints=None, topk=10, index_name=None + ): if index_name is None: index_name = "_default_text_index" if label_constraints is None: @@ -686,31 +902,48 @@ def text_search(self, query_string, label_constraints=None, topk=10, index_name= elif isinstance(label_constraints, str): label_constraints = self._escape_neo4j(label_constraints) elif isinstance(label_constraints, (list, tuple)): - label_constraints = "|".join(self._escape_neo4j(label_constraint) for label_constraint in label_constraints) + label_constraints = "|".join( + self._escape_neo4j(label_constraint) + for label_constraint in label_constraints + ) else: message = f"invalid label_constraints: {label_constraints!r}" raise RuntimeError(message) if label_constraints is None: - query = ("CALL db.index.fulltext.queryNodes($index_name, $query_string) " - "YIELD node AS node, score " - "RETURN node, score") + query = ( + "CALL db.index.fulltext.queryNodes($index_name, $query_string) " + "YIELD node AS node, score " + "RETURN node, score" + ) else: - query = ("CALL db.index.fulltext.queryNodes($index_name, $query_string) " - "YIELD node AS node, score " - f"WHERE (node:{label_constraints}) " - "RETURN node, score") + query = ( + "CALL db.index.fulltext.queryNodes($index_name, $query_string) " + "YIELD node AS node, score " + f"WHERE (node:{label_constraints}) " + "RETURN node, score" + ) query += " LIMIT $topk" query_string = self._make_lucene_query(query_string) def do_text_search(tx): - res = tx.run(query, query_string=query_string, topk=topk, index_name=index_name) + res = tx.run( + query, query_string=query_string, topk=topk, index_name=index_name + ) data = res.data() return data with self._driver.session(database=self._database) as session: return session.execute_read(do_text_search) - def vector_search(self, label, property_key, query_text_or_vector, topk=10, index_name=None, ef_search=None): + def vector_search( + self, + label, + property_key, + query_text_or_vector, + topk=10, + index_name=None, + ef_search=None, + ): if ef_search is not None: if ef_search < topk: message = f"ef_search must be greater than or equal to topk; {ef_search!r} is invalid" @@ -719,13 +952,17 @@ def vector_search(self, label, property_key, query_text_or_vector, topk=10, inde if index_name is None: vec_meta = self._vec_meta if label not in vec_meta: - logger.warning(f"vector index not defined for label, return empty. label: {label}, " - f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}.") + logger.warning( + f"vector index not defined for label, return empty. label: {label}, " + f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}." + ) return [] vector_field = self._create_vector_field_name(property_key) if vector_field not in vec_meta[label]: - logger.warning(f"vector index not defined for field, return empty. label: {label}, " - f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}.") + logger.warning( + f"vector index not defined for field, return empty. label: {label}, " + f"property_key: {property_key}, query_text_or_vector: {query_text_or_vector}." + ) return [] if index_name is None: index_name = self._create_vector_index_name(label, property_key) @@ -736,16 +973,27 @@ def vector_search(self, label, property_key, query_text_or_vector, topk=10, inde def do_vector_search(tx): if ef_search is not None: - query = ("CALL db.index.vector.queryNodes($index_name, $ef_search, $query_vector) " - "YIELD node, score " - "RETURN node, score, labels(node) as __labels__" - f"LIMIT {topk}") - res = tx.run(query, query_vector=query_vector, ef_search=ef_search, index_name=index_name) + query = ( + "CALL db.index.vector.queryNodes($index_name, $ef_search, $query_vector) " + "YIELD node, score " + "RETURN node, score, labels(node) as __labels__" + f"LIMIT {topk}" + ) + res = tx.run( + query, + query_vector=query_vector, + ef_search=ef_search, + index_name=index_name, + ) else: - query = ("CALL db.index.vector.queryNodes($index_name, $topk, $query_vector) " - "YIELD node, score " - "RETURN node, score, labels(node) as __labels__") - res = tx.run(query, query_vector=query_vector, topk=topk, index_name=index_name) + query = ( + "CALL db.index.vector.queryNodes($index_name, $topk, $query_vector) " + "YIELD node, score " + "RETURN node, score, labels(node) as __labels__" + ) + res = tx.run( + query, query_vector=query_vector, topk=topk, index_name=index_name + ) data = res.data() for record in data: record["node"]["__labels__"] = record["__labels__"] @@ -757,41 +1005,59 @@ def do_vector_search(tx): def _create_all_graph(self, graph_name): with self._driver.session(database=self._database) as session: - logger.debug(f"create pagerank graph graph_name:{graph_name} database:{self._database}") - result = session.run(f""" + logger.debug( + f"create pagerank graph graph_name:{graph_name} database:{self._database}" + ) + result = session.run( + f""" CALL gds.graph.exists('{graph_name}') YIELD exists WHERE exists CALL gds.graph.drop('{graph_name}') YIELD graphName RETURN graphName - """) + """ + ) summary = result.consume() - logger.debug(f"create pagerank graph exists graph_name:{graph_name} database:{self._database} succeed " - f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}") + logger.debug( + f"create pagerank graph exists graph_name:{graph_name} database:{self._database} succeed " + f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}" + ) - result = session.run(f""" + result = session.run( + f""" CALL gds.graph.project('{graph_name}','*','*') YIELD graphName, nodeCount AS nodes, relationshipCount AS rels RETURN graphName, nodes, rels - """) + """ + ) summary = result.consume() - logger.debug(f"create pagerank graph graph_name:{graph_name} database:{self._database} succeed " - f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}") + logger.debug( + f"create pagerank graph graph_name:{graph_name} database:{self._database} succeed " + f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}" + ) def _drop_all_graph(self, graph_name): with self._driver.session(database=self._database) as session: - logger.debug(f"drop pagerank graph graph_name:{graph_name} database:{self._database}") - result = session.run(f""" + logger.debug( + f"drop pagerank graph graph_name:{graph_name} database:{self._database}" + ) + result = session.run( + f""" CALL gds.graph.exists('{graph_name}') YIELD exists WHERE exists CALL gds.graph.drop('{graph_name}') YIELD graphName RETURN graphName - """) + """ + ) result.consume() - logger.debug(f"drop pagerank graph graph_name:{graph_name} database:{self._database} succeed") + logger.debug( + f"drop pagerank graph graph_name:{graph_name} database:{self._database} succeed" + ) def execute_pagerank(self, iterations=20, damping_factor=0.85): with self._driver.session(database=self._database) as session: - return session.execute_write(self._execute_pagerank, iterations, damping_factor) + return session.execute_write( + self._execute_pagerank, iterations, damping_factor + ) @staticmethod def _execute_pagerank(tx, iterations, damping_factor): @@ -809,7 +1075,9 @@ def get_pagerank_scores(self, start_nodes, target_type): with self._driver.session(database=self._database) as session: all_graph = self._allGraph self._exists_all_graph(session, all_graph) - data = session.execute_write(self._get_pagerank_scores, self, all_graph, start_nodes, target_type) + data = session.execute_write( + self._get_pagerank_scores, self, all_graph, start_nodes, target_type + ) return data @staticmethod @@ -817,13 +1085,15 @@ def _get_pagerank_scores(tx, self, graph_name, start_nodes, return_type): match_clauses = [] match_identify = [] for index, node in enumerate(start_nodes): - node_type, node_name = node['type'], node['name'] + node_type, node_name = node["type"], node["name"] node_identify = f"node_{index}" - match_clauses.append(f"MATCH ({node_identify}:{self._escape_neo4j(node_type)} {{name: '{escape_single_quotes(node_name)}'}})") + match_clauses.append( + f"MATCH ({node_identify}:{self._escape_neo4j(node_type)} {{name: '{escape_single_quotes(node_name)}'}})" + ) match_identify.append(node_identify) - match_query = ' '.join(match_clauses) - match_identify_str = ', '.join(match_identify) + match_query = " ".join(match_clauses) + match_identify_str = ", ".join(match_identify) pagerank_query = f""" {match_query} @@ -845,16 +1115,20 @@ def _get_pagerank_scores(tx, self, graph_name, start_nodes, return_type): def _exists_all_graph(session, graph_name): try: logger.debug(f"exists pagerank graph graph_name:{graph_name}") - result = session.run(f""" + result = session.run( + f""" CALL gds.graph.exists('{graph_name}') YIELD exists WHERE NOT exists CALL gds.graph.project('{graph_name}','*','*') YIELD graphName, nodeCount AS nodes, relationshipCount AS rels RETURN graphName, nodes, rels - """) + """ + ) summary = result.consume() - logger.debug(f"exists pagerank graph graph_name:{graph_name} succeed " - f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}") + logger.debug( + f"exists pagerank graph graph_name:{graph_name} succeed " + f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}" + ) except Exception as e: logger.debug(f"Error exists pagerank graph {graph_name}: {e}") @@ -873,18 +1147,26 @@ def _count(tx, self, label): def create_database(self, database): with self._driver.session(database=self._database) as session: database = database.lower() - result = session.run(f"CREATE DATABASE {self._escape_neo4j(database)} IF NOT EXISTS") + result = session.run( + f"CREATE DATABASE {self._escape_neo4j(database)} IF NOT EXISTS" + ) summary = result.consume() - logger.info(f"create_database {database} succeed " - f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}") + logger.info( + f"create_database {database} succeed " + f"executed:{summary.result_available_after} consumed:{summary.result_consumed_after}" + ) def delete_all_data(self, database): if self._database != database: - raise ValueError(f"Error: Current database ({self._database}) is not the same as the target database ({database}).") + raise ValueError( + f"Error: Current database ({self._database}) is not the same as the target database ({database})." + ) with self._driver.session(database=database) as session: while True: - result = session.run("MATCH (n) WITH n LIMIT 100000 DETACH DELETE n RETURN count(*)") + result = session.run( + "MATCH (n) WITH n LIMIT 100000 DETACH DELETE n RETURN count(*)" + ) count = result.single()[0] logger.info(f"Deleted {count} nodes in this batch.") if count == 0: @@ -893,7 +1175,9 @@ def delete_all_data(self, database): def run_cypher_query(self, database, query, parameters=None): if database and self._database != database: - raise ValueError(f"Current database ({self._database}) is not the same as the target database ({database}).") + raise ValueError( + f"Current database ({self._database}) is not the same as the target database ({database})." + ) with self._driver.session(database=database) as session: result = session.run(query, parameters) diff --git a/kag/common/graphstore/rest/__init__.py b/kag/common/graphstore/rest/__init__.py index 923147a3..2cce4606 100644 --- a/kag/common/graphstore/rest/__init__.py +++ b/kag/common/graphstore/rest/__init__.py @@ -35,4 +35,6 @@ from kag.common.graphstore.rest.models.edge_record_instance import EdgeRecordInstance from kag.common.graphstore.rest.models.upsert_edge_request import UpsertEdgeRequest from kag.common.graphstore.rest.models.upsert_vertex_request import UpsertVertexRequest -from kag.common.graphstore.rest.models.vertex_record_instance import VertexRecordInstance +from kag.common.graphstore.rest.models.vertex_record_instance import ( + VertexRecordInstance, +) diff --git a/kag/common/graphstore/rest/graph_api.py b/kag/common/graphstore/rest/graph_api.py index e2875966..13dcd5ea 100644 --- a/kag/common/graphstore/rest/graph_api.py +++ b/kag/common/graphstore/rest/graph_api.py @@ -18,10 +18,7 @@ import six from kag.common.rest.api_client import ApiClient -from kag.common.rest.exceptions import ( # noqa: F401 - ApiTypeError, - ApiValueError -) +from kag.common.rest.exceptions import ApiTypeError, ApiValueError # noqa: F401 class GraphApi(object): @@ -57,7 +54,7 @@ def graph_delete_edge_post(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.graph_delete_edge_post_with_http_info(**kwargs) # noqa: E501 def graph_delete_edge_post_with_http_info(self, **kwargs): # noqa: E501 @@ -86,26 +83,24 @@ def graph_delete_edge_post_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'delete_edge_request' - ] + all_params = ["delete_edge_request"] all_params.extend( [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' + "async_req", + "_return_http_data_only", + "_preload_content", + "_request_timeout", ] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( "Got an unexpected keyword argument '%s'" " to method graph_delete_edge_post" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -119,34 +114,42 @@ def graph_delete_edge_post_with_http_info(self, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'delete_edge_request' in local_var_params: - body_params = local_var_params['delete_edge_request'] + if "delete_edge_request" in local_var_params: + body_params = local_var_params["delete_edge_request"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/json']) # noqa: E501 + header_params[ + "Content-Type" + ] = self.api_client.select_header_content_type( # noqa: E501 + ["application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/graph/deleteEdge', 'POST', + "/graph/deleteEdge", + "POST", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='object', # noqa: E501 + response_type="object", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get( + "_return_http_data_only" + ), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def graph_delete_vertex_post(self, **kwargs): # noqa: E501 """delete_vertex # noqa: E501 @@ -169,7 +172,7 @@ def graph_delete_vertex_post(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.graph_delete_vertex_post_with_http_info(**kwargs) # noqa: E501 def graph_delete_vertex_post_with_http_info(self, **kwargs): # noqa: E501 @@ -198,26 +201,24 @@ def graph_delete_vertex_post_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'delete_vertex_request' - ] + all_params = ["delete_vertex_request"] all_params.extend( [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' + "async_req", + "_return_http_data_only", + "_preload_content", + "_request_timeout", ] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( "Got an unexpected keyword argument '%s'" " to method graph_delete_vertex_post" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -231,34 +232,42 @@ def graph_delete_vertex_post_with_http_info(self, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'delete_vertex_request' in local_var_params: - body_params = local_var_params['delete_vertex_request'] + if "delete_vertex_request" in local_var_params: + body_params = local_var_params["delete_vertex_request"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/json']) # noqa: E501 + header_params[ + "Content-Type" + ] = self.api_client.select_header_content_type( # noqa: E501 + ["application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/graph/deleteVertex', 'POST', + "/graph/deleteVertex", + "POST", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='object', # noqa: E501 + response_type="object", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get( + "_return_http_data_only" + ), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def graph_upsert_edge_post(self, **kwargs): # noqa: E501 """upsert_edge # noqa: E501 @@ -281,7 +290,7 @@ def graph_upsert_edge_post(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.graph_upsert_edge_post_with_http_info(**kwargs) # noqa: E501 def graph_upsert_edge_post_with_http_info(self, **kwargs): # noqa: E501 @@ -310,26 +319,24 @@ def graph_upsert_edge_post_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'upsert_edge_request' - ] + all_params = ["upsert_edge_request"] all_params.extend( [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' + "async_req", + "_return_http_data_only", + "_preload_content", + "_request_timeout", ] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( "Got an unexpected keyword argument '%s'" " to method graph_upsert_edge_post" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -343,34 +350,42 @@ def graph_upsert_edge_post_with_http_info(self, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'upsert_edge_request' in local_var_params: - body_params = local_var_params['upsert_edge_request'] + if "upsert_edge_request" in local_var_params: + body_params = local_var_params["upsert_edge_request"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/json']) # noqa: E501 + header_params[ + "Content-Type" + ] = self.api_client.select_header_content_type( # noqa: E501 + ["application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/graph/upsertEdge', 'POST', + "/graph/upsertEdge", + "POST", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='object', # noqa: E501 + response_type="object", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get( + "_return_http_data_only" + ), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) def graph_upsert_vertex_post(self, **kwargs): # noqa: E501 """upsert_vertex # noqa: E501 @@ -393,7 +408,7 @@ def graph_upsert_vertex_post(self, **kwargs): # noqa: E501 If the method is called asynchronously, returns the request thread. """ - kwargs['_return_http_data_only'] = True + kwargs["_return_http_data_only"] = True return self.graph_upsert_vertex_post_with_http_info(**kwargs) # noqa: E501 def graph_upsert_vertex_post_with_http_info(self, **kwargs): # noqa: E501 @@ -422,26 +437,24 @@ def graph_upsert_vertex_post_with_http_info(self, **kwargs): # noqa: E501 local_var_params = locals() - all_params = [ - 'upsert_vertex_request' - ] + all_params = ["upsert_vertex_request"] all_params.extend( [ - 'async_req', - '_return_http_data_only', - '_preload_content', - '_request_timeout' + "async_req", + "_return_http_data_only", + "_preload_content", + "_request_timeout", ] ) - for key, val in six.iteritems(local_var_params['kwargs']): + for key, val in six.iteritems(local_var_params["kwargs"]): if key not in all_params: raise ApiTypeError( "Got an unexpected keyword argument '%s'" " to method graph_upsert_vertex_post" % key ) local_var_params[key] = val - del local_var_params['kwargs'] + del local_var_params["kwargs"] collection_formats = {} @@ -455,31 +468,39 @@ def graph_upsert_vertex_post_with_http_info(self, **kwargs): # noqa: E501 local_var_files = {} body_params = None - if 'upsert_vertex_request' in local_var_params: - body_params = local_var_params['upsert_vertex_request'] + if "upsert_vertex_request" in local_var_params: + body_params = local_var_params["upsert_vertex_request"] # HTTP header `Accept` - header_params['Accept'] = self.api_client.select_header_accept( - ['application/json']) # noqa: E501 + header_params["Accept"] = self.api_client.select_header_accept( + ["application/json"] + ) # noqa: E501 # HTTP header `Content-Type` - header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 - ['application/json']) # noqa: E501 + header_params[ + "Content-Type" + ] = self.api_client.select_header_content_type( # noqa: E501 + ["application/json"] + ) # noqa: E501 # Authentication setting auth_settings = [] # noqa: E501 return self.api_client.call_api( - '/graph/upsertVertex', 'POST', + "/graph/upsertVertex", + "POST", path_params, query_params, header_params, body=body_params, post_params=form_params, files=local_var_files, - response_type='object', # noqa: E501 + response_type="object", # noqa: E501 auth_settings=auth_settings, - async_req=local_var_params.get('async_req'), - _return_http_data_only=local_var_params.get('_return_http_data_only'), # noqa: E501 - _preload_content=local_var_params.get('_preload_content', True), - _request_timeout=local_var_params.get('_request_timeout'), - collection_formats=collection_formats) + async_req=local_var_params.get("async_req"), + _return_http_data_only=local_var_params.get( + "_return_http_data_only" + ), # noqa: E501 + _preload_content=local_var_params.get("_preload_content", True), + _request_timeout=local_var_params.get("_request_timeout"), + collection_formats=collection_formats, + ) diff --git a/kag/common/graphstore/rest/models/__init__.py b/kag/common/graphstore/rest/models/__init__.py index 9660757a..ef11492f 100644 --- a/kag/common/graphstore/rest/models/__init__.py +++ b/kag/common/graphstore/rest/models/__init__.py @@ -16,4 +16,6 @@ from kag.common.graphstore.rest.models.edge_record_instance import EdgeRecordInstance from kag.common.graphstore.rest.models.upsert_edge_request import UpsertEdgeRequest from kag.common.graphstore.rest.models.upsert_vertex_request import UpsertVertexRequest -from kag.common.graphstore.rest.models.vertex_record_instance import VertexRecordInstance +from kag.common.graphstore.rest.models.vertex_record_instance import ( + VertexRecordInstance, +) diff --git a/kag/common/graphstore/rest/models/delete_edge_request.py b/kag/common/graphstore/rest/models/delete_edge_request.py index 4dc2984f..6d0a03ed 100644 --- a/kag/common/graphstore/rest/models/delete_edge_request.py +++ b/kag/common/graphstore/rest/models/delete_edge_request.py @@ -32,17 +32,13 @@ class DeleteEdgeRequest(object): attribute_map (dict): The key is attribute name and the value is json key in definition. """ - openapi_types = { - 'project_id': 'int', - 'edges': 'list[EdgeRecordInstance]' - } + openapi_types = {"project_id": "int", "edges": "list[EdgeRecordInstance]"} - attribute_map = { - 'project_id': 'projectId', - 'edges': 'edges' - } + attribute_map = {"project_id": "projectId", "edges": "edges"} - def __init__(self, project_id=None, edges=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, project_id=None, edges=None, local_vars_configuration=None + ): # noqa: E501 """DeleteEdgeRequest - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -73,8 +69,12 @@ def project_id(self, project_id): :param project_id: The project_id of this DeleteEdgeRequest. # noqa: E501 :type: int """ - if self.local_vars_configuration.client_side_validation and project_id is None: # noqa: E501 - raise ValueError("Invalid value for `project_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and project_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `project_id`, must not be `None`" + ) # noqa: E501 self._project_id = project_id @@ -96,8 +96,12 @@ def edges(self, edges): :param edges: The edges of this DeleteEdgeRequest. # noqa: E501 :type: list[EdgeRecordInstance] """ - if self.local_vars_configuration.client_side_validation and edges is None: # noqa: E501 - raise ValueError("Invalid value for `edges`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and edges is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `edges`, must not be `None`" + ) # noqa: E501 self._edges = edges @@ -108,18 +112,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/graphstore/rest/models/delete_vertex_request.py b/kag/common/graphstore/rest/models/delete_vertex_request.py index 1e9b980a..f6384a20 100644 --- a/kag/common/graphstore/rest/models/delete_vertex_request.py +++ b/kag/common/graphstore/rest/models/delete_vertex_request.py @@ -32,17 +32,13 @@ class DeleteVertexRequest(object): attribute_map (dict): The key is attribute name and the value is json key in definition. """ - openapi_types = { - 'project_id': 'int', - 'vertices': 'list[VertexRecordInstance]' - } + openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"} - attribute_map = { - 'project_id': 'projectId', - 'vertices': 'vertices' - } + attribute_map = {"project_id": "projectId", "vertices": "vertices"} - def __init__(self, project_id=None, vertices=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, project_id=None, vertices=None, local_vars_configuration=None + ): # noqa: E501 """DeleteVertexRequest - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -73,8 +69,12 @@ def project_id(self, project_id): :param project_id: The project_id of this DeleteVertexRequest. # noqa: E501 :type: int """ - if self.local_vars_configuration.client_side_validation and project_id is None: # noqa: E501 - raise ValueError("Invalid value for `project_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and project_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `project_id`, must not be `None`" + ) # noqa: E501 self._project_id = project_id @@ -96,8 +96,12 @@ def vertices(self, vertices): :param vertices: The vertices of this DeleteVertexRequest. # noqa: E501 :type: list[VertexRecordInstance] """ - if self.local_vars_configuration.client_side_validation and vertices is None: # noqa: E501 - raise ValueError("Invalid value for `vertices`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and vertices is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `vertices`, must not be `None`" + ) # noqa: E501 self._vertices = vertices @@ -108,18 +112,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/graphstore/rest/models/edge_record_instance.py b/kag/common/graphstore/rest/models/edge_record_instance.py index 77873ddd..e901fdde 100644 --- a/kag/common/graphstore/rest/models/edge_record_instance.py +++ b/kag/common/graphstore/rest/models/edge_record_instance.py @@ -33,24 +33,33 @@ class EdgeRecordInstance(object): and the value is json key in definition. """ openapi_types = { - 'src_type': 'str', - 'src_id': 'str', - 'dst_type': 'str', - 'dst_id': 'str', - 'label': 'str', - 'properties': 'object' + "src_type": "str", + "src_id": "str", + "dst_type": "str", + "dst_id": "str", + "label": "str", + "properties": "object", } attribute_map = { - 'src_type': 'srcType', - 'src_id': 'srcId', - 'dst_type': 'dstType', - 'dst_id': 'dstId', - 'label': 'label', - 'properties': 'properties' + "src_type": "srcType", + "src_id": "srcId", + "dst_type": "dstType", + "dst_id": "dstId", + "label": "label", + "properties": "properties", } - def __init__(self, src_type=None, src_id=None, dst_type=None, dst_id=None, label=None, properties=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + src_type=None, + src_id=None, + dst_type=None, + dst_id=None, + label=None, + properties=None, + local_vars_configuration=None, + ): # noqa: E501 """EdgeRecordInstance - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -89,8 +98,12 @@ def src_type(self, src_type): :param src_type: The src_type of this EdgeRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and src_type is None: # noqa: E501 - raise ValueError("Invalid value for `src_type`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and src_type is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `src_type`, must not be `None`" + ) # noqa: E501 self._src_type = src_type @@ -112,8 +125,12 @@ def src_id(self, src_id): :param src_id: The src_id of this EdgeRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and src_id is None: # noqa: E501 - raise ValueError("Invalid value for `src_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and src_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `src_id`, must not be `None`" + ) # noqa: E501 self._src_id = src_id @@ -135,8 +152,12 @@ def dst_type(self, dst_type): :param dst_type: The dst_type of this EdgeRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and dst_type is None: # noqa: E501 - raise ValueError("Invalid value for `dst_type`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and dst_type is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `dst_type`, must not be `None`" + ) # noqa: E501 self._dst_type = dst_type @@ -158,8 +179,12 @@ def dst_id(self, dst_id): :param dst_id: The dst_id of this EdgeRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and dst_id is None: # noqa: E501 - raise ValueError("Invalid value for `dst_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and dst_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `dst_id`, must not be `None`" + ) # noqa: E501 self._dst_id = dst_id @@ -181,8 +206,12 @@ def label(self, label): :param label: The label of this EdgeRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and label is None: # noqa: E501 - raise ValueError("Invalid value for `label`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and label is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `label`, must not be `None`" + ) # noqa: E501 self._label = label @@ -204,8 +233,12 @@ def properties(self, properties): :param properties: The properties of this EdgeRecordInstance. # noqa: E501 :type: object """ - if self.local_vars_configuration.client_side_validation and properties is None: # noqa: E501 - raise ValueError("Invalid value for `properties`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and properties is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `properties`, must not be `None`" + ) # noqa: E501 self._properties = properties @@ -216,18 +249,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/graphstore/rest/models/upsert_edge_request.py b/kag/common/graphstore/rest/models/upsert_edge_request.py index 7dd1c89a..5cd69ed1 100644 --- a/kag/common/graphstore/rest/models/upsert_edge_request.py +++ b/kag/common/graphstore/rest/models/upsert_edge_request.py @@ -33,18 +33,24 @@ class UpsertEdgeRequest(object): and the value is json key in definition. """ openapi_types = { - 'project_id': 'int', - 'upsert_adjacent_vertices': 'bool', - 'edges': 'list[EdgeRecordInstance]' + "project_id": "int", + "upsert_adjacent_vertices": "bool", + "edges": "list[EdgeRecordInstance]", } attribute_map = { - 'project_id': 'projectId', - 'upsert_adjacent_vertices': 'upsertAdjacentVertices', - 'edges': 'edges' + "project_id": "projectId", + "upsert_adjacent_vertices": "upsertAdjacentVertices", + "edges": "edges", } - def __init__(self, project_id=None, upsert_adjacent_vertices=None, edges=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + project_id=None, + upsert_adjacent_vertices=None, + edges=None, + local_vars_configuration=None, + ): # noqa: E501 """UpsertEdgeRequest - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -77,8 +83,12 @@ def project_id(self, project_id): :param project_id: The project_id of this UpsertEdgeRequest. # noqa: E501 :type: int """ - if self.local_vars_configuration.client_side_validation and project_id is None: # noqa: E501 - raise ValueError("Invalid value for `project_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and project_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `project_id`, must not be `None`" + ) # noqa: E501 self._project_id = project_id @@ -100,8 +110,13 @@ def upsert_adjacent_vertices(self, upsert_adjacent_vertices): :param upsert_adjacent_vertices: The upsert_adjacent_vertices of this UpsertEdgeRequest. # noqa: E501 :type: bool """ - if self.local_vars_configuration.client_side_validation and upsert_adjacent_vertices is None: # noqa: E501 - raise ValueError("Invalid value for `upsert_adjacent_vertices`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation + and upsert_adjacent_vertices is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `upsert_adjacent_vertices`, must not be `None`" + ) # noqa: E501 self._upsert_adjacent_vertices = upsert_adjacent_vertices @@ -123,8 +138,12 @@ def edges(self, edges): :param edges: The edges of this UpsertEdgeRequest. # noqa: E501 :type: list[EdgeRecordInstance] """ - if self.local_vars_configuration.client_side_validation and edges is None: # noqa: E501 - raise ValueError("Invalid value for `edges`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and edges is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `edges`, must not be `None`" + ) # noqa: E501 self._edges = edges @@ -135,18 +154,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/graphstore/rest/models/upsert_vertex_request.py b/kag/common/graphstore/rest/models/upsert_vertex_request.py index 6ed6cec1..682968b8 100644 --- a/kag/common/graphstore/rest/models/upsert_vertex_request.py +++ b/kag/common/graphstore/rest/models/upsert_vertex_request.py @@ -32,17 +32,13 @@ class UpsertVertexRequest(object): attribute_map (dict): The key is attribute name and the value is json key in definition. """ - openapi_types = { - 'project_id': 'int', - 'vertices': 'list[VertexRecordInstance]' - } + openapi_types = {"project_id": "int", "vertices": "list[VertexRecordInstance]"} - attribute_map = { - 'project_id': 'projectId', - 'vertices': 'vertices' - } + attribute_map = {"project_id": "projectId", "vertices": "vertices"} - def __init__(self, project_id=None, vertices=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, project_id=None, vertices=None, local_vars_configuration=None + ): # noqa: E501 """UpsertVertexRequest - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -73,8 +69,12 @@ def project_id(self, project_id): :param project_id: The project_id of this UpsertVertexRequest. # noqa: E501 :type: int """ - if self.local_vars_configuration.client_side_validation and project_id is None: # noqa: E501 - raise ValueError("Invalid value for `project_id`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and project_id is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `project_id`, must not be `None`" + ) # noqa: E501 self._project_id = project_id @@ -96,8 +96,12 @@ def vertices(self, vertices): :param vertices: The vertices of this UpsertVertexRequest. # noqa: E501 :type: list[VertexRecordInstance] """ - if self.local_vars_configuration.client_side_validation and vertices is None: # noqa: E501 - raise ValueError("Invalid value for `vertices`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and vertices is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `vertices`, must not be `None`" + ) # noqa: E501 self._vertices = vertices @@ -108,18 +112,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/graphstore/rest/models/vertex_record_instance.py b/kag/common/graphstore/rest/models/vertex_record_instance.py index 8fe12ca2..710891c1 100644 --- a/kag/common/graphstore/rest/models/vertex_record_instance.py +++ b/kag/common/graphstore/rest/models/vertex_record_instance.py @@ -33,20 +33,27 @@ class VertexRecordInstance(object): and the value is json key in definition. """ openapi_types = { - 'type': 'str', - 'id': 'str', - 'properties': 'object', - 'vectors': 'object' + "type": "str", + "id": "str", + "properties": "object", + "vectors": "object", } attribute_map = { - 'type': 'type', - 'id': 'id', - 'properties': 'properties', - 'vectors': 'vectors' + "type": "type", + "id": "id", + "properties": "properties", + "vectors": "vectors", } - def __init__(self, type=None, id=None, properties=None, vectors=None, local_vars_configuration=None): # noqa: E501 + def __init__( + self, + type=None, + id=None, + properties=None, + vectors=None, + local_vars_configuration=None, + ): # noqa: E501 """VertexRecordInstance - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -81,8 +88,12 @@ def type(self, type): :param type: The type of this VertexRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and type is None: # noqa: E501 - raise ValueError("Invalid value for `type`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and type is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `type`, must not be `None`" + ) # noqa: E501 self._type = type @@ -104,7 +115,9 @@ def id(self, id): :param id: The id of this VertexRecordInstance. # noqa: E501 :type: str """ - if self.local_vars_configuration.client_side_validation and id is None: # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and id is None + ): # noqa: E501 raise ValueError("Invalid value for `id`, must not be `None`") # noqa: E501 self._id = id @@ -127,8 +140,12 @@ def properties(self, properties): :param properties: The properties of this VertexRecordInstance. # noqa: E501 :type: object """ - if self.local_vars_configuration.client_side_validation and properties is None: # noqa: E501 - raise ValueError("Invalid value for `properties`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and properties is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `properties`, must not be `None`" + ) # noqa: E501 self._properties = properties @@ -150,8 +167,12 @@ def vectors(self, vectors): :param vectors: The vectors of this VertexRecordInstance. # noqa: E501 :type: object """ - if self.local_vars_configuration.client_side_validation and vectors is None: # noqa: E501 - raise ValueError("Invalid value for `vectors`, must not be `None`") # noqa: E501 + if ( + self.local_vars_configuration.client_side_validation and vectors is None + ): # noqa: E501 + raise ValueError( + "Invalid value for `vectors`, must not be `None`" + ) # noqa: E501 self._vectors = vectors @@ -162,18 +183,20 @@ def to_dict(self): for attr, _ in six.iteritems(self.openapi_types): value = getattr(self, attr) if isinstance(value, list): - result[attr] = list(map( - lambda x: x.to_dict() if hasattr(x, "to_dict") else x, - value - )) + result[attr] = list( + map(lambda x: x.to_dict() if hasattr(x, "to_dict") else x, value) + ) elif hasattr(value, "to_dict"): result[attr] = value.to_dict() elif isinstance(value, dict): - result[attr] = dict(map( - lambda item: (item[0], item[1].to_dict()) - if hasattr(item[1], "to_dict") else item, - value.items() - )) + result[attr] = dict( + map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") + else item, + value.items(), + ) + ) else: result[attr] = value diff --git a/kag/common/llm/__init__.py b/kag/common/llm/__init__.py index cee64ad5..5d3bfc1e 100644 --- a/kag/common/llm/__init__.py +++ b/kag/common/llm/__init__.py @@ -10,14 +10,18 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. + +from kag.common.llm.openai_client import OpenAIClient +from kag.common.llm.vllm_client import VLLMClient +from kag.common.llm.ollama_client import OllamaClient from kag.common.llm.llm_config_checker import LLMConfigChecker -from kag.common.llm.client.vllm_client import VLLMClient -from kag.common.llm.client.ollama_client import OllamaClient -from kag.common.llm.client.openai_client import OpenAIClient +from kag.common.llm.mock_llm import MockLLMClient __all__ = [ - "LLMConfigChecker", - "VLLMClient", + "LLMClient", "OpenAIClient", - "OllamaClient" + "VLLMClient", + "OllamaClient", + "MockLLMClient", + "LLMConfigChecker", ] diff --git a/kag/common/llm/client/llm_client.py b/kag/common/llm/client/llm_client.py deleted file mode 100644 index 3720516d..00000000 --- a/kag/common/llm/client/llm_client.py +++ /dev/null @@ -1,178 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import os -import json -from pathlib import Path -from typing import Union, Dict, List, Any -import logging -import traceback -import yaml - -from kag.common.base.prompt_op import PromptOp -from kag.common.llm.config import * - - -logger = logging.getLogger(__name__) - - -class LLMClient: - # Define the model type - model: str - - config_cls_map = { - "maas": OpenAIConfig, - "vllm": VLLMConfig, - "ollama": OllamaConfig, - } - - def __init__(self, **kwargs): - self.model = kwargs.get("model", None) - - @classmethod - def get_config_cls(self,config:dict): - client_type = config.get("client_type", None) - return LLMClient.config_cls_map.get(client_type, None) - - @classmethod - def get_llm_cls(self,config: LLMConfig): - from kag.common.llm.client import VLLMClient,OpenAIClient,OllamaClient - return { - VLLMConfig: VLLMClient, - OpenAIConfig: OpenAIClient, - OllamaConfig: OllamaClient, - }[config.__class__] - - @classmethod - def from_config(cls, config: Union[str, dict]): - """ - Initialize an LLMClient instance from a configuration file or dictionary. - - :param config: Path to a configuration file or a configuration dictionary - :return: Initialized LLMClient instance - :raises FileNotFoundError: If the configuration file is not found - :raises ValueError: If the model type is unsupported - """ - if isinstance(config, str): - config_path = Path(config) - if config_path.is_file(): - try: - with open(config_path, "r") as f: - nn_config = yaml.safe_load(f) - except: - logger.error(f"Failed to parse config file") - raise - else: - logger.error(f"Config file not found: {config}") - raise FileNotFoundError(f"Config file not found: {config}") - else: - # If config is already a dictionary, use it directly - nn_config = config - - config_cls = LLMClient.get_config_cls(nn_config) - if config_cls is None: - logger.error(f"Unsupported model type: {nn_config.get('client_type', None)}") - raise ValueError(f"Unsupported model type") - llm_config = config_cls(**nn_config) - llm_cls = LLMClient.get_llm_cls(llm_config) - return llm_cls(llm_config) - - - def __call__(self, prompt: Union[str, dict, list]) -> str: - """ - Perform inference on the given prompt and return the result. - - :param prompt: Input prompt for inference - :return: Inference result - :raises NotImplementedError: If the subclass has not implemented this method - """ - raise NotImplementedError - - def call_with_json_parse(self, prompt: Union[str, dict, list]): - """ - Perform inference on the given prompt and attempt to parse the result as JSON. - - :param prompt: Input prompt for inference - :return: Parsed result - :raises NotImplementedError: If the subclass has not implemented this method - """ - res = self(prompt) - _end = res.rfind("```") - _start = res.find("```json") - if _end != -1 and _start != -1: - json_str = res[_start + len("```json"): _end].strip() - else: - json_str = res - try: - json_result = json.loads(json_str) - except: - return res - return json_result - - def invoke(self, variables: Dict[str, Any], prompt_op: PromptOp, with_json_parse: bool = True, - with_except: bool = False): - """ - Call the model and process the result. - - :param variables: Variables used to build the prompt - :param prompt_op: Prompt operation object for building and parsing prompts - :param with_json_parse: Whether to attempt parsing the response as JSON - :param with_except: Whether to raise exception - :return: Processed result list - """ - result = [] - prompt = prompt_op.build_prompt(variables) - logger.debug(f"Prompt: {prompt}") - if not prompt: - return result - response = "" - try: - response = self.call_with_json_parse(prompt=prompt) if with_json_parse else self(prompt) - logger.debug(f"Response: {response}") - result = prompt_op.parse_response(response, model=self.model, **variables) - logger.debug(f"Result: {result}") - except Exception as e: - import traceback - logger.debug(f"Error {e} during invocation: {traceback.format_exc()}") - if with_except: - raise RuntimeError(f"call llm exception! llm output = {response} , llm input={prompt}, err={e}") - return result - - def batch(self, variables: Dict[str, Any], prompt_op: PromptOp, with_json_parse: bool = True) -> List: - """ - Batch process prompts. - - :param variables: Variables used to build the prompts - :param prompt_op: Prompt operation object for building and parsing prompts - :param with_json_parse: Whether to attempt parsing the response as JSON - :return: List of all processed results - """ - results = [] - prompts = prompt_op.build_prompt(variables) - # If there is only one prompt, call the `invoke` method directly - if isinstance(prompts, str): - return self.invoke(variables, prompt_op, with_json_parse=with_json_parse) - - for idx, prompt in enumerate(prompts, start=0): - logger.debug(f"Prompt_{idx}: {prompt}") - try: - response = self.call_with_json_parse(prompt=prompt) if with_json_parse else self(prompt) - logger.debug(f"Response_{idx}: {response}") - result = prompt_op.parse_response(response, idx=idx, model=self.model, **variables) - logger.debug(f"Result_{idx}: {result}") - results.extend(result) - except Exception as e: - logger.error(f"Error processing prompt {idx}: {e}") - logger.debug(traceback.format_exc()) - continue - return results - \ No newline at end of file diff --git a/kag/common/llm/client/ollama_client.py b/kag/common/llm/client/ollama_client.py deleted file mode 100644 index a4f04408..00000000 --- a/kag/common/llm/client/ollama_client.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import os -import ast -import re -import json -import time -import uuid -import html -from binascii import b2a_hex -from datetime import datetime -from pathlib import Path -from typing import Union, Dict, List, Any -from urllib import request -from collections import defaultdict - -from openai import OpenAI -import logging -from ollama import Client - -import requests -import traceback -from Crypto.Cipher import AES -from requests import RequestException - -from kag.common import arks_pb2 -from kag.common.base.prompt_op import PromptOp -from kag.common.llm.config import OllamaConfig - -from kag.common.llm.client.llm_client import LLMClient - - -# logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger(__name__) - -class OllamaClient(LLMClient): - def __init__(self, llm_config: OllamaConfig): - self.model = llm_config.model - self.base_url = llm_config.base_url - self.param = {} - self.client = Client(host=self.base_url) - - def sync_request(self, prompt,image=None): - # import pdb; pdb.set_trace() - response = self.client.generate(model=self.model, prompt=prompt, stream=False) - content = response["response"] - content = content.replace("”", "”").replace("“", "“") - content = content.replace("·", "") - - return content - - def __call__(self, prompt,image=None): - return self.sync_request(prompt,image) - - def call_with_json_parse(self, prompt): - rsp = self.sync_request(prompt) - _end = rsp.rfind("```") - _start = rsp.find("```json") - if _end != -1 and _start != -1: - json_str = rsp[_start + len("```json"): _end].strip() - else: - json_str = rsp - try: - json_result = json.loads(json_str) - except: - return rsp - return json_result diff --git a/kag/common/llm/config/base.py b/kag/common/llm/config/base.py deleted file mode 100644 index 40f4442a..00000000 --- a/kag/common/llm/config/base.py +++ /dev/null @@ -1,9 +0,0 @@ -"""LLM Parameters model.""" - -from pydantic import BaseModel, Field - - - -class LLMConfig(BaseModel): - """LLM Config model.""" - diff --git a/kag/common/llm/config/enums.py b/kag/common/llm/config/enums.py deleted file mode 100644 index 8741cf74..00000000 --- a/kag/common/llm/config/enums.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'PipelineCacheConfig', 'PipelineFileCacheConfig' and 'PipelineMemoryCacheConfig' models.""" - -from __future__ import annotations - -from enum import Enum - - -class CacheType(str, Enum): - """The cache configuration type for the pipeline.""" - - file = "file" - """The file cache configuration type.""" - memory = "memory" - """The memory cache configuration type.""" - none = "none" - """The none cache configuration type.""" - blob = "blob" - """The blob cache configuration type.""" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class InputFileType(str, Enum): - """The input file type for the pipeline.""" - - csv = "csv" - """The CSV input type.""" - text = "text" - """The text input type.""" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class InputType(str, Enum): - """The input type for the pipeline.""" - - file = "file" - """The file storage type.""" - blob = "blob" - """The blob storage type.""" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class StorageType(str, Enum): - """The storage type for the pipeline.""" - - file = "file" - """The file storage type.""" - memory = "memory" - """The memory storage type.""" - blob = "blob" - """The blob storage type.""" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class ReportingType(str, Enum): - """The reporting configuration type for the pipeline.""" - - file = "file" - """The file reporting configuration type.""" - console = "console" - """The console reporting configuration type.""" - blob = "blob" - """The blob reporting configuration type.""" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class TextEmbeddingTarget(str, Enum): - """The target to use for text embeddings.""" - - all = "all" - required = "required" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -class LLMType(str, Enum): - """LLMType enum class definition.""" - - # Embeddings - OpenAIEmbedding = "openai_embedding" - AzureOpenAIEmbedding = "azure_openai_embedding" - - # Raw Completion - OpenAI = "openai" - AzureOpenAI = "azure_openai" - - # Chat Completion - OpenAIChat = "openai_chat" - AzureOpenAIChat = "azure_openai_chat" - - # Debug - StaticResponse = "static_response" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' diff --git a/kag/common/llm/config/llm_config.cfg b/kag/common/llm/config/llm_config.cfg deleted file mode 100644 index a780c3fa..00000000 --- a/kag/common/llm/config/llm_config.cfg +++ /dev/null @@ -1,67 +0,0 @@ - -#-----------------------------------------------------------------------------------# -# openai SDK maas. client_type = maas # - # -# TongYi # -[llm] # -client_type = maas # -base_url = https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions # -api_key = "put your tongyi api key here" # -model = qwen-turbo # # - # -# Deepseek # -[llm] # -client_type = maas # -base_url = https://api.deepseek.com/beta # -api_key = "put your deepseek api key here" # -model = deepseek-chat # - # -# OpenAI # -[llm] # -client_type = maas # -base_url = https://api.openai.com/v1/chat/completions # -api_key = "put your openai api key here" # -model = gpt-3.5-turbo # - # -#-----------------------------------------------------------------------------------# - - - - -#-----------------------------------------------------------------------------------# -# local llm service. client_type = vllm # - # -# vllm # -[llm] # -client_type = vllm # -base_url = http://localhost:8000/v1/chat/completions # -model = qwen-7b-chat # - # -#-----------------------------------------------------------------------------------# - - - - -#-----------------------------------------------------------------------------------# -# maya llm service. client_type = maya # - # -[llm] # -client_type = maya # -scene_name = Qwen2_7B_Instruct_Knowledge # -chain_name = v1 # -lora_name = humming-v25 # - # -#-----------------------------------------------------------------------------------# - - - - -#-----------------------------------------------------------------------------------# - # -# ollama # -[llm] -client_type = ollama -base_url = http://localhost:11434/api/generate -model = llama3.1 # - # -#-----------------------------------------------------------------------------------# diff --git a/kag/common/llm/config/ollama.py b/kag/common/llm/config/ollama.py deleted file mode 100644 index 595ad8c5..00000000 --- a/kag/common/llm/config/ollama.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import Field -from kag.common.llm.config.base import LLMConfig - - -class OllamaConfig(LLMConfig): - model: str = Field( - description="model name." - ) - base_url: str = Field( - description="post url." - ) \ No newline at end of file diff --git a/kag/common/llm/config/openai.py b/kag/common/llm/config/openai.py deleted file mode 100644 index dc54bd44..00000000 --- a/kag/common/llm/config/openai.py +++ /dev/null @@ -1,20 +0,0 @@ -from pydantic import Field -from kag.common.llm.config.base import LLMConfig - - -class OpenAIConfig(LLMConfig): - api_key: str = Field( - description="api key." - ) - stream: bool = Field( - description="if use stream mode",default=False - ) - model: str = Field( - description="model name." - ) - temperature: float = Field( - description="temperature.",default=0.7 - ) - base_url: str = Field( - description="post url." - ) \ No newline at end of file diff --git a/kag/common/llm/config/proxy.py b/kag/common/llm/config/proxy.py deleted file mode 100644 index 62c43b65..00000000 --- a/kag/common/llm/config/proxy.py +++ /dev/null @@ -1,9 +0,0 @@ -from kag.common.llm.config.base import ProxyLLMConfig - - -class GPTProxyLLMConfig(ProxyLLMConfig): - pass - - -class DeepSeekProxyLLMConfig(ProxyLLMConfig): - pass diff --git a/kag/common/llm/config/vllm.py b/kag/common/llm/config/vllm.py deleted file mode 100644 index 6a018eb4..00000000 --- a/kag/common/llm/config/vllm.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import Field -from kag.common.llm.config.base import LLMConfig - - -class VLLMConfig(LLMConfig): - model: str = Field( - description="model name." - ) - base_url: str = Field( - description="post url." - ) \ No newline at end of file diff --git a/kag/common/llm/llm_config_checker.py b/kag/common/llm/llm_config_checker.py index c2ea3d84..7e9dd844 100644 --- a/kag/common/llm/llm_config_checker.py +++ b/kag/common/llm/llm_config_checker.py @@ -31,7 +31,8 @@ def check(self, config: str) -> str: :rtype: str :raises RuntimeError: if the config is invalid """ - from kag.common.llm.client import LLMClient + from kag.interface import LLMClient + config = json.loads(config) llm_client = LLMClient.from_config(config) try: @@ -39,12 +40,13 @@ def check(self, config: str) -> str: return res except Exception as ex: raise RuntimeError(f"invalid llm config: {config}, for details: {ex}") - + + if __name__ == "__main__": - config = ''' + config = """ {"client_type" :"ollama", "base_url" : "http://localhost:11434/", "model" : "llama3.1" } - ''' + """ config_checker = LLMConfigChecker() res = config_checker.check(config) diff --git a/kag/common/llm/mock_llm.py b/kag/common/llm/mock_llm.py new file mode 100644 index 00000000..dc685d28 --- /dev/null +++ b/kag/common/llm/mock_llm.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import time +import json +from kag.interface import LLMClient + + +@LLMClient.register("mock") +class MockLLMClient(LLMClient): + """ + MockLLMClient is a mock implementation of the LLMClient class, used for testing purposes. + + This class provides a method to simulate the behavior of a language model client by matching input prompts. + """ + + def __init__(self): + """ + Initializes the MockLLMClient instance. + """ + pass + + def match_input(self, prompt): + """ + Simulates the behavior of a language model call by matching the input prompt. + + Args: + prompt: The input prompt to be matched. + """ + time.sleep(0.3) # mimic llm call + if "You're a very effective entity extraction system" in prompt: + return [ + { + "entity": "The Rezort", + "type": "Movie", + "category": "Works", + "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.", + }, + { + "entity": "2015", + "type": "Year", + "category": "Date", + "description": "The year the movie 'The Rezort' was released.", + }, + ] + if "please attempt to provide the official names of" in prompt: + return [ + { + "entity": "The Rezort", + "type": "Movie", + "category": "Works", + "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.", + }, + { + "entity": "2015", + "type": "Year", + "category": "Date", + "description": "The year the movie 'The Rezort' was released.", + }, + ] + if ( + "You are an expert specializing in carrying out open information extraction" + in prompt + ): + return [ + ["The Rezort", "is", "zombie horror film"], + ["The Rezort", "publish at", "2015"], + ] + return "I am an intelligent assistant" + + def __call__(self, prompt): + return json.dumps(self.match_input(prompt)) + + def call_with_json_parse(self, prompt): + return self.match_input(prompt) diff --git a/kag/common/llm/ollama_client.py b/kag/common/llm/ollama_client.py new file mode 100644 index 00000000..82868978 --- /dev/null +++ b/kag/common/llm/ollama_client.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import json + +import logging +from ollama import Client + +from kag.interface import LLMClient +from tenacity import retry, stop_after_attempt + + +# logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +@LLMClient.register("ollama") +class OllamaClient(LLMClient): + """ + A client class for interacting with the Ollama API. + + This class provides methods to make synchronous requests to the Ollama API, handle model calls, and parse responses. + """ + + def __init__(self, model: str, base_url: str): + """ + Initializes the OllamaClient instance. + + Args: + model (str): The model to use for requests. + base_url (str): The base URL for the Ollama API. + """ + self.model = model + self.base_url = base_url + self.param = {} + self.client = Client(host=self.base_url) + self.check() + + def sync_request(self, prompt, image=None): + """ + Makes a synchronous request to the Ollama API with the given prompt. + + Args: + prompt: The prompt to send to the Ollama API. + image: Optional image data to include in the request. + + Returns: + str: The content of the response from the Ollama API. + """ + response = self.client.generate(model=self.model, prompt=prompt, stream=False) + content = response["response"] + content = content.replace("”", "”").replace("“", "“") + content = content.replace("·", "") + + return content + + def __call__(self, prompt, image=None): + """ + Executes a model request when the object is called and returns the result. + + Parameters: + prompt (str): The prompt provided to the model. + + Returns: + str: The response content generated by the model. + """ + + return self.sync_request(prompt, image) + + @retry(stop=stop_after_attempt(3)) + def call_with_json_parse(self, prompt): + """ + Calls the model and attempts to parse the response into JSON format. + + Parameters: + prompt (str): The prompt provided to the model. + + Returns: + Union[dict, str]: If the response is valid JSON, returns the parsed dictionary; otherwise, returns the original response. + """ + + rsp = self.sync_request(prompt) + _end = rsp.rfind("```") + _start = rsp.find("```json") + if _end != -1 and _start != -1: + json_str = rsp[_start + len("```json") : _end].strip() + else: + json_str = rsp + try: + json_result = json.loads(json_str) + except: + return rsp + return json_result diff --git a/kag/common/llm/client/openai_client.py b/kag/common/llm/openai_client.py similarity index 62% rename from kag/common/llm/client/openai_client.py rename to kag/common/llm/openai_client.py index 6a96e687..47f6dbeb 100644 --- a/kag/common/llm/client/openai_client.py +++ b/kag/common/llm/openai_client.py @@ -12,52 +12,55 @@ import json -from typing import Union from openai import OpenAI import logging -from kag.common.llm.client.llm_client import LLMClient -from kag.common.llm.config import OpenAIConfig +from kag.interface import LLMClient +from tenacity import retry, stop_after_attempt -# logging.basicConfig(level=logging.DEBUG) +logging.getLogger("openai").setLevel(logging.ERROR) +logging.getLogger("httpx").setLevel(logging.ERROR) logger = logging.getLogger(__name__) +@LLMClient.register("maas") +@LLMClient.register("openai") class OpenAIClient(LLMClient): """ A client class for interacting with the OpenAI API. Initializes the client with an API key, base URL, streaming option, temperature parameter, and default model. - Parameters: - api_key (str): The OpenAI API key. - base_url (str): The base URL of the API. - stream (bool, optional): Whether to process responses in a streaming manner. Default is False. - temperature (int, optional): Sampling temperature to control the randomness of the model's output. Default is 0.7. - model (str, optional): The default model to use. - - Attributes: - api_key (str): The OpenAI API key. - base_url (str): The base URL of the API. - model (str): The default model to use. - stream (bool): Whether to process responses in a streaming manner. - temperature (float): Sampling temperature. - client (OpenAI): An instance of the OpenAI API client. """ + def __init__( - self, - llm_config:OpenAIConfig + self, + api_key: str, + base_url: str, + model: str, + stream: bool = False, + temperature: float = 0.7, ): - # Initialize the OpenAIClient object - self.api_key = llm_config.api_key - self.base_url = llm_config.base_url - self.model = llm_config.model - self.stream = llm_config.stream - self.temperature = llm_config.temperature - self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) + """ + Initializes the OpenAIClient instance. + + Args: + api_key (str): The API key for accessing the OpenAI API. + base_url (str): The base URL for the OpenAI API. + model (str): The default model to use for requests. + stream (bool, optional): Whether to stream the response. Defaults to False. + temperature (float, optional): The temperature parameter for the model. Defaults to 0.7. + """ + self.api_key = api_key + self.base_url = base_url + self.model = model + self.stream = stream + self.temperature = temperature + self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) + self.check() - def __call__(self, prompt:str, image_url:str=None): + def __call__(self, prompt: str, image_url: str = None): """ Executes a model request when the object is called and returns the result. @@ -71,18 +74,12 @@ def __call__(self, prompt:str, image_url:str=None): if image_url: message = [ {"role": "system", "content": "you are a helpful assistant"}, - {"role": "user", "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image_url", - "image_url": { - "url": image_url - } - } - ] + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], }, ] response = self.client.chat.completions.create( @@ -93,7 +90,7 @@ def __call__(self, prompt:str, image_url:str=None): ) rsp = response.choices[0].message.content return rsp - + else: message = [ {"role": "system", "content": "you are a helpful assistant"}, @@ -108,6 +105,7 @@ def __call__(self, prompt:str, image_url:str=None): rsp = response.choices[0].message.content return rsp + @retry(stop=stop_after_attempt(3)) def call_with_json_parse(self, prompt): """ Calls the model and attempts to parse the response into JSON format. @@ -123,11 +121,11 @@ def call_with_json_parse(self, prompt): _end = rsp.rfind("```") _start = rsp.find("```json") if _end != -1 and _start != -1: - json_str = rsp[_start + len("```json"): _end].strip() + json_str = rsp[_start + len("```json") : _end].strip() else: json_str = rsp try: json_result = json.loads(json_str) except: return rsp - return json_result \ No newline at end of file + return json_result diff --git a/kag/common/llm/client/vllm_client.py b/kag/common/llm/vllm_client.py similarity index 50% rename from kag/common/llm/client/vllm_client.py rename to kag/common/llm/vllm_client.py index b1154403..6f430f3e 100644 --- a/kag/common/llm/client/vllm_client.py +++ b/kag/common/llm/vllm_client.py @@ -10,46 +10,49 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import os -import ast -import re + import json -import time -import uuid -import html -from binascii import b2a_hex -from datetime import datetime -from pathlib import Path -from typing import Union, Dict, List, Any -from urllib import request -from collections import defaultdict - -from openai import OpenAI import logging - import requests -import traceback -from Crypto.Cipher import AES -from requests import RequestException - -from kag.common import arks_pb2 -from kag.common.base.prompt_op import PromptOp -from kag.common.llm.config import VLLMConfig - -from kag.common.llm.client.llm_client import LLMClient +from kag.interface import LLMClient +from tenacity import retry, stop_after_attempt # logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) + +@LLMClient.register("vllm") class VLLMClient(LLMClient): - def __init__(self, llm_config: VLLMConfig): - self.model = llm_config.model - self.base_url = llm_config.base_url + """ + A client class for interacting with a language model deployed by VLLM. + + This class provides methods to make synchronous requests to the VLLM server, handle model calls, and parse responses. + """ + + def __init__(self, model: str, base_url: str): + """ + Initializes the VLLMClient instance. + + Args: + model (str): The model to use for requests. + base_url (str): The base URL for the VLLM API. + """ + self.model = model + self.base_url = base_url self.param = {} + self.check() def sync_request(self, prompt): - # import pdb; pdb.set_trace() + """ + Makes a synchronous request to the VLLM API with the given prompt. + + Args: + prompt: The prompt to send to the VLLM API. + + Returns: + str: The content of the response from the VLLM API. + """ self.param["messages"] = prompt self.param["model"] = self.model @@ -66,18 +69,37 @@ def sync_request(self, prompt): return content def __call__(self, prompt): - content = [ - {"role": "user", "content": prompt} - ] + """ + Executes a model request when the object is called and returns the result. + + Parameters: + prompt (str): The prompt provided to the model. + + Returns: + str: The response content generated by the model. + """ + + content = [{"role": "user", "content": prompt}] return self.sync_request(content) + @retry(stop=stop_after_attempt(3)) def call_with_json_parse(self, prompt): + """ + Calls the model and attempts to parse the response into JSON format. + + Parameters: + prompt (str): The prompt provided to the model. + + Returns: + Union[dict, str]: If the response is valid JSON, returns the parsed dictionary; otherwise, returns the original response. + """ + content = [{"role": "user", "content": prompt}] rsp = self.sync_request(content) _end = rsp.rfind("```") _start = rsp.find("```json") if _end != -1 and _start != -1: - json_str = rsp[_start + len("```json"): _end].strip() + json_str = rsp[_start + len("```json") : _end].strip() else: json_str = rsp try: diff --git a/kag/common/llm/client/__init__.py b/kag/common/registry/__init__.py similarity index 60% rename from kag/common/llm/client/__init__.py rename to kag/common/registry/__init__.py index b26f378a..3ab66aed 100644 --- a/kag/common/llm/client/__init__.py +++ b/kag/common/registry/__init__.py @@ -10,15 +10,16 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -from kag.common.llm.client.openai_client import OpenAIClient -from kag.common.llm.client.vllm_client import VLLMClient -from kag.common.llm.client.llm_client import LLMClient -from kag.common.llm.client.ollama_client import OllamaClient +from kag.common.registry.registrable import Registrable, ConfigurationError +from kag.common.registry.lazy import Lazy +from kag.common.registry.functor import Functor +from kag.common.registry.utils import import_modules_from_path __all__ = [ - "OpenAIClient", - "LLMClient", - "VLLMClient", - "OllamaClient" + "Registrable", + "ConfigurationError", + "Lazy", + "Functor", + "import_modules_from_path", ] diff --git a/kag/common/registry/functor.py b/kag/common/registry/functor.py new file mode 100644 index 00000000..e2286ecd --- /dev/null +++ b/kag/common/registry/functor.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import logging +import collections +from kag.common.registry.registrable import ( + Registrable, + ConfigurationError, + RegistrableType, + create_kwargs, +) +from types import FunctionType + +from typing import Type, Union, Callable, Dict, cast +from functools import partial +from pyhocon import ConfigTree, ConfigFactory + +logger = logging.getLogger() + + +@Registrable.register("functor") +class Functor(Registrable): + """ + A special `Registrable` for functions(NOT classes). + It is used to register user defined functions. The registered function will acquire the + ability of instantiate from configuration. + + e.g.: + + @Functor.register("simple1") + def simple_func1(name: "str", age: list = []): + print(f"name = {name}") + print(f"age = {age}") + return "+".join(age) + conf1 = {"type": "simple1", "name": "zzs", "age": ["1", "2", "3"]} + func = Functor.from_config(conf1) + func() # same as: simple_func1(name = "zzs", age = ["1", "2", "3"]) + + We can also serialize it backto configuration: + + reconstructed_conf = func.to_config() + reconstructed_func = Functor.from_config(reconstructed_conf) + """ + + def __init__(self, function: partial, register_type: str): + self._func = function + self.__register_type__ = register_type + + def __call__(self, *args, **kwargs): + return self._func(*args, **kwargs) + + @classmethod + def register( + cls: Type[RegistrableType], + name: str, + exist_ok: bool = True, + as_default=False, + ): + registry = Registrable._registry[cls] + if as_default: + cls.default_implementation = name + + def add_function_to_registry(func: FunctionType): + # Add to registry, raise an error if key has already been used. + if name in registry: + if exist_ok: + message = ( + f"{name} has already been registered as {registry[name]}, but " + f"exist_ok=True, so overwriting it with {func}" + ) + logger.info(message) + else: + message = ( + f"Cannot register {name} as {cls.__name__}; " + f"name already in use for {registry[name]}" + ) + raise ConfigurationError(message) + registry[name] = func + + return func + + return add_function_to_registry + + @classmethod + def from_config( + cls: Type[RegistrableType], + params: Union[str, Dict, ConfigTree], + constructor_to_call: Callable[..., RegistrableType] = None, + constructor_to_inspect: Union[ + Callable[..., RegistrableType], Callable[[RegistrableType], None] + ] = None, + ) -> RegistrableType: + + if isinstance(params, str): + params = ConfigFactory.from_dict({"type": params}) + elif isinstance(params, collections.abc.Mapping) and not isinstance( + params, ConfigTree + ): + params = ConfigFactory.from_dict(params) + + if not isinstance(params, ConfigTree): + raise ConfigurationError( + f"from_config was passed a `{params}` object that was not able to convert to `ConfigTree`. " + "This probably indicates malformed parameters." + f"This happened when constructing an object of type {cls}." + ) + + # registered_funcs = Registrable._registry.get(cls) + registered_funcs = cls.list_available() + if len(registered_funcs) == 0: + raise ConfigurationError("There are no registered functions.") + + as_registrable = cast(Type[Functor], cls) + default_choice = as_registrable.default_implementation + # call with BaseClass.from_prams, should use `type` to point out which subclasss to use + choice = params.pop("type", default_choice) + choices = as_registrable.list_available() + + if choice not in choices: + message = ( + f"{choice} not in acceptable choices for type: {choices}. " + "You should make sure the class is correctly registerd. " + ) + raise ConfigurationError(message) + + function = Registrable._registry[as_registrable][choice] + # setattr(function, "__register_type__", choice) + constructor_to_inspect = cast(Callable[..., RegistrableType], function) + accepts_kwargs, kwargs = create_kwargs( + constructor_to_inspect, + cls, + params, + ) + if accepts_kwargs: + params.clear() + if len(params) > 0: + raise ConfigurationError( + f"These params are not used for constructing {cls}:\n{params}" + ) + + return cls(partial(function, **kwargs), choice) + + def to_config(self) -> ConfigTree: + config = {} + + if hasattr(self, "__register_type__") and self.__register_type__: + config["type"] = self.__register_type__ + + for k, v in self._func.keywords.items(): + if k in self.NonParams: + continue + if hasattr(v, "to_config"): + conf = v.to_config() + else: + conf = self._to_config(v) + config[k] = conf + return ConfigFactory.from_dict(config) diff --git a/kag/common/registry/lazy.py b/kag/common/registry/lazy.py new file mode 100644 index 00000000..1b3f281e --- /dev/null +++ b/kag/common/registry/lazy.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import inspect +from pyhocon import ConfigTree +from typing import Callable, Generic, TypeVar, Type, Union, Any + +T = TypeVar("T") + + +class Lazy(Generic[T]): + """ + This class is for use when constructing objects using `Registrable`, when an argument to a + constructor has a _sequential dependency_ with another argument to the same constructor. + + For example, in a `Trainer` class you might want to take a `Model` and an `Optimizer` as arguments, + but the `Optimizer` needs to be constructed using the parameters from the `Model`. You can give + the type annotation `Lazy[Optimizer]` to the optimizer argument, then inside the constructor + call `optimizer.construct(parameters=model.parameters)`. + + This is only recommended for use when you have registered a `@classmethod` as the constructor + for your class, instead of using `__init__`. Having a `Lazy[]` type annotation on an argument + to an `__init__` method makes your class completely dependent on being constructed using the + `Registrable` pipeline, which is not a good idea. + + The actual implementation here is incredibly simple; the logic that handles the lazy + construction is actually found in `Registrable`, where we have a special case for a `Lazy` type + annotation. + + ```python + @classmethod + def my_constructor( + cls, + some_object: Lazy[MyObject], + optional_object: Lazy[MyObject] = None, + required_object_with_default: Lazy[MyObject] = Lazy(MyObjectDefault), + ) -> MyClass: + obj1 = some_object.construct() + obj2 = None if optional_object is None else optional_object.construct() + obj3 = required_object_with_default.construct() + ``` + + """ + + def __init__( + self, constructor: Union[Type[T], Callable[..., T]], original_params: Any = None + ): + constructor_to_use: Callable[..., T] + + if inspect.isclass(constructor): + + def constructor_to_use(**kwargs): + return constructor.from_config(ConfigTree({}), **kwargs) + + else: + constructor_to_use = constructor + + self._constructor = constructor_to_use + self.original_params = original_params + + def construct(self, **kwargs) -> T: + return self._constructor(**kwargs) diff --git a/kag/common/registry/registrable.py b/kag/common/registry/registrable.py new file mode 100644 index 00000000..7e0bbacd --- /dev/null +++ b/kag/common/registry/registrable.py @@ -0,0 +1,911 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import inspect +import importlib +import logging +import functools +import collections +import traceback + +from pathlib import Path +from pyhocon import ConfigTree, ConfigFactory +from pyhocon.exceptions import ConfigMissingException +from copy import deepcopy +from collections import defaultdict +from typing import ( + TypeVar, + Type, + Callable, + Dict, + List, + Optional, + Tuple, + Union, + cast, + Any, + get_origin, + get_args, + Mapping, + Set, + Iterable, +) +from kag.common.registry.lazy import Lazy + + +class ConfigurationError(Exception): + def __init__(self, message: str): + super().__init__() + self.message = message + + def __str__(self): + return self.message + + +logger = logging.getLogger() + +RegistrableType = TypeVar("RegistrableType", bound="Registrable") + + +def str_to_bool(s): + if isinstance(s, bool): + return s + s = s.lower() + if s == "true": + return True + elif s == "false": + return False + elif s == "none": + return None + elif s == "0": + return False + elif s == "1": + return True + else: + raise ValueError(f"not supported string {s}") + + +def auto_setattr(func, self, args, kwargs): + # handle default values + def try_setattr(attr, val): + try: + setattr(self, attr, val) + except Exception as e: + logger.warning( + f"set attribute {attr} of type {type(self)} error, info: {e}" + ) + + attrs, varargs, varkw, defaults = (inspect.getfullargspec(func))[:4] + if defaults: + for attr, val in zip(reversed(attrs), reversed(defaults)): + try_setattr(attr, val) + # handle positional arguments + positional_attrs = attrs[1:] + for attr, val in zip(positional_attrs, args): + try_setattr(attr, val) + + if kwargs: + for attr, val in kwargs.items(): + try_setattr(attr, val) + + +def autoargs(func): + """A decorator which automatically assign the inputs of the function to self PRIOR to executing + the function.""" + + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + auto_setattr(func, self, args=args, kwargs=kwargs) + + try: + ret = func(self, *args, **kwargs) + except TypeError as e: + raise TypeError( + "call %s.%s failed, details:%s" + % (type(self).__name__, func.__name__, str(e)) + ) + + return ret + + return wrapper + + +def can_accept_arg(obj, arg: str) -> bool: + """ + Checks whether the provided obj takes a certain arg. + If it's a class, we're really checking whether its constructor does. + If it's a function or method, we're checking the object itself. + Otherwise, we raise an error. + """ + if inspect.isclass(obj): + signature = inspect.signature(obj.__init__) + elif inspect.ismethod(obj) or inspect.isfunction(obj): + signature = inspect.signature(obj) + else: + raise ConfigurationError(f"object {obj} is not callable") + return arg in signature.parameters + + +def can_accept_kwargs(obj) -> bool: + """ + Checks whether a provided object takes in any positional arguments. + Similar to accept_arg, we do this for both the __init__ function of + the class or a function / method + Otherwise, we raise an error + """ + if inspect.isclass(obj): + signature = inspect.signature(obj.__init__) + elif inspect.ismethod(obj) or inspect.isfunction(obj): + signature = inspect.signature(obj) + else: + raise ConfigurationError(f"object {obj} is not callable") + return any( + p.kind == inspect.Parameter.VAR_KEYWORD # type: ignore + for p in signature.parameters.values() + ) + + +def can_construct_from_config(type_: Type) -> bool: + if type_ in [str, int, float, bool]: + return True + origin = getattr(type_, "__origin__", None) + if origin == Lazy: + return True + elif origin: + if hasattr(type_, "from_config"): + return True + args = getattr(type_, "__args__") + return all(can_construct_from_config(arg) for arg in args) + + return hasattr(type_, "from_config") + + +def remove_optional(annotation: type) -> type: + """ + Remove Optional[X](alias of Union[T, None]) annotations by filtering out NoneType from Union[X, NoneType]. + """ + origin = get_origin(annotation) + args = get_args(annotation) + + if origin == Union: + return Union[tuple([arg for arg in args if arg != type(None)])] # noqa + else: + return annotation + + +def extract_parameters( + cls: Type[RegistrableType], + constructor: Union[ + Callable[..., RegistrableType], Callable[[RegistrableType], None] + ] = None, +) -> Dict[str, Any]: + """ + Extracts the parameters from the constructor of a class, excluding any variable positional parameters. + + Args: + cls (Type[RegistrableType]): The class whose constructor parameters are to be extracted. + constructor (Union[Callable[..., RegistrableType], Callable[[RegistrableType], None]], optional): The constructor method to inspect. Defaults to cls.__init__. + + Returns: + Dict[str, Any]: A dictionary containing the parameters of the constructor, excluding any variable positional parameters. + """ + if constructor is None: + constructor = cls.__init__ + if isinstance(constructor, str): + constructor = getattr(cls, constructor) + signature = inspect.signature(constructor) + parameters = dict(signature.parameters) + + var_positional_key = None + for param in parameters.values(): + if param.kind == param.VAR_POSITIONAL: + var_positional_key = param.name + break + if var_positional_key: + del parameters[var_positional_key] + return parameters + + +def create_kwargs( + constructor: Callable[..., RegistrableType], + cls: Type[RegistrableType], + actual_params: ConfigTree, +) -> Tuple[bool, Dict[str, Any]]: + """ + Given some class, a `Params` object, and potentially other keyword arguments, + create a dict of keyword args suitable for passing to the class's constructor. + + The function does this by finding the class's constructor, matching the constructor + arguments to entries in the `params` object, and instantiating values for the parameters + using the type annotation and possibly a from_config method. + + """ + # Get the signature of the constructor. + + kwargs: Dict[str, Any] = {} + + formal_parameters = extract_parameters(cls, constructor) + accepts_kwargs = False + + # Iterate over all the constructor parameters and their annotations. + for param_name, param in formal_parameters.items(): + if param_name == "self": + continue + if param.kind == param.VAR_KEYWORD: + # if constructor takes **kwargs, we will put all the remaining params to kwargs + accepts_kwargs = True + continue + + # annotation = remove_optional(param.annotation) + constructed_arg = pop_and_construct_arg( + cls.__name__, + param_name, + param.annotation, + param.default, + actual_params, + ) + if constructed_arg is not param.default: + kwargs[param_name] = constructed_arg + + # If we just ended up constructing the default value for the parameter, we can just omit it. + # Leaving it in can cause issues with **kwargs in some corner cases, where you might end up + # with multiple values for a single parameter (e.g., the default value gives you lazy=False + # for a dataset reader inside **kwargs, but a particular dataset reader actually hard-codes + # lazy=True - the superclass sees both lazy=True and lazy=False in its constructor). + # if constructor accepts kwargs, put remainder params to kwargs + if accepts_kwargs: + kwargs.update(actual_params) + return accepts_kwargs, kwargs + + +def pop_and_construct_arg( + class_name: str, + argument_name: str, + annotation: Type, + default: Any, + actual_params: ConfigTree, +) -> Any: + annotation = remove_optional(annotation) + popped_params = ( + actual_params.pop(argument_name, default) + if default != inspect.Parameter.empty + else actual_params.pop(argument_name) + ) + if popped_params is None: + return None + + return construct_arg( + class_name, + argument_name, + popped_params, + annotation, + default, + ) + + +def construct_arg( + class_name: str, + argument_name: str, + popped_params: Any, + annotation: Type, + default: Any, +) -> Any: + origin = get_origin(annotation) + args = get_args(annotation) + + optional = default != inspect.Parameter.empty + # annotation is subclass of Registrable + if hasattr(annotation, "from_config"): + if popped_params is default: + return default + elif popped_params is not None: + # If `popped_params` has already been instantiated, use this object directly. + if isinstance(popped_params, annotation): + return popped_params + return annotation.from_config(ConfigFactory.from_dict(popped_params)) + elif not optional: + # Not optional and not supplied, that's an error! + raise ConfigurationError(f"expected key {argument_name} for {class_name}") + else: + return default + + # If the parameter type is a Python primitive, just pop it off + # using the correct casting pop_xyz operation. + elif annotation == int: + if type(popped_params) in {int, bool, str}: + return annotation(popped_params) + else: + raise TypeError(f"Expected {argument_name} to be a {annotation.__name__}.") + elif annotation == bool: + if type(popped_params) in {int, bool}: + return annotation(popped_params) + # string likes 'true', 'false', 'none' can be convert to bool correctly + # NOTE: bool(str) will always return True for nonempty str. + elif type(popped_params) == str: + return str_to_bool(popped_params) + + elif annotation == str: + # Strings are special because we allow casting from Path to str. + if type(popped_params) == str or isinstance(popped_params, Path): + return str(popped_params) # type: ignore + else: + raise TypeError(f"Expected {argument_name} to be a string.") + elif annotation == float: + # Floats are special because in Python, you can put an int wherever you can put a float. + # https://mypy.readthedocs.io/en/stable/duck_type_compatibility.html + if type(popped_params) in {int, float, str}: + return popped_params + else: + raise TypeError(f"Expected {argument_name} to be numeric.") + + elif annotation == ConfigTree: + if isinstance(popped_params, ConfigTree): + return popped_params + elif type(popped_params) in {collections.abc.Mapping, Mapping, Dict, dict}: + return ConfigFactory.from_dict(popped_params) + else: + raise TypeError(f"Expected {argument_name} to be Dict.") + # This is special logic for handling types like Dict[str, TokenIndexer], + # List[TokenIndexer], Tuple[TokenIndexer, Tokenizer], and Set[TokenIndexer], + # which it creates by instantiating each value from_config and returning the resulting structure. + elif ( + origin in {collections.abc.Mapping, Mapping, Dict, dict} + and len(args) == 2 + and can_construct_from_config(args[-1]) + ): + value_cls = annotation.__args__[-1] + + value_dict = {} + + for key, value_params in popped_params.items(): + value_dict[key] = construct_arg( + str(value_cls), + argument_name + "." + key, + value_params, + value_cls, + inspect.Parameter.empty, + ) + + return value_dict + + elif origin in (Tuple, tuple) and all( + can_construct_from_config(arg) for arg in args + ): + value_list = [] + + for i, (value_cls, value_params) in enumerate( + zip(annotation.__args__, popped_params) + ): + value = construct_arg( + str(value_cls), + argument_name + f".{i}", + value_params, + value_cls, + inspect.Parameter.empty, + ) + value_list.append(value) + + return tuple(value_list) + + elif origin in (Set, set) and len(args) == 1 and can_construct_from_config(args[0]): + value_cls = annotation.__args__[0] + + value_set = set() + + for i, value_params in enumerate(popped_params): + value = construct_arg( + str(value_cls), + argument_name + f".{i}", + value_params, + value_cls, + inspect.Parameter.empty, + ) + value_set.add(value) + + return value_set + + elif origin == Union: + # Storing this so we can recover it later if we need to. + backup_params = deepcopy(popped_params) + + # We'll try each of the given types in the union sequentially, returning the first one that + # succeeds. + all_err_msg = [] + for arg_annotation in args: + try: + return construct_arg( + str(arg_annotation), + argument_name, + popped_params, + arg_annotation, + default, + ) + except ( + ValueError, + TypeError, + ConfigurationError, + AttributeError, + ConfigMissingException, + ) as e: + # Our attempt to construct the argument may have modified popped_params, so we + # restore it here. + + popped_params = deepcopy(backup_params) + err_msg = f" Exception caught for constructing {arg_annotation}: {e}\n{traceback.format_exc()}" + all_err_msg.append(err_msg) + # If none of them succeeded, we crash. + info_separatpr = f"{'='*40}\n" + info = ( + f"Failed to construct argument {argument_name} with type {annotation}, details:\n" + f"{'='*80}" + f"\n{info_separatpr.join(all_err_msg)}" + ) + + raise ConfigurationError(info) + elif origin == Lazy: + if popped_params is default: + return default + + value_cls = args[0] + + def constructor(**kwargs): + return value_cls.from_config(params=deepcopy(popped_params), **kwargs) + + return Lazy(constructor, deepcopy(popped_params)) # type: ignore + + # For any other kind of iterable, we will just assume that a list is good enough, and treat + # it the same as List. This condition needs to be at the end, so we don't catch other kinds + # of Iterables with this branch. + elif ( + origin in {collections.abc.Iterable, Iterable, List, list} + and len(args) == 1 + and can_construct_from_config(args[0]) + ): + value_cls = annotation.__args__[0] + + value_list = [] + + for i, value_params in enumerate(popped_params): + value = construct_arg( + str(value_cls), + argument_name + f".{i}", + value_params, + value_cls, + inspect.Parameter.empty, + ) + value_list.append(value) + + return value_list + + else: + return popped_params + + +class Registrable: + """ + This class is motivated by the original work: + https://github.com/allenai/allennlp/blob/main/allennlp/common/from_params.py + """ + + _registry: Dict[Type, Dict[str, Tuple[Type, Optional[str]]]] = defaultdict(dict) + default_implementation: Optional[str] = None + NonParams = [] + + @autoargs + def __init__(self, **kwargs): + pass + + @classmethod + def register( + cls: Type[RegistrableType], + name: str, + constructor: str = None, + exist_ok: bool = True, + as_default=False, + ): + registry = Registrable._registry[cls] + if as_default: + cls.default_implementation = name + + def add_subclass_to_registry(subclass: Type[RegistrableType]): + # Add to registry, raise an error if key has already been used. + if name in registry: + if exist_ok: + message = ( + f"{name} of class {subclass} has already been registered as {registry[name][0].__name__}, but " + f"exist_ok=True, so overwriting with {cls.__name__}" + ) + logger.info(message) + else: + message = ( + f"Cannot register {name} as {cls.__name__}; " + f"name already in use for {registry[name][0].__name__}" + ) + raise ConfigurationError(message) + if inspect.isclass(subclass): + # not wrapped. + if not hasattr(subclass.__init__, "__wrapped__"): + subclass.__init__ = autoargs(subclass.__init__) + + registry[name] = (subclass, constructor) + + return subclass + + return add_subclass_to_registry + + @classmethod + def by_name( + cls: Type[RegistrableType], name: str + ) -> Callable[..., RegistrableType]: + """ + Returns a callable function that constructs an argument of the registered class. Because + you can register particular functions as constructors for specific names, this isn't + necessarily the `__init__` method of some class. + """ + subclass, constructor = cls.resolve_class_name(name) + if not constructor: + return subclass + else: + return getattr(subclass, constructor) + + @classmethod + def resolve_class_name( + cls: Type[RegistrableType], name: str + ) -> Tuple[Type[RegistrableType], Optional[str]]: + if name in Registrable._registry[cls]: + subclass, constructor = Registrable._registry[cls][name] + return subclass, constructor + elif "." in name: + # This might be a fully qualified class name, so we'll try importing its "module" + # and finding it there. + parts = name.split(".") + submodule = ".".join(parts[:-1]) + class_name = parts[-1] + + try: + module = importlib.import_module(submodule) + except ModuleNotFoundError: + raise ConfigurationError( + f"tried to interpret {name} as a path to a class " + f"but unable to import module {submodule}" + ) + + try: + subclass = getattr(module, class_name) + constructor = None + return subclass, constructor + except AttributeError: + raise ConfigurationError( + f"tried to interpret {name} as a path to a class " + f"but unable to find class {class_name} in {submodule}" + ) + + else: + # is not a qualified class name + raise ConfigurationError( + f"{name} is not a registered name for {cls.__name__}. " + "You probably need to use the --include-package flag " + "to load your custom code. Alternatively, you can specify your choices " + """using fully-qualified paths, e.g. {"model": "my_module.models.MyModel"} """ + "in which case they will be automatically imported correctly." + ) + + @classmethod + def list_all_registered(cls, with_leaf_classes: bool = False) -> List[str]: + registered = set() + for k, v in Registrable._registry.items(): + registered.add(k) + if with_leaf_classes: + if isinstance(v, dict): + for _, register_cls in v.items(): + registered.add(register_cls[0]) + return sorted(list(registered), key=lambda x: (x.__module__, x.__name__)) + + @classmethod + def list_available(cls) -> List[str]: + """List default first if it exists""" + keys = list(Registrable._registry[cls].keys()) + default = cls.default_implementation + + if default is None: + return keys + elif default not in keys: + raise ConfigurationError( + f"Default implementation {default} is not registered" + ) + else: + return [default] + [k for k in keys if k != default] + + @classmethod + def list_available_with_detail(cls) -> Dict: + """List default first if it exists""" + register_dict = Registrable._registry[cls] + availables = {} + for k, v in register_dict.items(): + params = extract_parameters(v[0], v[1]) + required_params = [] + optional_params = [] + sample_config = {"type": k} + for arg_name, arg_def in params.items(): + if arg_name.strip() == "self": + continue + annotation = arg_def.annotation + if annotation == inspect.Parameter.empty: + annotation = None + default = arg_def.default + required = default == inspect.Parameter.empty + # if default == inspect.Parameter.empty: + # default = None + if required: + arg_info = ( + f"{arg_name}: {annotation.__name__ if annotation else 'Any'}" + ) + required_params.append(arg_info) + else: + arg_info = f"{arg_name}: {annotation.__name__ if annotation else 'Any'} = {default}" + optional_params.append(arg_info) + if required: + sample_config[arg_name] = f"Your {arg_name} config" + else: + sample_config[arg_name] = default + + # if default != None: + # sample_config[arg_name] = default + + if v[1] is None or v[1] == "__init__": + constructor_doc_string = inspect.getdoc(getattr(v[0], "__init__")) + else: + constructor_doc_string = inspect.getdoc(getattr(v[0], v[1])) + availables[k] = { + "class": f"{v[0].__module__}.{v[0].__name__}", + "doc": inspect.getdoc(v[0]), + "constructor": constructor_doc_string, + "params": { + "required_params": required_params, + "optional_params": optional_params, + }, + # "default_config": default_conf, + "sample_useage": f"{cls.__name__}.from_config({sample_config})", + } + return availables + + @classmethod + def from_config( + cls: Type[RegistrableType], + params: Union[str, Dict, ConfigTree], + constructor_to_call: Callable[..., RegistrableType] = None, + constructor_to_inspect: Union[ + Callable[..., RegistrableType], Callable[[RegistrableType], None] + ] = None, + ) -> RegistrableType: + """ + Instantiate the object via parameters. + The `constructor_to_call` and `constructor_to_inspect` arguments deal with a bit of + redirection that we do. We allow you to register particular `@classmethods` on a class as + the constructor to use for a registered name. This lets you, e.g., have a single + `Vocabulary` class that can be constructed in two different ways, with different names + registered to each constructor. In order to handle this, we need to know not just the class + we're trying to construct (`cls`), but also what method we should inspect to find its + arguments (`constructor_to_inspect`), and what method to call when we're done constructing + arguments (`constructor_to_call`). These two methods are the same when you've used a + `@classmethod` as your constructor, but they are `different` when you use the default + constructor (because you inspect `__init__`, but call `cls()`). + """ + + logger.debug( + f"instantiating class {cls} from params {getattr(params, 'params', params)} " + ) + + if params is None: + return None + + if isinstance(params, str): + params = ConfigFactory.from_dict({"type": params}) + elif isinstance(params, collections.abc.Mapping) and not isinstance( + params, ConfigTree + ): + params = ConfigFactory.from_dict(params) + original_params = deepcopy(params) + if not isinstance(params, ConfigTree): + raise ConfigurationError( + f"from_config was passed a `{params}` object that was not able to convert to `ConfigTree`. " + "This probably indicates malformed parameters." + f"This happened when constructing an object of type {cls}." + ) + + registered_subclasses = Registrable._registry.get(cls) + try: + # instantiate object from base class + if registered_subclasses and not constructor_to_call: + as_registrable = cast(Type[Registrable], cls) + default_choice = as_registrable.default_implementation + # call with BaseClass.from_prams, should use `type` to point out which subclasss to use + choice = params.pop("type", default_choice) + choices = as_registrable.list_available() + # if cls has subclass and choice not found in params, we'll instantiate cls itself + if choice is None: + subclass, constructor_name = cls, None + # invalid choice encountered, raise + elif choice not in choices: + message = ( + f"{choice} not in acceptable choices for type: {choices}. " + "You should make sure the class is correctly registerd. " + ) + raise ConfigurationError(message) + + else: + subclass, constructor_name = as_registrable.resolve_class_name( + choice + ) + + # See the docstring for an explanation of what's going on here. + if not constructor_name: + constructor_to_inspect = subclass.__init__ + constructor_to_call = subclass # type: ignore + else: + constructor_to_inspect = cast( + Callable[..., RegistrableType], + getattr(subclass, constructor_name), + ) + constructor_to_call = constructor_to_inspect + + retyped_subclass = cast(Type[RegistrableType], subclass) + + instant = retyped_subclass.from_config( + params=params, + constructor_to_call=constructor_to_call, + constructor_to_inspect=constructor_to_inspect, + ) + + setattr(instant, "__register_type__", choice) + setattr(instant, "__original_parameters__", original_params) + # return ins + else: + # pop unused type declaration + register_type = params.pop("type", None) + + if not constructor_to_inspect: + constructor_to_inspect = cls.__init__ + if not constructor_to_call: + constructor_to_call = cls + + if constructor_to_inspect == object.__init__: + # This class does not have an explicit constructor, so don't give it any kwargs. + # Without this logic, create_kwargs will look at object.__init__ and see that + # it takes *args and **kwargs and look for those. + accepts_kwargs, kwargs = False, {} + else: + # This class has a constructor, so create kwargs for it. + constructor_to_inspect = cast( + Callable[..., RegistrableType], constructor_to_inspect + ) + accepts_kwargs, kwargs = create_kwargs( + constructor_to_inspect, + cls, + params, + ) + + instant = constructor_to_call(**kwargs) # type: ignore + setattr(instant, "__register_type__", register_type) + setattr( + instant, + "__constructor_called__", + functools.partial(constructor_to_call, **kwargs), + ) + setattr(instant, "__original_parameters__", original_params) + # if constructor takes kwargs, they can't be infered from constructor. Therefore we should record + # which attrs are created by kwargs to correctly restore the configs by `to_config`. + if accepts_kwargs: + remaining_kwargs = set(params) + params.clear() + setattr(instant, "__from_config_kwargs__", remaining_kwargs) + except Exception as e: + logger.warn(f"Failed to initialize class {cls}, info: {e}") + raise e + if len(params) > 0: + logger.warn(f"These params are not used for constructing {cls}:\n{params}") + + return instant + + def _to_config(self, v): + """iteratively convert v to params""" + v_type = type(v) + if hasattr(v, "to_config"): + params = v.to_config() + elif v_type in {collections.abc.Mapping, Mapping, Dict, dict}: + params = {} + for subk, subv in v.items(): + params[subk] = self._to_config(subv) + elif v_type in { + collections.abc.Iterable, + Iterable, + List, + list, + Tuple, + tuple, + Set, + set, + }: + params = [self._to_config(x) for x in v] + else: + params = v + return params + + def to_config(self) -> ConfigTree: + """ + convert object back to params. + Note: If the object is not instantiated by from_config, we can't transfer it back. + + """ + # user can modify object after instantiated, so directly return original params + # may not be a good way. + # if hasattr(self, "__original_parameters__") and self.__original_parameters__: + # return __original_parameters__ + config = {} + + if hasattr(self, "__register_type__") and self.__register_type__: + config["type"] = self.__register_type__ + + for k, v in self.__constructor_called__.keywords.items(): + if k in self.NonParams: + continue + # we don't directly use the value stored in __constructor_called__.keywords, because + # the value could be a Lazy object, which can't convert to params. Instead, we use + # attrs of instance itself. + if hasattr(self, k): + v = getattr(self, k) + if hasattr(v, "to_config"): + conf = v.to_config() + else: + conf = self._to_config(v) + config[k] = conf + return ConfigFactory.from_dict(config) + + def to_config_with_constructor(self, constructor: str = None) -> ConfigTree: + """convert object back to params. + Different from `to_config`, this function can convert objects that are not instantiated by `from_config`, + but sometimes it may not give correct result. + For example, suppose the class has more than one constructor, and we instantiated by constructorA but convert + it to params of constructorB. So use it with caution. + One should always use `from_config` to instantiate the object and `to_config` to convert it back to params. + """ + config = {} + + if hasattr(self, "__register_type__") and self.__register_type__: + config["type"] = self.__register_type__ + if constructor: + constructor = getattr(self, constructor) + else: + constructor = self.__init__ + + constructor_params = extract_parameters(type(self), constructor) + accepts_kwargs = False + for k, v in constructor_params.items(): + if k in self.NonParams: + continue + + if v.kind == v.VAR_KEYWORD: + accepts_kwargs = True + continue + # get param instance from class attr + v_instance = getattr(self, v.name, None) + + if hasattr(v_instance, "to_config"): + conf = v_instance.to_config() + else: + conf = self._to_config(v_instance) + config[k] = conf + if accepts_kwargs: + for k in self.__from_config_kwargs__: + if hasattr(self, k): + config[k] = getattr(self, k) + return ConfigFactory.from_dict(config) diff --git a/kag/common/registry/utils.py b/kag/common/registry/utils.py new file mode 100644 index 00000000..247d5845 --- /dev/null +++ b/kag/common/registry/utils.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import os +import sys +import importlib +import pkgutil +from pathlib import Path +from typing import Union + + +def append_python_path(path: Union[os.PathLike, str]) -> None: + """ + Append the given path to `sys.path`. + """ + # In some environments, such as TC, it fails when sys.path contains a relative path, such as ".". + path = Path(path).resolve() + path = str(path) + sys.path.append(path) + + +def import_modules_from_path(path: str) -> None: + """ + Import all submodules under the given package. + User can specify their custom packages and have their custom + classes get loaded and registered. + """ + path = os.path.abspath(os.path.normpath(path)) + importlib.invalidate_caches() + tmp = path.rsplit("/", 1) + if len(tmp) == 1: + module_path = "." + package_name = tmp[0] + else: + module_path, package_name = tmp + append_python_path(module_path) + # Import at top level + module = importlib.import_module(package_name) + path = list(getattr(module, "__path__", [])) + path_string = "" if not path else path[0] + # walk_packages only finds immediate children, so need to recurse. + for module_finder, name, _ in pkgutil.walk_packages(path): + # Sometimes when you import third-party libraries that are on your path, + # `pkgutil.walk_packages` returns those too, so we need to skip them. + if path_string and module_finder.path != path_string: + continue + # subpackage = f"{package_name}.{name}" + subpackage = f"{path_string}/{name}" + + import_modules_from_path(subpackage) diff --git a/kag/common/reranker/__init__.py b/kag/common/reranker/__init__.py index a945c8dd..4d9914d1 100644 --- a/kag/common/reranker/__init__.py +++ b/kag/common/reranker/__init__.py @@ -13,7 +13,4 @@ from kag.common.reranker.bge_reranker import BGEReranker from kag.common.reranker.reranker import Reranker -__all__ = [ - "BGEReranker", - "Reranker" -] +__all__ = ["BGEReranker", "Reranker"] diff --git a/kag/common/reranker/bge_reranker.py b/kag/common/reranker/bge_reranker.py index 45a63615..e74cb022 100644 --- a/kag/common/reranker/bge_reranker.py +++ b/kag/common/reranker/bge_reranker.py @@ -20,60 +20,61 @@ def rrf_score(length, r: int = 1): """ Calculates the RRF (Recursive Robust Function) scores. - + This function generates a score sequence of the given length, where each score is calculated based on the index according to the formula 1/(r+i). RRF is a method used in information retrieval and data analysis, and this function provides a way to generate weights based on document indices. - + Parameters: length: int, the length of the score sequence, i.e., the number of scores to generate. r: int, optional, default is 1. Controls the starting index of the scores. Increasing the value of r shifts the emphasis towards later scores. - + Returns: numpy.ndarray, an array containing the scores calculated according to the given formula. """ return np.array([1 / (r + i) for i in range(length)]) - class BGEReranker(Reranker): """ BGEReranker class is a subclass of Reranker that reranks given queries and passages. - + This class uses the FlagReranker model from FlagEmbedding to score and reorder passages. - + Args: model_path (str): Path to the FlagReranker model. use_fp16 (bool): Whether to use half-precision floating-point numbers for computation. Default is True. """ + def __init__(self, model_path: str, use_fp16: bool = True): from FlagEmbedding import FlagReranker + self.model_path = model_path self.model = FlagReranker(self.model_path, use_fp16=use_fp16) def rerank(self, queries: List[str], passages: List[str]): """ Reranks given queries and passages. - + Args: queries (List[str]): List of queries. passages (List[str]): List of passages, where each passage is a string. - + Returns: new_passages (List[str]): List of passages after reranking. """ # Calculate initial ranking scores for passages rank_scores = rrf_score(len(passages)) passage_scores = np.zeros(len(passages)) + rank_scores - + # For each query, compute passage scores using the model and accumulate them for query in queries: scores = self.model.compute_score([[query, x] for x in passages]) sorted_idx = np.argsort(-np.array(scores)) for rank, passage_id in enumerate(sorted_idx): passage_scores[passage_id] += rank_scores[rank] - + # Perform final sorting of passages based on accumulated scores merged_sorted_idx = np.argsort(-passage_scores) - + new_passages = [passages[x] for x in merged_sorted_idx] - return new_passages \ No newline at end of file + return new_passages diff --git a/kag/common/reranker/reranker.py b/kag/common/reranker/reranker.py index 69b97a25..92e6d968 100644 --- a/kag/common/reranker/reranker.py +++ b/kag/common/reranker/reranker.py @@ -43,4 +43,4 @@ def rerank(self, queries: List[str], passages: List[str]): The function is currently not implemented and raises an exception to indicate this. """ - raise NotImplementedError("rerank not implemented yet.") \ No newline at end of file + raise NotImplementedError("rerank not implemented yet.") diff --git a/kag/common/retriever/kag_retriever.py b/kag/common/retriever/kag_retriever.py deleted file mode 100644 index 4bc19aff..00000000 --- a/kag/common/retriever/kag_retriever.py +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import os -from tenacity import retry, stop_after_attempt - -from kag.common.base.prompt_op import PromptOp -from kag.common.vectorizer import Vectorizer -from knext.graph_algo.client import GraphAlgoClient -from kag.interface.retriever.chunk_retriever_abc import ChunkRetrieverABC -from typing import List, Dict - -import numpy as np -import logging - -from knext.reasoner.client import ReasonerClient -from knext.schema.client import CHUNK_TYPE, OTHER_TYPE -from knext.project.client import ProjectClient -from kag.common.utils import processing_phrases -from knext.search.client import SearchClient -from kag.solver.logic.core_modules.common.schema_utils import SchemaUtils -from kag.solver.logic.core_modules.config import LogicFormConfiguration - -logger = logging.getLogger(__name__) - - -class DefaultRetriever(ChunkRetrieverABC): - """ - KAGRetriever class for retrieving and processing knowledge graph data from a graph database. - - this retriever references the implementation of Hippoag for the combination of dpr & ppr, developer can define your Retriever - - Parameters: - - project_id (str, optional): Project ID to load specific project configurations. - - host_addr (str, optional): host addr to load specific server addr configurations. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - self.schema_util = SchemaUtils(LogicFormConfiguration(kwargs)) - - self._init_search() - - self.ner_prompt = PromptOp.load(self.biz_scene, "question_ner")(language=self.language, project_id=self.project_id) - self.std_prompt = PromptOp.load(self.biz_scene, "std")(language=self.language) - - self.pagerank_threshold = 0.9 - self.match_threshold = 0.8 - self.pagerank_weight = 0.5 - - self.reranker_model_path = os.getenv("KAG_RETRIEVER_RERANKER_MODEL_PATH") - if self.reranker_model_path: - from kag.common.reranker.reranker import BGEReranker - self.reranker = BGEReranker(self.reranker_model_path, use_fp16=True) - else: - self.reranker = None - - self.with_semantic = True - - def _init_search(self): - self.sc: SearchClient = SearchClient(self.host_addr, self.project_id) - vectorizer_config = eval(os.getenv("KAG_VECTORIZER", "{}")) - if self.host_addr and self.project_id: - config = ProjectClient(host_addr=self.host_addr, project_id=self.project_id).get_config(self.project_id) - vectorizer_config.update(config.get("vectorizer", {})) - - self.vectorizer = Vectorizer.from_config( - vectorizer_config - ) - self.reason: ReasonerClient = ReasonerClient(self.host_addr, self.project_id) - self.graph_algo = GraphAlgoClient(self.host_addr, self.project_id) - - - - - @retry(stop=stop_after_attempt(3)) - def named_entity_recognition(self, query: str): - """ - Perform named entity recognition. - - This method invokes the pre-configured service client (self.llm) to process the input query, - using the named entity recognition (NER) prompt (self.ner_prompt). - - Parameters: - query (str): The text input provided by the user or system for named entity recognition. - - Returns: - The result returned by the service client, with the type and format depending on the used service. - """ - return self.llm_module.invoke({"input": query}, self.ner_prompt) - - @retry(stop=stop_after_attempt(3)) - def named_entity_standardization(self, query: str, entities: List[Dict]): - """ - Entity standardization function. - - This function calls a remote service to process the input query and named entities, - standardizing the entities. This is useful for unifying different representations of the same entity in text, - improving the performance of natural language processing tasks. - - Parameters: - - query: A string containing the query with named entities. - - entities: A list of dictionaries, each containing information about named entities. - - Returns: - - The result of the remote service call, typically standardized named entity information. - """ - return self.llm_module.invoke( - {"input": query, "named_entities": entities}, self.std_prompt - ) - - @staticmethod - def append_official_name(source_entities: List[Dict], entities_with_official_name: List[Dict]): - """ - Appends official names to entities. - - Parameters: - source_entities (List[Dict]): A list of source entities. - entities_with_official_name (List[Dict]): A list of entities with official names. - - """ - tmp_dict = {} - for tmp_entity in entities_with_official_name: - name = tmp_entity["entity"] - category = tmp_entity["category"] - official_name = tmp_entity["official_name"] - key = f"{category}{name}" - tmp_dict[key] = official_name - - for tmp_entity in source_entities: - name = tmp_entity["entity"] - category = tmp_entity["category"] - key = f"{category}{name}" - if key in tmp_dict: - official_name = tmp_dict[key] - tmp_entity["official_name"] = official_name - - def calculate_sim_scores(self, query: str, doc_nums: int): - """ - Calculate the vector similarity scores between a query and document chunks. - - Parameters: - query (str): The user's query text. - doc_nums (int): The number of document chunks to return. - - Returns: - dict: A dictionary with keys as document chunk IDs and values as the vector similarity scores. - """ - scores = dict() - try: - query_vector = self.vectorizer.vectorize(query) - top_k = self.sc.search_vector( - label=self.schema_util.get_label_within_prefix(CHUNK_TYPE), - property_key="content", - query_vector=query_vector, - topk=doc_nums - ) - scores = {item["node"]["id"]: item["score"] for item in top_k} - except Exception as e: - logger.error( - f"run calculate_sim_scores failed, info: {e}", exc_info=True - ) - return scores - - def calculate_pagerank_scores(self, start_nodes: List[Dict]): - """ - Calculate and retrieve PageRank scores for the given starting nodes. - - Parameters: - start_nodes (list): A list containing document fragment IDs to be used as starting nodes for the PageRank algorithm. - - Returns: - ppr_doc_scores (dict): A dictionary containing each document fragment ID and its corresponding PageRank score. - - This method uses the PageRank algorithm in the graph store to compute scores for document fragments. If `start_nodes` is empty, - it returns an empty dictionary. Otherwise, it attempts to retrieve PageRank scores from the graph store and converts the result - into a dictionary format where keys are document fragment IDs and values are their respective PageRank scores. Any exceptions, - such as failures in running `run_pagerank_igraph_chunk`, are logged. - """ - scores = dict() - if len(start_nodes) != 0: - try: - scores = self.graph_algo.calculate_pagerank_scores( - self.schema_util.get_label_within_prefix(CHUNK_TYPE), - start_nodes - ) - except Exception as e: - logger.error( - f"run calculate_pagerank_scores failed, info: {e}, start_nodes: {start_nodes}", exc_info=True - ) - return scores - - def match_entities(self, queries: Dict[str, str], top_k: int = 1): - """ - Match entities based on the provided queries. - - :param queries: A dictionary containing keywords and their labels. - :param top_k: The number of top results to return. Default is 1. - :return: A tuple containing a list of matched entities and their scores. - """ - matched_entities = [] - matched_entities_scores = [] - for query, query_type in queries.items(): - query = processing_phrases(query) - if query_type not in self.schema_util.node_en_zh.keys(): - query_type = self.schema_util.get_label_within_prefix(OTHER_TYPE) - else: - query_type = self.schema_util.get_label_within_prefix(query_type) - typed_nodes = self.sc.search_vector( - label=query_type, - property_key="name", - query_vector=self.vectorizer.vectorize(query), - topk=top_k, - ) - if query_type != self.schema_util.get_label_within_prefix(OTHER_TYPE): - nontyped_nodes = self.sc.search_vector( - label=self.schema_util.get_label_within_prefix(OTHER_TYPE), - property_key="name", - query_vector=self.vectorizer.vectorize(query), - topk=top_k, - ) - else: - nontyped_nodes = typed_nodes - - if len(typed_nodes) == 0 and len(nontyped_nodes) != 0: - matched_entities.append( - {"name": nontyped_nodes[0]["node"]["name"], "type": OTHER_TYPE} - ) - matched_entities_scores.append(nontyped_nodes[0]["score"]) - elif len(typed_nodes) != 0 and len(nontyped_nodes) != 0: - if typed_nodes[0]["score"] > 0.8: - matched_entities.append( - {"name": typed_nodes[0]["node"]["name"], "type": query_type} - ) - matched_entities_scores.append(typed_nodes[0]["score"]) - else: - matched_entities.append( - {"name": nontyped_nodes[0]["node"]["name"], "type": OTHER_TYPE} - ) - matched_entities_scores.append(nontyped_nodes[0]["score"]) - matched_entities.append( - {"name": typed_nodes[0]["node"]["name"], "type": query_type} - ) - matched_entities_scores.append(typed_nodes[0]["score"]) - elif len(typed_nodes) != 0 and len(nontyped_nodes) == 0: - if typed_nodes[0]["score"] > 0.8: - matched_entities.append( - {"name": typed_nodes[0]["node"]["name"], "type": query_type} - ) - matched_entities_scores.append(typed_nodes[0]["score"]) - - if not matched_entities: - logger.info(f"No entities matched for {queries}") - return matched_entities, matched_entities_scores - - def calculate_combined_scores(self, sim_scores: Dict[str, float], pagerank_scores: Dict[str, float]): - """ - Calculate and return the combined scores that integrate both similarity scores and PageRank scores. - - Parameters: - sim_scores (Dict[str, float]): A dictionary containing similarity scores, where keys are identifiers and values are scores. - pagerank_scores (Dict[str, float]): A dictionary containing PageRank scores, where keys are identifiers and values are scores. - - Returns: - Dict[str, float]: A dictionary containing the combined scores, where keys are identifiers and values are the combined scores. - """ - def min_max_normalize(x): - if len(x) == 0: - return [] - if np.max(x) - np.min(x) > 0: - return (x - np.min(x)) / (np.max(x) - np.min(x)) - else: - return x - np.min(x) - - all_keys = set(pagerank_scores.keys()).union(set(sim_scores.keys())) - for key in all_keys: - sim_scores.setdefault(key, 0.0) - pagerank_scores.setdefault(key, 0.0) - sim_scores = dict(zip(sim_scores.keys(), min_max_normalize( - np.array(list(sim_scores.values())) - ))) - pagerank_scores = dict(zip(pagerank_scores.keys(), min_max_normalize( - np.array(list(pagerank_scores.values())) - ))) - combined_scores = dict() - for key in pagerank_scores.keys(): - combined_scores[key] = (sim_scores[key] * (1 - self.pagerank_weight) + - pagerank_scores[key] * self.pagerank_weight - ) - return combined_scores - - def recall_docs(self, query: str, top_k: int = 5, **kwargs): - """ - Recall relevant documents based on the query string. - - Parameters: - - query (str): The user's query string. - - top_k (int, optional): The number of documents to return, default is 5. - - Keyword Arguments: - - kwargs: Additional keyword arguments. - - Returns: - - list: A list containing the top_k most relevant documents. - """ - assert isinstance(query, str), "Query must be a string" - - chunk_nums = top_k * 20 - if chunk_nums == 0: - return [] - - ner_list = self.named_entity_recognition(query) - print(ner_list) - if self.with_semantic: - std_ner_list = self.named_entity_standardization(query, ner_list) - self.append_official_name(ner_list, std_ner_list) - - entities = {} - for item in ner_list: - entity = item.get("entity", "") - category = item.get("category", "") - official_name = item.get("official_name", "") - if not entity or not (category or official_name): - continue - if category.lower() in ["works", "person", "other"]: - entities[entity] = category - else: - entities[entity] = official_name or category - - sim_scores = self.calculate_sim_scores(query, chunk_nums) - matched_entities, matched_scores = self.match_entities(entities) - pagerank_scores = self.calculate_pagerank_scores(matched_entities) - - if not matched_entities: - combined_scores = sim_scores - elif matched_entities and np.min(matched_scores) > self.pagerank_threshold: - combined_scores = pagerank_scores - else: - combined_scores = self.calculate_combined_scores(sim_scores, pagerank_scores) - sorted_scores = sorted( - combined_scores.items(), key=lambda item: item[1], reverse=True - ) - logger.debug(f"sorted_scores: {sorted_scores}") - - return self.get_all_docs_by_id(query, sorted_scores, top_k) - - def get_all_docs_by_id(self, query: str, doc_ids: list, top_k: int): - """ - Retrieve a list of documents based on their IDs. - - Parameters: - - query (str): The query string for text matching. - - doc_ids (list): A list of document IDs to retrieve documents. - - top_k (int): The maximum number of documents to return. - - Returns: - - list: A list of matched documents. - """ - matched_docs = [] - hits_docs = set() - counter = 0 - for doc_id in doc_ids: - if counter == top_k: - break - if isinstance(doc_id, tuple): - doc_score = doc_id[1] - doc_id = doc_id[0] - else: - doc_score = doc_ids[doc_id] - counter += 1 - node = self.reason.query_node(label=self.schema_util.get_label_within_prefix(CHUNK_TYPE), id_value=doc_id) - node_dict = dict(node.items()) - matched_docs.append(f"#{node_dict['name']}#{node_dict['content']}#{doc_score}") - hits_docs.add(node_dict['name']) - try: - text_matched = self.sc.search_text(query, [self.schema_util.get_label_within_prefix(CHUNK_TYPE)], topk=1) - if text_matched: - for item in text_matched: - title = item["node"]["name"] - if title not in hits_docs: - if len(matched_docs) > 0: - matched_docs.pop() - else: - logger.warning(f"{query} matched docs is empty") - matched_docs.append(f'#{item["node"]["name"]}#{item["node"]["content"]}#{item["score"]}') - break - except Exception as e: - logger.warning(f"{query} query chunk failed: {e}", exc_info=True) - logger.debug(f"matched_docs: {matched_docs}") - return matched_docs - - def rerank_docs(self, queries: List[str], passages: List[str]): - """ - Re-ranks the given passages based on the provided queries. - - Parameters: - - queries (List[str]): A list of queries. - - passages (List[str]): A list of passages. - - Returns: - - List[str]: A re-ranked list of passages. - """ - if self.reranker is None: - return passages - return self.reranker.rerank(queries, passages) diff --git a/kag/common/retriever/retriever.py b/kag/common/retriever/retriever.py deleted file mode 100644 index e125248b..00000000 --- a/kag/common/retriever/retriever.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import io -import json -from pathlib import Path -from abc import ABC, abstractmethod -from typing import Any, Union, Iterable, Tuple - -from typing import Dict -import logging - -logger = logging.getLogger(__name__) - -Item = Dict[str, Any] -RetrievalResult = Iterable[Tuple[Item, float]] - - -class Retriever(ABC): - """ - Retriever indexing a collection of items and supports fast retrieving of the - desired items given a query. - """ - - @classmethod - def from_config(cls, config: Union[str, Path, Dict[str, Any]]) -> "Retriever": - """ - Create retriever from `config`. - - If `config` is a string or path, it will be loaded as a dictionary depending - on its file extension. Currently, the following formats are supported: - - * .json: JSON - * .json5: JSON with comments support - * .yaml: YAML - - :param config: retriever config - :type config: str, Path or Dict[str, Any] - :return: retriever instance - :rtype: Retriever - """ - from kag.common.utils import dynamic_import_class - - if isinstance(config, (str, Path)): - config_path = config - if not isinstance(config_path, Path): - config_path = Path(config_path) - if config_path.name.endswith(".yaml"): - import yaml - - with io.open(config_path, "r", encoding="utf-8") as fin: - config = yaml.safe_load(fin) - elif config_path.name.endswith(".json5"): - import json5 - - with io.open(config_path, "r", encoding="utf-8") as fin: - config = json5.load(fin) - elif config_path.name.endswith(".json"): - with io.open(config_path, "r", encoding="utf-8") as fin: - config = json.load(fin) - else: - message = "only .json, .json5 and .yaml are supported currently; " - message += "can not load retriever config from %r" % str(config_path) - raise RuntimeError(message) - elif isinstance(config, dict): - pass - else: - message = "only str, Path and dict are supported; " - message += "invalid retriever config: %r" % (config,) - raise RuntimeError(message) - - class_name = config.get("retriever") - if class_name is None: - message = "retriever class name is not specified" - raise RuntimeError(message) - retriever_class = dynamic_import_class(class_name, "retriever") - if not issubclass(retriever_class, Retriever): - message = "class %r is not a retriever class" % (class_name,) - raise RuntimeError(message) - retriever = retriever_class._from_config(config) - return retriever - - @classmethod - @abstractmethod - def _from_config(cls, config: Dict[str, Any]) -> "Retriever": - """ - Create retriever from `config`. This method is supposed to be implemented - by derived classes. - - :param config: retriever config - :type config: Dict[str, Any] - :return: retriever instance - :rtype: Retriever - """ - message = "abstract method _from_config is not implemented" - raise NotImplementedError(message) - - def index(self, items: Union[Item, Iterable[Item]]) -> None: - """ - Add one or more items to the index of the retriever. - - NOTE: This method may not be supported by the retriever. - - :param items: items to index - :type items: Item or Iterable[Item] - """ - message = "method index is not supported by the retriever" - raise RuntimeError(message) - - @abstractmethod - def retrieve( - self, queries: Union[str, Iterable[str]], top_k: int = 10 - ) -> Union[RetrievalResult, Iterable[RetrievalResult]]: - """ - Retrieve items for the given query or queries. - - :param queries: queries to retrieve - :type queries: str or Iterable[str] - :param int top_k: how many most related items to return for each query, default to 10 - :return: retrieval results of the queries - :rtype: RetrievalResult or Iterable[RetrievalResult] - """ - message = "abstract method retrieve is not implemented" - raise NotImplementedError(message) - - diff --git a/kag/common/sharding_info.py b/kag/common/sharding_info.py new file mode 100644 index 00000000..08d7c4cf --- /dev/null +++ b/kag/common/sharding_info.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +from kag.common.registry import Registrable + + +class ShardingInfo(Registrable): + """ + A class representing sharding information for distributed computing. + + This class provides methods to manage and query sharding information across + multiple machines, instances, and processes. It inherits from the `Registrable` + class. + + Attributes: + machine_id (int): The ID of the current machine. Default is 0. + machine_count (int): The total number of machines. Default is 1. + instance_id (int): The ID of the current instance. Default is 0. + instance_count (int): The total number of instances. Default is 1. + process_id (int): The ID of the current process. Default is 0. + process_count (int): The total number of processes. Default is 1. + shard_id (int, optional): The ID of the current shard. Default is None. + shard_count (int, optional): The total number of shards. Default is None. + shard_by_machine (bool): Whether to shard by machine. Default is True. + shard_by_instance (bool): Whether to shard by instance. Default is True. + shard_by_process (bool): Whether to shard by process. Default is True. + """ + + def __init__( + self, + machine_id: int = 0, + machine_count: int = 1, + instance_id: int = 0, + instance_count: int = 1, + process_id: int = 0, + process_count: int = 1, + shard_id: int = None, + shard_count: int = None, + ): + """ + Initializes a new instance of the ShardingInfo class. + + Args: + machine_id (int): The ID of the current machine. Default is 0. + machine_count (int): The total number of machines. Default is 1. + instance_id (int): The ID of the current instance. Default is 0. + instance_count (int): The total number of instances. Default is 1. + process_id (int): The ID of the current process. Default is 0. + process_count (int): The total number of processes. Default is 1. + shard_id (int, optional): The ID of the current shard. Default is None. + shard_count (int, optional): The total number of shards. Default is None. + """ + self.instance_id = instance_id + self.instance_count = instance_count + self.machine_id = machine_id + self.machine_count = machine_count + self.process_id = process_id + self.process_count = process_count + self.shard_id = shard_id + self.shard_count = shard_count + + self.shard_by_machine = True + self.shard_by_instance = True + self.shard_by_process = True + + def shard_by( + self, machine: bool = True, instance: bool = True, process: bool = True + ): + """ + Configures the sharding strategy by specifying whether to shard by machine, + instance, or process. + + Args: + machine (bool): Whether to shard by machine. Default is True. + instance (bool): Whether to shard by instance. Default is True. + process (bool): Whether to shard by process. Default is True. + """ + self.shard_by_machine = machine + self.shard_by_instance = instance + self.shard_by_process = process + + def get_rank(self): + """ + Returns the rank of the current shard based on the configured sharding strategy. + + Returns: + int: The rank of the current shard. + """ + if self.shard_id is not None: + return self.shard_id + if self.shard_by_machine: + machine_id = self.machine_id + else: + machine_id = 0 + if self.shard_by_instance: + instance_id, instance_count = self.instance_id, self.instance_count + else: + instance_id, instance_count = 0, 1 + if self.shard_by_process: + process_id, process_count = self.process_id, self.process_count + else: + process_id, process_count = 0, 1 + + return process_count * (machine_id * instance_count + instance_id) + process_id + + def get_world_size(self): + """ + Returns the total number of shards in the world based on the configured sharding strategy. + + Returns: + int: The total number of shards. + """ + if self.shard_count is not None: + return self.shard_count + world_size = 1 + if self.shard_by_machine: + world_size *= self.machine_count + if self.shard_by_instance: + world_size *= self.instance_count + if self.shard_by_process: + world_size *= self.process_count + return world_size + + def get_sharding_range(self, total: int): + """ + Returns the range of indices that the current shard is responsible for. + + Args: + total (int): The total number of items to be sharded. + + Returns: + Tuple[int, int]: A tuple containing the start and end indices of the range. + """ + rank = self.get_rank() + world_size = self.get_world_size() + if total % world_size == 0: + workload = total // world_size + else: + workload = total // world_size + 1 + start = workload * rank + end = min(total, workload * (rank + 1)) + return start, end + + @property + def is_master_process(self): + """ + Checks if the current process is the master process. + + Returns: + bool: True if the current process is the master process, False otherwise. + """ + return self.process_id == 0 + + @property + def is_master_instance(self): + """ + Checks if the current instance is the master instance. + + Returns: + bool: True if the current instance is the master instance, False otherwise. + """ + return self.instance_id == 0 + + @property + def is_master_machine(self): + """ + Checks if the current machine is the master machine. + + Returns: + bool: True if the current machine is the master machine, False otherwise. + """ + return self.machine_id == 0 + + def __str__(self): + """ + Returns a string representation of the ShardingInfo object. + + Returns: + str: A string containing the rank, world size, and other sharding details. + """ + content = ( + f"ShardingInfo: rank={self.get_rank()}, world_size={self.get_world_size()}, " + f"machine: {self.machine_id}/{self.machine_count}, " + f"instance: {self.instance_id}/{self.instance_count}, " + f"process: {self.process_id}/{self.process_count}" + ) + return content + + __repr__ = __str__ + + def copy(self): + """ + Creates a copy of the current ShardingInfo object. + + Returns: + ShardingInfo: A new instance of ShardingInfo with the same attributes. + """ + return ShardingInfo( + self.machine_id, + self.machine_count, + self.instance_id, + self.instance_count, + self.process_id, + self.process_count, + self.shard_id, + self.shard_count, + ) + + +ShardingInfo.register("base")(ShardingInfo) diff --git a/kag/common/utils.py b/kag/common/utils.py index 2a6f5ac0..c7c98924 100644 --- a/kag/common/utils.py +++ b/kag/common/utils.py @@ -12,51 +12,30 @@ import re import sys import json -from typing import Type,Tuple -import inspect +import hashlib import os -from pathlib import Path +import tempfile +import requests import importlib +from typing import Tuple +from pathlib import Path + from shutil import copystat, copy2 from typing import Any, Union from jinja2 import Environment, FileSystemLoader, Template from stat import S_IWUSR as OWNER_WRITE_PERMISSION +from tenacity import retry, stop_after_attempt - -def _register(root, path, files, class_type): - relative_path = os.path.relpath(path, root) - module_prefix = relative_path.replace(".", "").replace("/", ".") - module_prefix = module_prefix + "." if module_prefix else "" - for file_name in files: - if file_name.endswith(".py"): - module_name = module_prefix + os.path.splitext(file_name)[0] - import importlib - - module = importlib.import_module(module_name) - classes = inspect.getmembers(module, inspect.isclass) - for class_name, class_obj in classes: - if ( - issubclass(class_obj, class_type) - and inspect.getmodule(class_obj) == module - ): - - class_type.register( - name=class_name, - local_path=os.path.join(path, file_name), - module_path=module_name, - )(class_obj) - - -def register_from_package(path: str, class_type: Type) -> None: - """ - Register all classes under the given package. - Only registered classes can be recognized by kag. - """ - if not append_python_path(path): - return - for root, dirs, files in os.walk(path): - _register(path, root, files, class_type) - class_type._has_registered = True +reset = "\033[0m" +bold = "\033[1m" +underline = "\033[4m" +red = "\033[31m" +green = "\033[32m" +yellow = "\033[33m" +blue = "\033[34m" +magenta = "\033[35m" +cyan = "\033[36m" +white = "\033[37m" def append_python_path(path: str) -> bool: @@ -70,6 +49,7 @@ def append_python_path(path: str) -> bool: return True return False + def render_template( root_dir: Union[str, os.PathLike], file: Union[str, os.PathLike], **kwargs: Any ) -> None: @@ -82,7 +62,6 @@ def render_template( if path_obj.suffix == ".tmpl": path_obj.rename(render_path) - render_path.write_text(content, "utf8") @@ -113,7 +92,7 @@ def copyfile(src: Path, dst: Path, **kwargs): _make_writable(dst) if dst.suffix != ".tmpl": return - render_template('/', dst, **kwargs) + render_template("/", dst, **kwargs) def remove_files_except(path, file, new_file): @@ -137,7 +116,6 @@ def load_json(content): try: return json.loads(content) except json.JSONDecodeError as e: - substr = content[: e.colno - 1] return json.loads(substr) @@ -194,8 +172,7 @@ def processing_phrases(phrase): def to_camel_case(phrase): s = processing_phrases(phrase).replace(" ", "_") return "".join( - word.capitalize() if i != 0 else word - for i, word in enumerate(s.split("_")) + word.capitalize() if i != 0 else word for i, word in enumerate(s.split("_")) ) @@ -203,3 +180,98 @@ def to_snake_case(name): words = re.findall("[A-Za-z][a-z0-9]*", name) result = "_".join(words).lower() return result + + +def get_vector_field_name(property_key: str): + name = f"{property_key}_vector" + name = to_snake_case(name) + return "_" + name + + +def split_list_into_n_parts(lst, n): + length = len(lst) + part_size = length // n + seg = [x * part_size for x in range(n)] + seg.append(min(length, part_size * n)) + + remainder = length % n + + result = [] + + # 分割列表 + start = 0 + for i in range(n): + # 计算当前份的元素数量 + if i < remainder: + end = start + part_size + 1 + else: + end = start + part_size + + # 添加当前份到结果列表 + result.append(lst[start:end]) + + # 更新起始位置 + start = end + + return result + + +def generate_hash_id(value): + """ + Generates a hash ID and an abstracted version of the input value. + + If the input value is a dictionary, it sorts the dictionary items and abstracts the dictionary. + If the input value is not a dictionary, it abstracts the value directly. + + Args: + value: The input value to be hashed and abstracted. + + Returns: + Tuple[str, Any]: A tuple containing the hash ID and the abstracted value. + """ + if isinstance(value, dict): + sorted_items = sorted(value.items()) + key = str(sorted_items) + else: + key = value + if isinstance(key, str): + key = key.encode("utf-8") + hasher = hashlib.sha256() + hasher.update(key) + + return hasher.hexdigest() + + +@retry(stop=stop_after_attempt(3)) +def download_from_http(url: str, dest: str = None) -> str: + """Downloads a file from an HTTP URL and saves it to a temporary directory. + + This function uses the requests library to download a file from the specified + HTTP URL and saves it to the system's temporary directory. After the download + is complete, it returns the local path of the downloaded file. + + Args: + url (str): The HTTP URL of the file to be downloaded. + + Returns: + str: The local path of the downloaded file. + + """ + + # Send an HTTP GET request to download the file + response = requests.get(url, stream=True) + response.raise_for_status() # Check if the request was successful + + if dest is None: + # Create a temporary file + temp_dir = tempfile.gettempdir() + temp_file_path = os.path.join(temp_dir, os.path.basename(url)) + dest = temp_file_path + + with open(dest, "wb") as temp_file: + # Write the downloaded content to the temporary file + for chunk in response.iter_content(chunk_size=1024**2): + temp_file.write(chunk) + + # Return the path of the temporary file + return temp_file.name diff --git a/kag/common/vectorize_model/__init__.py b/kag/common/vectorize_model/__init__.py new file mode 100644 index 00000000..1af8cfd3 --- /dev/null +++ b/kag/common/vectorize_model/__init__.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +from kag.common.vectorize_model.local_bge_model import ( + LocalBGEVectorizeModel, + LocalBGEM3VectorizeModel, +) +from kag.common.vectorize_model.openai_model import OpenAIVectorizeModel +from kag.common.vectorize_model.mock_model import MockVectorizeModel +from kag.common.vectorize_model.vectorize_model_config_checker import ( + VectorizeModelConfigChecker, +) + + +__all__ = [ + "LocalBGEM3VectorizeModel", + "LocalBGEVectorizeModel", + "OpenAIVectorizeModel", + "MockVectorizeModel", + "VectorizeModelConfigChecker", +] diff --git a/kag/common/vectorize_model/local_bge_model.py b/kag/common/vectorize_model/local_bge_model.py new file mode 100644 index 00000000..b87d9d32 --- /dev/null +++ b/kag/common/vectorize_model/local_bge_model.py @@ -0,0 +1,201 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import os +import logging +import threading +from typing import Union, Iterable +from kag.interface import VectorizeModelABC, EmbeddingVector + +logger = logging.getLogger() + + +LOCAL_MODEL_MAP = {} + + +@VectorizeModelABC.register("bge") +class LocalBGEVectorizeModel(VectorizeModelABC): + """ + A class that extends the VectorizeModelABC base class. + It invokes local BGE embedding models to convert texts into embedding vectors. + """ + + _LOCK = threading.Lock() + + def __init__( + self, + path: str, + url: str = None, + query_instruction_for_retrieval: str = None, + vector_dimensions: int = None, + ): + """ + Initializes the LocalBGEVectorizeModel instance. + + Args: + path (str): The path to the local BGE model. + url (str, optional): The URL to download the model if not found locally. Defaults to None. + query_instruction_for_retrieval (str, optional): The query instruction for retrieval. Defaults to None. + vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None. + """ + super().__init__(vector_dimensions) + self.model_path = os.path.expanduser(path) + self.url = url + config_path = os.path.join(self.model_path, "config.json") + if not os.path.isfile(config_path): + if url is None: + message = f"model not found at {path!r}, nor model url specified" + raise RuntimeError(message) + logger.info("Model file not found in path, start downloading...") + self._download_model(self.model_path, self.url) + default_chinese_query_instruction_for_retrieval = "为这个句子生成表示以用于向量检索:" + default_english_query_instruction_for_retrieval = ( + "Represent this sentence for searching relevant passages:" + ) + if "BAAI/bge-base-zh-v1.5" in path: + default_query_instruction_for_retrieval = ( + default_chinese_query_instruction_for_retrieval + ) + else: + default_query_instruction_for_retrieval = ( + default_english_query_instruction_for_retrieval + ) + + if query_instruction_for_retrieval: + self.query_instruction_for_retrieval = query_instruction_for_retrieval + else: + self.query_instruction_for_retrieval = ( + default_query_instruction_for_retrieval + ) + with LocalBGEVectorizeModel._LOCK: + if self.model_path in LOCAL_MODEL_MAP: + logger.info("Found existing model, reuse.") + model = LOCAL_MODEL_MAP[self.model_path] + else: + model = self._load_model(self.model_path) + LOCAL_MODEL_MAP[self.model_path] = model + self.model = model + + def _load_model(self, path): + """ + Loads the BGE model from the specified path. + + Args: + path (str): The path to the BGE model. + + Returns: + FlagModel: The loaded BGE model. + """ + # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip. + import sklearn # noqa + from FlagEmbedding import FlagModel + + logger.info( + f"Loading FlagModel from {path!r} with query_instruction_for_retrieval={self.query_instruction_for_retrieval!r}" + ) + model = FlagModel( + path, + query_instruction_for_retrieval=self.query_instruction_for_retrieval, + use_fp16=False, + ) + return model + + def vectorize( + self, texts: Union[str, Iterable[str]] + ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: + """ + Vectorizes text(s) into embedding vector(s). + + Args: + texts (Union[str, Iterable[str]]): The text or texts to vectorize. + + Returns: + Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s). + """ + + result = self.model.encode(texts) + return result.tolist() + + +@VectorizeModelABC.register("bge_m3") +class LocalBGEM3VectorizeModel(VectorizeModelABC): + """ + A class that extends the VectorizeModelABC base class. + It invokes local BGE-M3 embedding models to convert texts into embedding vectors. + """ + + _LOCK = threading.Lock() + + def __init__( + self, + path: str, + url: str = None, + vector_dimensions: int = None, + ): + """ + Initializes the LocalBGEM3VectorizeModel instance. + + Args: + path (str): The path to the local BGE-M3 model. + url (str, optional): The URL to download the model if not found locally. Defaults to None. + vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None. + """ + super().__init__(vector_dimensions) + self.url = url + self.model_path = os.path.expanduser(path) + config_path = os.path.join(self.model_path, "config.json") + if not os.path.isfile(config_path): + if url is None: + message = f"model not found at {path!r}, nor model url specified" + raise RuntimeError(message) + self._download_model(path, url) + with LocalBGEM3VectorizeModel._LOCK: + if self.model_path in LOCAL_MODEL_MAP: + logger.info("Found existing model, reuse.") + model = LOCAL_MODEL_MAP[self.model_path] + else: + model = self._load_model(self.model_path) + LOCAL_MODEL_MAP[self.model_path] = model + self.model = model + + def _load_model(self, path): + """ + Loads the BGE-M3 model from the specified path. + + Args: + path (str): The path to the BGE-M3 model. + + Returns: + BGEM3FlagModel: The loaded BGE-M3 model. + """ + # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip. + + import sklearn # noqa + from FlagEmbedding import BGEM3FlagModel + + logger.info(f"Loading BGEM3FlagModel from {path!r}") + model = BGEM3FlagModel(path, use_fp16=False) + return model + + def vectorize( + self, texts: Union[str, Iterable[str]] + ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: + """ + Vectorizes text(s) into embedding vector(s). + + Args: + texts (Union[str, Iterable[str]]): The text or texts to vectorize. + + Returns: + Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s). + """ + result = self.model.encode(texts)["dense_vecs"] + return result.tolist() diff --git a/kag/common/vectorize_model/mock_model.py b/kag/common/vectorize_model/mock_model.py new file mode 100644 index 00000000..a930b576 --- /dev/null +++ b/kag/common/vectorize_model/mock_model.py @@ -0,0 +1,51 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import numpy as np +from typing import Union, Iterable +from kag.interface import VectorizeModelABC, EmbeddingVector + + +@VectorizeModelABC.register("mock") +class MockVectorizeModel(VectorizeModelABC): + """ + A mock implementation of the VectorizeModelABC class, used for testing purposes. + + This class provides a method to generate random embedding vectors for given texts. + """ + + def __init__( + self, + vector_dimensions: int = None, + ): + """ + Initializes the MockVectorizeModel instance. + + Args: + vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None. + """ + super().__init__(vector_dimensions) + + def vectorize( + self, texts: Union[str, Iterable[str]] + ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: + """ + Generates random embedding vectors for the given texts. + + Args: + texts (Union[str, Iterable[str]]): The text or texts to vectorize. + + Returns: + Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s). + """ + if isinstance(texts, str): + return np.random.rand(self._vector_dimensions).tolist() + else: + return np.random.rand(len(texts), self._vector_dimensions).tolist() diff --git a/kag/common/vectorize_model/openai_model.py b/kag/common/vectorize_model/openai_model.py new file mode 100644 index 00000000..ab26860c --- /dev/null +++ b/kag/common/vectorize_model/openai_model.py @@ -0,0 +1,62 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +from typing import Union, Iterable +from openai import OpenAI +from kag.interface import VectorizeModelABC, EmbeddingVector + + +@VectorizeModelABC.register("openai") +class OpenAIVectorizeModel(VectorizeModelABC): + """ + A class that extends the VectorizeModelABC base class. + It invokes OpenAI or OpenAI-compatible embedding services to convert texts into embedding vectors. + """ + + def __init__( + self, + model: str = "text-embedding-3-small", + api_key: str = "", + base_url: str = "", + vector_dimensions: int = None, + ): + """ + Initializes the OpenAIVectorizeModel instance. + + Args: + model (str, optional): The model to use for embedding. Defaults to "text-embedding-3-small". + api_key (str, optional): The API key for accessing the OpenAI service. Defaults to "". + base_url (str, optional): The base URL for the OpenAI service. Defaults to "". + vector_dimensions (int, optional): The number of dimensions for the embedding vectors. Defaults to None. + """ + super().__init__(vector_dimensions) + self.client = OpenAI(api_key=api_key, base_url=base_url) + + def vectorize( + self, texts: Union[str, Iterable[str]] + ) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: + """ + Vectorizes a text string into an embedding vector or multiple text strings into multiple embedding vectors. + + Args: + texts (Union[str, Iterable[str]]): The text or texts to vectorize. + + Returns: + Union[EmbeddingVector, Iterable[EmbeddingVector]]: The embedding vector(s) of the text(s). + """ + results = self.client.embeddings.create(input=texts, model=self.model) + results = [item.embedding for item in results.data] + if isinstance(texts, str): + assert len(results) == 1 + return results[0] + else: + assert len(results) == len(texts) + return results diff --git a/kag/common/vectorize_model/vectorize_model_config_checker.py b/kag/common/vectorize_model/vectorize_model_config_checker.py new file mode 100644 index 00000000..2932d64c --- /dev/null +++ b/kag/common/vectorize_model/vectorize_model_config_checker.py @@ -0,0 +1,47 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +import json + + +class VectorizeModelConfigChecker: + """ + A class that checks whether the vectorizer configuration is valid. + + This class provides a method to validate the vectorizer configuration and return the embedding vector dimensions if valid. + """ + + def check(self, vectorizer_config: str) -> int: + """ + Checks the vectorizer configuration. + + If the configuration is valid, it returns the actual embedding vector dimensions. + If the configuration is invalid, it raises a RuntimeError exception. + + Args: + vectorizer_config (str): The vectorizer configuration to be checked. + + Returns: + int: The embedding vector dimensions. + + Raises: + RuntimeError: If the configuration is invalid. + """ + try: + config = json.loads(vectorizer_config) + from kag.interface import VectorizeModelABC + + vectorizer = VectorizeModelABC.from_config(config) + res = vectorizer.vectorize("hello") + return len(res) + except Exception as ex: + message = "invalid vectorizer config: %s" % str(ex) + raise RuntimeError(message) from ex diff --git a/kag/common/vectorizer/__init__.py b/kag/common/vectorizer/__init__.py deleted file mode 100644 index b95190e0..00000000 --- a/kag/common/vectorizer/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from kag.common.vectorizer.local_bge_m3_vectorizer import LocalBGEM3Vectorizer -from kag.common.vectorizer.local_bge_vectorizer import LocalBGEVectorizer -from kag.common.vectorizer.openai_vectorizer import OpenAIVectorizer -from kag.common.vectorizer.vectorizer import Vectorizer -from kag.common.vectorizer.vectorizer_config_checker import VectorizerConfigChecker - - -__all__ = [ - "LocalBGEM3Vectorizer", - "LocalBGEVectorizer", - "OpenAIVectorizer", - "Vectorizer", - "VectorizerConfigChecker", -] diff --git a/kag/common/vectorizer/local_bge_m3_vectorizer.py b/kag/common/vectorizer/local_bge_m3_vectorizer.py deleted file mode 100644 index 75b0179b..00000000 --- a/kag/common/vectorizer/local_bge_m3_vectorizer.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import io -import os -import threading -import tarfile -import requests -from typing import Any, Union, Iterable, Dict -from kag.common.vectorizer.vectorizer import Vectorizer - - -EmbeddingVector = Iterable[float] - - -class LocalBGEM3Vectorizer(Vectorizer): - """ - Invoke local bge-m3 embedding models to turn texts into embedding vectors. - """ - - _local_model_map = {} - _lock = threading.Lock() - - def __init__(self, config: Dict[str, Any]): - super().__init__(config) - path = config.get("path") - if path is None: - message = "model path is required" - raise RuntimeError(message) - url = config.get("url") - path = os.path.expanduser(path) - config_path = os.path.join(path, "config.json") - if not os.path.isfile(config_path): - if url is None: - message = f"model not found at {path!r}, nor model url specified" - raise RuntimeError(message) - self._download_model(path, url) - self._path = path - self._url = url - with self._lock: - if path in self._local_model_map: - self._model = self._local_model_map[path] - else: - self._model = self._load_model(path) - self._local_model_map[path] = self._model - - @classmethod - def _from_config(cls, config: Dict[str, Any]) -> Vectorizer: - """ - Create vectorizer from `config`. - - :param config: vectorizer config - :type config: Dict[str, Any] - :return: vectorizer instance - :rtype: Vectorizer - """ - vectorizer = cls(config) - return vectorizer - - def _download_model(self, path, url): - res = requests.get(url) - with io.BytesIO(res.content) as fileobj: - with tarfile.open(fileobj=fileobj) as tar: - tar.extractall(path=path) - config_path = os.path.join(path, "config.json") - if not os.path.isfile(config_path): - message = f"model config not found at {config_path!r}, url {url!r} specified an invalid model" - raise RuntimeError(message) - - def _load_model(self, path): - # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip. - import sklearn - from FlagEmbedding import BGEM3FlagModel - - print(f"Loading BGEM3FlagModel from {path!r}") - model = BGEM3FlagModel(path, use_fp16=True) - return model - - def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: - """ - Vectorize a text string into an embedding vector or multiple text strings into - multiple embedding vectors. - - :param texts: texts to vectorize - :type texts: str or Iterable[str] - :return: embedding vectors of the texts - :rtype: EmbeddingVector or Iterable[EmbeddingVector] - """ - result = self._model.encode(texts)["dense_vecs"] - return result.tolist() diff --git a/kag/common/vectorizer/local_bge_vectorizer.py b/kag/common/vectorizer/local_bge_vectorizer.py deleted file mode 100644 index 0869df15..00000000 --- a/kag/common/vectorizer/local_bge_vectorizer.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import io -import os -import threading -import tarfile -import requests -from typing import Any, Union, Iterable, Dict -from kag.common.vectorizer.vectorizer import Vectorizer - - -EmbeddingVector = Iterable[float] - - -class LocalBGEVectorizer(Vectorizer): - """ - Invoke local bge embedding models to turn texts into embedding vectors. - """ - - _local_model_map = {} - _lock = threading.Lock() - - def __init__(self, config: Dict[str, Any]): - super().__init__(config) - path = config.get("path") - if path is None: - message = "model path is required" - raise RuntimeError(message) - url = config.get("url") - path = os.path.expanduser(path) - config_path = os.path.join(path, "config.json") - if not os.path.isfile(config_path): - if url is None: - message = f"model not found at {path!r}, nor model url specified" - raise RuntimeError(message) - self._download_model(path, url) - default_chinese_query_instruction_for_retrieval = "为这个句子生成表示以用于向量检索:" - default_english_query_instruction_for_retrieval = "Represent this sentence for searching relevant passages:" - if "BAAI/bge-base-zh-v1.5" in path: - default_query_instruction_for_retrieval = default_chinese_query_instruction_for_retrieval - else: - default_query_instruction_for_retrieval = default_english_query_instruction_for_retrieval - query_instruction_for_retrieval = config.get("query_instruction_for_retrieval", default_query_instruction_for_retrieval) - self._path = path - self._url = url - self._query_instruction_for_retrieval = query_instruction_for_retrieval - with self._lock: - if path in self._local_model_map: - self._model = self._local_model_map[path] - else: - self._model = self._load_model(path) - self._local_model_map[path] = self._model - - @classmethod - def _from_config(cls, config: Dict[str, Any]) -> Vectorizer: - """ - Create vectorizer from `config`. - - :param config: vectorizer config - :type config: Dict[str, Any] - :return: vectorizer instance - :rtype: Vectorizer - """ - vectorizer = cls(config) - return vectorizer - - def _download_model(self, path, url): - res = requests.get(url) - with io.BytesIO(res.content) as fileobj: - with tarfile.open(fileobj=fileobj) as tar: - tar.extractall(path=path) - config_path = os.path.join(path, "config.json") - if not os.path.isfile(config_path): - message = f"model config not found at {config_path!r}, url {url!r} specified an invalid model" - raise RuntimeError(message) - - def _load_model(self, path): - # We need to import sklearn at first, otherwise sklearn will fail on macOS with m chip. - import sklearn - from FlagEmbedding import FlagModel - - print(f"Loading FlagModel from {path!r} with query_instruction_for_retrieval={self._query_instruction_for_retrieval!r}") - model = FlagModel(path, - query_instruction_for_retrieval=self._query_instruction_for_retrieval, - use_fp16=True) - return model - - def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: - """ - Vectorize a text string into an embedding vector or multiple text strings into - multiple embedding vectors. - - :param texts: texts to vectorize - :type texts: str or Iterable[str] - :return: embedding vectors of the texts - :rtype: EmbeddingVector or Iterable[EmbeddingVector] - """ - result = self._model.encode(texts) - return result.tolist() diff --git a/kag/common/vectorizer/openai_vectorizer.py b/kag/common/vectorizer/openai_vectorizer.py deleted file mode 100644 index 13894b7a..00000000 --- a/kag/common/vectorizer/openai_vectorizer.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -from typing import Any, Union, Iterable, Dict -from openai import OpenAI -from kag.common.vectorizer.vectorizer import Vectorizer - - -EmbeddingVector = Iterable[float] - - -class OpenAIVectorizer(Vectorizer): - """ - Invoke OpenAI or OpenAI-compatible embedding services to turn texts into embedding vectors. - """ - - def __init__(self, config: Dict[str, Any]): - super().__init__(config) - self.model = config.get("model","text-embedding-3-small") - self.api_key = config.get("api_key") - self.base_url = config.get("base_url") - if not self.api_key: - raise ValueError("OpenAI API key is not set") - self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) - - @classmethod - def _from_config(cls, config: Dict[str, Any]) -> Vectorizer: - """ - Create vectorizer from `config`. - - :param config: vectorizer config - :type config: Dict[str, Any] - :return: vectorizer instance - :rtype: Vectorizer - """ - vectorizer = cls(config) - return vectorizer - - def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: - """ - Vectorize a text string into an embedding vector or multiple text strings into - multiple embedding vectors. - - :param texts: texts to vectorize - :type texts: str or Iterable[str] - :return: embedding vectors of the texts - :rtype: EmbeddingVector or Iterable[EmbeddingVector] - """ - results = self.client.embeddings.create(input=texts, model=self.model) - results = [item.embedding for item in results.data] - if isinstance(texts, str): - assert len(results) == 1 - return results[0] - else: - assert len(results) == len(texts) - return results diff --git a/kag/common/vectorizer/vectorizer.py b/kag/common/vectorizer/vectorizer.py deleted file mode 100644 index 3a32123d..00000000 --- a/kag/common/vectorizer/vectorizer.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import io -import json -from pathlib import Path -from abc import ABC, abstractmethod -from typing import Any, Union, Iterable, Optional, Dict - -EmbeddingVector = Iterable[float] - - -class Vectorizer(ABC): - """ - Vectorizer turns texts into embedding vectors. - """ - - def __init__(self, config: Dict[str, Any]): - self._config = config - self._vector_dimensions = None - - @classmethod - def from_config(cls, config: Union[str, Path, Dict[str, Any]]) -> "Vectorizer": - """ - Create vectorizer from `config`. - - If `config` is a string or path, it will be loaded as a dictionary depending - on its file extension. Currently, the following formats are supported: - - * .json: JSON - * .json5: JSON with comments support - * .yaml: YAML - - :param config: vectorizer config - :type config: str, Path or Dict[str, Any] - :return: vectorizer instance - :rtype: Vectorizer - """ - from kag.common.utils import dynamic_import_class - - if isinstance(config, (str, Path)): - config_path = config - if not isinstance(config_path, Path): - config_path = Path(config_path) - if config_path.name.endswith(".yaml"): - import yaml - - with io.open(config_path, "r", encoding="utf-8") as fin: - config = yaml.safe_load(fin) - elif config_path.name.endswith(".json5"): - import json5 - - with io.open(config_path, "r", encoding="utf-8") as fin: - config = json5.load(fin) - elif config_path.name.endswith(".json"): - with io.open(config_path, "r", encoding="utf-8") as fin: - config = json.load(fin) - else: - message = "only .json, .json5 and .yaml are supported currently; " - message += "can not load vectorizer config from %r" % str(config_path) - raise RuntimeError(message) - elif isinstance(config, dict): - pass - else: - message = "only str, Path and dict are supported; " - message += "invalid vectorizer config: %r" % (config,) - raise RuntimeError(message) - - class_name = config.get("vectorizer") - if class_name is None: - message = "vectorizer class name is not specified" - raise RuntimeError(message) - vectorizer_class = dynamic_import_class(class_name, "vectorizer") - if not issubclass(vectorizer_class, Vectorizer): - message = "class %r is not a vectorizer class" % (class_name,) - raise RuntimeError(message) - vectorizer = vectorizer_class._from_config(config) - return vectorizer - - @classmethod - @abstractmethod - def _from_config(cls, config: Dict[str, Any]) -> "Vectorizer": - """ - Create vectorizer from `config`. This method is supposed to be implemented - by derived classes. - - :param config: vectorizer config - :type config: Dict[str, Any] - :return: vectorizer instance - :rtype: Vectorizer - """ - message = "abstract method _from_config is not implemented" - raise NotImplementedError(message) - - def _get_vector_dimensions(self, config: Dict[str, Any]) -> Optional[int]: - """ - Get embedding vector dimensions from `config`. - - * If vector dimensions is not specified in `config`, return None. - - * If vector dimensions is specified in `config` but not a positive integer, - raise an exception. - - :param config: vectorizer config - :type config: Dict[str, Any] - :return: embedding vector dimensions or None - :rtype: Optional[int] - """ - value = config.get("vector_dimensions") - if value is None: - return None - if isinstance(value, str): - try: - value = int(value) - except ValueError as ex: - message = "vector_dimensions must be integer; " - message += "%r is invalid" % (value,) - raise RuntimeError(message) from ex - if not isinstance(value, int) or value <= 0: - message = "vector_dimensions must be positive-integer; " - message += "%r is invalid" % (value,) - raise RuntimeError(message) - return value - - @property - def vector_dimensions(self): - """ - Dimension of generated embedding vectors. - """ - if self._vector_dimensions is not None: - return self._vector_dimensions - try: - example_input = "This is a test." - example_vector = self.vectorize(example_input) - except Exception as ex: - message = "the embedding service is not available" - raise RuntimeError(message) from ex - value = self._get_vector_dimensions(self._config) - if value is not None and value != len(example_vector): - message = "invalid 'vector_dimensions', specified %d; " % value - message += "but the actual generated embedding vector is of %d dimensions" % len(example_vector) - raise RuntimeError(message) - self._vector_dimensions = len(example_vector) - return self._vector_dimensions - - @abstractmethod - def vectorize(self, texts: Union[str, Iterable[str]]) -> Union[EmbeddingVector, Iterable[EmbeddingVector]]: - """ - Vectorize a text string into an embedding vector or multiple text strings into - multiple embedding vectors. - - :param texts: texts to vectorize - :type texts: str or Iterable[str] - :return: embedding vectors of the texts - :rtype: EmbeddingVector or Iterable[EmbeddingVector] - """ - message = "abstract method vectorize is not implemented" - raise NotImplementedError(message) diff --git a/kag/common/vectorizer/vectorizer_config_checker.py b/kag/common/vectorizer/vectorizer_config_checker.py deleted file mode 100644 index 2177c25d..00000000 --- a/kag/common/vectorizer/vectorizer_config_checker.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -from kag.common.vectorizer.vectorizer import Vectorizer - - -class VectorizerConfigChecker(object): - """ - Check whether the vectorizer config is valid. - """ - - def check(self, vectorizer_config: str) -> int: - """ - Check the vectorizer config. - - * If the config is valid, return the actual embedding vector dimensions. - - * If the config is invalid, raise a RuntimeError exception. - - :param vectorizer_config: vectorizer config - :type vectorizer_config: str - :return: embedding vector dimensions - :rtype: int - :raises RuntimeError: if the config is invalid - """ - try: - config = json.loads(vectorizer_config) - vectorizer = Vectorizer.from_config(config) - vector_dimensions = vectorizer.vector_dimensions - return vector_dimensions - except Exception as ex: - message = "invalid vectorizer config: %s" % str(ex) - raise RuntimeError(message) from ex diff --git a/kag/examples/2wiki/.gitignore b/kag/examples/2wiki/.gitignore new file mode 100644 index 00000000..5c920f23 --- /dev/null +++ b/kag/examples/2wiki/.gitignore @@ -0,0 +1,3 @@ +ckpt/ +/solver/2wiki_res_*.json +/solver/2wiki_metrics_*.json diff --git a/kag/examples/2wiki/README.md b/kag/examples/2wiki/README.md new file mode 100644 index 00000000..5f9c214a --- /dev/null +++ b/kag/examples/2wiki/README.md @@ -0,0 +1,69 @@ +# KAG Example: TwoWiki + +[2WikiMultiHopQA](https://arxiv.org/abs/1809.09600) is a multi-hop QA dataset +for comprehensive evaluation of reasoning steps. It's used by [KAG](https://arxiv.org/abs/2409.13731) +and [HippoRAG](https://arxiv.org/abs/2405.14831) for multi-hop question answering +performance evaluation. + +Here we demonstrate how to build a knowledge graph for the 2WikiMultiHopQA dataset, +generate answers to those evaluation questions with KAG and calculate EM and F1 +metrics of the KAG generated answers compared to the ground-truth answers. + +## Steps to reproduce + +1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG. + + The following steps assume the Python virtual environment with KAG installed + is activated and the current directory is [2wiki](.). + +2. (Optional) Update [indexer.py](./builder/indexer.py) and [evaFor2wiki.py](./solver/evaFor2wiki.py) + to use the larger dataset. You may want to skip this step the first time and + use the small dataset to get started quickly. + +3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations + in [kag_config.yaml](./kag_config.yaml) properly. + +4. Restore the KAG project. + + ```bash + knext project restore --host_addr http://127.0.0.1:8887 --proj_path . + ``` + +5. Commit the schema. + + ```bash + knext schema commit + ``` + +6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph. + + ```bash + cd builder && python indexer.py && cd .. + ``` + +7. Execute [evaFor2wiki.py](./solver/evaFor2wiki.py) in the [solver](./solver) directory + to generate the answers and calculate the EM and F1 metrics. + + ```bash + cd solver && python evaFor2wiki.py && cd .. + ``` + + The generated answers are saved to ``./solver/2wiki_res_*.json``. + + The calculated EM and F1 metrics are saved to ``./solver/2wiki_metrics_*.json``. + +8. (Optional) To delete checkpoints, execute the following commands. + + ```bash + rm -rf ./builder/ckpt + rm -rf ./solver/ckpt + ``` + + To delete the KAG project and related knowledge graph, execute the following similar command. + Replace the OpenSPG server address and KAG project id with actual values. + + ```bash + curl http://127.0.0.1:8887/project/api/delete?projectId=1 + ``` + +9. (Optional) Restart from Step 2 and try the larger dataset. diff --git a/kag/examples/2wiki/builder/__init__.py b/kag/examples/2wiki/builder/__init__.py index 94be39bc..7a018e7c 100644 --- a/kag/examples/2wiki/builder/__init__.py +++ b/kag/examples/2wiki/builder/__init__.py @@ -11,4 +11,4 @@ """ Builder Dir. -""" \ No newline at end of file +""" diff --git a/kag/examples/2wiki/builder/data/__init__.py b/kag/examples/2wiki/builder/data/__init__.py index 6a8637b9..59bacd4d 100644 --- a/kag/examples/2wiki/builder/data/__init__.py +++ b/kag/examples/2wiki/builder/data/__init__.py @@ -11,4 +11,4 @@ """ Place the files to be used for building the index in this directory. -""" \ No newline at end of file +""" diff --git a/kag/examples/2wiki/builder/indexer.py b/kag/examples/2wiki/builder/indexer.py index 8f687ec1..67332d01 100644 --- a/kag/examples/2wiki/builder/indexer.py +++ b/kag/examples/2wiki/builder/indexer.py @@ -8,74 +8,27 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. -import json -import logging import os -from typing import Type, List +import logging +from kag.common.registry import import_modules_from_path -from kag.builder.component import KGWriter -from kag.builder.component.extractor import KAGExtractor -from kag.builder.component.splitter import LengthSplitter -from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer -from kag.builder.model.chunk import Chunk -from kag.examples.utils import generate_hash_id -from kag.interface.builder.reader_abc import SourceReaderABC -from knext.common.base.runnable import Input, Output -from knext.builder.builder_chain_abc import BuilderChainABC +from kag.builder.runner import BuilderChainRunner logger = logging.getLogger(__name__) -class TwowikiCorpusReader(SourceReaderABC): - @property - def input_types(self) -> Type[Input]: - """The type of input this Runnable object accepts specified as a type annotation.""" - return str - - @property - def output_types(self) -> Type[Output]: - """The type of output this Runnable object produces specified as a type annotation.""" - return Chunk - - def invoke(self, input: str, **kwargs) -> List[Output]: - if os.path.exists(str(input)): - with open(input, "r") as f: - corpus = json.load(f) - else: - corpus = json.loads(input) - chunks = [] - - for idx, item in enumerate(corpus): - chunk = Chunk( - id=generate_hash_id(item['text']), - name=item['title'], - content=item['text'], - ) - chunks.append(chunk) - return chunks - - -class TwowikiBuilderChain(BuilderChainABC): - def build(self, **kwargs): - source = TwowikiCorpusReader() - splitter = LengthSplitter(split_length=2000) - extractor = KAGExtractor() - vectorizer = BatchVectorizer() - sink = KGWriter() - - return source >> splitter >> extractor >> vectorizer >> sink +def buildKB(file_path): + from kag.common.conf import KAG_CONFIG + runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"]) + runner.invoke(file_path) -def buildKB(corpusFilePath): - TwowikiBuilderChain().invoke(file_path=corpusFilePath, max_workers=20) + logger.info(f"\n\nbuildKB successfully for {file_path}\n\n") - logger.info(f"\n\nbuildKB successfully for {corpusFilePath}\n\n") +if __name__ == "__main__": + import_modules_from_path(".") + dir_path = os.path.dirname(__file__) + file_path = os.path.join(dir_path, "data/2wiki_sub_corpus.json") -if __name__ == '__main__': - filePath = "./data/2wiki_sub_corpus.json" - # filePath = "./data/2wiki_corpus.json" - corpusFilePath = os.path.join( - os.path.abspath(os.path.dirname(__file__)), filePath - ) - buildKB(corpusFilePath) + buildKB(file_path) diff --git a/kag/examples/2wiki/builder/prompt/ner.py b/kag/examples/2wiki/builder/prompt/ner.py deleted file mode 100644 index cf5aa897..00000000 --- a/kag/examples/2wiki/builder/prompt/ner.py +++ /dev/null @@ -1,110 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -from string import Template -from typing import List, Optional - -from kag.common.base.prompt_op import PromptOp -from knext.schema.client import SchemaClient - - -class OpenIENERPrompt(PromptOp): - - template_en = """ - { - "instruction": "You're a very effective entity extraction system. Please extract all the entities that are important for knowledge build and question, along with type, category and a brief description of the entity. The description of the entity is based on your OWN KNOWLEDGE AND UNDERSTANDING and does not need to be limited to the context. the entity's category belongs taxonomically to one of the items defined by schema, please also output the category. Note: Type refers to a specific, well-defined classification, such as Professor, Actor, while category is a broader group or class that may contain more than one type, such as Person, Works. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.You can refer to the example for extraction.", - "schema": $schema, - "example": [ - { - "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", - "output": [ - { - "entity": "The Rezort", - "type": "Movie", - "category": "Works", - "description": "A 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger." - }, - { - "entity": "2015", - "type": "Year", - "category": "Date", - "description": "The year the movie 'The Rezort' was released." - }, - { - "entity": "British", - "type": "Nationality", - "category": "GeographicLocation", - "description": "Great Britain, the island that includes England, Scotland, and Wales." - }, - { - "entity": "Steve Barker", - "type": "Director", - "category": "Person", - "description": "Steve Barker is an English film director and screenwriter." - }, - { - "entity": "Paul Gerstenberger", - "type": "Writer", - "category": "Person", - "description": "Paul is a writer and producer, known for The Rezort (2015), Primeval (2007) and House of Anubis (2011)." - }, - { - "entity": "Dougray Scott", - "type": "Actor", - "category": "Person", - "description": "Stephen Dougray Scott (born 26 November 1965) is a Scottish actor." - }, - { - "entity": "Jessica De Gouw", - "type": "Actor", - "category": "Person", - "description": "Jessica Elise De Gouw (born 15 February 1988) is an Australian actress. " - }, - { - "entity": "Martin McCann", - "type": "Actor", - "category": "Person", - "description": "Martin McCann is an actor from Northern Ireland. In 2020, he was listed as number 48 on The Irish Times list of Ireland's greatest film actors" - } - ] - } - ], - "input": "$input" -} - """ - - template_zh = template_en - - def __init__( - self, language: Optional[str] = "en", **kwargs - ): - super().__init__(language, **kwargs) - self.schema = SchemaClient(project_id=self.project_id).extract_types() - self.template = Template(self.template).safe_substitute(schema=self.schema) - - @property - def template_variables(self) -> List[str]: - return ["input"] - - def parse_response(self, response: str, **kwargs): - rsp = response - if isinstance(rsp, str): - rsp = json.loads(rsp) - if isinstance(rsp, dict) and "output" in rsp: - rsp = rsp["output"] - if isinstance(rsp, dict) and "named_entities" in rsp: - entities = rsp["named_entities"] - else: - entities = rsp - - return entities diff --git a/kag/examples/2wiki/builder/prompt/std.py b/kag/examples/2wiki/builder/prompt/std.py deleted file mode 100644 index 1dfcfaaa..00000000 --- a/kag/examples/2wiki/builder/prompt/std.py +++ /dev/null @@ -1,114 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -from typing import Optional, List - -from kag.common.base.prompt_op import PromptOp - - -class OpenIEEntitystandardizationdPrompt(PromptOp): - template_en = """ -{ - "instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.", - "example": { - "input": "American History\nWhen did the political party that favored harsh punishment of southern states after the Civil War, gain control of the House? Republicans regained control of the chamber they had lost in the 2006 midterm elections.", - "named_entities": [ - {"entity": "American", "category": "GeographicLocation"}, - {"entity": "political party", "category": "Organization"}, - {"entity": "southern states", "category": "GeographicLocation"}, - {"entity": "Civil War", "category": "Keyword"}, - {"entity": "House", "category": "Organization"}, - {"entity": "Republicans", "category": "Organization"}, - {"entity": "chamber", "category": "Organization"}, - {"entity": "2006 midterm elections", "category": "Date"} - ], - "output": [ - { - "entity": "American", - "category": "GeographicLocation", - "official_name": "United States of America" - }, - { - "entity": "political party", - "category": "Organization", - "official_name": "Radical Republicans" - }, - { - "entity": "southern states", - "category": "GeographicLocation", - "official_name": "Confederacy" - }, - { - "entity": "Civil War", - "category": "Keyword", - "official_name": "American Civil War" - }, - { - "entity": "House", - "category": "Organization", - "official_name": "United States House of Representatives" - }, - { - "entity": "Republicans", - "category": "Organization", - "official_name": "Republican Party" - }, - { - "entity": "chamber", - "category": "Organization", - "official_name": "United States House of Representatives" - }, - { - "entity": "midterm elections", - "category": "Date", - "official_name": "United States midterm elections" - } - ] - }, - "input": "$input", - "named_entities": $named_entities -} - """ - - template_zh = """""" - - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - - @property - def template_variables(self) -> List[str]: - return ["input", "named_entities"] - - def parse_response(self, response: str, **kwargs): - - rsp = response - if isinstance(rsp, str): - rsp = json.loads(rsp) - if isinstance(rsp, dict) and "output" in rsp: - rsp = rsp["output"] - if isinstance(rsp, dict) and "named_entities" in rsp: - standardized_entity = rsp["named_entities"] - else: - standardized_entity = rsp - entities_with_offical_name = set() - merged = [] - entities = kwargs.get("named_entities", []) - for entity in standardized_entity: - merged.append(entity) - entities_with_offical_name.add(entity["entity"]) - # in case llm ignores some entities - for entity in entities: - if entity["entity"] not in entities_with_offical_name: - entity["official_name"] = entity["entity"] - merged.append(entity) - return merged diff --git a/kag/examples/2wiki/builder/prompt/triple.py b/kag/examples/2wiki/builder/prompt/triple.py deleted file mode 100644 index 9e375e2c..00000000 --- a/kag/examples/2wiki/builder/prompt/triple.py +++ /dev/null @@ -1,177 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 OpenSPG Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. - -import json -from typing import Optional, List - -from kag.common.base.prompt_op import PromptOp - - -class OpenIETriplePrompt(PromptOp): - template_en = """ -{ - "instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\", \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.", - "entity_list": $entity_list, - "input": "$input", - "example": { - "input": "The Rezort\nThe Rezort is a 2015 British zombie horror film directed by Steve Barker and written by Paul Gerstenberger.\n It stars Dougray Scott, Jessica De Gouw and Martin McCann.\n After humanity wins a devastating war against zombies, the few remaining undead are kept on a secure island, where they are hunted for sport.\n When something goes wrong with the island's security, the guests must face the possibility of a new outbreak.", - "entity_list": [ - { - "entity": "The Rezort", - "category": "Works" - }, - { - "entity": "2015", - "category": "Others" - }, - { - "entity": "British", - "category": "GeographicLocation" - }, - { - "entity": "Steve Barker", - "category": "Person" - }, - { - "entity": "Paul Gerstenberger", - "category": "Person" - }, - { - "entity": "Dougray Scott", - "category": "Person" - }, - { - "entity": "Jessica De Gouw", - "category": "Person" - }, - { - "entity": "Martin McCann", - "category": "Person" - }, - { - "entity": "zombies", - "category": "Creature" - }, - { - "entity": "zombie horror film", - "category": "Concept" - }, - { - "entity": "humanity", - "category": "Concept" - }, - { - "entity": "secure island", - "category": "GeographicLocation" - } - ], - "output": [ - [ - "The Rezort", - "is", - "zombie horror film" - ], - [ - "The Rezort", - "publish at", - "2015" - ], - [ - "The Rezort", - "released", - "British" - ], - [ - "The Rezort", - "is directed by", - "Steve Barker" - ], - [ - "The Rezort", - "is written by", - "Paul Gerstenberger" - ], - [ - "The Rezort", - "stars", - "Dougray Scott" - ], - [ - "The Rezort", - "stars", - "Jessica De Gouw" - ], - [ - "The Rezort", - "stars", - "Martin McCann" - ], - [ - "humanity", - "wins", - "a devastating war against zombies" - ], - [ - "the few remaining undead", - "are kept on", - "a secure island" - ], - [ - "they", - "are hunted for", - "sport" - ], - [ - "something", - "goes wrong with", - "the island's security" - ], - [ - "the guests", - "must face", - "the possibility of a new outbreak" - ] - ] - } -} - """ - - def __init__(self, language: Optional[str] = "en"): - super().__init__(language) - - @property - def template_variables(self) -> List[str]: - return ["entity_list", "input"] - - def parse_response(self, response: str, **kwargs): - rsp = response - if isinstance(rsp, str): - rsp = json.loads(rsp) - if isinstance(rsp, dict) and "output" in rsp: - rsp = rsp["output"] - if isinstance(rsp, dict) and "triples" in rsp: - triples = rsp["triples"] - else: - triples = rsp - - standardized_triples = [] - for triple in triples: - if isinstance(triple, list): - standardized_triples.append(triple) - elif isinstance(triple, dict): - s = triple.get("subject") - p = triple.get("predicate") - o = triple.get("object") - if s and p and o: - standardized_triples.append([s, p, o]) - - return standardized_triples diff --git a/kag/examples/2wiki/kag_config.cfg b/kag/examples/2wiki/kag_config.cfg deleted file mode 100644 index 55ded269..00000000 --- a/kag/examples/2wiki/kag_config.cfg +++ /dev/null @@ -1,27 +0,0 @@ -[project] -namespace = TwoWiki -host_addr = http://127.0.0.1:8887 -id = 11 - -[vectorizer] -vectorizer = kag.common.vectorizer.OpenAIVectorizer -model = bge-m3 -api_key = EMPTY -base_url = http://127.0.0.1:11434/v1 -vector_dimensions = 1024 - -[llm] -client_type = maas -base_url = https://api.deepseek.com/ -api_key = put your deepseek api key here -model = deepseek-chat - -[log] -level = INFO - -[qa] -force_chunk_retriever = True - -[prompt] -language = en -biz_scene = default \ No newline at end of file diff --git a/kag/examples/2wiki/kag_config.yaml b/kag/examples/2wiki/kag_config.yaml new file mode 100644 index 00000000..ac2c8110 --- /dev/null +++ b/kag/examples/2wiki/kag_config.yaml @@ -0,0 +1,126 @@ +#------------project configuration start----------------# +openie_llm: &openie_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +chat_llm: &chat_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +vectorize_model: &vectorize_model + api_key: key + base_url: https://api.siliconflow.cn/v1/ + model: BAAI/bge-m3 + type: openai + vector_dimensions: 1024 +vectorizer: *vectorize_model + +log: + level: INFO + +project: + biz_scene: default + host_addr: http://127.0.0.1:8887 + id: '7' + language: en + namespace: TwoWiki +#------------project configuration end----------------# + +#------------kag-builder configuration start----------------# +kag_builder_pipeline: + chain: + type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain + extractor: + type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor + llm: *openie_llm + ner_prompt: + type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt + std_prompt: + type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt + triple_prompt: + type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt + reader: + type: dict_reader # kag.builder.component.reader.dict_reader.DictReader + post_processor: + type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor + similarity_threshold: 0.9 + splitter: + type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter + split_length: 100000 + window_length: 0 + vectorizer: + type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer + vectorize_model: *vectorize_model + writer: + type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter + num_threads_per_chain: 1 + num_chains: 16 + scanner: + type: 2wiki_dataset_scanner # kag.builder.component.scanner.dataset_scanner.MusiqueCorpusScanner +#------------kag-builder configuration end----------------# + +#------------kag-solver configuration start----------------# +search_api: &search_api + type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI + +graph_api: &graph_api + type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi + +exact_kg_retriever: &exact_kg_retriever + type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever + el_num: 5 + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +fuzzy_kg_retriever: &fuzzy_kg_retriever + type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + el_num: 5 + vectorize_model: *vectorize_model + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +chunk_retriever: &chunk_retriever + type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + llm_client: *chat_llm + recall_num: 10 + rerank_topk: 10 + +kag_solver_pipeline: + memory: + type: default_memory # kag.solver.implementation.default_memory.DefaultMemory + llm_client: *chat_llm + max_iterations: 3 + reasoner: + type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner + llm_client: *chat_llm + lf_planner: + type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner + llm_client: *chat_llm + vectorize_model: *vectorize_model + lf_executor: + type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor + llm_client: *chat_llm + force_chunk_retriever: true + exact_kg_retriever: *exact_kg_retriever + fuzzy_kg_retriever: *fuzzy_kg_retriever + chunk_retriever: *chunk_retriever + merger: + type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger + vectorize_model: *vectorize_model + chunk_retriever: *chunk_retriever + generator: + type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator + llm_client: *chat_llm + generate_prompt: + type: resp_simple # kag/examples/2wiki/solver/prompt/resp_generator.py + reflector: + type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector + llm_client: *chat_llm + +#------------kag-solver configuration end----------------# diff --git a/kag/examples/2wiki/reasoner/__init__.py b/kag/examples/2wiki/reasoner/__init__.py index a0c4032b..8b8a3c91 100644 --- a/kag/examples/2wiki/reasoner/__init__.py +++ b/kag/examples/2wiki/reasoner/__init__.py @@ -17,4 +17,4 @@ MATCH (s:DEFAULT.Company) RETURN s.id, s.address ``` -""" \ No newline at end of file +""" diff --git a/kag/examples/2wiki/schema/__init__.py b/kag/examples/2wiki/schema/__init__.py index ef3dde6d..8ac86acc 100644 --- a/kag/examples/2wiki/schema/__init__.py +++ b/kag/examples/2wiki/schema/__init__.py @@ -15,4 +15,4 @@ You can execute `kag schema commit` to commit your schema to SPG server. -""" \ No newline at end of file +""" diff --git a/kag/examples/2wiki/solver/evaFor2wiki.py b/kag/examples/2wiki/solver/evaFor2wiki.py index d6a533b1..76cd4a4f 100644 --- a/kag/examples/2wiki/solver/evaFor2wiki.py +++ b/kag/examples/2wiki/solver/evaFor2wiki.py @@ -7,27 +7,29 @@ from tqdm import tqdm from kag.common.benchmarks.evaluate import Evaluate -from kag.common.env import init_kag_config from kag.solver.logic.solver_pipeline import SolverPipeline +from kag.common.conf import KAG_CONFIG +from kag.common.registry import import_modules_from_path + +from kag.common.checkpointer import CheckpointerManager logger = logging.getLogger(__name__) class EvaFor2wiki: - """ init for kag client """ - def __init__(self, configFilePath): - self.configFilePath = configFilePath - init_kag_config(self.configFilePath) + + def __init__(self): + pass """ qa from knowledge base, """ + def qa(self, query): - # CA - resp = SolverPipeline() + resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"]) answer, traceLog = resp.run(query) logger.info(f"\n\nso the answer for '{query}' is: {answer}\n\n") @@ -37,19 +39,29 @@ def qa(self, query): parallel qa from knowledge base and getBenchmarks(em, f1, answer_similarity) """ + def parallelQaAndEvaluate( self, qaFilePath, resFilePath, threadNum=1, upperLimit=10 ): + ckpt = CheckpointerManager.get_checkpointer( + {"type": "zodb", "ckpt_dir": "ckpt"} + ) + def process_sample(data): try: sample_idx, sample = data sample_id = sample["_id"] question = sample["question"] gold = sample["answer"] - prediction, traceLog = self.qa(question) - - evaObj = Evaluate() - metrics = evaObj.getBenchMark([prediction], [gold]) + if question in ckpt: + print(f"found existing answer to question: {question}") + prediction, traceLog = ckpt.read_from_ckpt(question) + else: + prediction, traceLog = self.qa(question) + ckpt.write_to_ckpt(question, (prediction, traceLog)) + + evalObj = Evaluate() + metrics = evalObj.getBenchMark([prediction], [gold]) return sample_idx, sample_id, prediction, metrics, traceLog except Exception as e: import traceback @@ -104,30 +116,28 @@ def process_sample(data): res_metrics[item_key] = item_value / total_metrics["processNum"] else: res_metrics[item_key] = total_metrics["processNum"] + CheckpointerManager.close() return res_metrics if __name__ == "__main__": - configFilePath = os.path.join( - os.path.abspath(os.path.dirname(__file__)), "../kag_config.cfg" - ) - evalObj = EvaFor2wiki(configFilePath=configFilePath) + import_modules_from_path("./prompt") + evalObj = EvaFor2wiki() + start_time = time.time() filePath = "./data/2wiki_qa_sub.json" - # filePath = "./data/2wiki_qa.json" - qaFilePath = os.path.join( - os.path.abspath(os.path.dirname(__file__)), filePath - ) - start_time = time.time() + evalObj.qa("When did Lothair Ii's mother die?") + + qaFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath) resFilePath = os.path.join( - os.path.abspath(os.path.dirname(__file__)), f"2wiki_qa_res_{start_time}.json" + os.path.abspath(os.path.dirname(__file__)), f"2wiki_res_{start_time}.json" ) total_metrics = evalObj.parallelQaAndEvaluate( - qaFilePath, resFilePath, threadNum=20, upperLimit=1000 + qaFilePath, resFilePath, threadNum=20, upperLimit=10000 ) - total_metrics['cost'] = time.time() - start_time + + total_metrics["cost"] = time.time() - start_time with open(f"./2wiki_metrics_{start_time}.json", "w") as f: json.dump(total_metrics, f) - print(total_metrics) diff --git a/kag/examples/2wiki/solver/prompt/__init__.py b/kag/examples/2wiki/solver/prompt/__init__.py index dadd42a3..dfa931cd 100644 --- a/kag/examples/2wiki/solver/prompt/__init__.py +++ b/kag/examples/2wiki/solver/prompt/__init__.py @@ -11,4 +11,4 @@ """ Place the prompts to be used for solving problems in this directory. -""" \ No newline at end of file +""" diff --git a/kag/examples/2wiki/solver/prompt/resp_generator.py b/kag/examples/2wiki/solver/prompt/resp_generator.py index 70e96cc9..cb8d76ab 100644 --- a/kag/examples/2wiki/solver/prompt/resp_generator.py +++ b/kag/examples/2wiki/solver/prompt/resp_generator.py @@ -3,26 +3,26 @@ from typing import List import logging -from kag.common.base.prompt_op import PromptOp +from kag.interface import PromptABC logger = logging.getLogger(__name__) -class RespGenerator(PromptOp): - template_zh = "基于给定的引用信息回答问题。" \ - "\n只输出答案,不需要输出额外的信息。" \ - "\n给定的引用信息:'$memory'\n问题:'$instruction'" - template_en = "Answer the question based on the given reference." \ - "\nOnly give me the answer and do not output any other words." \ - "\nThe following are given reference:'$memory'\nQuestion: '$instruction'" - - def __init__(self, language: str): - super().__init__(language) +@PromptABC.register("resp_simple") +class RespGenerator(PromptABC): + template_zh = ( + "基于给定的引用信息回答问题。" "\n只输出答案,不需要输出额外的信息。" "\n给定的引用信息:'$memory'\n问题:'$instruction'" + ) + template_en = ( + "Answer the question based on the given reference." + "\nOnly give me the answer and do not output any other words." + "\nThe following are given reference:'$memory'\nQuestion: '$instruction'" + ) @property def template_variables(self) -> List[str]: return ["memory", "instruction"] def parse_response(self, response: str, **kwargs): - logger.debug('推理器判别:{}'.format(response)) + logger.debug("推理器判别:{}".format(response)) return response diff --git a/kag/examples/README.md b/kag/examples/README.md index 6587ce7f..3e629a64 100644 --- a/kag/examples/README.md +++ b/kag/examples/README.md @@ -11,14 +11,14 @@ Create your new knext project from knext cli tool. host_addr = http://localhost:8887 [vectorizer] - vectorizer = kag.common.vectorizer.OpenAIVectorizer + type = openai model = bge-m3 api_key = EMPTY base_url = http://127.0.0.1:11434/v1 vector_dimensions = 1024 [llm] - client_type = ollama + type = ollama base_url = http://localhost:11434/api/generate model = llama3.1 @@ -130,7 +130,6 @@ Create your new knext project from knext cli tool. pass def qa(self, query): - # CA resp = SolverPipeline() answer, trace_log = resp.run(query) diff --git a/kag/examples/2wiki/builder/prompt/__init__.py b/kag/examples/baike/builder/__init__.py similarity index 86% rename from kag/examples/2wiki/builder/prompt/__init__.py rename to kag/examples/baike/builder/__init__.py index 247bb44c..7a018e7c 100644 --- a/kag/examples/2wiki/builder/prompt/__init__.py +++ b/kag/examples/baike/builder/__init__.py @@ -10,5 +10,5 @@ # or implied. """ -Place the prompts to be used for building the index in this directory. -""" \ No newline at end of file +Builder Dir. +""" diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt" new file mode 100644 index 00000000..dd363aac --- /dev/null +++ "b/kag/examples/baike/builder/data/\345\221\250\346\230\237\351\251\260\347\231\276\347\247\221.txt" @@ -0,0 +1,29 @@ +周星驰(Stephen Chow),1962年6月22日出生于中国香港,祖籍浙江宁波 [178],华语影视男演员、导演、编剧、监制、制片人、出品人、主持人、国家一级演员、西南民族大学客座教授 [82]、中国人民大学教授 [82]。 +1980年,成为丽的电视台特约演员,从而进入演艺圈 [32]。1981年,出演荧幕处女作《IQ成熟时》 [161]。1988年,将演艺事业的重心转向大银幕,在电影《霹雳先锋》中首次担任男主角 [91]。1990年,凭借喜剧片《一本漫画闯天涯》确立其无厘头的表演风格 [137],之后又凭借喜剧动作片《赌圣》、喜剧片《逃学威龙》两度打破香港电影票房纪录 [1] [142]。1993年上映的古装喜剧片《唐伯虎点秋香》使得周星驰第四次拿到香港电影年度票房冠军 [136]。1994年,周星驰开始转型,他首度出任导演的电影作品是《国产凌凌漆》 [94]。1995年,主演的喜剧爱情片《大话西游》成为周星驰后现代电影的代表作 [92]。2001年,自导自演的喜剧片《少林足球》打破香港电影票房纪录 [24]。2003年,成为美国《时代周刊》封面人物 [4]。2013年,执导古装电影《西游·降魔篇》,该片以2.18亿美元的票房成绩打破华语电影在全球的票房纪录 [5]。2016年,担任科幻喜剧片《美人鱼》的导演、编剧、制片人 [6-7],该片创下中国内地影史单片票房纪录 [81] [84]。 +作为演员,他获得过第21届香港电影金像奖最佳男主角奖,亚太电影节最佳男主角奖等奖项 [3] [139],入选“中国电影百年百位优秀演员”以及“中国电影百年名人堂” [85] [138]。作为导演,他先后获得第21届香港电影金像奖最佳导演奖、第42届台湾电影金马奖最佳导演等奖项 [3] [80]。 +早年经历 +1962年,周星驰出生在香港九龙的贫民区 [200]。母亲凌宝儿引用《滕王阁序》中的诗句“雄州雾列,俊采星驰”给儿子取名周星驰 [200]。周星驰有一个姐姐和一个妹妹,一家人住在一间狭窄的木板房里,全家一个月的生活费是50元 [200]。尽管家境贫寒,但母亲凌宝儿还是尽可能给周星驰买体面的衣服 [126]。周星驰7岁时,父母离异 [8]。周星驰从小就沉默寡言,不爱说话 [8]。父母离婚后,他变得愈发沉默寡言 [180]。周星驰的爱好是看电影以及TVB电视剧,他尤其爱看李小龙的电影。他是因为迷上李小龙才走上电影之路 [8]。为了学习功夫,周星驰还拜了李小龙的授业师兄黄淳梁为师 [180]。他曾经热衷于用炒热的绿豆练习铁砂掌,而且还只用右手练 [126]。周星驰童年时做过很多工作,包括帮老人摆地摊卖指甲钳,去酒楼推着滑轮车卖虾饺,到五金厂打工,在尖沙咀骑自行车兜售报纸 [200]。 +周星驰中学就读于香港圣玛利奥英文书院 [159]。他在学校的成绩不好,除了语文,其它科目成绩都不行,但是周星驰的老师曾经称赞他画画不错 [8]。周星驰16岁时,利用暑假时间卖过点心、眼镜和电器 [142]。中学毕业后,他做过两个月的办公室助理 [126]。后来,他通过姐姐的朋友认识了梁朝伟,两个人都有一个明星梦 [8]。 +演艺经历 +初涉影视 +1980年,周星驰成为丽的电视台的特约演员,从而正式进入演艺圈 [32] [160]。1981年,出演个人首部电视剧《IQ成熟时》 [161]。1982年,周星驰报考了第11期无线电视艺员训练班,但没有考上;同年,他在戚美珍的介绍下进入第11期无线电视艺员训练班夜训班学习 [126]。 +1983年,从无线电视艺员训练班毕业后,周星驰正式成为无线电视台的签约艺员 [126],他被指派担任儿童节目《430穿梭机》的主持人,并且还在节目中与龙炳基共同主演单元剧《黑白僵尸》 [126] [142]。为了实现当演员的梦想,周星驰在主持电视节目之余努力寻找跑龙套的机会。他在跑龙套期间常常跟导演争取展现自己的机会 [142]。终于,他在武侠剧《射雕英雄传》中获得了一个宋兵乙的龙套角色,该片也是他首次和吴孟达合作的作品 [126];周星驰为了剧中角色花了不少心血,在导演的设计中这个角色是被人一掌打死,但他认为这样不真实,于是给角色设计了反抗的动作,但是并没有被导演采纳 [142];之后,他还在时装剧《北斗双雄》中扮演一个问题少年 [126]。 +多面演绎 +1986年,周星驰被调入无线电视台戏剧组;同年,他在单元情景剧《哥哥的女友》中饰演可爱活泼又略带羞涩的潘家伟,这也是他第一次在情景剧中担任男主角;之后,他还在温兆伦、郭晋安等人主演的电视剧中跑龙套。 [126] +1988年,周星驰与万梓良、李美凤共同出演动作片《捕风汉子》,该片是他出演的第一部电影,因为这部电影,他结识了香港演员万梓良。万梓良欣赏周星驰的演技,于是他们之后又合作了时装商战剧《他来自江湖》 [126];其后,周星驰得到电影公司老板李修贤的赏识,在电影《霹雳先锋》中饰演一个浪荡江湖的小弟 [128],该片是周星驰首次在电影中担任男主角 [91],他也凭借该片获得第25届台湾电影金马奖最佳男配角奖、第8届香港电影金像奖最佳配角奖提名 [9] [107]。《霹雳先锋》上映的这一年也是其电视剧播出最密集的一年,他先后有《梦边缘》、《刑警本色》、《大都会》、《斗气一族》等六部电视剧在TVB播出,其无厘头的搞笑方式通过电视剧引起了香港人的注意 [91]。1989年,周星驰和罗慧娟、吴孟达共同出演了TVB古装武侠剧《盖世豪侠》 [126]。 +事业上升 +1990年5月24日,主演的动作喜剧片《咖喱辣椒》上映,周星驰以即兴发挥的方式完成了与张学友的配戏,并且受到广泛好评 [181]。在接下来的几部电影中,周星驰延续了《咖喱辣椒》的表演方式 [181]。周星驰的成名作是喜剧电影《赌圣》,该片不仅让周星驰发挥了无厘头的表演风格,同时也为他所表演的小人物开拓了成型的道路 [134]。周星驰的表演,在传统的英雄人物塑造之外,又添加了戏谑的成分 [91]。该片在香港地区收获4132万港元的票房 [2],不仅获得香港年度票房排行榜冠军 [137],还打破了香港地区的票房纪录 [142],周星驰凭借该片获得第10届香港电影金像奖最佳男主角奖提名 [10]。这一年,他还通过喜剧片《一本漫画闯天涯》确立其无厘头的表演风格 [137],该片也成为周星驰风格形成的重要转折点 [33]。7月,主演时装喜剧《孖仔孖心肝》,周星驰在剧中饰演性格爽朗且具有正义感的王利就 [184]。同年,他还主演了喜剧片《赌侠》,在片中饰演身负多项特异功能绝技的阿星 [78],该片在香港地区的最终票房达到4034万港元,位列香港年度票房排行榜第二名 [11]。1990年的华语片香港票房前十名中有三部是周星驰主演的电影,其中冠军《赌圣》、亚军《赌侠》票房都超过了4000万港元 [14]。 +1991年2月,在爱情喜剧片《整蛊专家》中饰演整人专家古晶 [12],作为无厘头电影,该片无论主题、故事情节还是造型都呈现出一种卡通画的夸张 [185],影片在香港上映后最终票房为3138.8万港元 [186];7月,在喜剧片《逃学威龙》中改变小混混的银幕形象,饰演飞虎队队长周星星 [79],该片在香港地区上映后以4382万港元的票房成绩获得香港年度票房冠军 [11],并打破香港地区的票房纪录 [1];8月,主演喜剧科幻片《赌侠2上海滩赌圣》,该片在香港的票房达到3186万港元;同年,周星驰与成龙、周润发并称为“双周一成” [13]。 +1992年1月,主演的喜剧片《家有喜事》在香港上映后最终票房为4899万港元,获得香港年度票房排行榜亚军 [11];4月,主演《逃学威龙》系列电影的第二部《逃学威龙2》,在片中饰演以个人身份到学校协助曹达华的交通警察周星星 [187];7月,在古装喜剧片《审死官》中饰演口才和能力出色的状师宋世杰 [188],该片以4988万港元的票房成绩获得香港电影年度票房冠军 [2],并再度打破票房纪录 [200],周星驰亦凭借该片获得第37届亚太电影节最佳男主角奖以及第12届香港电影金像奖最佳男主角提名 [16] [137];9月,与林青霞、李嘉欣共同主演武侠喜剧片《鹿鼎记Ⅱ:神龙教》,在片中饰演护送建宁公主嫁到云南的韦小宝 [202],该片在香港地区的最终票房达到3658万港元 [186],位列香港十大卖座电影第五名 [186];这一年,他还出演了古装电影《武状元苏乞儿》,他所饰演的主人公苏灿在即将考取武状元之际,遭遇歹人设计陷害,被打断全身经脉,沦落成了乞丐 [189-190],该片是他出演的第一部悲喜剧风格的影片 [205],周星驰通过具有特色的表演表现出了苏灿在顺意时期的豪爽性格,与影片后半部分惨淡的遭遇形成鲜明的对比 [205]。在1992年香港年度票房排行榜中,排名前五名的电影全部由周星驰主演,且每一部电影的票房都超过3600万港元。周星驰在这个时期成为了一个符号,而其独特的表演风格则被称为“无厘头文化” [14]。 +1993年,与巩俐、郑佩佩共同主演古装喜剧片《唐伯虎点秋香》,在片中饰演天资聪慧、诗画双绝的江南四大才子之首唐伯虎 [15],影片通过解构手法对经典文本进行重新解读,历史上风流倜傥、才华横溢的才子唐伯虎在片中被周星驰塑造成游手好闲、吊儿郎当的痞子模样 [68]。该片在香港取得4017万港元的票房,获得香港年度票房排行榜冠军 [2]。这一年,他还相继主演了逃学威龙系列电影的第三部《逃学威龙3:龙过鸡年》以及武侠喜剧片《济公》 [174-175]。 +自导自演 +1994年到1999年是周星驰的转型阶段。这时期的周星驰不再满足于无厘头式的创作,而试图在影片中融入更多正剧和悲剧的成分 [135]。1994年,周星驰第一次担任导演,推出个人首部自编自导自演的电影《国产凌凌漆》 [94] [99];该片在香港地区票房达到3752万港元,在香港年度票房排行榜上排名第三 [11];周星驰在片中饰演手持杀猪刀、不走寻常路的特工阿漆 [94],并凭借该片获得第14届香港电影金像奖最佳男主角提名 [17];该片对美国谍战影片007系列进行了戏拟的创作,周星驰所扮演的人物和《007》系列影片中的特工以相同的方式出场,但是同样的镜头却给观众完全不一样的视觉感受 [68]。这一年,他还在自导自演的喜剧动作片《破坏之王》中饰演快餐小子阿星 [100];此外,他还出演了古装喜剧片《九品芝麻官之白面包青天》,在片中饰演候补知县包龙星 [18]。 +1995年1月21日,主演的喜剧片《大话西游之月光宝盒》在中国香港上映 [96],他在片饰演对白晶晶一见钟情的至尊宝 [95];为了演好片中角色,周星驰在导演刘镇伟的建议下专门去看了金·凯瑞主演的电影《变相怪杰》 [70]。随后他又主演了《大话西游》系列电影的下部《大话西游之大圣娶亲》,在片中饰演为寻找紫霞仙子而回到五百年前的至尊宝 [98],并凭借该片获得第1届香港电影金紫荆奖最佳男主角奖、第15届香港电影金像奖最佳男主角提名 [22] [120]。《大话西游》的加长纪念版于2017年在中国内地重映后票房突破1.3亿元,成为首部票房破亿元的华语重映影片 [97]。《大话西游》是周星驰的转型之作,尽管影片在票房上没有达到投资方的预期,但经过VCD等媒介的传播后逐步在中国内地走红 [19] [93]。该片对传统电影进行了解构,成为周星驰后现代风格的代表作之一 [92];通过这部电影,周星驰不仅第一次尝试创作电影,还第一次自己开公司投资拍摄电影。该片也被外界看作是周星驰在电影创作上的分水岭,自此之后周星驰电影不再单纯依靠搞笑 [91];同年,主演科幻片《百变星君》,在片中饰演学业无成却挥金如土的富豪之子李泽星 [20],该片上映后以3533万港元的票房成绩,位列香港年度票房排行榜第三位 [11]。 +1996年,自导自演科幻喜剧片《大内密探零零发》 [194];片中有一场当众扇周星驰耳光的戏是背对着镜头,这场戏周星驰完全可以使用替身,但他仍坚持自己完成 [192];周星驰在影片的情节设置上借鉴了《奇门遁甲》等作品,并重新解构了部分内容 [191],此外他还发挥个人风格,设计了很多有创造性的情节 [193];该片的票房达到3605万港元,位列香港年度票房排行榜第三位 [11];同年,担任喜剧动作片《食神》的导演、编剧、主演 [196],在片中饰演在饮食界享有盛名的史蒂芬·周 [195],该片上映后以4086万港币的票房成绩位列香港年度票房排行榜第二位 [11],此外,影片还被威尼斯国际电影节选为观摩影片 [21];周星驰通过这部电影向导演、制片人转型,并陆续推出了《少林足球》、《功夫》、《长江七号》等影片 [131]。 +1997年,主演喜剧片《97家有喜事》,在片中饰演个性反叛、不修边幅的老恭,该片在香港地区收获4044万港元的票房,获得香港年度票房亚军 [11];之后,他又相继主演了喜剧片《算死草》、贺岁片《行运一条龙》等作品 [162] [165]。 +1999年,周星驰自导自演了带有自传性质的喜剧片《喜剧之王》,在片中饰演虽屡遭失败但仍不气馁的群众演员尹天仇 [197],并在表演上回归到了卓别林式的喜剧风格 [23]。该片表达了周星驰喜剧演员生涯的心路历程,片中“我是一个演员”的台词更是周星驰对于其表演道路的总结 [85]。该片是周星驰主导的第一部影片 [181],它让周星驰从一个演员转变成职业导演 [140]。在《喜剧之王》中,周星驰突破编剧和导演所给予的表演空间,把更多的个人想法融入到作品中。影片在延续周星驰演员时期喜剧风格的同时,也有了更深的文化内涵 [181]。该片在香港上映以后以2984万港元的票房成绩获得香港年度票房冠军 [2]。这一年,周星驰还与张家辉、吴君如合作主演了喜剧片《千王之王2000》,在片中饰演千王之王黄师虎 [158]。 +2001年,周星驰自导自演了喜剧片《少林足球》 [167]。在影片拍摄期间,周星驰因为练习踢球而双腿臃肿,导致他两天无法下床 [200]。《少林足球》是周星驰第一次完全掌控的电影 [176],他将特效、功夫以及足球结合在一起,并借由陈国坤身穿的黄色连体服完成致敬李小龙的愿望 [148]。他在片中饰演的阿星虽然以拾荒为生,但对武术极度痴迷,在足球教练明峰的说服下加入少林足球队,努力实现自己的人生价值 [68]。周星驰在片中突破以往风格,表演也变得内敛 [167],他凭借该片获得第21届香港电影金像奖最佳导演奖、最佳男主角奖以及杰出青年导演奖 [3],而该片亦获得第21届香港电影金像奖最佳电影奖、日本电影蓝丝带奖最佳外语片等奖项 [3] [26-27],并被美国《时代周刊》选为“世界史上25部最佳体育电影之一” [176]。该片在香港地区的最终票房达到6073万港币,不仅获得香港年度票房冠军,还打破了香港地区票房纪录 [24-25]。 +2003年,周星驰成为美国《时代周刊》的封面人物,并入选该杂志评出的“29位亚洲英雄” [4]。2004年,担任喜剧动作片《功夫》的导演、编剧兼主演,该片在全球的总票房达到1.05亿美元,在香港以6127万港元的票房成绩打破香港地区的票房纪录,并创下华语电影在北美上映的单厅票房纪录 [28],获得第24届香港电影金像奖最佳影片奖、第42届台湾电影金马奖最佳剧情片奖、第63届美国金球奖最佳外语片提名、第59届英国电影学院奖最佳非英语片等奖项 [47] [56] [105] [154],而周星驰个人则凭借该片获得第42届台湾电影金马奖最佳导演奖 [47]。《功夫》是周星驰面向国际推出的一部作品,他在片中淡化了原来夸张的演艺方式,通过故事、画面和人物性格来表达想要阐述的东西 [181]。 +2005年,在中国电影表演艺术学会举办的评选活动中,周星驰被选为“中国电影百年百位优秀演员”之一 [85];同年,入选“中国电影百年名人堂” [138] +2008年,自导自演科幻题材的电影《长江7号》,该片是周星驰的转型之作,他摒弃了无厘头,转而走悲剧路线 [206],该片在香港上映以后以5140万港元的票房成绩获得香港电影年度票房冠军,在中国内地则收获了2.02亿元的票房,获得中国内地上半年票房冠军 [164]。2010年,担任动画电影《长江7号爱地球》的制片人以及编剧 [163]。 + diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt" new file mode 100644 index 00000000..2b9acb82 --- /dev/null +++ "b/kag/examples/baike/builder/data/\345\221\250\346\235\260\344\274\246\347\231\276\347\247\221.txt" @@ -0,0 +1,12 @@ +周杰伦(Jay Chou),1979年1月18日出生于台湾省新北市,祖籍福建省永春县,华语流行乐男歌手、音乐人、演员、导演、编剧,毕业于淡江中学。 +2000年,发行个人首张音乐专辑《Jay》 [26]。2001年,凭借专辑《范特西》奠定其融合中西方音乐的风格 [16]。2002年,举行“The One”世界巡回演唱会 [1]。2003年,成为美国《时代》杂志封面人物 [2];同年,发行音乐专辑《叶惠美》 [21],该专辑获得第15届台湾金曲奖最佳流行音乐演唱专辑奖 [23]。2004年,发行音乐专辑《七里香》 [29],该专辑在全亚洲的首月销量达到300万张 [316];同年,获得世界音乐大奖中国区最畅销艺人奖 [320]。2005年,主演个人首部电影《头文字D》 [314],并凭借该片获得第25届香港电影金像奖和第42届台湾电影金马奖的最佳新演员奖 [3] [315]。2006年起,他连续三年获得世界音乐大奖中国区最畅销艺人奖 [4]。 +2007年,自编自导爱情电影《不能说的秘密》 [321],同年,成立杰威尔音乐有限公司 [10]。2008年,凭借歌曲《青花瓷》获得第19届台湾金曲奖最佳作曲人奖 [292]。2009年,入选美国CNN“25位亚洲最具影响力人物” [6];同年,凭借专辑《魔杰座》获得第20届台湾金曲奖最佳国语男歌手奖 [7]。2010年,入选美国《Fast Company》杂志评出的“全球百大创意人物”。2011年,凭借专辑《跨时代》获得第22届台湾金曲奖最佳国语男歌手奖 [294]。2012年,登上福布斯中国名人榜榜首 [8]。2014年,发行个人首张数字音乐专辑《哎呦,不错哦》 [295]。2023年,凭借专辑《最伟大的作品》成为首位获得国际唱片业协会“全球畅销专辑榜”冠军的华语歌手 [287]。 +周杰伦出生于台湾省新北市,祖籍福建省泉州市永春县 [13]。4岁的时候,母亲叶惠美把他送到淡江山叶幼儿音乐班学习钢琴。初中二年级时,父母因性格不合离婚,周杰伦归母亲叶惠美抚养。中考时,没有考上普通高中,同年,因为擅长钢琴而被淡江中学第一届音乐班录取。高中毕业以后,两次报考台北大学音乐系均没有被录取,于是开始在一家餐馆打工。1997年9月,周杰伦在母亲的鼓励下报名参加了台北星光电视台的娱乐节目《超级新人王》 [26],并在节目中邀请他人演唱了自己独立创作的歌曲《梦有翅膀》;当主持人吴宗宪看到这首歌曲的曲谱后,便邀请周杰伦到阿尔发音乐公司担任音乐助理。当时,全唱片公司只有四位员工,包括周杰伦和创作歌词的作词人方文山 [367]。1998年,周杰伦创作了歌曲《眼泪知道》,公司把这首歌曲给到刘德华后被退歌,后为张惠妹创作的歌曲《忍者》(后收录于周杰伦个人音乐专辑《范特西》中)也被退回 [14]。2000年,音乐人杨峻荣在听到周杰伦独立创作的歌曲《可爱女人》的卡带后便立刻被吸引,并在吴宗宪的支持下争取了2000万元新台币经费来力捧周杰伦 [367]。 +2000年,周杰伦在杨峻荣的推荐下开始演唱自己创作的歌曲 [367];11月7日,发行个人首张音乐专辑《Jay》 [26],并包办专辑全部歌曲的作曲、和声编写以及监制工作 [368],该专辑融合了R&B、嘻哈等多种音乐风格 [369],发行于21世纪元年的当口,周杰伦亦在当时流行音乐疲软之际凭借R&B的音乐风格掀起了一股狂热的R&B潮流 [370],周杰伦尝试着把高难度的西班牙式弦乐演奏表现在了专辑的许多歌曲中 [369],使得整张专辑的意境十分逼近电影配乐 [371],发行后获得IFPI香港唱片销量大奖十大销量国语唱片奖 [372],其中的主打歌曲《星晴》获得第24届十大中文金曲优秀国语歌曲金奖 [15],而他也凭借该专辑在华语乐坛受到关注,并在次年举办的第12届台湾金曲奖颁奖典礼上凭借该专辑获得最佳流行音乐演唱专辑奖 [361]、入围最佳专辑制作人奖 [372],凭借专辑中的歌曲《可爱女人》提名最佳作曲人奖 [371]。 +2001年9月,周杰伦发行个人第二张音乐专辑《范特西》 [26],他除了担任专辑的制作人外,还包办了专辑中所有歌曲的作曲,该专辑是周杰伦确立其音乐风格的作品 [16],其中不仅囊括了抒情R&B歌曲,周杰伦还扩展想象空间,将摇滚、加快版Rap、日本民族风味的音乐风格融入到了该专辑中 [373],专辑中结合中西方音乐元素的主打歌曲《双截棍》成为饶舌歌曲的代表作之一,而该专辑的发行也让周杰伦打开了东南亚地区的音乐市场 [16],并于次年凭借该专辑获得第13届台湾金曲奖最佳专辑制作人奖、最佳流行音乐专辑奖 [241],以及香港唱片销量大奖颁奖典礼十大销量国语唱片等奖项,周杰伦亦凭借专辑中的歌曲《爱在西元前》获得第13届台湾金曲奖最佳作曲人奖 [228];10月,为李玟创作融合中西方音乐元素的歌曲《刀马旦》 [325];12月24日,发行个人音乐EP《范特西plus》,收录了他在桃园巨蛋演唱会上演唱的《你比从前快乐》《世界末日》等歌曲;同年,获得第19届十大劲歌金曲颁奖典礼最受欢迎唱作歌星金奖、叱咤乐坛流行榜颁奖典礼叱咤乐坛生力军男歌手金奖等奖项。 +2002年,参演个人首部电视剧《星情花园》;2月,在新加坡新达城国际会议展览中心举行演唱会;7月,发行个人第三张音乐专辑《八度空间》 [26] [317],除了包办专辑中所有歌曲的作曲外,他还担任专辑的制作人 [17],该专辑以节奏蓝调风格的歌曲为主,并获得g-music风云榜白金音乐奖十大金碟奖、华语流行乐传媒大奖十大华语唱片奖、新加坡金曲奖大奖年度最畅销男歌手专辑奖等奖项 [18];9月28日,在台北体育场举行“The One”演唱会;12月12日至16日,在香港体育馆举行5场“The One”演唱会;12月25日,在美国拉斯维加斯举办“The One”演唱会;同年,获得第1届MTV日本音乐录影带大奖亚洲最杰出艺人奖、第2届全球华语歌曲排行榜最受欢迎创作歌手奖和最佳制作人奖 [350]、第9届新加坡金曲奖亚太最受推崇男歌手奖等奖项 [19]。 +2003年2月,成为美国《时代周刊》亚洲版的封面人物 [2];3月,在第3届音乐风云榜上获得港台年度最佳唱作人奖、年度风云大奖等奖项,其演唱的歌曲《暗号》则获得港台年度十大金曲奖 [236];5月17日,在马来西亚吉隆坡默迪卡体育场举行“The One”演唱会;7月16日,他的歌曲《以父之名》在亚洲超过50家电台首播,预计有8亿人同时收听,而该曲首播的当日也被这些电台定为“周杰伦日” [20];7月31日,发行个人第四张音乐专辑《叶惠美》 [21] [26],他不仅包办了专辑所有歌曲的作曲,还担任专辑的制作人和造型师 [21],该专辑发行首月在亚洲的销量突破200万张 [22],并于次年获得第15届台湾金曲奖最佳流行音乐演唱专辑奖、第4届全球华语歌曲排行榜年度最受欢迎专辑等奖项 [23-24],专辑主打歌曲《东风破》也是周杰伦具有代表性的中国风作品之一,而他亦凭借该曲获得第4届华语音乐传媒大奖最佳作曲人奖;9月12日,在北京工人体育场举行“The One”演唱会;11月13日,发行个人音乐EP《寻找周杰伦》 [25],该EP收录了周杰伦为同名电影《寻找周杰伦》创作的两首歌曲《轨迹》《断了的弦》 [25];12月12日,在上海体育场举办“The One”演唱会,并演唱了变奏版的《双截棍》、加长版的《爷爷泡的茶》等歌曲;同年,客串出演的电影处女作《寻找周杰伦》上映 [90]。 +2004年1月21日,首次登上中央电视台春节联欢晚会的舞台,并演唱歌曲《龙拳》 [27-28];3月,在第4届音乐风云榜上获得台湾地区最受欢迎男歌手奖、年度风云大奖、年度港台及海外华人最佳制作人等奖项 [326];8月3日,发行融合嘻哈、R&B、古典音乐等风格的音乐专辑《七里香》 [29] [289],该专辑是一张带有浓重东方抒情摇滚风格的音乐作品,并维持了周杰伦一贯的高格调,风格也更加统一,周杰伦也在被流行乐坛一再忽略或曲解的本土文化中调动着民乐所有灵性的想象力,展现了其独树一帜的音乐才华 [29],专辑发行当月在全亚洲的首月销量便突破了300万张 [316],而专辑同名主打歌曲《七里香》则获得了第27届十大中文金曲十大金曲奖、优秀流行国语歌曲奖金奖,以及第5届全球华语歌曲排行榜年度25大金曲奖等奖项 [30],他亦凭借该专辑获得了第16届世界音乐大奖中国区最畅销艺人奖等多个音乐奖项 [320];10月起,在中国台湾省台北市、中国香港、美国洛杉矶、蒙特维尔等地举行“无与伦比”世界巡回演唱会 [374]。 +2005年1月11日,在第11届全球华语榜中榜颁奖盛典上获得港台最佳男歌手奖、港台最受欢迎男歌手奖、港台最佳创作歌手奖等奖项 [31];4月,凭借专辑《七里香》入围第16届台湾金曲奖最佳国语男演唱人奖、最佳流行音乐演唱专辑奖,凭借歌曲《七里香》入围第16届台湾金曲奖最佳作曲人奖;6月23日,由其担任男主角主演的电影《头文字D》上映 [91],他在该片中饰演藤原拓海 [314] [347],这也是他主演的个人首部电影 [314],他也凭借该片获得第42届台湾电影金马奖最佳新演员奖 [3]、第25届香港电影金像奖最佳新演员奖 [315];7月1日,在上海体育场举行“无与伦比巡回演唱会” [32];7月9日,在北京工人体育场举行“无与伦比巡回演唱会” [33]。8月31日,在日本发行个人首张精选专辑《Initial J》 [327],该专辑收录了周杰伦为电影《头文字D》演唱的主题曲《一路向北》和《飘移》 [34];11月1日,发行个人第六张音乐专辑《11月的萧邦》 [296],并包办了专辑中所有歌曲的作曲以及专辑的造型设计 [35],该专辑发行后以4.28%的销售份额获得台湾G-MUSIC年终排行榜冠军;同年,其创作的歌曲《蜗牛》入选“上海中学生爱国主义歌曲推荐目录” [328]。 +2006年1月11日,在第12届全球华语榜中榜颁奖盛典上获得最佳男歌手奖、最佳创作歌手奖、最受欢迎男歌手奖,并凭借歌曲《夜曲》及其MV分别获得年度最佳歌曲奖、最受欢迎音乐录影带奖 [234];1月20日,发行个人音乐EP《霍元甲》 [329],同名主打歌曲《霍元甲》是李连杰主演的同名电影《霍元甲》的主题曲 [36];1月23日,在第28届十大中文金曲颁奖典礼上获得了优秀流行歌手大奖、全年最高销量歌手大奖男歌手奖 [246];2月5日至6日,在日本东京举行演唱会;9月,发行个人第七张音乐专辑《依然范特西》 [290],该专辑延续了周杰伦以往的音乐风格,并融合了中国风、说唱等音乐风格,其中与费玉清合唱的中国风歌曲《千里之外》获得第13届全球华语音乐榜中榜年度最佳歌曲奖、第29届十大中文金曲全国最受欢迎中文歌曲奖等奖项 [37-38],该专辑发行后以5.34%的销售份额位列台湾五大唱片排行榜第一位 [39],并获得中华音乐人交流协会年度十大优良专辑奖、IFPI香港唱片销量大奖最高销量国语唱片奖等奖项 [40];12月,发行个人音乐EP《黄金甲》 [330],该专辑获得IFPI香港唱片销量大奖十大畅销国语唱片奖 [332];同年,获得世界音乐大奖中国区最畅销艺人奖 [4];12月14日,主演的古装动作片《满城尽带黄金甲》在中国内地上映 [331],他在片中饰演武功超群的二王子元杰,并凭借该片获得第16届上海影评人奖最佳男演员奖,而他为该片创作并演唱的主题曲《菊花台》则获得了第26届香港电影金像奖最佳原创电影歌曲奖 [92] [220]。 + diff --git "a/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt" "b/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt" new file mode 100644 index 00000000..356d270e --- /dev/null +++ "b/kag/examples/baike/builder/data/\345\221\250\346\266\246\345\217\221\347\231\276\347\247\221.txt" @@ -0,0 +1,8 @@ +周润发(Chow Yun Fat),1955年5月18日出生于中国香港南丫岛,籍贯广东省江门市开平市 [1],华语影视男演员、摄影家,国家一级演员。 +1976年,出演个人首部电影《投胎人》 [2]。1980年,主演民国剧《上海滩》获得关注 [3]。1985年,凭借电影《等待黎明》获得第22届台湾电影金马奖最佳男主角奖 [114] [123]。1986年,主演的动作片《英雄本色》获得该年度香港电影票房冠军 [124],他凭借该片获得第6届香港电影金像奖最佳男主角奖 [125]。1988年,凭借电影《龙虎风云》获得第7届香港电影金像奖最佳男主角奖 [5]。1989年,主演剧情片《赌神》、动作片《喋血双雄》 [126-127]。1990年,凭借电影《阿郎的故事》获得第9届香港电影金像奖最佳男主角奖 [6]。1991年,主演的剧情片《纵横四海》成为其代表作 [7]。1998年,开始前往美国好莱坞发展 [9]。2000年,主演的剧情片《卧虎藏龙》在国际获得广泛关注 [10]。2003年,获颁特区政府银紫荆星章 [11]。 +2005年,被评为“中国电影百年百位优秀演员”之一 [12]。2007年-2010年间,相继主演《姨妈的后现代生活》《让子弹飞》等多部电影 [13-14] [134]。2011年,凭借电影《孔子》获得第14届中国电影华表奖优秀境外华裔男演员奖 [128]。2012年,获得第15届上海国际电影节华语电影杰出贡献奖 [130];此后,相继主演《铜雀台》《澳门风云》《寒战2》等电影 [129] [131-132]。2018年,主演的剧情片《无双》成为该年度国庆档首部票房突破10亿元的电影 [146],他亦凭借该片获得第14届中美电影节最佳男主角奖 [89]。2023年,主演剧情片《别叫我“赌神”》 [145];同年,获得第28届釜山国际电影节亚洲电影人奖 [142]。 +周润发出生于香港南丫岛的一个农村家庭,籍贯广东省江门市开平市 [1],他的父亲周容允是出海打渔的船员,常年漂泊海上,周润发的母亲种菜养鸡,也经常到别人家里帮佣 [18]。因生活清苦,周润发从小帮母亲打零工贴补家用。童年时父亲因为好赌而输光月薪,因为父亲的薪水都输光了,发薪水时,母亲只能去父亲公司领回一桶油 [19],周润发因此从小厌恶赌博 [20]。1965年,即周润发10岁时,因为家庭生活困难,妈妈带着孩子们去位于九龙的外婆家居住,妈妈去当工人养家糊口 [18]。周润发读到中学三年级的时候,父亲积劳成疾,一病不起,家里再也没有能力供他继续上学,他便过早地踏入社会,寻找工作。在从事演艺工作之前,周润发一直生活在社会底层,仅他干过的职业,就有商行侍役、电子厂童工、酒店服务员、邮差、照相器材售货员等等 [21]。1973年,18岁的周润发在报纸上看到无线电视演员训练班(TVB)的征人广告,便与朋友一同去应征 [22],得到担任考官之一的钟景辉的赏识 [23];之后,他考入了无线电视台第三期艺员训练班,与吴孟达、林岭东是同学。入行之初,周润发跑过一段时间的龙套,并出演了《民间传奇》《红楼梦》《毕业后》等剧集中的配角角色 [147]。 +1974年,周润发顺利从训练班毕业,经过短暂的龙套生涯,周润发就得到了演出的机会。1975年,出演的由萧笙、李惠民、陈宇超联合执导的古装爱情剧《红楼梦》播出,周润发在剧中饰演蒋玉函 [24]。1976年,出演的由张森执导的剧情电影《投胎人》上映 [2];4月,与刘志荣、黄杏秀合作出演的喜剧《新苏小妹三难新郎》播出,在剧中饰演王安石之子王雱 [25];9月,与余安安、林建明联袂主演的剧情片《池女》首映;同年,与张午郎、黄杏秀合作主演的剧情电影《捞家邪牌姑爷仔》上映 [25]。 +1977年,与汪明荃、南红合作出演了家庭剧《家变》,周润发在剧中饰演廉政公署人员何严明;同年,出演动作犯罪电影《入册》 [2]。1978年1月,与廖咏湘领衔主演的剧情电影《爱欲狂潮》首映 [25];8月,出演的歌舞电视剧《青春热潮》播出;随后,与刘嘉玲、吴孟达等人合作出演的恐怖惊悚剧《幻海奇情》首播;同年,与赛祝娟领衔主演的剧情电影《O女》上映,周润发在电影中饰演澳门富家子管厌平。 +1979年,与任达华领衔主演了剧情类电视剧《有楼收租》,周润发在该剧中饰演黑社会成员阿龙 [25];随后,其主演的悬疑剧《龙潭群英》首播;同年,与廖伟雄、郑裕玲、欧阳佩珊等人联袂主演了家庭爱情剧《网中人》,周润发在剧中饰演中大工商管理系毕业生程纬。 +1980年,主演了由余允抗执导的犯罪喜剧片,在片中饰演年少气盛的阿杰 [2];3月,与吕良伟、赵雅芝领衔主演的民国剧《上海滩》在无线电视台播出,周润发在剧中饰演矛盾、犹豫、冷酷的许文强 [3];4月,与郑裕玲、任达华合作主演的家庭类电视剧《亲情》首播,周润发在剧中饰演石启泰的次子石晖 [26];同年,主演了由王天林执导的动作喜剧电影《懵女大贼傻侦探》,周润发在电影中饰演杀手梁标 [2];随后,领衔主演了喜剧动作片《金榜英雄》(又名《系咁先》),在片中饰演刚从警校毕业的朱嘉华 [2]。 diff --git a/kag/examples/baike/builder/indexer.py b/kag/examples/baike/builder/indexer.py new file mode 100644 index 00000000..e95956dd --- /dev/null +++ b/kag/examples/baike/builder/indexer.py @@ -0,0 +1,32 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. +import logging +from kag.common.registry import import_modules_from_path + +from kag.builder.runner import BuilderChainRunner + +logger = logging.getLogger(__name__) + + +def buildKB(file_path): + from kag.common.conf import KAG_CONFIG + + runner = BuilderChainRunner.from_config(KAG_CONFIG.all_config["kag_builder_pipeline"]) + runner.invoke(file_path) + + logger.info(f"\n\nbuildKB successfully for {file_path}\n\n") + + +if __name__ == "__main__": + import_modules_from_path(".") + file_path = "./data/" + + buildKB(file_path) diff --git a/kag/examples/baike/kag_config.yaml b/kag/examples/baike/kag_config.yaml new file mode 100644 index 00000000..e4ebca6d --- /dev/null +++ b/kag/examples/baike/kag_config.yaml @@ -0,0 +1,128 @@ +#------------project configuration start----------------# +openie_llm: &openie_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +chat_llm: &chat_llm + api_key: key + base_url: https://api.deepseek.com + model: deepseek-chat + type: maas + +vectorize_model: &vectorize_model + api_key: key + base_url: https://api.siliconflow.cn/v1/ + model: BAAI/bge-m3 + type: openai + vector_dimensions: 1024 +vectorizer: *vectorize_model + +log: + level: INFO + +project: + biz_scene: default + host_addr: http://127.0.0.1:8887 + id: '7' + language: zh + namespace: BaiKe +#------------project configuration end----------------# + +#------------kag-builder configuration start----------------# +kag_builder_pipeline: + chain: + type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain + extractor: + type: schema_constraint_extractor # kag.builder.component.extractor.schema_constraint_extractor.SchemaConstraintExtractor + llm: *openie_llm + ner_prompt: + type: spg_entity # kag.builder.prompt.spg_prompt.SPGEntityPrompt + event_prompt: + type: spg_event # kag.builder.prompt.spg_prompt.SPGEventPrompt + std_prompt: + type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt + relation_prompt: + type: spg_relation # kag.builder.prompt.spg_prompt.SPGRelationPrompt + reader: + type: txt_reader # kag.builder.component.reader.txt_reader.TXTReader + post_processor: + type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor + similarity_threshold: 0.9 + splitter: + type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter + split_length: 300 + window_length: 0 + vectorizer: + type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer + vectorize_model: *vectorize_model + writer: + type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter + num_threads_per_chain: 2 + num_chains: 4 + scanner: + type: dir_file_scanner # kag.builder.component.scanner.directory_scanner.DirectoryScanner +#------------kag-builder configuration end----------------# + +#------------kag-solver configuration start----------------# +search_api: &search_api + type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI + +graph_api: &graph_api + type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi + +exact_kg_retriever: &exact_kg_retriever + type: default_exact_kg_retriever # kag.solver.retriever.impl.default_exact_kg_retriever.DefaultExactKgRetriever + el_num: 5 + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +fuzzy_kg_retriever: &fuzzy_kg_retriever + type: default_fuzzy_kg_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + el_num: 5 + vectorize_model: *vectorize_model + llm_client: *chat_llm + search_api: *search_api + graph_api: *graph_api + +chunk_retriever: &chunk_retriever + type: default_chunk_retriever # kag.solver.retriever.impl.default_fuzzy_kg_retriever.DefaultFuzzyKgRetriever + llm_client: *chat_llm + recall_num: 10 + rerank_topk: 10 + +kag_solver_pipeline: + memory: + type: default_memory # kag.solver.implementation.default_memory.DefaultMemory + llm_client: *chat_llm + max_iterations: 3 + reasoner: + type: default_reasoner # kag.solver.implementation.default_reasoner.DefaultReasoner + llm_client: *chat_llm + lf_planner: + type: default_lf_planner # kag.solver.plan.default_lf_planner.DefaultLFPlanner + llm_client: *chat_llm + vectorize_model: *vectorize_model + lf_executor: + type: default_lf_executor # kag.solver.execute.default_lf_executor.DefaultLFExecutor + llm_client: *chat_llm + force_chunk_retriever: true + exact_kg_retriever: *exact_kg_retriever + fuzzy_kg_retriever: *fuzzy_kg_retriever + chunk_retriever: *chunk_retriever + merger: + type: default_lf_sub_query_res_merger # kag.solver.execute.default_sub_query_merger.DefaultLFSubQueryResMerger + vectorize_model: *vectorize_model + chunk_retriever: *chunk_retriever + generator: + type: default_generator # kag.solver.implementation.default_generator.DefaultGenerator + llm_client: *chat_llm + generate_prompt: + type: default_resp_generator # kag.solver.prompt.default.resp_generator.RespGenerator + reflector: + type: default_reflector # kag.solver.implementation.default_reflector.DefaultReflector + llm_client: *chat_llm + +#------------kag-solver configuration end----------------# diff --git a/kag/examples/baike/schema/BaiKe.schema b/kag/examples/baike/schema/BaiKe.schema new file mode 100644 index 00000000..23747bc9 --- /dev/null +++ b/kag/examples/baike/schema/BaiKe.schema @@ -0,0 +1,141 @@ +namespace BaiKe + +Chunk(文本块): EntityType + desc: A chunk refers to a segment of text. + properties: + content(内容): Text + index: TextAndVector + +ArtificialObject(人造物体): EntityType + desc: a human-made entity that does not occur naturally. + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Astronomy(天文学): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Building(建筑): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Creature(生物): EntityType + desc: generally refers to any living being, especially animals + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Concept(概念): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Date(日期): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +GeographicLocation(地理位置): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Keyword(关键词): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Medicine(药物): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + + +NaturalScience(自然科学): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Organization(组织机构): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Person(人物): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + job(工作): Text + constraint: MultiValue + +Transport(运输): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +Works(作品): EntityType + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + publisTime(发行时间): Date + authors(作者): Person + desc: authors of work, such as director, actor, lyricist, composer and singer + constraint: MultiValue + +BaikeEvent(事件): EventType + properties: + subject(主体): Person + participants(参与者): Person + desc: the participants of event, such as subject and objects + constraint: MultiValue + time(时间): Date + location(地点): GeographicLocation + abstract(摘要): Text + index: TextAndVector + semanticType(事件语义类型): Text + desc: a more specific and clearly defined type, such as Professor or Actor for the Person type + index: Text + +Others(其他): EntityType + desc: Entities that does not belong to any other type + properties: + info(信息): Text + index: TextAndVector + semanticType(语义类型): Text + index: Text + +SemanticConcept(语义概念): EntityType + properties: + desc(内容): Text + index: Text diff --git a/kag/solver/logic/core_modules/op_executor/op_output/__init__.py b/kag/examples/baike/solver/__init__.py similarity index 100% rename from kag/solver/logic/core_modules/op_executor/op_output/__init__.py rename to kag/examples/baike/solver/__init__.py diff --git a/kag/examples/baike/solver/eval.py b/kag/examples/baike/solver/eval.py new file mode 100644 index 00000000..0ca82830 --- /dev/null +++ b/kag/examples/baike/solver/eval.py @@ -0,0 +1,36 @@ +import json +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed + +from tqdm import tqdm + +from kag.common.benchmarks.evaluate import Evaluate +from kag.solver.logic.solver_pipeline import SolverPipeline +from kag.common.conf import KAG_CONFIG +from kag.common.registry import import_modules_from_path + +from kag.common.checkpointer import CheckpointerManager + + +def qa(query): + resp = SolverPipeline.from_config(KAG_CONFIG.all_config["kag_solver_pipeline"]) + answer, traceLog = resp.run(query) + + print(f"\n\nso the answer for '{query}' is: {answer}\n\n") # + print(traceLog) + return answer, traceLog + + +if __name__ == "__main__": + import_modules_from_path("./prompt") + queries = [ + "周星驰的姓名有何含义?", + "周星驰和万梓良有什么关系", + "周星驰在首部自编自导自演的电影中,票房达到多少,他在其中扮演什么角色", + "周杰伦曾经为哪些自己出演的电影创作主题曲?", + "周杰伦在春晚上演唱过什么歌曲?是在哪一年", + ] + for q in queries: + qa(q) diff --git a/kag/examples/hotpotqa/builder/prompt/__init__.py b/kag/examples/baike/solver/prompt/__init__.py similarity index 86% rename from kag/examples/hotpotqa/builder/prompt/__init__.py rename to kag/examples/baike/solver/prompt/__init__.py index 247bb44c..dfa931cd 100644 --- a/kag/examples/hotpotqa/builder/prompt/__init__.py +++ b/kag/examples/baike/solver/prompt/__init__.py @@ -10,5 +10,5 @@ # or implied. """ -Place the prompts to be used for building the index in this directory. -""" \ No newline at end of file +Place the prompts to be used for solving problems in this directory. +""" diff --git a/kag/examples/baike/solver/prompt/resp_generator.py b/kag/examples/baike/solver/prompt/resp_generator.py new file mode 100644 index 00000000..cb8d76ab --- /dev/null +++ b/kag/examples/baike/solver/prompt/resp_generator.py @@ -0,0 +1,28 @@ +import re +from string import Template +from typing import List +import logging + +from kag.interface import PromptABC + +logger = logging.getLogger(__name__) + + +@PromptABC.register("resp_simple") +class RespGenerator(PromptABC): + template_zh = ( + "基于给定的引用信息回答问题。" "\n只输出答案,不需要输出额外的信息。" "\n给定的引用信息:'$memory'\n问题:'$instruction'" + ) + template_en = ( + "Answer the question based on the given reference." + "\nOnly give me the answer and do not output any other words." + "\nThe following are given reference:'$memory'\nQuestion: '$instruction'" + ) + + @property + def template_variables(self) -> List[str]: + return ["memory", "instruction"] + + def parse_response(self, response: str, **kwargs): + logger.debug("推理器判别:{}".format(response)) + return response diff --git a/kag/examples/csqa/.gitignore b/kag/examples/csqa/.gitignore new file mode 100644 index 00000000..50e414ac --- /dev/null +++ b/kag/examples/csqa/.gitignore @@ -0,0 +1,3 @@ +ckpt/ +/cs.jsonl +/solver/data/csqa_kag_answers.json diff --git a/kag/examples/csqa/README.md b/kag/examples/csqa/README.md new file mode 100644 index 00000000..ee1cca95 --- /dev/null +++ b/kag/examples/csqa/README.md @@ -0,0 +1,84 @@ +# KAG Example: CSQA + +The [UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main) +``cs.jsonl`` dataset contains 10 documents in Computer Science and +100 questions with their answers about those documents. + +Here we demonstrate how to build a knowledge graph for those documents, +generate answers to those questions with KAG and compare KAG generated +answers with those from other RAG systems. + +## Steps to reproduce + +1. Follow the Quick Start guide of KAG to install the OpenSPG server and KAG. + + The following steps assume the Python virtual environment with KAG installed + is activated and the current directory is [csqa](.). + +2. (Optional) Download [UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main) + ``cs.jsonl`` and execute [generate_data.py](./generate_data.py) to generate data files in + [./builder/data](./builder/data) and [./solver/data](./solver/data). Since the generated files + were committed, this step is optional. + + ```bash + python generate_data.py + ``` + +3. Update the ``openie_llm``, ``chat_llm`` and ``vectorizer_model`` configurations + in [kag_config.yaml](./kag_config.yaml) properly. + The ``splitter`` and ``num_threads_per_chain`` configurations may also be updated + to match with other systems. + +4. Restore the KAG project. + + ```bash + knext project restore --host_addr http://127.0.0.1:8887 --proj_path . + ``` + +5. Commit the schema. + + ```bash + knext schema commit + ``` + +6. Execute [indexer.py](./builder/indexer.py) in the [builder](./builder) directory to build the knowledge graph. + + ```bash + cd builder && python indexer.py && cd .. + ``` + +7. Execute [eval.py](./solver/eval.py) in the [solver](./solver) directory to generate the answers. + + ```bash + cd solver && python eval.py && cd .. + ``` + + The results are saved to ``./solver/data/csqa_kag_answers.json``. + +8. (Optional) Follow the LightRAG [Reproduce](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#reproduce) + steps to generate answers to the questions and save the results to + [./solver/data/csqa_lightrag_answers.json](./solver/data/csqa_lightrag_answers.json). + Since a copy was committed, this step is optional. + +9. Update the LLM configurations in [summarization_metrics.py](./solver/summarization_metrics.py) + and [factual_correctness.py](./solver/factual_correctness.py) + and execute them to get the metrics. + + ```bash + python ./solver/summarization_metrics.py + python ./solver/factual_correctness.py + ``` + +10. (Optional) To delete checkpoints, execute the following commands. + + ```bash + rm -rf ./builder/ckpt + rm -rf ./solver/ckpt + ``` + + To delete the KAG project and related knowledge graph, execute the following similar command. + Replace the OpenSPG server address and KAG project id with actual values. + + ```bash + curl http://127.0.0.1:8887/project/api/delete?projectId=1 + ``` diff --git a/kag/examples/musique/builder/prompt/__init__.py b/kag/examples/csqa/builder/__init__.py similarity index 86% rename from kag/examples/musique/builder/prompt/__init__.py rename to kag/examples/csqa/builder/__init__.py index 247bb44c..7a018e7c 100644 --- a/kag/examples/musique/builder/prompt/__init__.py +++ b/kag/examples/csqa/builder/__init__.py @@ -10,5 +10,5 @@ # or implied. """ -Place the prompts to be used for building the index in this directory. -""" \ No newline at end of file +Builder Dir. +""" diff --git a/kag/examples/csqa/builder/data/__init__.py b/kag/examples/csqa/builder/data/__init__.py new file mode 100644 index 00000000..59bacd4d --- /dev/null +++ b/kag/examples/csqa/builder/data/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2023 OpenSPG Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. + +""" +Place the files to be used for building the index in this directory. +""" diff --git a/kag/examples/csqa/builder/data/guide_to_java.txt b/kag/examples/csqa/builder/data/guide_to_java.txt new file mode 100644 index 00000000..66839a40 --- /dev/null +++ b/kag/examples/csqa/builder/data/guide_to_java.txt @@ -0,0 +1,9536 @@ +Guide to Java + +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6© Springer-Verlag London 2014 + +Undergraduate Topics in Computer Science + +Series EditorIan Mackie + +Undergraduate Topics in Computer Science (UTiCS) delivers high-quality instructional content for undergraduates studying in all areas of computing and information science. From core foundational and theoretical material to final-year topics and applications, UTiCS books take a fresh, concise, and modern approach and are ideal for self-study or for a one- or two-semester course. The texts are all authored by established experts in their fields, reviewed by an international advisory board, and contain numerous examples and problems. Many include fully worked solutions. + +For further volumes: http://​www.​springer.​com/​series/​7592 + +James T. Streib and Takako Soma + +Guide to JavaA Concise Introduction to Programming + +James T. Streib + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Takako Soma + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +ISSN 1863-7310e-ISSN 2197-1781 + +ISBN 978-1-4471-6316-9e-ISBN 978-1-4471-6317-6 + +Springer London Heidelberg New York Dordrecht + +Library of Congress Control Number: 2014931850 + +© Springer-Verlag London 2014 + +Undergraduate Topics in Computer Science + +This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed. Exempted from this legal reservation are brief excerpts in connection with reviews or scholarly analysis or material supplied specifically for the purpose of being entered and executed on a computer system, for exclusive use by the purchaser of the work. Duplication of this publication or parts thereof is permitted only under the provisions of the Copyright Law of the Publisher's location, in its current version, and permission for use must always be obtained from Springer. Permissions for use may be obtained through RightsLink at the Copyright Clearance Center. Violations are liable to prosecution under the respective Copyright Law. + +The use of general descriptive names, registered names, trademarks, service marks, etc. in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant protective laws and regulations and therefore free for general use. + +While the advice and information in this book are believed to be true and accurate at the date of publication, neither the authors nor the editors nor the publisher can accept any legal responsibility for any errors or omissions that may be made. The publisher makes no warranty, express or implied, with respect to the material contained herein. + +Printed on acid-free paper + +Springer is part of Springer Science+Business Media (www.springer.com) + +Preface + +## Purpose + +The purpose of this text is to help the reader learn very quickly how to program using the Java programming language. This is accomplished by concentrating on the fundamentals, providing plenty of illustrations and examples, and using visual contour diagrams to illustrate the object-oriented semantics of the language. + +## Comparison to Other Texts + +There are a number of texts on the Java programming language. Some of these texts provide plenty of examples and are very comprehensive, but unfortunately they sometimes seem to cover too many details, which might make it difficult for a beginning programmer to discern which points are the most relevant. There are also other texts that attempt to provide a shortened introduction to the language, but it seems that these texts might not provide the necessary examples and illustrations and might be better suited for readers who have previous programming experience. + +## Need + +This text attempts to fill the gap between the above two types of books. First, it provides plenty of examples and concentrates primarily on the fundamentals of the Java programming language so that the reader can stay focused on the key concepts. Second, by concentrating on the fundamentals, it allows the text to be more concise and yet still accessible to readers who have no prior programming experience. The result is that the reader can learn the Java programming language very quickly and also have a good foundation to learn more complex topics later. + +## Features of This Text + +This text provides many examples and illustrations. It further has an early introduction to object-oriented programming and uses contour diagrams to illustrate various object-oriented concepts. The contour model was originally developed by John B. Johnson [1]. The model was elaborated on by Organick, Forsythe, and Plummer to illustrate subprograms, parameter passing, and recursion in procedural and functional languages [2]. The model seems quite adaptable to newer programming methodologies such as object-oriented programming as illustrated in a paper by the authors of this text [3]. As discussed in that paper, it was shown that the use of contour diagrams can be an effective tool in helping one learn object-oriented concepts in the Java programming language. By acquiring a good working model of objects, there is less chance of possible misconceptions. + +In many paragraphs of the text, questions are asked of the reader to help them interact with the material and think about the subject matter just presented. Hopefully the reader will take a few moments to try to answer these questions on their own before proceeding to the answer that immediately follows. To help further reinforce concepts, each chapter has one or more complete programs to illustrate many of the concepts presented and also to help readers learn how to write programs on their own. In addition, for review and practice, there are summaries and exercises provided at the end of each chapter. Further, in the appendices at the end of the text, there are answers to selected exercises and a glossary of important terms. A summary of these features is listed below: + + * Stresses the fundamentals + + * Provides many examples and illustrations + + * Has an early introduction to objects + + * Uses contour diagrams to illustrate object-oriented concepts + + * Asks readers questions to help them interact with the material + + * Has one or more complete programs in every chapter + + * Provides chapter summaries + + * Includes exercises at the end of each chapter, with selected answers in an appendix + + * Has a glossary of important terms + +## Overview of the Chapters + +This text first allows the reader to understand a simple program with the appropriate input, processing, and output, followed by an early introduction to objects. It then looks at selection and iteration structures followed by more object-oriented concepts. Next, strings and arrays are examined. This is followed by recursion, inheritance and polymorphism, and elementary files. The appendices include information on graphical input/output, exception processing, Javadoc, a glossary, and answers to selected exercises. Lastly there are references and useful websites and an index. The following provides a brief synopsis of the chapters and appendices: + + * Chapter 1 provides an introduction to variables, input/output, and arithmetic operations. + + * Chapter 2 introduces objects and contour diagrams. + + * Chapter 3 explains selection structures. + + * Chapter 4 shows how iteration structures work. + + * Chapter 5 revisits object-oriented concepts. + + * Chapter 6 introduces string variables and processing. + + * Chapter 7 illustrates arrays and array processing. + + * Chapter 8 examines recursion. + + * Chapter 9 explores inheritance and polymorphism. + + * Chapter 10 discusses elementary files. + + * Appendix A gives an introduction to graphical input/output. + + * Appendix B discusses elementary exception processing. + + * Appendix C presents the basics of Javadoc. + + * Appendix D lists a glossary of key terms. + + * Appendix E provides answers to selected exercises. + +## Scope + +As mentioned previously, this text concentrates on the fundamentals of the Java programming language such as input/output, object-oriented programming, arithmetic and logic instructions, control structures, strings, arrays including elementary sorting and searching, recursion, and files. As a result, it might not cover all the details that are found in some other texts, and if necessary, these topics can be supplemented by the instructor or reader, or covered in a subsequent text and/or second semester course. + +## Audience + +This text is intended primarily for readers who have not had any previous programming experience; however, this does not preclude its use by others who have programmed previously. It can serve as a text in an introductory programming course, as an introduction to a second language in a practicum course, as a supplement in a course on the concepts of programming languages, or as a self-study guide in either academe or industry. Although no prior programming is assumed, it is recommended that readers have the equivalent of an introduction to functions course that includes trigonometry which will help with problem solving and understanding the examples presented in the text. + +## Acknowledgments + +The authors would like to thank the reviewers Mark E. Bollman of Albion College, James W. Chaffee of the University of Iowa, Naomi E. Hahn of Illinois College, Carroll W. Morrow of Augustana College, and Curt M. White of DePaul University. Also, the authors would like to acknowledge the students of Illinois College who have read and used various sections of the text in the classroom. On a personal note, James Streib would like to acknowledge his father William J. Streib for their numerous conversations, and thank his wife Kimberly A. Streib and son Daniel M. Streib for their continued patience. Takako Soma would like to thank her family and friends, near and far. + +Note that Java is a registered trademark of Oracle and/or its affiliates and that Windows is a registered trademark of Microsoft Corporation in the United States and/or other countries. + +## Feedback + +The possibility of errors exist in any text, therefore any corrections, comments, or suggestions are welcome and can be sent to the authors via the e-mail addresses below. In addition to copies of the complete programs presented in the text, any significant corrections can be found at the website below. + +Website: http://​www.​jtstreib.​com/​GuideJavaProgram​ming.​html + +James T. Streib + +Takako Soma + +October 21, 2013 + +Contents + +1 Variables, Input/​Output, and Arithmetic 1 + +1.​1 Introduction 1 + +1.​2 Java Skeleton 5 + +1.​3 Variables and Constants 6 + +1.​4 Assignment Statements 10 + +1.​5 Output 13 + +1.​6 Input 20 + +1.​7 Arithmetic Statements 22 + +1.​8 Comments 29 + +1.​9 Program Design 30 + +1.​10 Complete Program:​ Implementing a Simple Program 33 + +1.​11 Summary 36 + +1.​12 Exercises (Items Marked with an * Have Solutions in Appendix E) 36 + +2 Objects:​ An Introduction 39 + +2.​1 Introduction 39 + +2.​2 Classes and Objects 40 + +2.​3 Public and Private Data Members 41 + +2.​4 Value-Returning Methods 42 + +2.​5 Void Methods and Parameters 42 + +2.​6 Creating Objects and Invoking Methods 44 + +2.​7 Contour Diagrams 45 + +2.​8 Constructors 50 + +2.​9 Multiple Objects and Classes 53 + +2.​10 Universal Modeling Language (UML) Class Diagrams 60 + +2.​11 Complete Program:​ Implementing a Simple Class and Client Program 62 + +2.​12 Summary 63 + +2.​13 Exercises (Items Marked with an * Have Solutions in Appendix E) 65 + +3 Selection Structures 69 + +3.​1 Introduction 69 + +3.​2 If-Then Structure 69 + +3.​3 If-Then-Else Structure 75 + +3.​4 Nested If Structures 78 + +3.​4.​1 If-Then-Else-If Structure 78 + +3.​4.​2 If-Then-If Structure 80 + +3.​4.​3 Dangling Else Problem 82 + +3.​5 Logical Operators 86 + +3.​6 Case Structure 93 + +3.​7 Complete Programs:​ Implementing Selection Structures 98 + +3.​7.​1 Simple Program 98 + +3.​7.​2 Program with Objects 101 + +3.​8 Summary 103 + +3.​9 Exercises (Items Marked with an * Have Solutions in Appendix E) 103 + +4 Iteration Structures 107 + +4.​1 Introduction 107 + +4.​2 Pretest Indefinite Loop Structure 108 + +4.​2.​1 Count-Controlled Indefinite Iteration Structure 109 + +4.​2.​2 Sentinel Controlled Loop 116 + +4.​3 Posttest Indefinite Loop Structure 120 + +4.​4 Definite Iteration Loop Structure 124 + +4.​5 Nested Iteration Structures 127 + +4.​6 Potential Problems 129 + +4.​7 Complete Programs:​ Implementing Iteration Structures 130 + +4.​7.​1 Simple Program 131 + +4.​7.​2 Program with Objects 133 + +4.​8 Summary 138 + +4.​9 Exercises (Items Marked with an * Have Solutions in Appendix E) 138 + +5 Objects:​ Revisited 143 + +5.​1 Sending an Object to a Method 143 + +5.​2 Returning an Object from a Method 146 + +5.​3 Overloaded Constructors and Methods 148 + +5.​4 Use of the Reserved Word this 153 + +5.​5 Class Constants, Variables, and Methods 157 + +5.​5.​1 Local, Instance, and Class Constants 157 + +5.​5.​2 Local, Instance, and Class Variables 162 + +5.​5.​3 Class Methods 165 + +5.​6 Complete Programs:​ Implementing Objects 167 + +5.​6.​1 Program Focusing on Overloaded Methods 167 + +5.​6.​2 Program Focusing on Class Data Members and Class Methods 175 + +5.​7 Summary 179 + +5.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 179 + +6 Strings 185 + +6.​1 Introduction 185 + +6.​2 String Class 185 + +6.​3 String Concatenation 186 + +6.​4 Methods in String Class 188 + +6.​4.​1 The length Method 188 + +6.​4.​2 The indexOf Method 188 + +6.​4.​3 The substring Method 189 + +6.​4.​4 Comparison of Two String Objects 191 + +6.​4.​5 The equalsIgnoreCase​ Method 194 + +6.​4.​6 The charAt Method 195 + +6.​5 The toString Method 196 + +6.​6 Complete Program:​ Implementing String Objects 198 + +6.​7 Summary 200 + +6.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 201 + +7 Arrays 203 + +7.​1 Introduction 203 + +7.​2 Array Declaration 203 + +7.​3 Array Access 205 + +7.​4 Input, Output, Simple Processing, and Methods 206 + +7.​4.​1 Input 207 + +7.​4.​2 Output 210 + +7.​4.​3 Simple Processing 211 + +7.​4.​4 Passing an Array to and from a Method 212 + +7.​5 Reversing an Array 213 + +7.​6 Searching an Array 218 + +7.​6.​1 Sequential Search 218 + +7.​6.​2 Binary Search 219 + +7.​6.​3 Elementary Analysis 221 + +7.​7 Sorting an Array 221 + +7.​7.​1 Simplified Bubble Sort 222 + +7.​7.​2 Modified Bubble Sort 224 + +7.​8 Two-Dimensional Arrays 225 + +7.​8.​1 Declaration, Creation, and Initialization 226 + +7.​8.​2 Input and Output 228 + +7.​8.​3 Processing Data 229 + +7.​8.​4 Passing a Two-Dimensional Array to and from a Method 232 + +7.​8.​5 Asymmetrical Two-Dimensional Arrays 234 + +7.​9 Arrays of Objects 236 + +7.​10 Complete Program:​ Implementing an Array 238 + +7.​11 Summary 242 + +7.​12 Exercises (Items Marked with an * Have Solutions in Appendix E) 242 + +8 Recursion 245 + +8.​1 Introduction 245 + +8.​2 The Power Function 245 + +8.​3 Stack Frames 253 + +8.​4 Fibonacci Numbers 254 + +8.​5 Complete Program:​ Implementing Recursion 264 + +8.​6 Summary 266 + +8.​7 Exercises (Items Marked with an * Have Solutions in Appendix E) 266 + +9 Objects:​ Inheritance and Polymorphism 267 + +9.​1 Inheritance 267 + +9.​2 Protected Variables and Methods 276 + +9.​3 Abstract Classes 277 + +9.​4 Polymorphism 278 + +9.​5 Complete Program:​ Implementing Inheritance and Polymorphism 284 + +9.​6 Summary 288 + +9.​7 Exercises (Items Marked with an * Have Solutions in Appendix E) 289 + +10 Elementary File Input and Output 293 + +10.​1 Introduction 293 + +10.​2 File Input 293 + +10.​3 File Output 298 + +10.​4 File Input and Output Using an Array 300 + +10.​5 Specifying the File Location 303 + +10.​6 Complete Programs:​ Implementing File Input and Output 305 + +10.​6.​1 Matrix Multiplication 305 + +10.​6.​2 Sorting Data in a File 307 + +10.​7 Summary 309 + +10.​8 Exercises (Items Marked with an * Have Solutions in Appendix E) 309 + +Appendix A Simple Graphical Input and Output311 + +A.1 Message Dialog Boxes311 + +A.2 Input Dialog Boxes312 + +A.3 Converting String Input from Input Dialog Boxes to Numbers314 + +A.4 Confirmation Dialog Boxes316 + +A.5 Option Dialog Boxes317 + +Appendix B Exceptions321 + +B.1 Exception Class and Error Class321 + +B.2 Handling an Exception322 + +B.3 Throwing Exceptions and Multiple catch Blocks325 + +B.4 Checked and Unchecked Exceptions330 + +Appendix C Javadoc Comments335 + +C.1 Javadoc335 + +C.2 More Javadoc Tags338 + +C.3 Generating JavadocDocumentation fromaCommandLine339 + +Appendix D Glossary341 + +Appendix E Answers to Selected Exercises345 + +References and Useful Websites353 + +Index355 +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_1 + +© Springer-Verlag London 2014 + +# 1. Variables, Input/Output, and Arithmetic + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +In addition to an introduction to hardware and software concepts, including the concept of compiling, interpreting, and executing a program, this chapter provides an initial skeleton program from which to create subsequent programs. An introduction to variables, constants, assignment statements, arithmetic operations, and simple input/output using the keyboard and monitor is also provided. Further, there is a discussion concerning errors, comments, and program design. A simple complete program is included at the end of the chapter. + +## 1.1 Introduction + +As many readers may already know from using applications software such as word processing, a computer system is composed of two major parts: hardware and software. The hardware is the physical computer that includes five basic components: the central processing unit (CPU), the random access memory (RAM) or just memory for short, input (typically a keyboard), output (typically a monitor), and storage (often a disk) as shown in Fig. 1.1. + +Fig. 1.1 + +Computer hardware + +In order for computer hardware to perform, it is necessary that it has a software. Essentially, software (often called a program) is the set of instructions that tells the computer what to do and when to do it. A program is typically loaded from storage into the computer's RAM for subsequent execution in the computer's CPU. As the program executes or runs, it will typically ask the user to input data which will also be stored in RAM, the program will then process the data, and various results will be output to the monitor. This input, process, output sequence is sometimes abbreviated as IPO. + +The only type of instruction a computer can actually understand is low-level machine language, where different types of CPUs can have different machine languages. Machine language is made up of ones and zeros, which makes programming in machine language very tedious and error prone. An alternative to using machine language is assembly language which is also a low-level language that uses mnemonics (or abbreviations) and is easier to use than ones and zeros [4]. However, if the only language that the computer can directly understand is machine language, how does the computer understand assembly language? The answer is that the assembly language is converted into machine language by another program called an assembler (see Fig. 1.2). Note that there is a one-to-one correspondence between assembly language and machine language, and for every assembly language instruction, there is typically only one machine language instruction. However, even though assembly language is easier to program in than machine language, different types of CPUs can also have different types of assembly languages, so the assembly language of one machine can be different from that of another machine. + +Fig. 1.2 + +Assemblers and compilers + +The solution to making programming easier and allow programs to be used on different machines is through the use of high-level languages which are more English-like and math-like. One of the first high-level programming languages was FORTRAN (FORmula TRANslation), which was developed in the early 1950s to help solve mathematical problems. There have been a number of high-level languages developed since that time to meet the needs of many different users. Some of these include COBOL (COmmon Business Oriented Language) developed in the 1950s for the business world, BASIC (Beginners All-purpose Symbolic Instruction Code) developed in the 1960s for beginning programmers, Pascal in the 1970s previously used for teaching computer science students, C in the 1970s for systems programming, and C++ in the 1980s for object-oriented programming. + +The program needed to convert or translate a high-level language to a low-level language is either a compiler or an interpreter. Although there is a one-to-one correspondence between assembly language and machine language, there is a one-to-many correspondence between a high-level language and a low-level language. This means that for one high-level language instruction, there can be many low-level assembly or machine language instructions. Even though different CPUs need different compilers or interpreters to convert a particular high-level language into the appropriate machine language, compliers and interpreters allow the same high-level language to be used on different CPUs. + +The difference between a compiler and an interpreter is that a compiler will translate the high-level language instructions for the entire program to the corresponding machine language for subsequent execution, whereas an interpreter will translate and then execute each instruction one at a time. Further, a compiler might translate directly to machine language, or it might translate the high-level language to assembly language, and then let an assembler convert the assembly language program to machine language as shown in Fig. 1.2. Once the machine language is created, it is subsequently loaded into the computer's RAM and executed by the CPU. + +As mentioned above, an interpreter works slightly differently than a compiler. Instead of converting an entire high-level program into machine language all at once and then executing the machine language, an interpreter converts one line of the high-level program to machine language and then immediately executes the machine language instructions before proceeding on with the converting and executing of the next high-level instruction (see Fig. 1.3). The result is that compiler-generated code executes faster than interpreted code because the program does not need to be converted each time it is executed. However, interpreters might be more convenient in an education or development environment because of the many modifications that are made to a program which require a program to be converted each time a change is made. + +Fig. 1.3 + +Compilers and interpreters + +The Java programming language was developed at Sun MicroSystems (which is now a subsidiary of Oracle Corporation) and was released in 1995. The intent of the language was for portability on the World Wide Web. It does not contain some of the features of C++ (such as operator overloading and multiple inheritance, where overloading and inheritance will be discussed in Chaps.​ 5 and ), so it is an easier language to learn. Object-Oriented Programming (OOP) is a programming methodology that makes it more convenient to reuse software as will be discussed further in Chaps.​ 2, , and . Although no prior programming experience is necessary to learn Java in this text, programmers with experience in C or C++ will recognize a number of similarities between Java and these languages. Conversely, programmers learning Java first will also notice a number of similarities should they subsequently learn C or C++. The reason for this similarity between these languages is that both Java and C++ are based on C. + +Java is somewhat unique in that it uses both a compiler and an interpreter to convert the high-level instructions to machine language. A compiler is used to convert the Java instructions into an intermediate-level language known as bytecode, and then the bytecode is converted into machine language using an interpreter. The advantage of using both a compiler and an interpreter is that most of the translation process can be done by the compiler, and when bytecode is sent to different types of machines, it can be translated by an interpreter into the machine language of the particular type of machine the code needs to be run on (see Fig. 1.4). Note that just as there can be a one-to-many relationship between high-level and low-level instructions, there can be a one-to-many relationship between Java and bytecode. However, unlike the one-to-one relationship between assembly language and machine language, there can be a one-to-many relationship between bytecode and machine language, depending on the machine for which the bytecode is being interpreted. + +Fig. 1.4 + +Java instructions, bytecode, and machine language + +When learning a new programming language, one should distinguish between the syntax and the semantics of a program. Simply stated, the syntax is the grammar of the language, and the semantics is the meaning or what each instruction does. To explain further, syntax is the spelling of the individual words, where the semicolons go, and so on. If mistakes are made, the compiler will detect what are known as syntax errors, generate messages to the programmer, and the program will not be compiled or executed. Although syntax is very important, there is a tendency for first-time programmers to spend too much time learning syntax to avoid syntax errors. However, there must be equal time spent on semantics to ensure that the program does what the programmer intended it to do. Even though there might not be any syntax errors, there can be what are called execution errors or run-time errors, such as division by zero. When these types of errors occur, the appropriate error messages are generated and execution stops. Even worse, there can also be logic errors, which are mistakes in the logic of the program and the program does not do what was intended. The unfortunate aspect of logic errors is that they do not produce any error messages which can make them extremely difficult to find and fix. The process of finding and fixing logic errors is known as debugging. When learning to program, one must be attentive not only to the syntax of the language but also to the semantics of the language. Both are stressed in this text, and with time and practice, a beginning programmer can get better at both. + +## 1.2 Java Skeleton + +Probably the best way to understand a programming language is to start right away with a sample program. Although the following program does not do anything, it will serve as a skeleton to add instructions in the future and provide a starting point to understand the basic layout of a Java program. At first the program in Fig. 1.5 might look a bit intimidating, but examining and discussing each of the statements should help one understand it better. Although some of the descriptions discussed below might be a little advanced and confusing now, it helps to realize that each of the words in the program has an important purpose and each of them will be discussed later in detail throughout the text. As one learns more about Java and starts to fill in the skeleton with other instructions, it will become less intimidating. + +Fig. 1.5 + +Java skeleton program + +The first line in the program begins with the reserved word class. A reserved word is one that has a special meaning in a program and cannot have its meaning changed by the programmer nor used for identifiers (or names) of packages, classes, variables, or methods. A package is like a folder in which classes can be stored. A class is a definition of a group of objects that includes data members (places to store data) and methods (places to put the program logic). Although classes and objects will be discussed further in Chap.​ 2, for now think of a class as a blueprint for a house and the houses built from the blueprint as objects. The word Skeleton is a name of the class that is provided by the programmer. Usually class names begin with a capital letter. Braces are used to identify blocks of code and data and require matching opening and closing braces. The entire definition of the class, Skeleton, should be placed between the first opening brace and the last closing brace. + +This class has one method definition starting on the second line. Typically the method is indented to improve the readability of the program. The first three words in the second line are reserved words. The word public is one of the access or visibility modifiers which will also be discussed further in Chap.​ 2. The main method is always defined using public visibility, so that the program can be executed by the interpreter. The word static means this is a class method, and the main method is always declared static so that it can be executed without creating an object of the class as will be discussed further in Chap.​ 5. The word void means that main is a non-value-returning method as will be discussed further in Chap.​ 2. Next, the word main is the name of the method. When a program is run, the system will search for the main method and start executing instructions in the main method first. Inside of the parentheses after the name of the method, parameters are listed along with their types to allow the method to receive values as will be discussed further in Chap.​ 2. The main method has a parameter called args which is an array of type String, and the square brackets indicate args is an array where strings and arrays will be discussed further in Chaps.​ 6 and , respectively. The definition of the main method starts with an opening brace and ends with a closing brace. Inside the braces, a sequence of instructions would be placed. + +For now, the method does not have any instructions other than a comment line. Comments will not be compiled and executed when the program is run. They are used to make programs easier for other programmers to understand. Comments can start with // symbols and continue to the end of the line, or be placed between /* and */ symbols. The // symbols are used for a single-line comment, and /* and */ are used when the comments run over multiple lines. The above program should compile without any syntax errors and run without any execution errors, except it does not do anything. + +Again the above description should give the reader some insight into the meaning of various words in the skeleton program. As should be noticed, there were several references to subsequent chapters. What might be helpful to the reader is to return to this section later after reading the subsequent chapters and see that the above is more understandable. For now it should be understood that each of the words has a particular meaning and that the program serves as a skeleton in which to insert code as will be done in the following sections. + +## 1.3 Variables and Constants + +One of the first things that need to be added to the skeleton are memory locations so that data can be stored, and another name for a memory location is a variable. Since the contents of the memory location can vary, just as a variable in mathematics, these two terms can be used interchangeably. + +In order to understand variables and how data is stored in memory, it is oftentimes very helpful to draw a picture of the memory location. A memory location can be thought of as a mailbox that has two main parts. One part is the contents, which includes the letters that are inside the mailbox, and the other is the address of the mailbox as shown in Fig. 1.6. + +Fig. 1.6 + +Representation of memory + +The address of the mailbox is usually a number, like the address of a memory location in a computer. At the machine language level, the address is in ones and zeros, just like the machine language instructions mentioned in the first section of this chapter. However, using numbers to represent the address of a memory location can be quite confusing, especially if there are hundreds of memory locations in a program. Instead it is helpful to use characters to form names, called symbolic addressing, to make it easier to remember what data is stored in what memory location as shown in Fig. 1.7. In this example, the name number is used to describe the contents of the corresponding memory location. This is one of the primary advantages of using assembly language over machine language, and this is also true of all high-level languages including Java. + +Fig. 1.7 + +Using names for memory locations + +Instead of a three-dimensional representation of a mailbox to represent a memory location, it is much easier to draw a two-dimensional representation. Further, instead of using ones and zeros to represent the contents of the memory location, it is easier to use the decimal number system to represent values as follows: + +Although not as crucial in high-level languages (like Java) as low-level languages (machine and assembly languages), it is important to remember that a memory location has two features: its address and its contents. In Java, the programmer is typically concerned about its contents. + +Given the above representation of variables, how are they actually created or declared? When a variable is declared, there are two things that must be done. First, a variable needs to be given a name so that it can be referred to by various instructions in the program, and second, the type of data that will be stored in the memory location needs to be indicated. The reason for this is that although all the data is stored as ones and zeros as discussed above, different types of data are stored using different combinations of ones and zeros. A single one or zero is called a binary digit (abbreviated as a bit), and a group of 8 bits is called a byte. Typically the more bytes that make up a memory location, the larger the number that can be stored in the location. Although how the data is actually stored is beyond the scope of this text, Table 1.1 shows some of the types of data, the size, and the range of values that can be stored for each type. + +Table 1.1 + +Data types + +Type | Size | Range + +---|---|--- + +byte | 1 byte | −128 to 127 + +short | 2 bytes | −32,768 to 32,767 + +int | 4 bytes | −2,147,483,648 to 2,147,483,647 + +long | 8 bytes | −9,223,372,036,854,775,808 to 9,223,372,036,854,775,807 + +float | 4 bytes | −3.40282347 × 1038 to 3.4028347 × 1038 + +double | 8 bytes | −1.79769313486231570 × 10308 to 1.79769313486231570 × 10308 + +char | 2 bytes | one character + +String | 2 or more bytes | one or more characters + +Typically the types int, double, char, and String are the ones that are used the most frequently. For example, should one want to declare a variable named number and have it store an integer, it would be declared as follows: + +int number; + +First the type is indicated, in this case int for integer, and then the identifier or name of the variable number is given, followed by a semicolon. The name of the variable can be almost anything except for a reserved word, but there are certain rules that need to be followed as well as some suggestions that should be followed. The length of the variable name should be from 1 to any number of characters long. Further, the variable name can be composed of letters, numbers, underscores _, and dollar signs $, but must begin with a letter. Also, the variable name is case sensitive, meaning that cat, Cat, and CAT are separate variable names and correspond to separate memory locations. + +Typically a variable name should not be too long, because they can be difficult to read, but by the same token, they should not be too short either, for it could become difficult to remember what it represents. For example, if the letter n were used instead of number, then it might not be clear whether n stood for name, number, or numeral. Exceptions to this are for variables from a mathematical expression. For example, the variables x, y, and z are commonly used to represent the points of a Cartesian coordinate system, or i, j, or k are used for loop control variables as will be discussed in Chap.​ 4. Although most of the time this text will avoid the use of shorter names, on occasion shorter names might be used to save space or for the sake of simplicity to concentrate on other aspects of a code segment. If a variable is too long, it can be difficult to read as in the following: numberofcatsanddogs. Common practice in Java is not to capitalize the first letter of a variable but to capitalize the first letter in all subsequent words, as in numberOfCatsAndDogs. Notice that it is a little easier to read that way. Also on occasion, abbreviations can be used such as num instead of number, but be sure to use good abbreviations, and this text will occasionally show some of the more commonly used ones. + +Variables of other types can be declared as well, such as a variable of type float or double. Although numbers of type float take up less space in the computer's memory, they are less precise and can sometimes cause inaccuracy in calculations. Even though they take up more memory, this text will use double variables to alleviate some possible problems later. For example, should one want to declare a variable to hold a double precision value, it would be declared as follows: + +double average; + +Further it could contain a value and would look like the following: + +Notice that instead of showing the number zero as an integer, it is represented as a real number with a decimal point, to indicate its type as a double. + +All of the types given in Table 1.1, other than the String type, are known as primitive data types, meaning that when they are declared, the memory needed to store the associated data is allocated at that time. However, a String data type is a reference data type. When a variable of type String is declared, the memory allocated is not used to store the data, but rather only to store a reference to the data. String data types are unique in that although they are technically objects, they can be used syntactically as if they were primitive data types. + +The first part of this text will use strings in a very limited capacity. An understanding of strings is much easier once one has had an introduction to objects and practice with objects, so a full description of how string objects are created and manipulated is presented in Chap.​ 6. However, for now, this text will represent strings "as if" they are primitive data types, and the following shows a character primitive data type and a simplified view of the string data type. For example, a character and string could be declared as follows: + +char initial; String name; + +and would be represented with values as follows, respectively: + +Note that the char data type is represented using single quotation marks and that the String is represented using double quotation marks. Although a character could be represented as a String of length one, it is usually better to use the char data type. Further, there are also ways to extract a single char type from a String data type. Again, a full description will be deferred until Chap.​ 6. + +In contrast to variables, a constant can be declared so that its value cannot be changed. Although not nearly as useful as variables, constants have their place in a program when a value does not need to be changed, nor should it be changed. For example, if an integer N always needs to remain a 7, then it could be declared as follows, where the use of the reserved word final indicates that N is a constant: + +final int N = 7; + +Typically constant names are declared as all capital letters to help other programmers distinguish them from variables. In another example, suppose a number like PI needs only two digits after the decimal point, then it could be declared as follows: + +final double PI = 3.14; + +Although the use of a constant might not be readily apparent at this time, their use will become clearer in subsequent examples after discussing assignment statements in the next section. + +## 1.4 Assignment Statements + +In the previous section, all the drawings of the memory locations had values in them. How did those values get there? By default, Java technically initializes all int variables to 0 and double variables to 0.0. Also, char variables are initialized to ", the empty character, and String variables are initialized to null as will be discussed further in Chap.​ 6. Although this can be helpful in some instances, in many other languages variables do not have a default value. The variables contain whatever was in that memory location from the last time it was used which could be interpreted as junk to another program, cause logic errors, and be difficult to debug. Variables with unknown initial values are said to be indeterminate. As a result, many programmers do not use Java's default values and assume instead that the initial values of variables are indeterminate, which will also be the assumption of this text. So instead of initially showing an integer variable with the number 0 in it, this text will show the variable as indeterminate with a dashed line in it as shown below: + +Does this mean that all variables need to be initialized to some value? Not necessarily. As will be seen, only those variables that need an initial value for subsequent processing should be initialized. Initializing a variable to a value when it does not need to be initialized could be confusing to other programmers reading the code, as will be discussed later in this chapter and in Chap.​ 4 on iteration structures. + +So if a variable is assumed not to be initialized, how does one initialize a variable to a value such as 0 or any other value for that matter, such as 5? After a variable is declared, it can be given a value in an assignment statement using an assignment symbol. The assignment symbol is the equal sign. However, when one first starts to use the equal sign, one must remember that it does not mean that the variable on the left is "equal to" the value on the right, but rather that the value on the right is copied into or assigned to the variable on the left. Again, this is best shown by way of an example: + +int number; + +number = 5; + +After the variable number is declared as type int, the second statement indicates that the integer 5 is assigned or copied into the variable number and the memory location would then appear as follows: + +Again, the assignment statement is not really saying that number is equal to 5 or equals 5, but rather that the variable number is assigned a 5 or takes on the value of 5. Although it is tempting to say that number equals 5 and even though most people will understand what is meant, try to avoid saying it, and there will be less difficulty in the future as shown in Sect. 1.7 on arithmetic statements. + +Note that it is possible to combine the previous two statements into one statement as shown below. It looks similar to the definition of a constant in the previous section but without the word final in the statement: + +int number = 5; + +The above syntax is perfectly legal and saves a line when writing a program. However, when first learning a language, it helps to reinforce the distinction between the declaration of a variable and the assignment of a value to a variable. Of course if one's instructor does not mind the above shortcut or if one is studying this text on their own and likes the shortcut, then go ahead and use it. However, this text will use the previous two line method at least for the next few chapters to help reinforce the distinction between the declaration of a variable and the assignment of a value to a variable. + +Continuing, what if one wanted to take the contents of number, and copy it into another memory location named answer? For example, consider the following code segment: + +int number, answer; + +number = 5; + +answer = number; + +After both number and answer have been declared in the first line, the variable number is then assigned the value 5 in the second line and answer will still be indeterminate. The memory locations would look as follows: + +The third line then takes a copy of the contents of number and places it into the memory location answer as shown below: + +Note that the assignment statement does not remove the 5 from number and put it into answer, but rather it takes a copy of the 5 and puts it into answer. The original 5 in number does not disappear. Why does it copy and not move it? The reason is because it is actually faster for the computer to copy it and not take the time to delete the original. This is a fundamental concept in most computer languages and will become more important later in the writing of subsequent programs. + +Again, the important point to notice is that the copying of values is from right to left, not left to right. This sometimes causes confusion among beginning programmers, possibly because they are used to reading from left to right. The reason why Java and many previous languages go from right to left is because they are mimicking some of the assembly languages on many machines. Ideally it would be nice if languages used an arrow to show how values are copied as shown below: + +However, most keyboards do not have an arrow character, so an equal sign was used. Just be very careful to remember that values are copied from right to left and there should not be any problems. + +Assigning variables of type double is similar to the above and will not be shown here; however, a couple of points need to be made concerning assigning variables of different types. For example, what would happen if a variable of type int was assigned to a variable of type double as shown below? + +int number; + +double result; + +number = 5; + +result = number; + +As before, the contents of the memory locations after the assignment of 5 to number would be as follows: + +Then when the next assignment statement is executed, the int value of 5 would be copied, converted to a double value of 5.0, and assigned to result as follows: + +Would the value in number be converted to a 5.0? The answer is no, as shown above, because only the variable to the left of the assignment symbol is altered by an assignment statement. The 5 in number is not converted, but rather when it is copied, it is converted to the proper type so that it can be assigned to result. + +If an int value can be stored in a variable of type double, is the reverse true? The answer is no, because, for example, how could the number 5.7 be stored as an integer without the fractional part? A way around this problem is to use a typecast operator. A typecast operator allows a value of one type to be converted to another type. In the case below, the typecast operator (int) converts the double value in number to type int so it can be assigned to result. As before, the value in number would not change and would still contain a 5.7. However, what happens to the fractional part? The result is that it is truncated and a 5 is stored in result: + +double number; + +int result; + +number = 5.7; + +result = (int) number; + +What if the value needed to be rounded instead? Fortunately Java has the Math class which contains a method named round. A method is somewhat like a function in mathematics. The name of the class, Math, is followed by a period and the name of the method, round. Parentheses are placed after the method name and contain the argument, number, which is sent to the method. The code segment from above is rewritten below: + +double number; + +int result; + +number = 5.7; + +result = (int) Math.round(number); + +Unfortunately, when the round method is sent a value of type double, it returns a value of type long, but the typecast operator (int) can again be used to convert the value of type long to type int. Since number contains 5.7, the variable result would contain a 6. Again, the value in number would not change and would still contain a 5.7. Of course if the precision of the type double is needed, the better solution would be to change the type of result to double to preserve the fractional part of number. The round method is one of the many methods available in the Math class which is discussed in more detail in Sect. 1.7 on arithmetic statements. + +## 1.5 Output + +Unless a program performs some type of output, it is not particularly useful. Output can be of many forms including output to a screen, a printer, a disk, or even some form of movement such as a robot on an assembly line. In this section, only output to a screen will be considered. Although there are several ways to output data to the screen, this section will examine the simplest of them to get started. More advanced methods of output will be examined in Chap.​ 10 and Appendix A, and one can jump to these locations and learn these methods if one is reading this text independently or at the discretion of one's instructor. However, this text will use the methods introduced in this chapter for the sake of simplicity. + +One of the more common first programs written when learning a new language is the infamous "Hello World!" program. The advantage of this program is to make sure that one is writing a program correctly and using the compiler properly. This program can be written as shown in Fig. 1.8. + +Fig. 1.8 + +Hello World! + +This program looks very similar to the original Skeleton program in Sect. 1.2, except that the class name has been changed from Skeleton to Output and the comment line has been replaced with the System.out.println("Hello World!"); statement. This statement outputs the string contained within the double quotation marks to the monitor. Java uses System.out to refer to console output and the standard output device by default is the monitor. To perform console output, one simply uses the println method to display a primitive value or a string to the monitor. The println method is part of the Java Application Programming Interface (API) which is a predefined set of classes that can be used in any Java program. The classes and methods in the Java API provide a variety of fundamental services that are not part of the language itself. + +The method name println is often pronounced as "print line," even though it is not spelled that way. The print portion of println causes the information in the parentheses to be output to the computer screen, and then the ln portion of println causes the cursor on the screen to move down to the next line. In this case, the only information in the parentheses is the string "Hello World!". Of course, the statement is terminated with a semicolon just as the declaration statements and assignment statements were in Sects. 1.3 and 1.4, respectively. Go ahead and try typing in this program on your computer using the IDE (Integrated Development Environment) installed in your lab, home computer, or place of employment and then compile and execute the program. Provided there are no syntax errors, the output should appear similar to the following, where the underscore represents the location of the cursor on the screen: + +Hello World! + +_ + +Notice that the quotation marks are not output to the screen and the cursor appears on the next line. Also note that the cursor might not appear on the screen, since there is no input as of yet, but in this example, it still serves to illustrate where any subsequent output would appear. However, what would happen should one leave off the ln portion of the println, as shown below? + +System.out.print("Hello World!"); + +Given the previous description concerning the println above, the output would be as follows: + +Hello World!_ + +At first glance, this does not appear to be much different than the original sample output. However, if one looks carefully, note the location of the cursor. It is not on the second line but rather at the end of the string. The statement outputs the string to the screen, but with the absence of the ln, the cursor does not move down to the next line. In fact, if the cursor does not show up on the screen, one could not notice the difference. Even though it might not be detected on the screen, it is important to know where the cursor is located, so that subsequent output is correct. For example, what if one split the string so that it appears on two separate lines? This can be accomplished by using two separate System.out.println statements as follows: + +System.out.println("Hello"); + +System.out.println("World!"); + +As one might suspect, the output would appear as follows: + +Hello + +World! + +_ + +The string "Hello" is output and the cursor moves down to the next line. Then, the string "World!" is output, and again the cursor moves down to the next line in preparation for the subsequent line to be output. However, what if one accidently used two separate System.out.print statements instead? + +System.out.print("Hello"); + +System.out.print("World!"); + +The output would appear as given below: + +HelloWorld!_ + +Note that this output appears similar to using a single System.out.print statement as shown previously. Why are they similar? After the first System.out.print output the word Hello, the cursor stayed on the same line and did not move to the second line. So when the second System.out.print was executed, the word World! was output on the same line, and since there was no ln in the second statement, the cursor stayed on the same line. One might also notice there is no space between the two words. Why did this happen? Since there is no space at the end of the first string within the double quotes, nor a space at the beginning of the second string, a space did not appear in the output. + +Although this is similar to the example using the System.out.print, could it be changed to mimic the first example in this section? The answer is yes, as in the following example: + +System.out.print("Hello "); + +System.out.print("World!"); + +System.out.println(); + +In this case, the word Hello followed by a space would be output, and then the word World! would be output. The last line would output nothing, because there is no string in the parentheses, but the ln would cause the cursor to move down to the next line as shown below: + +Hello World! + +_ + +Although the above three line code segment produces the same output as the original single-line statement, why would one want to use this latter example? Usually one would not and the single line is preferable to using multiple lines. However, there are instances where one needs to break up an output line into multiple lines for the sake of convenience as will be illustrated in the next section on input and in Chap.​ 3 on selection statements. + +As a further example of formatting output, what if one wanted to output the following with a blank line between the two words and the cursor at the bottom? + +Hello + +World! + +_ + +The following code segment would accomplish this task: + +System.out.println("Hello"); + +System.out.println(); + +System.out.println("World!"); + +The first statement outputs the word Hello and moves the cursor down to the second line. The second statement does not output anything, so the ln of the System.out.println statement causes the cursor to move down to the third line and the blank line to appear on output. Lastly, the word World! is output and the cursor moves down to the fourth line. What if one wanted to output two blank lines, would the following code segment work? + +System.out.print("Hello"); + +System.out.println(); + +System.out.println(); + +System.out.println("World!"); + +At first glance, it might appear to work, but look carefully. Notice that the first statement does not contain a println but rather only a print. The result would be exactly the same as the previous code segment since the first statement outputs the word Hello, but does not move the cursor down to the next line on the screen. The second statement is a System.out.println, and it moves the cursor down from the first line to the second line of output. The second System.out.println creates a single blank line. + +Unfortunately, this is a mistake that is sometimes made by beginning Java programmers, where they assume that anytime there is a System.out.println(); a blank line is produced. The only time a blank line is produced is when there is not a preceding System.out.print statement. This is yet another reason why one should tend to avoid using the System.out.print statement unless under special circumstances, again discussed in the next section and Chap.​ 3. The correct code segment to produce two blank lines is given below. Note that the first statement is a System.out.println: + +System.out.println("Hello"); + +System.out.println(); + +System.out.println(); + +System.out.println("World!"); + +Although the above code segments are useful for outputting strings and formatting output, how does one output integers and real numbers? Combining the information learned in the previous two sections, one can then have a program as shown in Fig. 1.9. + +Fig. 1.9 + +Outputting an int precision number + +This program declares the variable num to be of type int, assigns the value 5 to num, and then outputs the contents of the variable num. Note that the variable num is not enclosed in quotation marks, so the word num is not output, but rather the contents of the variable num are output. Unfortunately, only the integer 5 would be output to the screen which would not be very useful. Instead, it is helpful to output some other information for the user to identify and understand the information on the screen. + +The output statement in the program in Fig. 1.9 can be modified to include the string "The number is " followed by a plus sign prior to the variable num as shown in Fig. 1.10. A plus sign between two strings or between a string and any other type of data means concatenation. In other words, the string "The number is " and the contents of num are output as if they are one string. It should be noted that one needs to be careful should only two integers be separated by a plus sign, because then it would mean addition as will be discussed in Sect. 1.7. However, provided a string or a concatenated string appears to the left, then the item to the right of the plus sign will be concatenated instead of added. Note that there is a space within the quotes at the end of the string so that the contents of the variable num are separated from the word is in the string. The result is that the output of this program would appear as follows: + +The number is 5 + +_ + +Fig. 1.10 + +Outputting an int precision number with description of output + +What happens if one outputs a number of type double using the same format shown in Fig. 1.10? For example, Fig. 1.11 outputs the contents of the variable num of type double. + +Fig. 1.11 + +Outputting a double precision number without formatting + +As will be discussed further in Sect. 1.7, the / means division and num will take on the value of one third. When the above program is compiled and executed, the screen displays + +The number is 0.3333333333333333 + +Although using high precision is necessary during computation, it may not be needed when a number of type double is displayed. How can one limit the number of digits after the decimal point in a floating-point number? A predefined method in the Java API called printf can be used. The general syntax of the printf method is as follows: + +printf(control string, expr, expr,...) + +where control string is a string that may consist of substrings and format specifiers and an expr represents a variable, expression, or constant value. A format specifier indicates how an expr should be displayed. A specifier %d is used for a decimal integer, %f for a floating-point number, %c for a character, and %s for a string. For numbers, the total width and precision can be indicated in a specifier. For example, the specifier %10d outputs an integer value with a width of at least 10. The specifier %10.2f outputs a floating-point number with a width of at least 10 including a decimal point and two digits after the decimal point. The width of character and string values can also be indicated. For example, the specifier %3c outputs a single character and adds two spaces before it, and %10s outputs a string with a width at least 10 characters. If there is more than one expr to be output, they must match the specifiers within the control string in order, number, and type. Using the formatting information described above, the program in Fig. 1.11 can be rewritten as follows in Fig. 1.12. + +Fig. 1.12 + +Formatting a double precision number + +The floating-point number stored in the variable num will be output with two digits after the decimal point. Since a space is included before the specifier in the string after the word is, there will be a space between is and the number as shown below: + +The number is 0.33 + +Also notice that since the printf method does not move the cursor to the next line, just like a print method. A System.out.println(); statement needs to be added at the end of the program in order to have the same effect as the program in Fig. 1.11. + +Some characters cannot be simply included between double quotes for output. In order to output a double quotation mark, two characters, a backslash and a double quote, need to be used, \". The following statement + +System.out.println("He said \"Hello\"."); + +will output + +He said "Hello". + +Similarly, a backslash can be output by placing an extra backslash in front of one as shown below: + +System.out.println("How to output a backslash, \\\"); + +This will produce an output of + +How to output backslash, \ + +## 1.6 Input + +The ability to declare variables, assign values to them, and output strings and variables is very important but does not allow for many useful programs. As it stands, anytime one wants to change the output of a program, one has to edit the program and recompile it before executing the program. What is needed is a way to input data into a program. As with output, input can come from a variety of sources such as the keyboard, mouse, a disk, or even from sensors such as those that might be on a robot on an assembly line. Although other methods for input can be found in Chap.​ 10 and Appendix A, this section will deal with the simplest form of input. + +As in the last section, it is best to start with a simple example based on the previous program in Fig. 1.10 and modified as shown in Fig. 1.13. Although the description of the first few lines of the following program might be a little complicated due to the nature of input in Java, the actual statements that perform the input are less complicated as will be seen shortly. + +Fig. 1.13 + +Program to input an integer + +Notice the addition of the import statement in the first line. The import statement is added in order to use a predefined method for input. All the predefined classes and methods in the Java API are organized into packages, and the import statement identifies those packages that will be used in a program. For example, the following statement imports the Scanner class of the java.util package: + +import java.util.Scanner; + +A second option uses an asterisk to indicate that any class inside the package might be used in the program. Thus, the statement + +import java.util.*; + +allows any of the classes in the java.util package to be referenced in the program. The second option is used in the program shown in Fig. 1.13. + +Remember when the System.out.println, System.out.print, and System.out.printf statements were used in the previous section for output, the java.lang package which includes the System class was not imported at the beginning of the program. This is because the java.lang package, which includes the System and Math classes, is used extensively, and it is automatically imported into all Java programs. + +Returning back to Fig. 1.13, in order for input to work properly, one needs a place to store the data entered. The first statement in the body of the main method declares the variable num as type int. The next statement is the declaration of the variable scanner of type Scanner as shown below: + +Scanner scanner; + +Scanner is not a primitive data type like int or double, but rather it is a class. As discussed briefly at the beginning of Sect. 1.2 and will be discussed further in Chap.​ 2, a class is like the set of blueprints for a building. The following statement + +scanner = new Scanner(System.in); + +creates a new instance of the Scanner class, or in other words a Scanner object. This can be thought of as how an individual building might be constructed from a set of blueprints. Java uses System.in to refer to the standard input device, which is the keyboard. Unlike output, input is not directly supported in Java; however, the Scanner class can be used to create an object to get input from the keyboard. The above statement then assigns a reference to the new object to the variable scanner. Again, although this might be a little confusing at this point, the important thing is be sure to include the import statement and the above two statements in any program that needs to input data. + +The next statement below shows how the Scanner object is used to scan the input for the next integer. The method nextInt will make the system wait until an integer is entered from the keyboard, and then the integer input is assigned to the variable num: + +num = scanner.nextInt(); + +The last statement in the program is the same as before where the value of num is output to the computer screen. However, if one were to enter, compile, and run this program as given, the result might be a little confusing. The reason is that there would only be a blinking cursor on the screen as the system is waiting for input and there would be no indication of what should be input without having to look at the program. To solve this problem, it is usually best to provide a prompt to let the user know what should be input. A prompt is just an output of a message to the user to help them understand what is expected to be input. The program in Fig. 1.14 includes a prompt just prior to the input. + +Fig. 1.14 + +Prompting a user to input a number + +As can be seen, the prompt is nothing more than the output of a string to indicate what the program is expecting in terms of input. Notice that a System.out.print(); is used to cause the input to stay on the same line. Further, a prompt should be formatted well. Note that there is a space after the colon so that the cursor is separated from the prompt. After entering the data and when the user presses the enter key, the cursor then moves to the next line. + +Furthermore, a prompt should be user friendly. A user-friendly prompt is one that clearly describes what the user should input, as in the case above where it asks for an integer. A user-friendly prompt can be polite such as "Please enter a number: ", but typically a prompt should avoid the use of first person words like "I" and "you", as in "I would like you to...", since the computer is a machine, not a human. + +Now would be a good time to enter, compile, and run the program in Fig. 1.14 to see how it works. The results should be similar to the following: + +Enter an integer: 5 + +The integer is 5 + +_ + +In addition to nextInt, the method nextDouble reads a number of type double, the method next reads a word of type String that ends prior to a space, and the method nextLine reads an entire line of text of type String, including all the spaces until the user presses the enter or return key. All of these methods work similarly to the method nextInt. + +## 1.7 Arithmetic Statements + +The ability to input data, copy data from one memory location to another, and output data is fundamental to almost every computer program. However, unless there is the capability to manipulate and process data to convert it into information that can be output and used, the power of the computer has hardly been tapped. One of the first things computers were used for and continue to be used for is arithmetic computation, which is the subject of this section. + +The four basic operations of arithmetic, addition, subtraction, multiplication, and division can be accomplished in Java by the use of the binary operators +, -, *, and /, respectively. The word binary in this case does not mean the binary number system, but rather that these operators have two operands (such as variables and constants) that are manipulated by the operators. As before, the best way to illustrate this is through an example. Consider the following code segment: + +int num1, num2, sum; + +num1 = 5; + +num2 = 7; + +sum = num1 + num2; + +After the variables of num1 and num2 have been assigned the values 5 and 7, respectively, the contents of the memory locations would appear as follows: + +What occurs next is that the expression on the right side of the last assignment statement is evaluated. The contents of num1 are brought into the CPU, and then the contents of num2 are added to it in the CPU. Once the expression on the right side of the assignment symbol has been evaluated, the result of the expression in the CPU is then copied into the variable to the left of the assignment symbol. As in Sect. 1.4, the copying goes from right to left, so the expression is always on the right side of the equal sign and there can only be one variable on the left side. The results of this evaluation and assignment can be seen below: + +Of course the values for num1 and num2 in the above segment could have been input from the keyboard, and the result in sum could be output to the screen, but for now simple assignment statements are used to initialize num1 and num2, and the value of sum is not output to keep the segment simple. The examples following will use this same pattern; however, a complete program using input and output will be shown in Sect. 1.10. + +Similar equations can be made using subtraction, multiplication, and division, and examples incorporating these operators will follow later in this section. Still, a few comments need to be made about mixing variables of different types. As shown above, when two variables of the same type are used, the result is of that type. However, should one or both of these operands be of type double, then the result will also be of type double. For example, if num1 is of type int and num2 is of type double, then the result of the expression would be of type double. Of course, if the result of the expression is of type double, then it could not be assigned to the variable sum of type int. Either the round method would need to be used or the type of sum would need to be changed to double. + +There is also a unique aspect to the division operation depending on the types of its operands. As with the other operators, if either or both of the operands are of type double, then the result of the division is also of type double. So, for example, 7.0 divided by 2 would be 3.5. If both operands are of type int, the result will of course be of type int. Although this does not pose a problem with the other arithmetic operators, the result of division when performing arithmetic often has a fractional component, and one would write it as 3½, 3.5, or possibly 3 with a remainder of 1. However, if the result of the division operation in Java is of type int, the fractional part is discarded and the result is simply 3. Although one does not get the fractional part with integer division, what if one wanted to determine the remainder? That can be done with the mod operator which is represented by the percent sign, %. To illustrate, consider the following code segment, where all variables are of type int: + +int num1, num2, quotient, remainder; + +num1 = 7; + +num2 = 2; + +quotient = num1 / num2; + +remainder = num1 % num2; + +Upon completion of the segment, the respective memory locations would contain the following: + +Although it is relatively easy to create some simple instructions that contain only one operator, what about expressions with more than one operator? In that case, an awareness of the precedence of the various operators is needed. The precedence in Java is the same as in mathematics, on a calculator, or in a spreadsheet application program. First, the multiplication and division operators have precedence over addition and subtraction. For example, given the following code segment, what are the contents in answer? + +int answer, x, y, z; + +x = 2; + +y = 3; + +z = 4; + +answer = x + y * z; + +Unfortunately if one guessed 20, that would be wrong. Remember that multiplication has precedence over addition so the result of the multiplication of y and z, which contain 3 and 4, would be 12, plus the contents of x, which is 2, would be 14. + +However, what if one wanted to perform the addition first? As in arithmetic, one can always use parentheses to override the precedence of the operators, so that + +answer = (x + y) * z; + +would result in answer containing a 20. If there are more than one set of parentheses, then the innermost nested ones are evaluated first, and if the parentheses are not nested, the parentheses are evaluated from left to right. In fact, if there is a tie of any sort, such as two addition symbols, or an addition symbol and a subtraction symbol, the order is also from left to right. + +Given all this information, what would be the answers in the following segment? + +int answer1, answer2, x, y, z; + +x = 3; + +y = 4; + +z = 5; + +answer1 = x - y + 6 / z; + +answer2 = (x * (y + 2)) % 2 – 1; + +First, note that there are some constants in the mathematical expressions on the right side of the assignment statement and this is perfectly acceptable. In the first expression, the 6 / z is evaluated first and the result would be 1. After that, which operation is performed second? Since there is a tie in the precedence between the subtraction and the addition, and the subtraction is on the left, it is performed first, where 3 minus 4 is -1. Lastly, the 1 from the division is added to the -1 from the subtraction, so the answer is 0. + +In the second expression, which operation is performed first? Since there are nested parentheses, the y \+ 2 is performed first with an answer of 6. Then the 3 in x is multiplied by the 6 for a value of 18. Then the 18 is divided by 2, where the remainder is 0, and lastly the 1 is subtracted from the 0 for a final answer of -1. + +When trying to evaluate expressions, it is sometimes helpful to draw a line underneath each of the sub-expressions to help one remember which parts of the expression have been evaluated and remember their respective values. For example, in the first expression above, it would appear as follows: + +Since parentheses override the order of precedence, why can't one just use parentheses all of the time and avoid having to remember the order of precedence? One could do that, but the resulting expressions would have an inordinate number of parentheses and they could be quite difficult to read. Further, since the precedence rules in most languages are fairly similar and most programmers use parentheses sparingly, it is to one's advantage to learn and use them correctly. For further practice, see the exercises at the end of this chapter. + +Just as there are binary operators that have two operands, there also exist unary operators that have only one operand. The two most common are the plus sign and the minus sign, where the latter is used more frequently as in the following example: + +z = -x + y; + +The thing to remember about unary operators is that they have a higher priority than binary operators. So in the above statement, the negative of the value contained in x is added with the value in y and the result placed in the variable z. Should one want to negate the entire quantity, then parentheses would need to be used as in the following example, where the values in x and y are added together first, then negated, and the result placed in z. + +z = -(x + y); + +There are of course other arithmetic expressions to be learned, including how the contents of a variable can be incremented or decremented by 1 or more. There are a couple of ways to do this, and the method that is applicable in most programming languages will be examined first. One way is to first get the contents of a variable, add or subtract 1, and then copy the new number back to the variable as follows: + +int x, y; + +x = 0; + +y = 0; + +x = x + 1; + +y = y - 1; + +At first the fourth and fifth statements above might appear unusual to the beginning programmer. The fourth statement seems to be saying that x is equal to x \+ 1, which would be impossible in algebra. How could a value in x be equal to itself plus 1? The answer is that it cannot. The reason why this might look unusual is that one might be mistaking the equal sign in Java as an equal sign in algebra, which it is not. If one recalls from Sect. 1.4, the equal sign in Java is the assignment symbol which takes a copy of the result on the right side and places it in the variable on the left. + +In this case, the value in x, which is a 0 as shown above, plus a 1 is 1, and that is the value placed into x. So prior to execution of the fourth statement, the value in x is a 0, and after the execution of the fourth statement, the value in x is a 1. The same sort of process occurs with the statement using subtraction where the final value in y would be a -1. Also note that since both variables appear on the right side of the assignment symbol, they must be initialized to some value and should not be indeterminate. At first these statements might be a little confusing, but with time they become second nature. Statements like these are often used to increment and decrement variables that are used as counters and will be discussed in detail in Chap.​ 4. + +Since these operations are fairly commonplace, the languages C, C++, and Java have shortcuts for these as follows: + +These operators are very convenient. The operators on the left side work the same way as those on the right when they are used as standalone statements. The style on the right is seen more often and will be used again extensively in Chap.​ 4. However, when used as part of a larger expression, the two styles have entirely different meanings. For example, consider the following two statements: + +If x and y originally contain a 2, their respective memory locations would initially appear as follows: + +At first it might seem that all four variables would contain a 3, but that would be incorrect. When the ++ ( or −− ) appears prior to a variable, the increment is performed before the assignment or any other operation that might be in the expression. On the other hand, if the ++ (or again −− ) appears after the variable, then any other operations are performed first, including the assignment operation. The result is that in the example on the left, the value of x is incremented by 1, which makes x contain a 3, and then the new value of x would be assigned to a, which would then also contain a 3. In the example on the right, the value in the variable y, which is a 2, is first assigned to b . Then the value in y would be incremented to 3 and the value in b would still be a 2 as shown below: + +As mentioned above, as standalone operators, the ++ and −− can be fairly useful and easy to use, and this text will use them more frequently in Chap.​ 4. However, using the more simple initial approach such as x = x \+ 1; is common in almost all languages, so this text will tend to use this initially to help reinforce how an expression like this works. Further, when these operators are used in more complicated expressions, their use becomes much more difficult to understand, and it is for this reason that this text will tend to avoid the use of the ++ or −− operators in this fashion. However, be aware that intermediate and advanced texts often use these operators more frequently in complicated expressions, so one needs to know how they work and also be careful when reading code containing them. + +As shown at the beginning of this section, when two variables are added together, the sum is often stored in a third variable. However, similar to counting, when a constant such as a 1 is added to a variable in the process of trying to find a total, one variable is added to another variable. For example, consider the following segment: + +int total, num; + +total = 0; + +num = 5; + +total = total + num; + +where the initial contents of the respective memory locations would appear as follows: + +As with previously incrementing by 1, it might look a little odd to see the variable total on both sides of the equal sign. Again the equal sign does not mean equality but assignment, where the expression on the right is evaluated first and the results are then stored in the variable on the left. Also, since the variable total appears on both sides of the assignment symbol, it needs to be initialized with a value prior to the statement. After the 0 and 5 are added together, the results are then placed back into total as follows: + +Just as with the increment operation, the ability to find a total also has a shortcut. This shortcut is as follows and has the same effect as the instruction above. + +total += num; + +Similar shortcuts can also be used with the subtraction, multiplication, and division operators, but they are used less frequently than addition. As with the previous shortcuts, this is only possible in languages like C, C++, and Java and does not appear in all languages. Likewise, since they do not appear in all languages and do not illustrate as readily how values can be totaled, this text will tend not to use these shortcuts as often. + +Although all the basic arithmetic operation are available in the Java programming language, there are a number of other functions that would be helpful to have available. In addition to the constants PI and E for pi and e, respectively, many extra functions are in the Math class. Including the round method previously introduced in Sect. 1.4, some of the other methods include square root, the power function, and the trigonometric functions. These methods along with some others are shown in Table 1.2. + +Table 1.2 + +Various methods in the Math class + +Method | Function performed | Arguments | Value returned + +---|---|---|--- + +cos( x ) | cosine | double (in radians) | double + +pow( x,y ) | x to the power of y | double | double + +round( x ) | round | float (or double) | int (or long) + +sin( x ) | sine | double (in radians) | double + +sqrt( x ) | square root | double | double + +tan( x ) | tangent | double (in radians) | double + +toDegrees( x ) | convert radians to degrees | double | double + +toRadians( x ) | convert degrees to radians | double | double + +To illustrate a few of these functions, examine the program segment in Fig. 1.15. + +Fig. 1.15 + +Sample Math class constants and methods + +The methods should be fairly straightforward given their descriptive names and the reader's requisite mathematical background. After execution of the segment, the answers stored in the variables power, sqRoot, sine, and cosine would be 8.0, 2.0, 0.0, and -1.0, respectively. Note that the value in z is in terms of PI, because the trigonometric functions work with radians instead of degrees. If the initial value in z was in degrees, the method toRadians could be used. + +## 1.8 Comments + +Although comments were discussed briefly in Sect. 1.2, there are few more items that should be discussed. As mentioned previously, comments are either preceded by two slashes //, and the remainder of the line is considered a comment by the compiler, or a comment can begin with a slash and an asterisk /* and end with an asterisk and a slash */ which allows a comment to extend over multiple lines in a program. Single-line comments are helpful in explaining an individual line or multiple lines of code. Although a single-line comment can be placed off to the right-hand side of the statement it is describing, it can sometimes get crowded once code is indented as shown in Chaps.​ 3 and . As a result, this text will usually place comments just prior to a line of code or code segment being documented. For example, the following comment helps the reader of the program understand what the subsequent statement accomplishes: + +// calculate the area of a rectangle + +areaRect = base * height; + +Multiple-line comments are also helpful to create what are called headings at the beginning of programs and methods in class definitions. The format of these headings can vary in different computer courses and companies, so be sure to determine your local requirements. An example of one such heading might be as follows: + +/* name: your name + +class : cs 1xx + +prog : one + +date : mm/dd/yyyy + +*/ + +Once filled with the corresponding information, this heading identifies the author of the program, which class it was written for, the program number, and the date written. As can be seen, comments are good for documenting what various sections of code do in a program and identify who wrote a program, among other things. Having comments within a program explaining what a program does is known as internal documentation, whereas having explanations that appear in manuals (whether online or in physical manuals) is known as external documentation. Internal documentation tends to be more specific and is helpful to programmers, whereas external documentation tends to be more general and is useful to users, customers, and managers who may not understand programming. + +Although at first some of the simpler programs will not appear to need comments, it becomes imperative to include comments as programs become larger and more complex. If the original programmer is on vacation or is no longer with a company, documentation is essential to help other programmers understand how the program works. Although many of the programs written in a first programming course might not be too complex, it is helpful to include comments to gain practice in good commenting techniques. To that end, the complete programs at the end of each chapter will include comments to help the reader understand the program and learn some commenting techniques. + +There is also another way to document a program using Javadoc. This technique is very useful with larger programs that have many classes and methods, and an introduction is presented in Appendix C. Again, many computer science departments and computer science professors have different documentation standards, as do many different companies. Although they share some commonalities, there can also be a number of differences. Find out what your professor's or company standards are and be sure to follow them closely. + +## 1.9 Program Design + +When writing a program for the first time, there is a tendency to want to just start keying the program into the computer and get it to work. Initially this method appears to work fairly well when programs are small at the beginning of a text and in a class. As mentioned previously, many beginning programmers focus primarily on the syntax of their program, and they want to avoid getting syntax errors. However, as problems get more complex, they become more difficult to solve, and programs written this way will tend to have not only more syntax errors but complicated logic errors which are more difficult to correct since no error messages are provided. + +As an analogy, an individual might be able to build a small storage shed by just sawing and nailing some lumber together without worrying about the overall design of the project. However, with a larger project such as a house, apartment building, or office building, that methodology would not be sufficient. Instead there are many other people who must be consulted, including the original customer who wants the building built, the architects who work with the customer, the contractors, and carpenters. The same holds true in the world of programming which involves customers, users, and managers. + +What are needed are various strategies and tools to help write programs correctly. Just as in the above example where blueprints and plans are used by the architect, there are techniques that can be used by analysts, software engineers, and programmers. Although the complete process for developing software might not be needed initially with smaller programs, it does not hurt to practice the various techniques on smaller programs to gain familiarity, so that when one advances to more difficult projects, one is comfortable with many of the techniques. Although the following techniques are used primarily with non-object-oriented programs, they can be augmented with object-oriented design techniques introduced in the next chapter and used in larger programs. + +There are many different methodologies and number of stages within the various methodologies for solving problems that can be found in different texts, but upon closer examination, they are all rather similar. They tend to include at least four stages, and they are usually comparable to the following: + +1. + +Analysis + +2. + +Design + +3. + +Implementation + +4. + +Maintenance + +The analysis stage is where the needs of the user or customer are first determined. Questions concerning the form and quantity of the input, the type of processing that needs to be done, the storage requirements of data, and the type of output needed are asked and clarified at this stage. This would be similar to a customer in a construction project trying to determine what type of building should be built. In a first semester programming class, this stage may or may not be included. Sometimes a professor might have already completed the analysis stage and included what is needed in the programming assignment. However, at other times, they might require this stage and a number of questions will need to be asked by the student. This might be especially true when working on a team project in a senior capstone course. + +The design stage is where a project begins to take shape. It is similar to the architect creating a set of blueprints and models for the user to examine, because changes are much easier to make on paper or with the model than once the construction of the building has started. Various tools such as UML diagrams (discussed in the next chapter) and pseudocode (discussed later in this section) are used by analysts, software engineers, and programmers to help design the program. Again it is much easier to make changes during the design phase than once the programming has begun. + +The implementation stage is where the code is actually written, compiled, and errors are corrected. Once the code is free of syntax errors, it is thoroughly tested. This includes testing various components of the program to be sure each section is working properly. If not, then the code needs to be debugged to correct any logic errors. In addition to the various components, the entire program needs to be tested to ensure that all the components work together as planned. Sometimes errors are a result of not following the design, whereas other times, it is not necessarily the code but rather the design itself that has the error, in which case one has to go back and correct the error in the design. The result is that each of the stages above is not a step that needs to be rigorously adhered to, but rather one stage may need to return to a previous stage for clarification or to fix a possible error. + +Although it is tempting to jump directly to the implementation stage, this tendency should be avoided. It is important to take the time to properly design the algorithm first before starting to key in a program. An algorithm is a step-by-step sequence of instructions, not necessarily implemented on a computer. Once an algorithm is implemented in a specific language, it is then a program. By taking the time to design a well-thought-out algorithm, there will be fewer logic errors in the program. Although it might seem to take longer to include the design stage, the savings will be more than made up for in less time spent debugging logic errors later. + +The maintenance stage is where all the modifications and updates take place. In an industrial strength program, more time is spent in the maintenance phase than all of the three preceding stages. This is because once a program is up and running, there can be numerous changes that need to be made over the lifetime of a program. This is another reason why a program should be designed well in order to facilitate modifications later in the life of a program. Unfortunately, beginning programmers do not often experience this stage of a program, because once the concepts are learned from one programming assignment, the program is often not used again and another program is assigned to introduce the next set of concepts. However, in some upper-level courses, the assignments get longer, existing programs might be modified and reused, and students get to have some experience with the maintenance stage of programs. Regardless, it helps even beginning students to design well-thought-out programs to gain practice in the event that a professor decides it might be easier to modify an existing program rather than having to design a new program from scratch, as done in the real world. + +One technique that can help during the design stage is the use of pseudocode. Pseudocode is a combination of English and a programming language. Since it is really not a programming language, this is the reason for its name as "pseudo" code. The advantage of using pseudocode is that one can concentrate on the logic of an algorithm and not worry about the syntax of a particular programming language. In fact, well-written pseudocode should be understood by any programmer regardless of the programming language that they use, and they should be able to convert the pseudocode into their particular programming language. However, there can be many different versions and levels of detail that can be included in pseudocode, so it is best to check with one's instructor or company if there are any preferences or standards that are employed. In this text, when pseudocode is used, it will be written with as much detail as possible so as not to be ambiguous and to help with the translation into Java. As a simple example, consider the following pseudocode on the left and the Java statement on the right: + +Note first that an arrow is used instead of an equal sign to indicate an assignment statement. This helps illustrate the direction of assignment, since some languages use symbols other than an equal sign to illustrate assignment. Also notice that a mathematical symbol is used instead of an asterisk to illustrate multiplication. Lastly, a semicolon is not used since not all other languages use them to terminate statements. The result is that the pseudocode is more generic and helps in the translation to other languages and not just the Java programming language. Again, this is just one sample of pseudocode, so be sure to check your local guidelines and requirements. + +Even when all attempts to write a logically correct program are followed, the possibility of logic errors still exists. When this occurs, a programmer should not start to randomly alter code in the hope that the error might be fixed. Although this might work occasionally with smaller programs, it rarely works as programs become larger and more complex. Instead, one should look for patterns in the output in an attempt to isolate the problem. Further, one needs to carefully check the program by walking through the code to ensure that it is doing what was originally intended. To assist in this process, many IDEs include debuggers that can trace the contents of various memory locations to help locate a logic error. However, do not rely on the debugger alone to help correct the problem, but rather use it as a tool to assist in tracing the logic of the program. If a debugger is not available, well-placed output statements at critical points in the program can help in the debugging process. In the end, it is the programmer reading the code carefully to see what the code is actually doing rather than what one thinks it is doing that will ultimately fix logic errors in a program. + +## 1.10 Complete Program: Implementing a Simple Program + +Combining all the material from Chap.​ 1, one can now write a simple program to prompt for and input various numbers, perform a wide variety of calculations, and output answers as needed. In this section, a program that calculates two roots of a quadratic equation ax 2 \+ bx + c = 0 will be developed and implemented. As might be recalled from mathematics, the following is the definition of the two roots: + +and + +Problem statement: Write a program to calculate the two roots of a quadratic equation. Assume that a ≠ 0 and the relationship b 2 ≥ 4ac holds, so there will be real number solutions for x. + +Once a problem statement has been given, the requirements can be determined by analyzing the problem. The program will: + + * Prompt a user to enter values for a, b, and c + + * Compute the two roots + + * Display the two roots + +During the design stage, pseudocode can be used to outline the program. At this point, one does not need to be concerned with the details of the implementation, such as the name of the class or the parameters in the main method. It lists the steps that need to be taken to accomplish the task. The following is the pseudocode for a program calculating two roots of a quadratic equation: + + * declare a, b, c, root1, root2 + + * input (a) + + * input (b) + + * input (c) + + * root1 ← + + * root2 ← + + * output (root1, root2) + +Observe in the formulas for the roots that the expression in the square root is called the discriminant and is used in calculating both roots. Therefore, the square root of discriminant can be calculated prior to the computation of root1 and root2, so that it does not need to be calculated twice. The augmented pseudocode is + + * declare a, b, c, root1, root2, sqrtDiscr + + * input (a) + + * input (b) + + * input (c) + + * sqrtDiscr ← + + * root1 ← (-b \+ sqrtDiscr)/(2a) + + * root2 ← (-b − sqrtDiscr)/(2a) + + * output (root1, root2) + +After the design phase comes the implementation phase. Consider the following program that is derived from the pseudocode above: + +Observe the formula for the discriminant for root1 and root2 . The methods sqrt and pow are defined in the Math class and are used to calculate the square root of the discriminant and the number b raised to the power of 2. All the parentheses are necessary to obtain the answer, which is accurate to at least two decimal places. In the output section of the program, println is called at the beginning in order to have a blank line between the input and output. The specifiers for root1 and root2 do not include the width to avoid any extra space before the roots are output since an extra space is included in the string. Given the above program, sample input and output are shown below: + +Enter a: 2.0 + +Enter b: -5.0 + +Enter c: -3.0 + +Two roots of the equation, 2.0*x*x + -5.0*x + -3.0 = 0, are + +3.00 and -0.50. + +## 1.11 Summary + + * Machine language and assembly language are low-level languages, where the former uses ones and zeros and the latter uses mnemonics. High-level languages are more English-like, where C, C++, and Java are examples of high-level languages. + + * Compilers convert the entire high-level language program into machine language before executing the machine language program, whereas interpreters convert a high-level language program one instruction at a time and then execute only the corresponding machine language instructions before converting the next high-level instruction. + + * Java is a hybrid system, where the Java instructions are converted into an intermediate language called bytecode using a compiler and then the bytecode is converted into machine language using an interpreter. + + * System.out.print leaves the cursor on the same line, whereas System.out.println moves the cursor to the next line. + + * Just because there are no arguments in a System.out.println, it does not mean a blank line is output. A blank line is output with a System.out.println when there are no preceding System.out.print statements. + + * Remember that multiplication and division have a higher precedence than addition and subtraction and that unary operators have an even higher precedence. + + * Parentheses can override any operator precedence, where the innermost nested parentheses have the highest precedence. It is also good practice not to use unnecessary parentheses. + + * Whenever there is a tie at any level of precedence, the operators or parentheses are evaluated from left to right. + + * The ++ or −− operators are an easy shortcut when used as standalone statements. However, great care must be taken when they are used in assignment statements or with other operators. In that case, if the ++ or −− precede a variable, it is performed first, but if they appear after the operand, they are performed last. + +## 1.12 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Indicate whether the following statements are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement: + +A. + +integer num1, num2; + +*B. + +double num3; + +C. + +7.8 = num3; Assume that a variable num3 has been declared correctly. + +*D. + +int j; + +j = 5.5; + +2. + +Assume the following declaration and initialization of variables: + +int i, j; + +double d; + +i = 1; + +j = 5; + +d = 2.34; + +Determine the value for each of the following expressions, or explain why it is not a valid expression: + +*A. + +i / j; + +B. + +j + d; + +C. + +Math.pow(j); + +D. + +i - j * d + +E. + +i + d * (j * 3 – 2) / 4 + +3. + +Assuming the following declaration and initialization of variables, + +int i; + +double d; + +i = 3; + +d = 2.34; + +Determine the value assigned to the variable in each of the following assignment statements, or explain why it is not a valid assignment statement: + +A. + +i = d; + +*B. + +d = i + d; + +C. + +d = Math.pow(5, Math.sqrt(Math.pow(i, 2))); + +4. + +Implement each of the following statements in the Java language: + +A. + +Declare a variable weight of type double. + +*B. + +Declare a constant EULER_NUMBER of type double and assign it the value 2.7182. + +5. + +Given the following Java program, what will be output to the screen? Be sure to line everything up properly. Use an underscore to represent a blank and the words blank line to represent a blank line: + +class OutputTest { + +public static void main (String[] args) { + +System.out.println("alpha "); + +System.out.println(); + +System.out.print(" beta"); + +System.out.println(" gamma"); + +} + +} + +*6. + +Write code to output the following pattern: + +** ** + +** ** + +**** + +**** + +**** + +**** + +** ** + +** ** + +*7. + +After the following statements are executed, what is stored in value1, value2, and value3? + +int value1 = 5; + +int value2 = 9; + +int value3 = 4; + +value1 = value2; + +value2 = value3; + +value3 = value1; + +8. + +Write an equivalent Java assignment statement for each of these mathematical expressions. + +A. + +*B. + +C. + +9. + +Write a complete program to prompt for and input a number, and then compute 2 to the power of the number that was input. The form of the input and output can be found below, and as always be careful with the vertical and horizontal spacing. + + * Input and Output: + + * Enter the number: 4.0 + + * Two to the power of 4.0 is 16.0. + +Reference + +4. + +Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_2 + +© Springer-Verlag London 2014 + +# 2. Objects: An Introduction + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +This chapter introduces classes and objects. Public and private data members along with value-returning methods and void methods with parameters are discussed. How objects are created and how methods are invoked are illustrated using contour diagrams. Contours help the reader have a better understanding of object-oriented concepts by providing visual representation of objects. Constructors are introduced along with multiple objects and classes. Lastly, UML (Universal Modeling Language) class diagrams are illustrated and a complete program implementing a simple class and client program is provided. + +## 2.1 Introduction + +Having written a complete Java program in the proceeding chapter, one should have a basic understanding of how a program works. However, as programs get larger, they can become very difficult to modify. It would be similar to trying to write a paper or book as just one long paragraph without any chapters, sections, or paragraphs. To help make a program easier to modify and maintain, it can be broken up into sections much like a book is divided up into chapters. Further, if a section of a book needed to be referred to many times, instead of repeating that section over and over again, it could possibly be placed in an appendix, and then the appendix can be referred to as necessary. Similarly, if a section of a program needs to be used again, the program can be broken up into subprograms. Instead of having to rewrite the code, a program can just call the same subprogram repetitively, thus saving time rewriting the code and saving memory as well. + +However, what if the repeated code is only slightly different from the code that has been previously written? One could rewrite the code again with only slight modifications, but the chance for making mistakes would increase. There would also be time wasted rewriting existing code and memory wasted to store the code. + +Instead of the above scenario, the programming methodology called object-oriented programming (OOP) could be used. OOP allows programmers to identify the common memory locations and code and then create what is known as a class. Then as variations of the class are needed, they can be made based on the original class. This allows for the reuse of a software that has been initially created in the original class, and the new classes are just variations on the theme of the original class. + +A class is essentially a definition of an object or group of objects. For example, in the real world, the drawings, plans, or blueprints for a house are a definition for a single house or a group of houses. Although blueprints could be drawn up for a single custom-built house, many times there might be a set of master blueprints for a group of houses. A subdivision could be built with houses that are all very similar but have various subtle differences so that they do not all look the same. For example, some houses might be built with different color siding, with windows in different locations, with one or two car garages, and so on. The reason for doing this is to keep the cost of the individual houses reasonable. Should a major change in the blueprint need to be made for all the houses, then only the master blueprints would need to be changed. However, if a change only needs to be made to some of the houses, such as only to those houses that have fireplaces, then only the individual supplement that contains the plans for fireplaces would need to be changed. This idea is called inheritance and will be explored further in Chap.​ 9. However, before learning more about that topic, the fundamentals of object-oriented programming must be discussed first. + +## 2.2 Classes and Objects + +In object-oriented terminology, the master blueprint would be called the class definition, and an actual house would be an instance of that class or what is known as an object as shown in Fig. 2.1. This can be a source of confusion for some beginning programmers which sometimes use the words class and object interchangeably. However, if one keeps the distinction between the plans or blueprints as the class and the individual houses as instances of the class or the objects themselves, it makes the learning of object-oriented programming easier in the long run. + +Fig. 2.1 + +Classes and objects using blueprints and houses + +Although a class can be placed in the same file right before or after the class that contains the main program, it is often placed into a separate file. This eventually helps when there are a number of different classes and when there is more than one programmer working on a project. However, this text will show classes immediately after the main program in order to save space. + +As with the initial skeleton of the main program in Chap.​ 1, the introduction of classes will also start with an empty class called Number as shown below: + +class Number { + +} + +As can be seen, a class is somewhat similar to the main program except it is much simpler. As before, the word class is a reserved word, Number is the name of the class, and the opening and closing braces indicate the body of the class. + +## 2.3 Public and Private Data Members + +As before, an empty class is not very useful until code is added to it. Two of the most important items in a class definition are its data members and methods. A data member is similar to the declaration of a variable in the previous chapter. An important difference is that data members need to be declared using the access modifiers: public or private. A public data member is one that can be seen and used by an object of the class in which it was declared but can also be used outside the object, such as the main program. A private data member is one that can only be seen or used within an object of the class and cannot be used externally, such as by the main program. As shown below, the variable or data member x is declared as private, and the data members y and z are declared as public. + +class Number { + +private int x; + +public int y, z; + +} + +At first, one might be tempted to declare all data members as public to allow for easy access from the calling program. However, this would be in contradiction with why one creates a class in the first place. One of the important aspects of OOP is data encapsulation. This means that the data in an instance of a class is encapsulated within the object and not directly accessible from the outside world. For example, in an automobile there are various parts which are inaccessible when one is driving, such as the fuel tank. However, through a gauge on the dashboard, one can tell whether there is fuel in the fuel tank. This is similar to public and private data members, where in many instances one does not want the main program having direct access to the data members. So although it is possible to declare data members as public, they will most often be declared as private. + +If a data member is not directly accessible when it is declared as private, how does one gain access to it? The answer is through a method, specifically a public method which can indirectly allow access to private data members. Although methods are sometimes declared as private, for now most of the methods will be declared as public. If a method is just accessing and examining the contents of a data member, it is known as an accessor. Should a method alter a data member, it is known as a mutator. An accessor method is often used to get the contents of a data member and a mutator is often used to set the contents of a data member. In particular, an accessor method is known as a value-returning method, and a mutator is known as a void method, as discussed in the next two sections. + +## 2.4 Value-Returning Methods + +First, consider a method that returns the contents of a private integer data member x as follows: + +public int getX() { + +return x; + +} + +The word public means that the method can be accessed from the main program. If the data member is private, then the method invoked from the main program to access the data member is declared as public. (How the method is invoked will be discussed shortly.) The word int is the type of the value that will be returned to the main program. The name of the method is getX and it is used in the main program to invoke the method. Inside the opening and closing parentheses () is known as a parameter list and is used for sending information to the method. Since this method is an accessor and not a mutator, there is no information being sent to the method, so the parameter list is empty. The opening and closing braces {} indicate the body of the method that contains the instructions, just as in the main program. The return instruction followed by the variable x indicates what value will be returned to the main program. Although there can be more than one return statement in a method, it is a good programming practice to include only one return statement, and typically as the last statement in the method, as will be discussed later in Chap.​ 3. Returning to the automobile example, the getX accessor method is somewhat like the fuel gauge on the dash panel of a car that displays the amount of fuel in the fuel tank. + +## 2.5 Void Methods and Parameters + +As an example of a void method, consider the following: + +public void setX(int a) { + +x = a; + +} + +As with the value-returning method, the void method is also public so it can be invoked from the main program. The word void indicates that the method will not return a value. Similarly, setX is the name of the method that will be used when invoking the method from the main program as will be discussed in the next section. + +Unlike the previous method, this method has a parameter (sometimes called a formal parameter) between the parentheses. Notice that it looks similar to a variable declaration, and in a sense, it is like a variable declaration with a type and a variable name. However, what is unique about a parameter is that it can accept a value from the calling program. This is accomplished through an invoking statement, where there is another variable or constant called an argument (sometimes called an actual parameter) and the value of the argument is passed to the parameter. This is not unlike how the value on the right side of an assignment symbol = is copied into the variable on the left side. This copying of a value from an argument to a parameter is known as pass-by-value, or in other words this type of parameter is known as a value parameter. A value parameter provides one-way communication from the main program to the method. Other programming languages have additional parameter passing mechanisms that provide two-way communication, but Java has only value parameters, which makes the task of learning parameters a little easier. A visual example of how this works will be demonstrated in the section on contour diagrams later in this chapter. Lastly, the only statement in the method is x = a; which is a simple assignment statement that takes a copy of the contents in the parameter a and copies it into the data member x, as discussed in Chap.​ 1. + +A question that might be asked is where is the data member x, since it does not appear in either of the two methods. If the variable is used by only one of the two methods, it should be declared locally in that method, but if the value in the variable is needed in both methods, it should be declared as a data member in the class. If a variable is declared in a method, it is sometimes referred to as a local variable since only that method has access to it. However, if a variable is declared as a data member, it is sometimes referred to as a global variable since it is accessible by all the methods in the object. In this example, since the variable x is used by both methods, it is declared as a data member so that both methods have access to it. To illustrate a complete class using both the data member x and the two methods above, the class definition of Number is shown in Fig. 2.2. + +Fig. 2.2 + +Number class + +Unlike the previous skeleton, the new class Number above only contains the private data member x. Also, the order of the methods is irrelevant. Sometimes the methods are put in alphabetical order, but this text will typically list the mutators first followed by the accessors, and then order them alphabetically within each group. The use of comments and line spacing helps with the readability of the class, although they will sometimes be omitted to save space in this text. + +## 2.6 Creating Objects and Invoking Methods + +Given the discussion of classes and methods in the previous sections, how are instances of classes created and the methods invoked? The best way is to show an example of a complete main program. Using the skeleton program from Chap.​ 1 with the appropriate code added, consider the program in Fig. 2.3. + +Fig. 2.3 + +Invoking program + +Note that there are two variables named y and z declared as type int, but there is also a variable named num that is declared as type Number. Just like different variables can be declared as primitive data types, variables can also be declared as a type of a class. Similar to the primitive types, the contents of the class variables are initially indeterminate. In order to create a new instance of a class, in other words a new object, the new operator must be used, and then a reference to the new object is typically placed into a variable. The statement num = new Number(); performs these two tasks. First, a new object is created via the new Number() section of the statement. Then a reference to that new object is placed in the variable num through the assignment symbol =. It is important to remember that simply declaring a variable is not sufficient to create an object, but rather after the variable is declared, a new object must be created and then assigned to the variable. A shorter way of doing this is as follows: + +Number num = new Number(); + +Although this technique might occasionally be used later in the text to save space, for now the two statements as shown below will be used to reinforce the concepts of variable declaration, object creation, and the assignment of references to variables. + +Number num; + +num = new Number(); + +This also reinforces the idea concerning the separate declaration and assignment of variables presented in Chap.​ 1. If one's instructor prefers using a single statement or if one is reading this text independently and wants to use just one statement, then of course do so. + +## 2.7 Contour Diagrams + +As indicated in the preface, contour diagrams are a very useful mechanism to visualize how objects, data members, methods, and parameters work. By building a good working visual model of objects, there will be less of chance having misconceptions as to how objects work. By building a solid foundation of the fundamental concepts, it makes it easier to understand more complex ideas in the future. + +The purpose of using contours is to not only show the data members, similar to the variables that were drawn in Chap.​ 1, but also to show the scope of where the data members are accessible. The scope of a local variable is the method where it is declared, and the scope of data member is all of the methods in the object. + +Although not required, it is also helpful to include the type of the variable in the contour to avoid confusion among the many different types of variables. In addition to the variables, contours can also show how parameters are represented in the methods. Lastly, contours show the dynamic or changing nature of a program as it executes. + +As before, it is helpful to start with an example. The program from Fig. 2.3 is combined with the class from Fig. 2.2 to create Fig. 2.4 with each line numbered in a comment to the right for convenience in the description that follows. The contour diagram in Fig. 2.5 shows the state of execution just prior to the execution of Line 5 in the main program. + +Fig. 2.4 + +Invoking program and Number class + +Fig. 2.5 + +State of execution just prior to Line 5 + +The outer contour represents the class Invoke, and the inner contour around the boxes shows the scope of the variables in the main program. Although the contours do not indicate much presently, the use of the contours will become clear shortly. Further, note that although technically the Invoke contour should be drawn for each of the following figures, it is not very useful at this time and will not be drawn for the rest of this chapter in order to simplify the drawings. However, it will be reintroduced and discussed further in Chap.​ 5. + +Continuing, the first column of boxes on the left indicates the names of the variables, and the boxes in the middle indicate the types of the variables, where y and z are of type int, and num is of type Number. Lastly, the boxes on the right indicate the current contents of the variables. Note that the state of execution is just prior to line 5, not after its execution. While technically y and z are initialized by the system to 0, this text will continue to assume that the variables do not contain an initial value and are indeterminate as discussed in Chap.​ 1. + +Although rather simplistic here, once Line 5 is executed, the contents of variable y now contain the value 5. Figure 2.6 shows the state of execution just prior to the execution of Line 6 and also does not show the outer contour for the Invoke class. + +Fig. 2.6 + +State of execution just prior to Line 6 + +However, when Line 6 is executed, things start to get interesting. Just like the Invoke contour was drawn in Fig. 2.5, when a new instance of the Number class is created, a new corresponding contour is also created. Although as mentioned previously the contour for Invoke is not very useful at this time, the contour for Number is necessary for the following discussion. Note that there is one data member in the class and it is shown within the Number contour. Once the instance is created, a reference to the object is assigned to the variable num. This reference is illustrated as an arrow in the contour diagram, where the arrow points to the new contour and the end of the arrow is placed in the variable num. Figure 2.7 shows the state of execution just prior to Line 7 in main. + +Fig. 2.7 + +State of execution just prior to Line 7 + +The next line to be executed is Line 7, which invokes the method setX. Prior to having the flow of control go from Line 7 to Line 15 in the setX method, a number of things need to occur. Just like when a new object is created and a corresponding contour is drawn, the same holds true when a method is invoked. Since the method is part of the instance of the class Number, this is where the corresponding contour appears. A convenient way of remembering this is that whenever there is a dot in the invocation of a method, then one needs to follow the reference or arrow to the corresponding contour. With the instruction num.setX(y); one just starts with the variable num, then follows the arrow to the Number contour, and then within the Number contour creates another contour for the setX method as shown in Fig. 2.8 which illustrates the state of execution just prior to Line 15 in setX. + +Fig. 2.8 + +State of execution just prior to Line 15 + +Note that the contour setX has a memory location associated with it for the parameter a. As mentioned in Sect. 2.5, a parameter is essentially a variable that takes on the value of the corresponding argument. Since the value contained in the variable y which is used as an argument in the main program is a 5, then the corresponding parameter takes on a copy of that same value, similar to an assignment statement. This also illustrates why parameters in Java are called value parameters, because they merely take on the value of the corresponding argument. Note that an argument and the corresponding parameter can have the same name or different names. In this example, the argument y and parameter a have different names, illustrating that the two do not have to be the same. Then when Line 15 is executed, Fig. 2.9 shows the state of execution just prior to Line 16 in setX. + +Fig. 2.9 + +State of execution just prior to Line 16 + +Note that Line 15 is the assignment statement x = a; where the contents of the parameter a will be copied into the variable x. However, notice that the parameter a is inside the contour for setX and the variable x is in the contour for the object or instance of Number. Is it okay for the contents of a to be assigned to x? The answer is yes. The reason is that when executing a statement that contains a variable, the system first looks for the variable within the innermost contour for the variable. If it is found, it uses that variable or parameter. If it is not, then the system looks at the variables contained within the next most encompassing contour diagram. If the variable is found, it is used. However, if the variable is not found, then a syntax error will be generated during compilation time. It is very important to note that although the system will look at any encompassing contour, it cannot look into another contour. In other words, it will look outside of a contour, but it cannot look into another contour. + +Another way of looking at this is to say that the scope of the variable a includes only the method setX; however, the scope of the variable x includes both the object num and the method setX. The word scope is just a way of expressing in which objects and methods a variable is accessible. Problems can occur when there are two variables of the same name, and examples will be illustrated later in Chap.​ 5, but for now this text will use different variable names to avoid this difficulty. + +Although Line 16 is not an instruction, it does represent the end of method setX. When the method is done executing, control is transferred back to the main program. Since setX is a void method, control is transferred back to the line just after the one that invoked the method. The result is that Fig. 2.10 represents the state of execution just prior to Line 8 in the main program. + +Fig. 2.10 + +State of execution just prior to Line 8 + +Note that the contour for the setX method is shaded as light gray. The reason for this is to indicate the contour is deallocated, where the memory locations associated with the method are no longer accessible. Although the contour can and is often simply erased as shown in Fig. 2.11, it is sometimes helpful to show the contour as shaded prior to erasing it so that the contents of the memory locations can still be seen by others. Although shading a contour might be difficult when drawing a contour by hand, an alternative is to just very lightly cross it out while still allowing its contents to be seen. + +Fig. 2.11 + +State of execution just prior to Line 8 (alternative) + +So what happens when Line 8 is executed? Similar but somewhat different to the invoking of the void method setX, the value-returning method getX is invoked, and the state of execution just prior to Line 18 is shown in Fig. 2.12. + +Fig. 2.12 + +State of execution just prior to Line 18 + +Note that there are no memory locations allocated in the contour for getX. The reason for this is that there are no parameters in the parameter list, nor are there any local variables declared within the method, as will be discussed later. As a result, no memory locations are allocated within the contour. So what happens when the return x; statement is executed? Since there is no variable declared by the name x in the getX contour, the system looks outside the contour to see the variable x in the Number contour. The number 5 in the variable x is the value returned to the main program. Since this is a value-returning method, control does not return back to the line after the line that invoked the method, but rather control is returned back to the same line from which it was invoked, so that the value returned can be assigned to a variable or possibly output. When the return is executed, control is transferred back to Line 8, where the number 5 is assigned to the variable z in the main program. + +Figure 2.13 shows the state of execution just prior to Line 9 with the contour for getX shaded as discussed previously. Alternatively, the contour for getX does not need to be shaded nor drawn as shown in Fig. 2.14. + +Fig. 2.13 + +State of execution just prior to Line 9 + +Fig. 2.14 + +State of execution just prior to Line 9 (alternative) + +Since Line 9 is just a print statement and does not contribute to the understanding of objects, the state of execution after Line 9 is not shown here. Although almost every contour was drawn to illustrate the intricate details in the preceding example, this will not always be the case. In the future, some of the more simplistic contours might be skipped, but should they be needed they will be drawn in order to explain a particular concept, as in the next section on constructors. + +## 2.8 Constructors + +When a new object is created, it is sometimes nice to have the various private data members initialized to specific values. This is convenient and allows variables to have default values in case a programmer forgets to initialize them. The mechanism needed to accomplish this task is known as a constructor. A constructor is a special method that is automatically invoked once at the time an object is created via the new instruction. It looks similar to other methods, but instead of having its own unique name as determined by the programmer, it has the same name as the class. Although this can be confusing at first, it helps to remember that when a new object of a class like Number is created, the method that serves as the constructor for the class has the same name, Number, and does not have a return type. Again, it is best to show an example. In this case the constructor initializes the data member x to the default value 0, again assuming that the initial value of variables is indeterminate as discussed in Chap.​ 1. + +public Number() { + +x = 0; + +} + +Including the above constructor, the previous class would look as shown in Fig. 2.15, where typically constructors are located after the data members but prior to all the other methods. + +Fig. 2.15 + +The Number class with a constructor + +Using the first 11 lines of the main program in Fig. 2.4 and replacing lines 12 through 20 with the code from Fig. 2.15, the program in Fig. 2.16 is the revised one from Fig. 2.4 that now incorporates a constructor. Instead of walking through the entire program as was done in the last section, only the first few lines of the program will be executed to illustrate how a constructor works. + +Fig. 2.16 + +Invoking program and Number class with a constructor + +After executing Line 5, the contour in Fig. 2.17 shows the state of execution just prior to the execution of Line 6 in the main program. If the contour looks familiar, it is because it is the same contour that appeared previously in Fig. 2.6. + +Fig. 2.17 + +State of execution just prior to Line 6 + +However, what happens when Line 6 is executed is different from the previous program. As before a contour is created for an instance of the Number class which contains the variable x. Recall from the discussion above that a constructor is automatically executed when a new instance of an object is created. As a result, a contour is also created for the constructor as shown in Fig. 2.18 which shows the state of execution just prior to Line 15 in the constructor for the class Number. + +Fig. 2.18 + +State of execution just prior to Line 15 + +Notice that the contour is empty, since there are no local variables or parameters as was the case previously with the getX() method. Also note that there is no arrow pointing to the contour either. That is because while the constructor is executing, the reference to the object has not yet been assigned to the variable num. + +After Line 15 is executed, the state of execution looks as shown in Fig. 2.19. Notice that the variable x has been initialized to 0 . Since there is not a variable named x in the constructor, the system looks outside to find the variable x in the class Number, similar to the setX method as discussed previously. Once Line 16 is finished, the contour for the constructor is deallocated and shaded in gray. The flow of control then returns back to Line 6 in the main program, and the reference to the object is assigned to the variable num as shown in Fig. 2.20. + +Fig. 2.19 + +State of execution just prior to Line 16 + +Fig. 2.20 + +State of execution just prior to Line 7 + +The program then continues to execute Line 7 just as it did previously, where the only difference is that the variable x has been initialized to the number 0 instead of being indeterminate. Although the initialization could have been accomplished by invoking the setX method with a parameter of 0, the advantage of using a constructor is that a programmer does not need to explicitly invoke a method and does not run the risk of forgetting to do so, which under some circumstances might cause a logic error. Although this is a simple example, as programs become more complicated, the role of a constructor will become more important. When one begins to learn more about data structures in later courses, the role of the constructor as just a mere initializer will diminish, and it takes on roles more befitting of its namesake as a constructor. For now, it is a good practice to use constructors when possible to gain more familiarity and become more comfortable with their use and function. + +## 2.9 Multiple Objects and Classes + +Is it possible to have more than one instance of a class or more than one class? The answer is yes and this section will address some of the issues with multiple objects and classes. For example, if one wanted to have two instances of the preceding Number class, the program could be written as in Fig. 2.21. In the interest of simplifying the contours, the number of variables has been reduced in this example. For example, instead of using local variables as arguments as done in the previous section, constants are used as arguments in Lines 6 and 7. Also, note that the values returned from getX are not stored in variables, but rather just simply output as shown in Lines 8 and 9. Again, these shortcuts are not generally encouraged, but they do save some space in the contour diagrams and hopefully help the reader see the points currently under consideration more clearly. + +Fig. 2.21 + +Program to create multiple instances of the same class + +Notice that there are now two variables of type Number on Line 3. As before, it is helpful to use contour diagrams to assist in the understanding of the code. In this case, only the first part of the code will be executed, and the remainder of the code is left as an exercise at the end of the chapter. Figure 2.22 shows the state of execution after Line 5 but just prior to Line 6. + +Fig. 2.22 + +State of execution after creating two instances prior to Line 6 + +Note that after the constructor has been invoked twice, there are now two instances of the class Number. There are also two variables with the same name, x, but does this cause any problems during the execution of the program? The answer is no, because each variable x is in a different instance of the Number class, where one of the variables is in the object referenced by num1 and the other by num2. Upon completion of Line 6, Fig. 2.23 shows the state of execution after the execution of Line 18, but prior to the execution of Line 19 in the setX method. + +Fig. 2.23 + +State of execution just prior to Line 19 + +As before, the contents of the parameter a have been placed in the data member x. However, is there any confusion as to where the setX method contour should appear? No there is not; since the method call was num1.setX(5); the system knows to execute the setX method in the contour referenced by num1. As discussed previously in Sect. 2.7, an easy way of reading the code num1.setX(5); is to first go to the variable name in the contour, in this case num1, and when there is a dot after the variable name in the code, follow the corresponding reference or arrow to the appropriate contour. In other words, a dot in the line of code refers to a reference or arrow in the contour diagram. After following the reference to the corresponding contour diagram, the contour for the method setX is created. This also reinforces that it is very important to create the initial object contour and corresponding reference correctly when the new instruction is first executed, because all subsequent code is dependent upon it. + +Although the creation of two instances of the same class is fairly straightforward, one must be careful when manipulating the two instances. For example, what if one wanted to take a copy of the integer 5 in the variable x in num1 and put it in the variable x in num2? At first it would seem to be a simple assignment operation from Chap.​ 1, for example, a = b; to copy an integer from the variable b into the variable a. However, when dealing with objects, the results might not be what one expects. For example, what if one wrote the code num2 = num1? The contents of num1 would be copied into num2, but remember, what exactly is in num1? It is not the integer 5, but rather a reference to the corresponding object that contains the integer 5. What is copied is not the integer 5, but rather the result would be that num2 points to the same object as num1 and the previous object that num2 referenced would be deallocated as shown in Fig. 2.24. + +Fig. 2.24 + +Results of num2 = num1; + +Given that the simple assignment statement above does not accomplish the intended task, how then could the integer 5 be copied from the x in num1 to the x in num2? Although another technique will be shown later in Chap.​ 5, for now a temporary variable temp could be used, and the contents of x in num1 could be retrieved using the method getX. Then the corresponding x in num2 could be set with the method setX as shown in the following code segment: + +int temp; + +temp = num1.getX(); + +num2.setX(temp); + +Alternatively, the temporary variable might not be used, and the getX method could be used as a parameter for the setX method as shown in the following shortened segment: + +num2.setX(num1.getX()); + +Here the getX method is invoked first, and then the results returned are used as a parameter to be sent to the setX method. Although the above shortcut works well, for now this text will occasionally use a temporary variable to help make the code a little easier to read. + +Just as it is possible to have multiple instances of a single class, it is also possible to have multiple instances of multiple classes. To elaborate further on the Number class and make it a little more interesting, suppose there is a class defined that has methods to calculate the area of a square and another class has methods to define and calculate the area of a rectangle. Although it could be argued that a square is just a special case of a rectangle, for now they will be defined as two separate classes, and this will pave the way to help explain the concept of inheritance later in Chap.​ 9. + +The class Square will need a method to set the length of the sides and another to calculate the area of the square. Although the method that calculates the area could also return the area (see Sect. 2.11 for the alternative technique), for now an accessor method will be used to return the area of the square, and all three methods are shown in Fig. 2.25. + +Fig. 2.25 + +Square class + +Note that instead of a single data member as in the previous example, there are now two private data members, one for the side and one for the area. Except for the different variable names, note the constructor, setSide, and getArea methods are similar to the constructor, setX, and getX methods in the previous example. The only real difference is the inclusion of the calcArea method which calculates the area of the square, and it is implemented as a void method. + +The Rectangle class can be implemented similar to the Square class. The major difference between these two classes is that with a rectangle, it is possible to have the two sides be of different lengths, so there needs to be two variables instead of just one to represent the sides, in this case, sideX and sideY as shown in Fig. 2.26. + +Fig. 2.26 + +Rectangle class + +Notice the use of three variables in the bodies of the constructor and the calcArea method. Also, since the setSide method is modifying more than one side, the body of that method is also changed, but more importantly, the setSide method has two parameters instead of just one. Lastly, the getArea method remains unchanged. + +Both classes can now be implemented and used with a main program as illustrated in Fig. 2.27. As with the last program and again not generally encouraged, in order to help save space in the contours, note that in Lines 7 and 8 constants are used as arguments and in Lines 11 and 12 the get methods are located in the println statements. + +Fig. 2.27 + +The main program along with the Square and Rectangle classes + +As before, in order to see the difference between instances of multiple classes, it is helpful to walk through the contour diagrams, at least part of the way. The contour in Fig. 2.28 illustrates the state of execution after Line 6 and before the execution of Line 7 in the main program. + +Fig. 2.28 + +State of execution just prior to Line 7 + +Previously in Fig. 2.22, the two object contours were identical because they were two instances of the same class. However, here in Fig. 2.28 the two object contours are different because they are instances of different classes. After executing Line 7, Fig. 2.29 shows the state of execution just prior to Line 23 in the setSide method. + +Fig. 2.29 + +State of execution prior to Line 23 + +Is there any confusion as to where the setSide method contour appears? No, since the method call was square.setSide(2); the system knows to execute the setSide method in the Square class because square is of type Square. Although somewhat different, this is similar to the previous example in Fig. 2.23 where there were two variables of the same name, but in that example there were two instances of the same class. In this case, there are two methods of the same name, but they are in two different classes. As before, an easy way of reading the code and the contour diagram is to go to the variable name, in this case square, and when there is a dot after the variable name in the code, follow the corresponding reference or arrow to the appropriate contour and then create the method contour in the corresponding object contour. + +After returning to Line 8 in the main program, the rect.setSide(3, 4); statement is executed, and control is transferred to Line 39 in the corresponding setSide method in the Rectangle class. Figure 2.30 then shows the state of execution just prior to Line 41. + +Fig. 2.30 + +State of execution just prior to Line 41 + +Note that this time the setSide method contour appears in the Rectangle class contour, and there are two parameters instead of one. Later it will be seen that there can be several methods within a class with the same name; however, they can be distinguished by having a different number, type, or order of the types of parameters. This concept is called method overloading and will be discussed in detail in Chap.​ 5. In the current example, although there are two methods that have the same name, it is not a problem because the two methods are in different classes. As with the previous example, the completion of the contours is left as an exercise at the end of the chapter. + +## 2.10 Universal Modeling Language (UML) Class Diagrams + +Whereas contours are helpful in examining how a specific object works, when an application becomes larger and includes several classes, it is helpful to get a better picture of the relationship among the various classes using Universal Modeling Language (UML) diagrams. UML diagrams can also help one not only see relationships between classes but also see the relationships among the objects of different classes. UML is a language specifying a graphical notation for describing software designs in an object-oriented style. It gives one an overall view of a complex system more effectively than a Java program which may provide too much detail. Again, whereas contour diagrams are helpful when trying to understand the execution of a program, UML diagrams are helpful when trying to design a program. The class definitions and objects discussed in the previous sections can be illustrated using UML class diagrams. Figure 2.31 shows how the Number class in Fig. 2.16 can be displayed using UML class diagram notation. + +Fig. 2.31 + +UML class diagram of Number class + +In the UML class diagram, both data members and methods are included. A class is displayed as a box that includes three sections: The top section gives the class name, the middle section includes the data members for individual objects of the class, and the bottom section includes methods that can be applied to objects. In this example, the middle section represents the data member x, and the type of the data member is specified by placing a colon : followed by the name of the type. The methods in the Number class include the constructor Number, along with the two methods, a mutator setX and an accessor getX. Methods are denoted as the following format: + +methodName(parameterName: parameterType): returnType + +Notice that if there is no information being sent to the method, the inside of the parentheses will be empty, and if the method does not return a value, the returnType will not be included. In Fig. 2.31, the type of the return value is specified after the colon, similar to the type of data members. The parameter list (a: int) for the method setX indicates that information is sent to the method and the value of a, which is of type int, is assigned to the data member. By having an empty parameter list in the parentheses, the getX method does not accept any information and returns a value of type int which is the value stored in the data member x. + +Similar to contour diagrams, but not as detailed, UML notation can also be used to illustrate objects graphically. In the main method of Fig. 2.16, an object named num is instantiated from the class Number. Then the value 5 is assigned to the data member of the object num through a mutator method. UML notation for the object after Line 7 is executed is shown in Fig. 2.32. + +Fig. 2.32 + +UML notation for object num of the Number class + +In the diagram, the top section gives the object name followed by the class name after the colon, all of which is underlined. The bottom section lists the data members. In this example, the variable x contains the value 5. + +## 2.11 Complete Program: Implementing a Simple Class and Client Program + +Combining all the material from this chapter, one can now define a simple class and use an instance of the class in a client program. In this section, a program to calculate the area of a circle will be developed. + +Problem Statement: Write a program to calculate the area of a circle. + +Once a problem statement is given, the requirements can be established by analyzing the problem. The program will: + + * Accept a radius from the user + + * Compute the area of the circle using the given radius + + * Display the area + +Next, some further issues can be considered. Since the area of more than one circle may need to be calculated, a class describing a circle should be defined separately from the main program. In the definition of a circle, only the value of the radius which is the main characteristic of a circle should be kept. In some circumstances where a calculation is very complex, it might be better to calculate the result just once and invoke a method to get the result each time it is needed, thus saving compute time. But since the calculation for the area of the circle is not very complex, it can be computed at any time using the value of the radius, and it does not need to be stored in the object. + +Having addressed some of the issues, the design of the application can proceed. The definition of the Circle class in UML notation is shown in Fig. 2.33. + +Fig. 2.33 + +UML class diagram of the Circle class + +According to the diagram, a Circle object has a data member radius of type double which is a property that characterizes a circle shown in the middle section. The behavior of an object is defined by the methods in the bottom section. The first method is a constructor which creates a new object and performs the initialization of the data members when a new object is created. Each circle can assign a value of radius by performing the setRadius method, invoke the computeArea method to return its area, and return the value of radius using the getRadius method. + +After the design phase comes the implementation phase. Figure 2.34 contains the code defining the class for a Circle object. + +Fig. 2.34 + +Circle class + +A client program to test the functionality of the Circle class is given in Fig. 2.35. + +Fig. 2.35 + +A client program for Circle class + +When the above program is compiled and executed using the sample input of 2.0, the output of the program looks like this: + +Enter the radius: 2.0 + +The area of the circle with a radius of 2.00 cm is 12.57 square cm. + +In this example, an object circle was instantiated from the class Circle, and the user provided 2.0 for the value of the radius of the circle. The UML notation for the object, circle, is shown in Fig. 2.36. + +Fig. 2.36 + +UML notation for the object, circle, of the Circle class + +As before, the top section contains the object name circle followed by the class name Circle after the colon, all of which is underlined. The bottom section lists the data member radius of the object circle. In this example, the variable radius has a value 2.0. + +## 2.12 Summary + + * Remember that a class is like a definition, whereas an instance of a class is an object. + + * Private data members and methods can only be accessed internally within an object of a class, whereas public data members and methods can be accessed both internally and externally. + + * A value-returning method is used to return a value back to the invoking statement. + + * It is best to use only one return statement in a value-returning method and also to place the return statement as the last statement in the method. + + * A void method is usually used to set values in an object. + + * Arguments in an invoking statement are used to send values to a method, and the corresponding parameters are used to receive values within the method. + + * Each time an object is created or a method is invoked, a corresponding contour should be drawn. + + * The new instruction creates a new instance of a class, and the reference to the new instance is often assigned to a variable. + + * A constructor is automatically invoked when the new instruction is executed and is often used to initialize data members. Remember that a constructor has the same name as the name of the class and does not have a return type. + +## 2.13 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Indicate whether the following statements using the Circle class in Fig. 2.34 in Sect. 2.11 are syntactically correct or incorrect. If incorrect, indicate what is wrong with the statement: + +*A. + +Circle circle = new circle(); + +B. + +Circle circle + +Circle = new Circle(5); + +*C. + +circle.getRadius(); assume that an object circle has been declared and created correctly. + +D. + +circle.setRadius("two"); assume that an object circle has been declared and created correctly. + +E. + +circle.setRadius(); assume that an object circle has been declared and created correctly. + +2. + +Draw contour diagrams to show the state of execution prior to the following line numbers of the CalcAreaCircle class in Fig. 2.35 in Sect. 2.11. + +A. + +Line 8 + +B. + +Line 12 (assume an input value of 2.0) + +3. + +Draw contour diagrams to show the state of execution prior to Line 8 of the Invoke class in Fig. 2.21 in Sect. 2.9. + +4. + +Answer the questions A–D about the following declaration of class Circle: + +*A. + +Declare and create a variable of type Circle called innerCircle. + +B. + +Write a statement using the setRadius method to change the value of innerCircle's data member, radius to 10.0. + +*C. + +Write a statement using the getRadius method to output the value of innerCircle's data member, radius, preceded by the phrase "The value of radius is ". + +D. + +Write a statement using the computeCircumference method to output the value of innerCircle's circumference, preceded by the phrase "The value of the circumference is ". + +5. + +Draw contour diagrams to show the state of execution prior to Line 11 of the class Multiple shown in Fig. 2.27 in Sect. 2.9. + +6. + +Write a complete program to calculate the volumes of a cone and a hollow cylinder. The shape of a hollow cylinder is shown below, where r is the radius of the inner cylinder and R is the radius of the outer cylinder: + +First, draw a UML diagram similar to Fig. 2.31 for a class named Cone as described below and then write the code to implement the Cone class. + +*A. + +The Cone class has two private data members, radius and height , of type double. + +B. + +Write code for a constructor to set the data members to default values of 0.0. + +C. + +Write code for the accessor methods, getRadius and getHeight, that return the value of the appropriate data member. + +*D. + +Write code for the mutator methods, setRadius and setHeight, that each have one formal parameter which is stored as the value of the data member. + +E. + +Write a method named computeVolume to compute the volume of a cone and return the computed volume to the client. The formula to find the volume of a cone is . + +Second, draw a UML diagram similar to Fig. 2.31 for a class named HollowCylinder as described below and then write the code to implement the HollowCylinder class. + +F. + +The HollowCylinder class has three private data members, innerRadius, outerRadius, and height, of type double. + +G. + +Write code for a constructor to set the data members to 0.0. + +H. + +Write code for the accessor methods, getInnerRadius, getOuterRadius, and getHeight, that return the value of the appropriate data member. + +I. + +Write code for the mutator methods, setInnerRadius, setOuterRadius, and setHeight, that each have one formal parameter which is stored as the value of the data member. + +J. + +Write a method named computeVolume to compute the volume of a hollow cylinder and return the computed volume to the client. The formula to find the volume of a hollow cylinder is πh(R 2 − r 2). + +Third, write a client program to test the Cone and HollowCylinder class as defined above. Name this class CalcVolume. The main method should perform the following tasks: + +K. + +Allow the user to enter a radius of the cone. + +L. + +Allow the user to enter a height of the cone. + +M. + +Declare and create a Cone object setting the data members to the values entered by the user. + +N. + +Allow the user to enter an inner radius of the hollow cylinder. + +O. + +Allow the user to enter an outer radius of the hollow cylinder. + +P. + +Allow the user to enter a height of the hollow cylinder. + +Q. + +Declare and create a HollowCylinder object setting the data members to the values entered by the user. + +R. + +Output the phrase "The volume of the cone with a radius of XX cm and a height of XX cm is XX cubic cm.", where the XXs are the input values and the value returned from the method. + +S. + +Output the phrase "The volume of the hollow cylinder with an inner radius of XX cm, an outer radius of XX cm, and a height of XX cm is XX cubic cm.", where the XXs are the input values and the value returned from the method. + + * Here is some sample input and output: + +Input for the cone + +Enter the radius: 2.0 + +Enter the height: 3.0 + +Input for the hollow cylinder + +Enter the inner radius: 2.0 + +Enter the outer radius: 4.0 + +Enter the height: 3.0 + +The volume of the cone with a radius of 2.00 cm and + +a height of 3.00 cm is 12.57 cubic cm. + +The volume of the hollow cylinder with an inner radius + +of 2.00 cm, an outer radius of 4.00 cm, and + +a height of 3.00 cm is 113.10 cubic cm. + + * Finally, draw a UML diagram similar to Fig. 2.32 for the objects created in the main method. + +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_3 + +© Springer-Verlag London 2014 + +# 3. Selection Structures + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +Selection structures are explained in this chapter using flowcharts, pseudocode, and the corresponding Java code. The if-then, if-then-else, and nested if structures, including if-then-else-if and if-then-if structures, are introduced. The dangling-else problem is also discussed. Logical operators are presented followed by the introduction of the case structure. Two complete programs are provided: one with objects and one without. + +## 3.1 Introduction + +Chapter showed how to perform input, arithmetic, and output, which are fundamental to many subsequent programs. Chapter 2 introduced elementary object-oriented programming, which allows programs to be designed using objects and methods. Although invoking a method causes a program to branch to another subprogram and this alters the flow of control, the order in which the methods are executed can be determined by examining the code to see the order in which they are invoked. In other words, each time the program is executed, it would have the same order of execution regardless of what was input. What gives software some of its power is the ability to alter the flow of control of a program, so that during different executions of the program with different input, it will behave in a different fashion. This ability is a result of a program being able to use control structures. + +The word "structure" is a generic description of statements regardless of the programming language, whereas "statements" are the individual instructions which can vary from language to language. Control structures can alter the flow of control of a program and can be classified as two main groups, selection structures and iteration structures. Selection structures, sometimes also called decision structures, allow the program to take two or more different paths based on different conditions, whereas iteration structures, sometimes called repetition structures, allow a program to repeat a part of the code many times. In this chapter, various forms of the selection structures will be examined along with the associated Java statements. + +## 3.2 If-Then Structure + +The most basic of the selection structures is the if-then structure. If a particular condition is true, the then portion of the structure is executed; otherwise the then portion of the structure is not executed. It is very similar to natural languages, where one might say "If it is hot today, then I'll buy ice cream." If it was actually hot later in the day, then one would buy ice cream; otherwise one would not buy ice cream. Before looking at specific Java code for this example, it is helpful to look at a visual representation using a flowchart. There are many different types of flowcharts, where Fig. 3.1 shows the type of flowchart that will be used in this text. + +Fig. 3.1 + +Flowchart representing an if-then structure + +In Fig. 3.1, the diamond shape represents a selection structure and the arrows represent the flow of control. The arrow at the top represents entrance into the selection structure. The statement inside the diamond is a question and its results are either true or false. The two labeled arrows exiting the diamond represent the flow of control should the condition be true or false. The true branch is known as the then branch which contains a rectangle representing a statement, and there are no statements in the false branch. The rectangles can be used to hold various statements such as input, output, and assignment statements. In this example, the question is asked "Is it hot?", and if the answer is true, the then or true branch is taken and one would "Buy Ice Cream." Should the answer to the question be false, the false branch is taken and one does not buy ice cream. + +However, the example shown in Fig. 3.1 is not very precise for writing a program. It is not clear what is classified as hot, so it might be better to specify a particular temperature. To make it easier to write a program, it would be best to use a variable such as temp for temperature, where temp would first need to be input. It could then be tested in an if-then structure. For example, if it is 90° Fahrenheit or above, the message "Buy Ice Cream" could be output. Although not necessary now, but for convenience later, a message indicating "End of Program" can also be output as shown in Fig. 3.2. + +Fig. 3.2 + +Flowchart using the variable temp + +Specifically, the flowchart in Fig. 3.2 first inputs the value of temp. Next it tests if the value in temp is greater than or equal to 90. If it is true, it outputs the message "Buy Ice Cream", and if it is false, it does not output the message "Buy Ice Cream". In either case, the flow of control continues on to the end of the if-then structure and the message "End of Program" is output. + +The comparison between temp and 90 is known as a conditional expression, and the greater than or equal to symbol is known as a relational operator and it could be any of the relational operators that one has previously learned in mathematics. For example, one could also say temp "greater than" 89, where 90 would still output the message "Buy Ice Cream", and a temp "equal to" 89 would not. However, what if the variable temp was of type double? Then, a temp of 89.5 would cause the message "Buy Ice Cream" to be output, and this might not be what was intended. As a result, it is a good idea not to change what is given and to implement what was originally intended. + +Although flowcharts are good for visually depicting the logic of a program, sometimes they are cumbersome to draw. As an alternative to flowcharts, pseudocode can be used to create the logic for a program as discussed previously in Chap.​ 1. The above flowchart could be implemented in pseudocode as follows: + + * input temp + + * if temp ≥ 90 then + + * output "Buy Ice Cream" + + * output "End of Program" + +After temp is input, the word if indicates an if-then structure. The condition appears between the words if and then and the word then is optional. If the condition is true, the statement immediately following the if statement is executed, and execution proceeds to the statement following. Note that the true section of the structure is indented to visually indicate the then section. If the condition is false, control branches or jumps over the indented then section and the last statement is executed. + +Given the above flowchart and pseudocode, how could they be implemented in Java? The code would look as shown below: + +System.out.print("Enter a temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 90) + +System.out.println("Buy Ice Cream"); + +System.out.println("End of Program"); + +The input and output statements should look familiar from Chap.​ 1. What is new and different is the if-then statement. Note that there are parentheses around the conditional expression and the word then does not appear in the code. Although the word then does not and should not appear in Java, the true section of an if-then statement is still referred to as the then section. Also, just like the pseudocode, it is a good idea to indent the true or then section, but be aware that indenting the code does not affect the flow of control in the program. It is done as a courtesy for other programmers to help improve the readability and maintainability of the code. + +Lastly, note that the ≥ symbol has been replaced with the >= symbols. This is because the mathematical symbol ≥ does not exist in the Java programming language and the >= symbols need to be used instead. As one might suspect, some of the other mathematical symbols do not exist in Java as well as indicated in Table 3.1. + +Table 3.1 + +Relational symbols + +Mathematical symbol | Java symbol + +---|--- + +> | > + +≥ | >= + +< | < + +≤ | <= + += | == + +≠ | != + +In addition to the "less than or equal to" symbols, notice the "equal to" symbol. Instead of a "single" equal sign, it is represented in Java as a "double" equal sign. The reason for this is to distinguish the check for equality == from the assignment symbol =. This is a common mistake for beginning Java programmers to use the wrong symbol, so extra care must be taken when writing a conditional expression in a control structure. Although not as problematic as the "equal to" symbol, notice that the "not equal to" symbol is !=. + +To illustrate a complete program that can be keyed into the computer to test the current if-then statement, see Fig. 3.3. This program can also be modified to test subsequent selection statements introduced in this chapter. + +Fig. 3.3 + +Complete program using the if-then statement + +It should further be pointed out that syntactically there can be only one statement in the then section of an if statement in Java. But if there can be only one statement in Java, how can more than one statement be placed in the then section? Taking a minute to think about it, a way this problem can be solved has already been presented in Chap.​ 2. Yes, multiple statements could be placed in a method and then an invoke statement could be placed in the then section. However, if a method was not being used to solve this problem, how could more than one statement be put into the then section? + +With flowcharts and pseudocode, there are no restrictions to using only one statement as there is in Java. In a flowchart, additional boxes can be placed in the then branch and each box represents a new statement. For example, in addition to the message "Buy Ice Cream", the message "Buy Lemonade" could be added as shown in Fig. 3.4. + +Fig. 3.4 + +Flowchart with two statements in the then section + +In pseudocode, if more than one statement is needed in the then section, it is simply inserted and indented to visually indicate to the reader that the additional statements are part of the then section and do not belong after the then section, such as in the following: + + * input temp + + * if temp ≥ 90 then + + * output "Buy Ice Cream" + + * output "Buy Lemonade" + + * output "End of Program" + +However, if one attempted to write the above pseudocode in Java as follows, there would be a logic error: + +// *** Caution: Incorrectly Implemented Code *** + +System.out.print("Enter a temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 90) + +System.out.println("Buy Ice Cream"); + +System.out.println("Buy Lemonade"); + +System.out.println("End of Program"); + +Although this might look correct, it is sometimes a common error made by beginning programmers. By merely moving the "Buy Lemonade" statement to the left as shown below, there is no change in the logic of the segment, and the true flow of control is made more obvious, where the "Buy Lemonade" message is output regardless of the temperature: + +// *** Caution: Incorrectly Implemented Code *** + +System.out.print("Enter a temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 90) + +System.out.println("Buy Ice Cream"); + +System.out.println("Buy Lemonade"); // <\- - - Unindented + +System.out.println("End of Program"); + +As stated previously, the indentation of the code does not affect the flow of control of the program in Java. So how does one indicate that there is more than one line of code in the then section? The answer is through the use of a compound statement. A compound statement is indicated by the use of opening and closing braces, { and }. For example, the above pseudocode would be correctly implemented as follows: + +// *** Correctly Implemented Code *** + +System.out.print("Enter a temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 90) { + +System.out.println("Buy Ice Cream"); + +System.out.println("Buy Lemonade"); + +} + +System.out.println("End of Program"); + +The compiler sees the compound statement which allows more than one statement to be in the then section. Although syntactically to the compiler there is still only one statement, specifically the compound statement, there are now logically two statements in the then section. Notice that the opening brace appears just after the closing parentheses of the conditional expression and the closing brace lines up with the if statement. Although there are a number of other styles, this text will use the style shown above. However, should one's instructor or place of employment use a different style, be sure to follow it. + +## 3.3 If-Then-Else Structure + +The if-then structure is helpful when there is something that needs to be done in addition to the normal flow of control. However, what if one wanted to have a program do one thing in one case and another thing in an alternative case. Using a new example, assume that if the number of credit hours input, using the variable credits, is 120 or greater, the program should output the message "Graduate"; otherwise the program should output "Does not graduate". + +Is it possible to solve this problem using only if-then structures? The answer is yes, by using two if-then structures in the pseudocode that follows: + + * if credits ≥ 120 then + + * output "Graduate" + + * if credits < 120 then + + * output "Does not graduate" + +Although this solution works, the problem with this method is that it has to ask two questions. For example, if the number of credit hours is equal to 120, then the message "Graduates" would be output. However, even though the message has already been output, the code still needs to check to see if the number of credit hours is less than 120 and branch around the output "Does not graduate" message. It should be clear that if one of the options is true, the other one is false, so there is no need to check the opposite condition. This can be accomplished with the use of the if-then-else structure. An example of the flowchart for this scenario is as shown in Fig. 3.5. + +Fig. 3.5 + +If-then-else structure + +Note that unlike the flowchart in the previous section, the false section is no longer empty. Instead, it contains a box to output the message "Does not graduate". The false section of the flowchart is also called the else section. The pseudocode for this flowchart is shown below: + + * input credits + + * if credits ≥ 120 then + + * output "Graduate" + + * else + + * output "Does not graduate" + + * output "End of Program" + +Notice that the word else lines up with the word if and that the else section of the pseudocode lines up with the then section. The Java code to implement the pseudocode is as follows: + +System.out.print("Enter the credit hours: "); + +credits=scanner.nextInt(); + +if(credits >= 120) + +System.out.println("Graduate"); + +else + +System.out.println("Does not graduate"); + +System.out.println("End of Program"); + +As with the pseudocode, notice that the word if and the word else line up and the then and else sections line up. What if there needs to be more than one statement in either the then or else sections? As before with the if-then statement, a compound statement must be used. + +It is possible to reverse the above then and else sections, but one needs to be cautious and reverse the conditional expression correctly. What is the opposite of greater than or equal to? Be careful, it is not less than or equal to. If one used less than or equal to, then those students who had exactly 120 credit hours would be listed as not graduating, much to their dismay! Instead, the opposite of greater than or equal to is simply less than as shown below: + +System.out.print("Enter the credit hours: "); + +credits=scanner.nextInt(); + +if(credits < 120) + +System.out.println("Does not graduate"); + +else + +System.out.println("Graduates"); + +System.out.println("End of Program"); + +Although the above code performs identically to the previous code, why should one be chosen over the other? Unless there is a compelling reason to do otherwise, such as the original description is unduly confusing, it is usually better to write the code to follow the original specifications as given. However, if either way is acceptable, then code is often written to have the most common occurrence in the then section and the exception in the else section. In the above example, most seniors will probably have 120 credit hours or more at graduation, so using the original code segment is probably the best choice. + +When writing if-then structures, it is important to write them so that they not only work correctly but they are also efficient in terms of memory utilization. For example, consider the following code segment: + +if (a > 0) { + +b = b + 1; + +a = a - b; + +c = c + a; + +} + +else { + +b = b + 1; + +a = a + b; + +c = c + a; + +} + +Note that the first and last statements in both the then and else sections are the same. The only statement that is different between the two is the middle statement in each segment. Given that the other statements are the same, why are they duplicated in the then and else sections? The answer is that they should not be and they can be moved. Not only are they taking up more memory, they also present a possible problem when someone attempts to modify the code, where a programmer might accidently modify a statement in one section and fail to modify the other statement in the other section which might lead to a subsequent logic error. Although this does not appear to present as much of problem here in a small code segment, it could be much more serious in larger code segments. + +If the duplicate statements are to be consolidated and moved, where should they be relocated? By examining the above code segment, the variable b modified in the first statement in each segment is used by the second statement, so it should be moved prior to the if statement. Similarly, the variable a used in the last statement is modified by the middle statement, so it should be relocated after the if-then-else statement. In other words, care must be used to ensure that the logic is not altered when moving statements to optimize an if-then-else statement or any code segment for that matter. Below is the modified code segment that clearly is less cluttered without the braces, uses less memory, and would be easier to modify in the future. The result is that once one has written code that works correctly, be sure to take the time to ensure that it is also a well-written code. + +b = b + 1; + +if(a > 0) + +a = a - b; + +else + +a = a + b; + +c = c + a; + +Note further that it is also possible to write an if-then structure as an if-then-else with either an empty else section or an empty then section. In both cases, leaving an empty else or then section in Java requires a semicolon in either section, which might lead subsequent programmers to wonder what might have been accidently left out. Unless there is intent to fill in the empty section in the immediate future, it is best to just write the code simply as an if-then. If code is written with an empty else section, the else section should be removed. In the case of an empty then section, it is usually best to carefully reverse the conditional expression and again write the code as an if-then. + +## 3.4 Nested If Structures + +If there is only one selection, the if-then is the best choice, and should there be two selections, the if-then-else structure is the obvious choice. But what if there are three or more choices? Sure, a series of if-then structures could be used, but although this "works," it is a very inefficient solution as discussed in the previous section. Instead, a series of if-then-else structures could be nested. There are two ways if-then-else structures can be nested: the subsequent if-then-else statements could be nested in the else section or in the then section of the previous if-then-else. The first form of nesting is called an if-then-else-if structure and the second is called an if-then-if structure. Note that there are no Java statements that correspond to each of these two structures, but rather they can be created fairly easily from a pair of if-then-else statements. Of the two, the former tends to be used more often and will be discussed first. + +### 3.4.1 If-Then-Else-If Structure + +As mentioned above, an if-then-else-if structure is created when an if-then-else is nested in the else section of an if-then-else. Using a new example, assume that the temperature is input in degrees Celsius and messages are to be output as to whether water is in the form of steam, water, or ice. At 100° or greater, water is in the form of steam, and at 0° or less, it is in the form of ice; otherwise it is in its liquid state. As before, it is helpful to view the structure in the form of a flowchart as shown in Fig. 3.6. + +Fig. 3.6 + +Nested if-then-else-if structure + +Notice that the second if statement appears in the else section of the first statement. The dotted lines are not part of the flowchart, but rather are included to help one see that the inner if-then-else is contained in the else section of the outer if-then-else. If the first condition is true, the message "Steam" is output and no further testing is necessary. If the first condition is false, then further testing occurs in the nested if-then-else structure. Given the flowchart in Fig. 3.6, the corresponding pseudocode would appear as follows: + +As with the flowchart, the dashed lines are not part of the pseudocode. Rather, they are included to allow one to see how the inner if-then-else structure is nested in the else portion of the outer if-then-else structure. In particular, note that the nested if and else line up with the output statement in the then section of the outer if-then-else structure. Again, if the first condition is true, the then section is executed and no further testing occurs, but if the first condition is false, the nested if is executed. + +As would be expected, the Java code looks very similar: + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 100) + +System.out.println("Steam"); + +else + +if(temp > 0) + +System.out.println("Water"); + +else + +System.out.println("Ice"); + +System.out.println("End of Program"); + +The dashed lines are not included in the Java code so that one can concentrate on the indentation and the syntax. As with the pseudocode, note how the inner if and else line up with the System.out.println statement in the then section of the outer if statement. + +Since there appears to be more than one statement in the else section of the outer if-then-else structure, does there need to be a pair of braces, { and }, in that section? In other words, does a compound statement need to be used? The answer is no, because an if-then-else statement is syntactically considered to be a single statement. Although it would not cause a syntax error to include the braces, it could cause some programmers to wonder if a second statement was forgotten and not included. Some instructors might not care whether the extra pair of braces is included, but this text will omit them to help the reader get used to this programming style. + +Does it matter which test is first? If all the groups are equal, then the answer is no. However, if one of the groups occurs more frequently, then it would be best to put it first so that fewer tests would need to be done. This is especially true when an if statement is inside an iteration structure as will be seen in Chap.​ 4. What if the middle section occurs more often? This could prove to be a problem at this point, but it will be discussed further in Sect. 3.5 on logical operators. + +### 3.4.2 If-Then-If Structure + +Since it is possible to nest an if-then-else structure in the else section of an outer if-then-else structure, is it possible to nest an if-then-else structure in the then section of an outer if-then-else structure? The answer is yes, and this type of structure is called an if-then-if structure. Again, there is no Java statement called an if-then-if, but rather this name merely indicates what section the subsequent if-the-else is nested. The flowchart for an if-then-if that implements the example from the previous section is shown in Fig. 3.7. + +Fig. 3.7 + +Nested if-then-if structure + +As before, the dashed lines are not part of the flowchart but help indicate how the if-then-else is nested in the then section of the outer if-then-else. In particular, notice how the relational expression in the first if is changed from ≥100 to >0. The reason for this is because previously when the temp was at 100 or greater, the then section would be executed and the message "Steam" would be output. However, with the if-then-if structure, the then section now contains a nested if and has two groups that need to be further subdivided. The relational expression in the outer if structure is changed to >0, so when temp is zero or less than zero, execution proceeds to the else section. As discussed previously in Sect. 3.2, be careful to write the relational expression properly, otherwise a logic error could occur. After checking for a temperature greater than zero, the nested if checks whether the temperature is greater than or equal to 100, and if so the message "Steam" is output, otherwise the message "Water" is output. As before, the pseudocode for the nested if-then-if can be found below: + +Notice the nested if-then-else in the then section of the outer if-then-else and note the level of indentation. As should be expected, the Java code follows. Again pay attention to the indentation and the absence of braces: + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0) + +if(temp >= 100) + +System.out.println("Steam"); + +else + +System.out.println("Water"); + +else + +System.out.println("Ice"); + +System.out.println("End of Program"); + +Since the if-then-else-if and the if-then-if structures can perform the same tasks, which is the better choice? In one sense it depends on the circumstances. If the original specifications are written in such a fashion to make it easier to implement with one or the other structure, then the most appropriate structure should be used. However, often the original specifications are written in a way that is easier to communicate to other users and programmers, and this tends to be in an if-then-else-if fashion. For example, assume there were an equal number of different denominations of coins and someone wanted to move all of the one cent pieces. Ordinarily a person would not try to remove all the other coins to leave only the one cent coins, but instead it would be easier to merely remove the one cent coins. If there were subsequent coins to be removed, such as the five cent pieces, they would be the next to be removed and so on. + +This is similar to the previous example, where instead of checking for temperatures above freezing and then checking for temperatures that produce steam or water, it is more natural to check for the temperatures that are greater than or equal to 100°. In other words, the if-then-else-if structure is often chosen over the if-then-if structure because that is the way people often speak and tend to write specifications. Further, it is helpful to have the program written similar to the specifications to assist other programmers who might be maintaining and modifying the program in the future. There is yet another reason why the if-then-else structure is used more often than the if-then-if as discussed in the next section. + +### 3.4.3 Dangling Else Problem + +The if-then-if structure also suffers from an occasional problem due to the nature of the Java syntax. For example, assume that one wanted to modify the previous temperature example to implement the flowchart in Fig. 3.8 which only the messages for "Steam" and "Ice" are to be output. + +Fig. 3.8 + +"Ice" or "Steam" flowchart + +The flowchart can also be implemented as shown in the following pseudocode: + +In both cases, what is intended is that if the temperature is greater than or equal to 100, then the first and second if statements are true and the message "Steam" is output. If the temperature is 0 or less than 0, the first if is false and the message "Ice" is output. However, if the temperature is greater than 0 or less than 100, then the first if statement would be true, and the second if would be false, and since there is no code in the else section of the second if, no message is output. It would appear that the code for the above could be implemented as follows: + +// *** Caution: Incorrectly Implemented Code *** + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0) + +if(temp >= 100) + +System.out.println("Steam"); + +else + +System.out.println("Ice"); + +System.out.println("End of Program"); + +However, what appears to be correctly implemented code is not accurately implementing the logic from the flowchart and pseudocode. If the pseudocode follows from the flowchart, and the code follows from the pseudocode, how can this be? The problem is that the pseudocode is relying on indentation to indicate which parts belong in the then and else sections, but recall from Sect. 3.2 that indentation does not affect the flow of control in Java or in most languages for that matter. This is known as the "dangling else" problem. It might not be clear which if statement the else statement is paired. If the above code segment has the else and the subsequent System.out.println indented, note that the code presents itself entirely differently: + +// *** Caution: Incorrectly Implemented Code *** + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0) + +if(temp >= 100) + +System.out.println("Steam"); + +else // Indented - - - -> + +System.out.println("Ice"); // Indented - - - -> + +System.out.println("End of Program"); + +Instead of the else appearing to belong to the outer if, it now seems to belong to the inner if statement. If indenting doesn't affect the flow of control, which of the above two code segments is correct? The answer is neither, but the second one more accurately represents the flow of control, because an else is always matched with the closest if statement. The result is that the flowchart for the above code segment is as shown in Fig. 3.9. + +Fig. 3.9 + +Flowchart representing the "Dangling Else" problem + +If temp is less than or equal to 0, then nothing is output, and if the temperature is greater than 0, but less than 100, then the message "Ice" is output, which is clearly incorrect. Although indenting is a useful way of indicating flow of control in pseudocode, it is only useful in illustrating the flow of control in Java when it is done properly. If indenting will not help correct the above problem, what can be done to correct the code? There are a couple of solutions. One is to include braces to force the else to match up with the outer if instead of the inner if as shown below: + +// *** Correctly Implemented Code *** + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0) { + +if(temp >= 100) + +System.out.println("Steam"); + +} + +else + +System.out.println("Ice"); + +System.out.println("End of Program"); + +Note that in addition to the braces, the else is moved to the left to line up with the outer if to improve readability. But doesn't the inclusion of braces contradict the suggestion from Sect. 3.2 to not use braces for a single statement and use them only when they are necessary? No, not in this case, because although the if-then structure in Java is only a single statement, the braces are necessary in this case to force the else to match with the proper if statement. + +In fact, some might suggest that braces should always be used to avoid a special case such as this. However, it seems somewhat counterintuitive to use braces everywhere for only a single potential error, since too many braces might clutter up a program and hurt the overall readability. There is another solution and that is to generally avoid the use of the if-then-if structure and instead primarily use the if-then-else-if structure, which does not suffer from this problem, as shown below: + +// *** Correctly Implemented Code *** + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 100) + +System.out.println("Steam"); + +else + +if(temp <= 0) + +System.out.println("Ice"); + +System.out.println("End of Program"); + +Again, does this mean one should never use the if-then-if structure? No, as mentioned previously use the if-then-if structure only when the nature of the problem lends itself to its usage and use extra caution to ensure that the code written actually implements the intended logic. Further, an example of the use of the if-then-if structure is shown in the next two sections. + +However, it might appear that the initial cause of the above problem results from the indentation used in the previous pseudocode. Does this mean that one should not rely on indentation when writing pseudocode and braces should be used to help indicate nesting? The answer is largely left up to the individual, the instructor of a class, or the standards in a company. As long as one is aware of the potential problem, indentation can be used in pseudocode to indicate the flow of control. Also, if one wants to ensure that a mistake does not occur in writing subsequent Java from the pseudocode, then the inclusion of braces in the above instance would provide extra insurance that the pseudocode is not accidently implemented incorrectly. However, this text will not use braces in pseudocode to save space and help the reader better understand the potential problems. + +## 3.5 Logical Operators + +Although nested if statements are very useful in the circumstances discussed in the previous section, there are techniques that can make them even more useful. For example, assume that the only message needed to be output was the opposite of the example presented in the previous section. If the temperature is greater than 0° and less than 100°, only the message "Water" needs to be output. This could be done with either an if-then-if structure or an if-then-else-if structure, where the former is shown below: + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0) + +if(temp < 100) + +System.out.println("Water"); + +System.out.println("End of Program"); + +However, does the use of an if-then-if above go against the suggestion in the previous section to use the if-then-else-if? No not really, because this is one of those cases that lend itself better to the use of the if-then-if. The use of an if-then-else-if would result in an empty then section which should be avoided as discussed in Sect. 3.3 and as shown below: + +System.out.printl("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp >= 100); + +else + +if(temp > 0) + +System.out.println("Water"); + +System.out.println("End of Program"); + +Note the semicolon at the end of the first if statement indicating an empty then section, which can be quite confusing. Clearly in this instance the if-then-if structure is a better solution than the if-then-else-if structure. However, by using logic there is an even better solution to this problem, and before presenting the solution, it is best to look over the fundamentals of logic operations. + +Logical operators are also known as Boolean operators, which are named after George Boole (1815–1864) an English-born mathematician and logician. The results of Boolean operations are the values true or false which can be stored in variables of type boolean as shown below: + +boolean flag; + +flag = true; + +Further, any relational or logic operation can be assigned to a boolean variable, and that variable can be used subsequently in an if statement. Although not used as often, it is sometimes helpful to have a relation in one part of a program, set a boolean variable (often called a flag and coded as flag), and then test the flag later in another part of the program. The result is that both of the following code segments are equivalent: + +if(x == 0) + +System.out.println("x equals 0"); + +flag = x == 0; + +if(flag) + +System.out.println("x equals 0"); + +Although at first the assignment statement of the second segment might look a little strange, if one thinks about it for a minute, the comparison of x == 0 results in either true or false. The true or false is then assigned to the boolean variable flag. Lastly, when the if statement is executed, should the value in flag be true, the then portion of the if is executed. Otherwise the value in flag is false, the then portion is skipped, and any statement that might follow is executed. In the second instance, does the variable flag need to be compared to the Boolean values of true or false? The answer is no, because the variable flag is of type boolean and already contains either the value true or false, so the comparison is unnecessary. Although the first example is more common, again the second is useful to set a flag in one part of a program and test it in another part of a program. + +Continuing, there are three fundamental logic operations called and, or, and not. The first of these three has a value of true when both conditions are true. For example, a graduation requirement for a major in computer science might include that a student takes both a course in calculus and discrete mathematics. If one takes one course but not the other, or takes neither course, then the major will not be complete. This can be represented in the form of a truth table, where all the possible combinations of the two courses are listed on the left side and the result of the and operation is listed on the right in Table 3.2. The variables c and d are used to represent the calculus and discrete mathematics courses, respectively, and the letters T and F are used to represent the values true and false, respectively. Note that result is true only when both c and d are true. + +Table 3.2 + +Truth table for the and operation + +c | d | c and d + +---|---|--- + +F | F | F + +F | T | F + +T | F | F + +T | T | T + +As an example of the or operation, suppose that in order to complete a major in computer science a student must take one of two electives, such as a course in artificial intelligence or a course in computer graphics. If a student takes one course or the other, then the student has fulfilled the requirement. But what if both courses are taken? In the case of the or operation under consideration here, known as an inclusive-or, the results are true when one or the other, or both are true. The result is that a student would have also fulfilled the requirement if both courses were taken. On the other hand, an exclusive-or is true when only one or the other is true, but not both. Although some other languages have both types of or operators, Java only has the inclusive-or as illustrated in the truth table in Table 3.3, where the letter a represents artificial intelligence and the letter c represents computer graphics. As can be seen, if either a or c is true, or both are true, the result is true. If neither is true, the result is false. + +Table 3.3 + +Truth table for the or operation + +a | c | a or c + +---|---|--- + +F | F | F + +F | T | T + +T | F | T + +T | T | T + +The last of the logic operators is the not operator, which when applied to something that is true initially, the result is false and vice versa. For example, if one has taken an introduction to computer science course, then the result is true, but if one has not taken the course, the result is false. In Table 3.4 the letter c represents the introduction to computer science course. Since there is only one variable, there are only two entries in the truth table. In fact, to determine the number of entries needed in a truth table, just count the number of variables in the expression and raise 2 to that power. For example, if there were three variables in a logical expression, how many entries would be needed? The answer is 2 raised to the 3rd power which is equal to 8. + +Table 3.4 + +Truth table for the not operation + +c | not c + +---|--- + +F | T + +T | F + +In Java the and, or, and not operations are represented using the &&, ||, and ! symbols, as shown in Table 3.5. + +Table 3.5 + +Logic operations and Java symbols + +Logic operation | Java symbol + +---|--- + +and | && + +or | || + +not | ! + +Using this information, how can the if-then-if structure presented at the beginning of this section be simplified? Instead of checking first whether temp is greater than 0 and subsequently checking whether temp is less than 100, it would make sense to use the and operation. Although it would be nice to use a range such as 0 < temp < 100 as done in mathematics, note that this would cause a syntax error in Java. Instead, the relation must be written with two separate comparisons each using the variable temp as in temp > 0 && temp < 100. The previous if-then-if structure can now be written as follows: + +System.out.print("Enter the temperature: "); + +temp=scanner.nextInt(); + +if(temp > 0 && temp < 100) + +System.out.println("Water"); + +System.out.println("End of Program"); + +Could the above if statement been written as if(temp >= 1 && temp <= 99)? Given that the variable temp is of type int in the past couple of examples, the answer is yes. However, what if the variable temp was a double? Then, a temperature such as 0.5° would not be output as "Water," which would be incorrect. Again as discussed previously in Sect. 3.3, it is usually better to write a program with the proper endpoints and relations even when programming with integers to help prevent a possible future logic error should a program be modified later. + +Although the basic operations of logic are fairly simple, expressions can become quite complex as the number of operations increase, so extra care must be taken when creating Boolean expressions. For example, suppose someone had originally coded the following if statement with an empty then section to check for a correct battery voltage in order for a system to operate correctly. Further, suppose that one wanted to convert the if-else structure to an if-then structure, how could that be accomplished? + +if(voltage < 10.5 || voltage > 14.0); + +else + +System.out.println("Correct Voltage"); + +The message needs to be moved from the else section to the then section. In other words the message should be output when the condition is true, not when it is false. The simple way to convert the condition is to simply add a not operator in front of the conditional expression and remember to remove the semicolon from the end of the if statement as follows: + +if(!(voltage < 10.5 || voltage > 14.0)) + +System.out.println("Correct Voltage"); + +However, one must be careful with the not, because just as arithmetic operators have precedence rules, so to do logical operators. The not operator has the highest priority, the and operator has the second highest priority, and the or operator has the lowest priority. Further, just as with arithmetic operators, when there is a tie between two operators, the order is from left to right, and parentheses can be used to override any precedence rules where the expression in the innermost nested parentheses is evaluated first. The order of precedence for logical operators is summarized in Table 3.6. + +Table 3.6 + +Logical operator precedence + +Operator | Precedence + +---|--- + +innermost nested () | Highest + +! | + +&& + +|| + +Tie – left to right | Lowest + +As a result, note that when the not is added, there are a set of parentheses around the original logical operator and its operands from the previous if statement, because without them the result would be different. A truth table is a convenient way to prove that the two are different. To simplify the above relations, the Boolean variables a and b are used in the truth table below: + +Notice that the intermediate columns are shown to help ensure that there are no mistakes, or if one is made, it is easy to see where it occurred. Further, note that the arrow pointing to the two columns shows that !a || b is not equal to !(a || b). Specifically, the values in the second and fourth line down are not equal, and although the other two are correct, it takes only one instance to prove that they are not equal. Further, something like this might be difficult to catch when testing a program. If these particular instances are not tested, a program could subsequently have a logic error and no error message would be generated. + +Returning to the if statement, what if one didn't want to have the not symbol in the if statement. Could it be rewritten without the ! symbol? The answer is yes, but again one must be careful when changing a logical expression. Similar to what can be done in arithmetic with a minus sign, the not symbol can be distributed over the terms in the parentheses. Although similar, it is different than arithmetic and De Morgan's laws must be used, which were formulated by Augustus De Morgan (1806–1871), a British mathematician and logician. Simply stated, if a not is distributed over either an and operator or an or operator, the operands must be complemented. Further, the operators must be changed to an or operator or an and operator, respectively. To help understand these laws better, they are listed in Table 3.7. + +Table 3.7 + +De Morgan's laws + +not (a and b) = not a or not b + +--- + +not (a or b) = not a and not b + +To show that the laws are indeed correct, a truth table can be used to prove that they are equal using the techniques shown above, and this is left as an exercise at the end of the chapter. To show how De Morgan's laws can be used in Java in the previous if statement, first the ! symbol is distributed over the operands and then the || operator is changed to an && operator as shown below: + +if(!(voltage < 10.5) && !(voltage > 14.0)) + +System.out.println("Correct Voltage"); + +Since there are now two not symbols, the relations can be changed to their opposites, thus eliminating the need for the two not symbols. Of course, one has to be careful to reverse the relationals correctly as has been discussed previously. The final if statement without the ! symbols is shown below: + +if(voltage >= 10.5 && voltage <= 14.0) + +System.out.println("Correct Voltage"); + +Given some of the potential problems above, if a code segment can be written without using logical operators, then generally it is better to do so to avoid the added complexity and the potential for errors. When creating nested if structures, it is helpful not to have the first if contain a logical operator and instead rewrite the if structure to use a simple expression first. For example, in a code segment concerning temperatures, instead of starting with the water range and using an and operator, it is better to start with the steam or ice range which do not require a logical operator. + +Another potential complexity often occurs when some beginning programmers feel compelled to include a logical operator on subsequent if statements. However, this is often unnecessary as shown previously in the temperature example where the first if checks for temperatures of 100° and above. Since the higher temperatures have already been removed by the first if statement, it is not necessary to include the logical operators in the subsequent if statement to check whether the temperatures are below 100°. As a general rule, if the logical operators are necessary or they help to reduce the number of if statements, then they should be included. However, if the code can be written without the use of logical operators, it is best not to include them. An example of when to use or not use logical operators can be found at the beginning of the next section. + +As one writes logical operators with conditional expressions as operands, care must also be taken which conditional expression comes first. For example, the following code segment checks to make sure that i is not equal to 0 and that the results of the division operation are positive before outputting a message. What would happen if both i and total contained a 0? + +if(i != 0 && total / i >= 0 ) + +System.out.println("The average is positive"); + +Since i is equal to 0, the result of the first operand is false. However, does it matter what the results of the second operand are? Since false && false is false and false && true is also false, there is no need to check the second operand. This averts the division by zero error and the then portion of the if statement would not be executed. This is known as a short circuit, where if the first operand of an && operation is false, there is no need to check the second operand. + +So given the above, what would happen if the operands were reversed as follows and the value i and total were still 0? + +if(total / i >= 0 && i != 0) + +System.out.println("The average is positive"); + +At first, it seems to be okay because the if statement is still checking to see if i is not equal to 0. However, although both tests are included in the if statement, recall from the discussion above that the operand on the left is evaluated first. Further if i was not equal to 0, there would not be a problem, but in the instance where i is equal to 0, there would be a division by zero error before the comparison of i to 0 in the second operand. + +A similar problem can occur with the || operator, where if the first operand is true, there is no need to check the second operand. The reason this occurs with both the && and || operators is the result of the underlying machine language generated by the compiler and the interpreter. For a further explanation, see Guide to Assembly Language: A Concise Introduction [4]. Although this short circuit evaluation of statements can be helpful in some instances, it can cause a problem if one is not careful with the order of the operands. So when writing logical operators, in addition to being careful with the precedence of logical operators and De Morgan's laws, one should also be careful with the order of the operands. + +## 3.6 Case Structure + +As can be imagined, if the number of nested if statements becomes too deep, the resulting code might be difficult to read and maintain. For example, consider when a student's quiz score is input and a message is output indicating how well the student performed as implemented in the following code segment: + +Notice the use of an or operation in the first if statement to test for a score of either 9 or 10 and the output of the message "Very Good". Note that an and operator could have been used instead as in if(score >= 9 && score <= 10), but since the range is only two integers, it is probably better represented using an or operator. However, with the last if statement above, it is easier to use the and operator to test for the range of numbers instead of listing out each of the possibilities. Lastly, notice that if the score does not fall between 0 and 10 inclusive, then a message is output indicating that it is an invalid quiz score. + +Although the above code segment works, what if there were more levels of scores to check and corresponding messages to be output? The level of indentation could become quite ungainly and the code might become more difficult to read and modify. Luckily, most languages have what is known as a case structure to help with these situations. In Java this structure is known as the switch statement. A switch statement is like a multi-way if statement. The contents of a simple variable or the result of an expression causes the flow of control to branch to one of the many particular cases, and the corresponding code is then executed. The above nested if-then-else-if structure can be implemented using a switch statement as follows: + +The first thing to be aware of is that the variable score cannot be of type double or float. Although it is possible to use typecast operators with these types, in these instances the use of nested if structures might be a better choice. This is one of the drawbacks of the switch statement, where typically only variables or expressions of type int and char can be used. The second thing to note in the switch statement is that the variable score is not part of a relational expression (using >, >=, etc.) as it can be in an if statement. Instead, the contents of the variable score are compared with each of the case statements that follow. If a match is found, then control is transferred to the corresponding case statement, and the code that follows is executed. For example, if the value in the variable score is a 10, then control is transferred to case 10: and the code that follows is executed. As mentioned above, an expression can be used instead of a variable, and an example of this follows later. + +Syntactically, there is one set of braces which indicate the beginning and end of the entire switch statement; however, note that there are no braces in each of the individual case sections even when there is more than one statement. The reason for this is that at the end of each case section, a break statement is included. The use of the break statement causes the flow of control to be transferred to the end of the switch statement. Without it, the flow of control would fall through to the code that follows the next case statement. Although it is legal to write code that does not use a break statement, the need to do so is very rare and is considered to be of poor programming style. Doing so usually makes code difficult to debug or modify and should be avoided. + +The last section of the switch statement is the default statement, which is executed when a matching case is not found. Although a default can be placed anywhere within the switch statement, it is typically placed at the end of the switch statement. It should be noted that switch statements are not required to have a default statement. However, if a switch statement does not have a default statement and the particular value is not found in the cases given, then nothing will be executed in the switch statement, and in the previous example, nothing would be output. Although this might be what was intended, a value that is not part of the data to be processed might cause a logic error later on in the program. As a result, default statements are usually included as a precautionary measure. + +Notice that the default case does not have a break statement. If there were no default statement, then the last case section would not need to have a break statement either. The reason is that upon completion of executing the code in the last case or default, the flow of control will simply fall through to the next statement following the switch statement. Although a break statement could be included, it is not necessary and will not be included in this text. + +With respect to indenting, there are a number of styles that can be followed, but typically the individual case statements are indented three spaces, and the code in each section is lined up after the colons. Again, should one's instructor or place of employment have a different style, be sure to follow that style. + +Also, note that each of the individual possible values of the variable score has its own case statement. Unfortunately a relation cannot be used in the case statements and this is another of the switch statement's drawbacks. However, there are on occasion a few ways around this limitation as will be seen later. + +For example, instead of having quiz scores of 10 through 0, what if the variable score was used to hold an exam score from 100 through 0, where a score of 100 through 90 inclusive was to output a message "Very Good", 89 through 80 was to output the message "Good", and so on? For a nested if structure, the solution is fairly simple. Instead of just checking for one or two integers as in the previous nested if structure, it could be modified to check for a range of integers using an and logical operator as in the following segment: + +Note that each if statement has an && to check for a range of values. However, wasn't it suggested in Sect. 3.5 to avoid this? Yes it was, but in previous examples, such as the temperature example, there were no upper and lower bounds, but in this case there are the bounds of 0 and 100. Although it appears necessary to include a range in each if statement in this example, is there a way that it could be rewritten to avoid having to include an and operator in every if statement? The answer is yes, where an extra if statement can be placed prior to the other if statements. This can be written as an if-then-else-if structure starting with if(score < 0 || score > 100) and with the error message at the beginning, or it can be coded as an if-then-if, which allows for the error message to be written at the end. To reflect the preferred order of the switch statement, the latter if structure is chosen as shown in the following segment: + +Note the use of De Morgan's rules in the first if statement where the || is replaced with an && and the relations are reversed. With an if statement checking the range of the scores added at the beginning, there is no longer a need to have an and operator in each of the subsequent if statements, which simplifies the code. Also, the extra if at the beginning makes it so the last if statement checking for the range from 0 to 59 can be eliminated, since after all the previous if statements, the only scores left would be in that range. Although an if-then-if is used as the outer if, the last nested if has its own else statement and therefore the problem of a dangling else is avoided. + +As can be seen, the exam score problem can be implemented relatively easily using nested if statements, but how could this be implemented using a switch statement? Does there need to be a separate case for each of the 101 possibilities? Without using an arithmetic expression, the answer would be yes. However, since the messages output are based upon exam scores in multiples of 10, if one thinks about it for a minute, there is a solution to this problem. What if each number is divided by 10? For example, if the score 98 is divided by 10, then the answer appears to be 9.8. But wasn't it said previously that the switch statement can't be used with floating point numbers? The answer is yes. However, recall that an integer divided by an integer is an integer, so the answer above would be just 9, not 9.8. Since each division results in an integer, the control can be transferred to the appropriate case. As another example, what if the value in score is a 70 or 79? Then, 70/10 is 7 and 79/10 is also 7, so in both cases a message of "Fair" could be output. + +But what about values that fall outside the range, such as −10 and 110? When divided by 10, they result in −1 and 11, respectively, and would be caught by the default statement. However, what about numbers like −1 and 101? When divided by 10, they would result in 0 and 10, respectively, so clearly this would not work. The solution is similar to the preceding nested if structure as shown below: + +Notice that there are no braces around the switch statement in the then section of the if-then-else statement because it is syntactically only one statement. Since the value in score is being divided by 10, will the value in score be altered? No, because as discussed in Chap.​ 1, the variable score is not being assigned a new value. Also notice that there is no default statement because the error message is part of the else section of the if-then-else statement. Lastly, note that since case 0: is the last statement, the break statement is not included prior to the closing brace of the switch statement. + +Given that it appears that the switch statement can solve this problem, when should the switch statement be used instead of nested if statements? Granted the above solution was helpful in this instance, because each of the message categories were multiples of 10. If other problems are multiple of other particular values, then the switch statement can be just as useful. However, if each of the categories are not of the same multiple, then the switch statement might not be as useful and nested if statements are probably a better solution to the problem. + +In general, if statements can work in all instances and the switch statement has various limitations. If there are only one or two alternatives, then the if-then or if-then-else structures are probably the best choice, because using the switch statement is probably overkill. Likewise, if there are only three or possibly four alternatives, then the if-then-else-if will be used by this text to give the reader practice with using nested if statements. If the problem has five or more of alternatives, then the switch statement can be the better choice. However, if the number of cases for each alternative are too numerous, then nested if statements might again provide the best solution. + +## 3.7 Complete Programs: Implementing Selection Structures + +The first program in this section is a simple program that does not include objects, whereas the second program incorporates objects to help reinforce concepts learned in Chap.​ 2. + +### 3.7.1 Simple Program + +Hurricanes are classified into five categories by the US National Oceanic and Atmospheric Administration (NOAA) based on the speed of the wind as shown below: + +Category | Wind speed (mph) + +---|--- + +1 | 74–95 + +2 | 96–110 + +3 | 111–130 + +4 | 131–155 + +5 | Over 155 + +In this section a program using selection structures which will categorize a hurricane will be developed. As in the past two chapters, this program will be developed step by step. First, the problem that will be solved is: + +Problem Statement: Write a program to classify a hurricane. + +Once a problem statement is given, the requirements can be established by analyzing the problem. The program will: + + * Accept the wind speed of a hurricane from a user + + * Determine the category of the hurricane + + * Display the category of the hurricane + +Because of the nature of the problem, a selection structure will be used. Since there are five alternatives, five separate if statements could be used to check the range of the wind speed. Assuming the wind speed is stored in the variable windSpeed, a possible solution is shown below: + +if(windSpeed >= 74 && windSpeed <= 95) + +System.out.println("The hurricane is category 1."); + +if(windSpeed >= 96 && windSpeed <= 110) + +System.out.println("The hurricane is category 2."); + +if(windSpeed >= 111 && windSpeed <= 130) + +System.out.println("The hurricane is category 3."); + +if(windSpeed >= 131 && windSpeed <= 155) + +System.out.println("The hurricane is category 4."); + +if(windSpeed > 155) + +System.out.println("The hurricane is category 5."); + +Is this a good design? The answer is no, because all five conditions will be checked every time the program is run as was discussed in Sect. 3.3. This means a nested if structure would be a better choice. How can the conditions be nested? Here is one solution: + +Is this a good design? It is better than the first solution because whenever the condition becomes true, the rest of the conditions will not be checked. However, it is always a good idea to reduce the number of logical operators. The complete code shown below will check the wind speed in reverse order so that a logical operator is not required in the first if statement nor in the subsequent if statements: + +Notice that the code is indented only two spaces instead of three to help conserve space. Although three spaces is preferred, when using a number other than three, be sure to be consistent. When the above program is compiled and executed using the sample input of 125, the output of the program looks like this: + +Enter the wind speed (mph): 125 + +The hurricane is category 3. + +The first two conditions returned false, and since the third condition was true, it found the hurricane was category 3. The flow of control skipped the rest of the conditions in the nested selection structure and reached the end of the program. The program also checks for an invalid wind speed, which is any negative value. When the program is executed with −50 as a wind speed, the output looks as shown below: + +Enter the wind speed (mph): -50 + +Invalid wind speed. + +### 3.7.2 Program with Objects + +How can the concept of objects, discussed in Chap.​ 2, be incorporated into the program in the previous section? If an object for a hurricane is created, information about a particular hurricane such as a wind speed and a category can be stored inside of the object, and two hurricanes can be compared. Figure 3.10 contains the code defining the class for a Hurricane object. + +Fig. 3.10 + +Hurricane class + +Notice the setCategory method uses the value of windSpeed which is stored in the object to determine the category of the hurricane. As a result, the setCategory method does not require any parameters. In the main program shown in Fig. 3.11, two hurricane objects are created. After a user enters the wind speed of both hurricanes, the program determines the categories and outputs them. Then, it compares the categories of the two hurricanes to decide the strongest storm. + +Fig. 3.11 + +A client program for Hurricane class + +The stronger hurricane can be found by comparing the categories of the two hurricanes. Since the value of the category is stored in each object, it can be retrieved by using an accessor, the getCategory method. When the above program is compiled and executed using the sample input of 100 and 160, the output of the program looks as given below: + +Enter the wind speed (hurricane1): 100 + +Enter the wind speed (hurricane2): 160 + +Hurricane1 is category 2. + +Hurricane2 is category 5. + +Hurricane2 is stronger. + +## 3.8 Summary + + * The then and else sections of an if statement can syntactically contain only one statement. Should more than one statement need to be included, use a compound statement by putting two or more statements in braces. If there is only one statement in the then or else section, braces are not needed and should not be used. + + * Empty then or else sections should be avoided in if-then-else statements and the code should be rewritten as an if-then. + + * When nesting if statements, the if-then-else-if structure tends to be used more often than the if-then-if structure. When using the if-then-if structure, be careful to avoid the dangling else problem. + + * Logical operator precedence from highest to lowest is () – innermost nested first, !, &&, ||, and in a tie – left to right. + + * De Morgan's laws are not (a and b) = not a or not b and not (a or b) = not a and not b. + + * The switch statement works well with integer and character data but is not as useful with floating point or double precision data. + + * Generally, be sure to include a break statement after every case section, except for the last one, unless there is a default statement at the end. + + * Although a default statement is not required in a switch statement, it is usually a good idea to include one at the end and it does not need a break statement. + + * Should there be only one or two alternatives, use an if-then or if-then-else statement respectively and avoid the use of a switch statement. If there are three or four alternatives, a switch could be used, but in this text nested if statements will be used. Lastly, if there are five or more alternatives, a switch statement should be used if possible. + +## 3.9 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Given the code segment below, indicate the output for the following initial values of y: + +int x = 50; + +if(y > 10) + +x = 30; + +if(y < 20) + +x = 40; + +System.out.println(x); + +*A. + +What is the output if the integer variable y contains 10? + +B. + +What is the output if the integer variable y contains 15? + +C. + +What is the output if the integer variable y contains 30? + +2. + +Given the code segment below, indicate the output for the following initial values of x and y: + +A. + +What is the output if the integer variable x contains 10 and y contains −15? + +*B. + +What is the output if the integer variable x contains 100 and y contains 20? + +C. + +What is the output if the integer variable x contains 200 and y contains −100? + +3. + +Given the code segment below, indicate the output for the following initial values of x, y, and z: + +A. + +What is the output if the integer variable x contains 1, y contains 0, and z contains 2? + +B. + +What is the output if the integer variable x contains 0, y contains 1, and z contains −1? + +*C. + +What is the output if the integer variable x contains 1, y contains 2, and z contains 1? + +4. + +Declare a Boolean variable, isEligible, and assign it a value of false. + +5. + +Evaluate each Boolean expression as true or false. Show intermediate steps. Assume int num1 = 5, int num2 = -2, int num3 = 0, boolean flag1 = true, and boolean flag2 = false . + +*A. + +num1 > num2 || flag2 + +B. + +num1 < num2 && num3 >= 0 + +*C. + +num2 < 0 || flag1 && flag2 + +D. + +(num2 < 0 || flag1) && flag2 + +*E. + +(num2 < 0 || !flag1) && flag2 + +F. + +num1 != 0 && num2 != 0 && num3 != 0 + +6. + +Using a truth table, show that the first De Morgan's law discussed in Sect. 3.5 is correct. + +7. + +Using a truth table, show that the second De Morgan's law discussed in Sect. 3.5 is correct. + +*8. + +Write a code segment to ask a user to enter a number between 1 and 4, and print the name of the class (Freshman, Sophomore, Junior, and Senior) corresponding to the number. Use a case structure. + +*9. + +Repeat the previous exercise using a selection structure instead of a case structure. + +10. + +Write a code segment to ask a user to enter a number between 1 and 12, and print the name of the month corresponding to the number. Use a selection structure. + +11. + +Repeat the previous exercise using a case structure instead of a selection structure. + +12. + +In Sect. 3.5 it was mentioned that a mathematical expression like 0 < temp < 100 would cause a syntax error if used as a condition in an if-then structure in a Java program. Explain why. + +13. + +The dew point temperature is a good indicator of how humid it feels during a hot day. The US National Weather Service (NWS) summarizes the human perception of humidity using the dew point temperatures shown in the table below. + +Dew point temperature (°F) | Human perception + +---|--- + +75 or higher | Extremely uncomfortable + +70–74 | Very humid + +65–69 | Somewhat uncomfortable + +60–64 | OK + +55–59 | Comfortable + +50–54 | Very comfortable + +49 or lower | A bit dry + +Write a complete program using a selection structure to output how a person feels for a given dew point temperature. The program should perform the following tasks: + +a. + +Allow the user to enter a dew point temperature. + +b. + +Determine the human perception for a given dew point temperature. + +c. + +Output the corresponding phrase from the table. + +Here is some sample input and output: + +Enter a dew point temperature (F): 55 + +Comfortable + +Enter a dew point temperature (F): 30 + +A bit dry + +Enter a dew point temperature (F): 90 + +Extremely uncomfortable + +Enter a dew point temperature (F): 65 + +Somewhat uncomfortable + +14. + +Repeat the previous exercise using a case structure instead of a selection structure. + +15. + +Write a complete program to compare the temperatures of three different cities and find the hottest city. First, implement a class called Thermometer as described below: + +A. + +Thermometer has one private data member, temperature of type double. + +B. + +Write code for a constructor to set a data member to the default value of 0.0. + +C. + +Write code for an accessor method, getTemperature, which returns the value of the appropriate data member. + +D. + +Write code for a mutator method, setTemperature, which has one formal parameter, and store it as the value of the data member. + +Then, write a client program to test the Thermometer class defined above. Call this class Temperatures. The main method should perform the following tasks: + +E. + +Allow the user to enter the temperatures of three cities. + +F. + +Declare and create three Thermometer objects setting the instance data member to the values entered by the user. + +G. + +If city1 is the hottest city among the three cities, output a phrase like "City1 is the hottest city." + +Here is some sample input and output: + +Enter the temperature of city1: 93.4 + +Enter the temperature of city2: 76.1 + +Enter the temperature of city3: 85.8 + +City1 is the hottest city. + +Enter the temperature of city1: 76.5 + +Enter the temperature of city2: 85.2 + +Enter the temperature of city3: 66.9 + +City2 is the hottest city. + +Reference + +4. + +Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_4 + +© Springer-Verlag London 2014 + +# 4. Iteration Structures + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +This chapter shows how iterations structures work using flowcharts, pseudocode and Java. It includes pretest indefinite loop structures, both count and sentinel controlled while loops. The posttest indefinite do-while loop and the definite iteration for loop are also discussed. Nested loops and potential problems are examined, and complete programs both with and without objects are included. + +## 4.1 Introduction + +Selection structures were discussed in Chap.​ 3, which allows a program to follow one of two or more paths. Iteration structures, sometimes called repetition structures, allow a program to repeat a section of code many times. It is this capability to repeat or loop that gives the computer the ability to perform a task over and over again. + +In creating any type of loop, it will generally have three parts: initialization, test, and change. When performing a repetitive task, one typically does not think about the particular steps of the repetition, but taking a moment to think about the process, one can recognize these three components. For example, if a student needs to do a number of homework problems for a mathematics class, they might count each of the problems, starting with the number one. This can be seen as the initialization phase which is performed just once. As the student starts to do the first problem, they might look at their notes to see how many problems they need to do, where in this example the student might need to do ten problems. Noticing that the count one has not passed the number ten, the student realizes the assigned homework is not completed. This is known as the test phase. As the student finishes the first problem, the student then counts to the next number, two, and this act of counting is the change phase of the repetitive process. The student again compares the count to the number of problems to be completed. This process of counting and comparing is the repetitive process of change and test. The process continues until the student has finished the tenth problem and the iterative process stops. Although this detailed analysis is much more than what a person does when performing a repetitive task, it is what the computer needs to do to perform a loop. + +In particular, this chapter will examine indefinite and definite loop structures. The first type of loop iterates an unknown number of times, whereas the second type of loop structure loops a fixed number of times. The first of these two loops can be divided into what are known as pretest and posttest loop structures, where the first has the test or conditional expression at the beginning of the loop and the second has the conditional expression at the end of the loop. Since the pretest indefinite loop structure is probably the most versatile, it is discussed first. + +## 4.2 Pretest Indefinite Loop Structure + +A pretest indefinite loop structure is a loop that has the test or conditional expression at the beginning of the loop and can iterate an indefinite number of times. An indefinite loop structure can also be made to loop a fixed number of times, and this is one of the reasons it is a very useful loop structure. The pretest indefinite loop structure in Java is known as a while loop. The while loop can generically be represented in a flowchart as shown in Fig. 4.1. + +Fig. 4.1 + +Generic while loop + +At first glance, the flowchart of the while loop might appear similar to the flowchart for the if structure presented in the last chapter. The reason for this might be because of the diamond-shaped conditional expression near the top of the flowchart, but upon closer examination, one should be able to see a number of differences. The first box is for the initialization of a variable which occurs just once. That is followed by the diamond-shaped box where the test of the variable occurs. Note that like the if structure, there is a true and a false branch, but instead of the true branch going off to the right, it is pointing downward. Further, note that the two branches do not meet together at the bottom, but instead the false branch goes to the box with the "End of Program" message and the true branch ultimately ends up going back to the test. It is the true branch that forms the actual loop. The first section in the loop is known as the body of the loop. It is here that any task or tasks that need to be performed repetitively can be placed. This can be any sort of input, processing, or output that needs to be performed. The body of the loop can also include nested if structures or even nested loops as will be shown later in this chapter. Lastly, the change to the variable occurs before the flow of control loops back to the test. Although the change can occur anywhere in the loop, it is best to be consistent in its placement, and for now it is the last thing that is done in the loop. + +### 4.2.1 Count-Controlled Indefinite Iteration Structure + +Although the generic flowchart is fine for understanding the basic layout and concept of a loop, it is helpful to see exactly how the loop performs. In the next flowchart, the initialize, test, and change are replaced with more specific statements. In this case, the loop is known as a count-controlled loop and the variable controlling the loop is sometimes called the Loop Control Variable (LCV). In this example, the LCV will be the variable i as shown in Fig. 4.2. + +Fig. 4.2 + +Count-controlled while loop + +To understand the loop, the best thing to do is walk through the logic. First, the variable i in the flowchart is initialized to 1. Then, the variable i is tested to see if it is less than or equal to 3, which is true. The body of the loop is executed for the first time and the value of i is incremented by 1, so that the value of i is equal to 2. The flow of control is returned back to the test, where i is less than or equal to 3. The body of the loop is executed for the second time, and the value of i is incremented to 3. The value is tested again and i is still less than or equal to 3, so the body of the loop is executed for the third time and the value of i is incremented to 4. The next time the value is tested, it is no longer less than or equal to 3, so the false branch is taken and the message "End of Program" is output. In the end, the final value of i is 4 and the body of the loop was executed three times. + +As in the previous chapter on if structures, it is nice to examine the pseudocode equivalent of the while structure as seen below: + + * i ← 1 + + * while i ≤ 3 do + + * //body of loop + + * i ← i + 1 + + * output "End of Program" + +First, note that the while is written as while i ≤ 3 do, where while-do is a common way to describe the while loop structure. Of course, if one wanted to write it as while (i ≤ 3) to make the pseudocode look more like the Java language as will be seen shortly, that is okay. However, it is recommended that whatever style of pseudocode is chosen, it should be consistent. As with if structures, note that the body of the loop, including the increment, is indented approximately three spaces. Lastly, note that the output statement is not in the loop so it is not indented. + +As one might suspect, the Java syntax is similar to the pseudocode as shown below: + +i = 1; + +while(i <= 3) { + +// body of loop + +i = i + 1; + +} + +System.out.println("End of Program"); + +The first line is the initialization, the second line is the test with the conditional expression in parentheses like an if statement, and the increment of the variable i is inside the compound statement. Note that the statement i++ could be used instead as shown in Chap.​ 1, and this style is often used in loops. Notice that braces are being used around the comment concerning the body of the loop and also the increment. Are these braces required in this particular code segment? At first the answer might seem to be yes, because there appear to be two statements in the loop. However, recall from Chap.​ 1 that comments are ignored by the compiler, so technically there is only one statement in the loop and the answer to the question is no. Why then are there braces included in the above segment? The reason is that in addition to the increment, there are usually other statements in the body of the loop. It is uncommon to see only one statement in a while loop, so braces are included in the above example in anticipation of more statements being added later. + +What if the user wanted to loop a different number of times other than three? That would require the user to modify and recompile the program, but many users do not have knowledge of programming. To expand upon the above, the value 3 could be changed to an integer variable n, and the value for n could be prompted for and input from the user as shown below: + +System.out.print("Enter the number of times to loop: "); + +n = scanner.nextInt(); + +i = 1; + +while(i <= n) { + +// body of loop + +i = i + 1; + +} + +System.out.println("End of Program"); + +If the user entered the value 3, the loop would still iterate three times as it did before. Further, the user now has the option to enter any other number for the value of n which allows the loop to have more versatility. However, what if the user entered a value of 0 instead? One other important thing about a while loop is that it is known as a pretest loop, meaning that the test is at the beginning of the loop. In this particular case, the variable i is initialized to 1 and then the comparison would be performed. Since the 1 in the variable i is not less than or equal to the 0 in the variable n, the result would be false and the body of the loop would not be executed. This is one of the important features about a pretest loop because the body of the loop might be executed anywhere from zero to many times. This is a reason why the while loop is one of the more versatile loops as will be seen below. + +As an example of how the while loop structure can be used to solve a problem in the Java language, consider a user who wants to add a series of numbers. If there are a relatively small fixed number of integers to be added, then a loop might not be necessary. Consider the following program that would add three numbers entered by the user: + +int num1, num2, num3, total; + +System.out.print("Enter an integer to be summed: "); + +num1 = scanner.nextInt(); + +System.out.print("Enter an integer to be summed: "); + +num2 = scanner.nextInt(); + +System.out.print("Enter an integer to be summed: "); + +num3 = scanner.nextInt(); + +total = num1 + num2 + num3; + +System.out.println("The total is " + total); + +Although the above works, what if there were a large number of integers to be added, say 1,000? The number of variables, prompts, and inputs would be overwhelming when writing the code, and the program would also take up a lot of memory. Returning to the example above where only three numbers need to be added, the number of variables used to store the input could be reduced to one. This would make the task a little easier, but more importantly it paves the way to see how the problem could be solved using a loop. + +Using only a single variable num instead of three variables, the first integer could be prompted for, input, and placed into the variable total. The second integer could be input into the variable num and added to the variable total. The same would occur with the third integer and then the sum in total is output. + +In the code above, the three prompts and inputs look the same, but the assigning of the first integer input into total makes it different from the subsequent assignment statements. The last two groups of statements indicated by the brackets could be placed in a loop, but the first group could not be placed in the loop. It would be convenient if there did not need to be the exception, so instead of assigning the first value input into total, the variable total could be initialized to zero; thus, the first value input into num could be added to the variable total just as all the other integers. + +The first group is no longer a special case, so it can also be put into a loop that iterates three times. The body of the loop would contain a prompt and input for the integer num followed by the variable num added to the variable total. However, to allow for the first time num is added, the variable total would need to be initialized to zero prior to the loop. Then each time through the loop, the current value in num could be added to the previous value in the variable total. The first time through the loop, the value in num would be added to the zero in total, the second time to the previous value in total, and so on until the loop terminates, and the final value in the variable total is the sum of all the integers input. + +Notice that the basic loop is the same as the loop presented earlier, with the initialization, test, and change of the variable i. Also note that the variable total is initialized to zero so that the integers input can be summed. Lastly, notice that three statements from the previous code segment are no longer written three times, but rather only once, because the loop will iterate three times and accomplish the same task. + +How does one know what belongs inside the loop and what belongs outside the loop? If outside the loop, does it belong before or after the loop? By looking for patterns on a smaller number of items, one should be able to see those items that need to be repeated and those items that need to be executed only once. In the above example, the variables for counting and the total need to be initialized only once, and they should be placed prior to the loop. Since the output of the total needs to occur only once, it should be placed outside and after the loop. Further, since there are three integers to be prompted for, input, and summed, that code should be placed inside the loop. An advantage of the above code segment is that if just three values were being input or 1,000 values were being input, the only thing that would need to be changed is the number 3 in the while statement. This version of the code is much easier to write than straight line code and also takes up less memory. + +The previous code segment is a significant step forward by utilizing the power of the computer to perform repetitive tasks; however, it can be improved. As it is currently written, if the user wants to input and sum four integers instead of three, the user would have to edit and recompile the program. Since most users are not programmers, is there a way to make this program easier to use? The answer is yes. As before, a prompt and input can be placed prior to the loop to allow the user to input the number of integers to be summed as shown below: + +int num, total, i, n; + +total = 0; + +i = 1; + +System.out.print("Enter the # of integers to be summed: "); + +n = scanner.nextInt(); + +while(i <= n) { + +System.out.print("Enter an integer to be summed: "); + +num = scanner.nextInt(); + +total = total + num; + +i = i + 1; + +} + +System.out.println("The total is " + total); + +Notice the prompt and input of the variable n prior to the while statement, and also notice that the number 3 in the while statement has been changed to the variable n. Again, this makes the program much more useful since it does not require the user to make changes to the program. For example, if the user started the program and then decided that they did not want to sum any integers, the user could just enter the number 0, and since the while loop is a pretest loop, the user would never be prompted to input any integers. Further, since total was initialized to 0, the message indicating a total of 0 would be output also. + +There are of course other tasks that could be added to the above program. For example, what if the user wanted to find the average of the integers entered, how would this be written? Since total needs to be divided by the number of items, one thought is to use the value in the variable i. However, its final value is one more than the number of items entered. If three items were input and since it was initialized with a 1, it would contain the number 4 at the end of the loop. That value could be decremented by one to make it the correct number, but why use the counter when the variable n contains the number of items which was originally entered by the user? The answer is that the use of the variable n is the better choice as shown in the following code segment: + +First, notice that average is declared as type double. Also, note that the calculation of the average is outside the loop at the end of the segment because the average only needs to be calculated once. Offhand, the above segment appears to be fairly good. However, there are a few problems with it. If the program was executed using a 3 for the first prompt and then using the three integers 5, 7, and 8 for the values to be summed and averaged, what would the answer be? Using a calculator one would say 6.666..., but is this the answer that the program would generate? The answer is no because the program would output the answer 6.0, which is incorrect. The variable average is type double so that is not the problem. However, look carefully at the division on the right side of the assignment symbol. Recall from Chap.​ 1, an integer divided by an integer is an integer, which in this case is 6. The assignment of the integer to a variable of type double causes the 6 to be changed to 6.0, which is the number that is output. How can this be corrected? The answer from Chap.​ 1 is to use a (double) typecast operator on one of the variables involved in the division which will force the answer to be of type double. Also, it would help to format the output so that it would not be a repeating decimal. + +There is another problem with the previous code segment that might not be as readily apparent. What would happen if the user entered a 0 for the number of items to be summed and averaged? As discussed previously, the user would not be prompted for integers to be entered. The problem occurs after the loop in the division statement. The value in n would be a 0 which would cause an execution error, or in other words a run-time error. How could this problem be solved? An if statement could be included so that division would not occur unless the value in n is positive. Should the average message still be output? That would depend on the original specifications. In this case it would not hurt to still output the message, but it would probably be a good idea to ensure that the variable average contained the value 0. The updated program with all of the above changes can be seen below: + +Although typically users will not enter a negative number or the number 0 as the number of items to be summed, programmers need to write programs that work correctly under such circumstances. The old adage "If something can go wrong, it will" applies to software development as well. As a result, these sorts of possibilities should also be addressed in the design and specifications of programs so that they will be taken care of properly when a program is written. This sort of programming is known as robust programming and will be discussed at various points throughout the text. However, at other times it will not be included when introducing a new concept and to save space. When encountering an assignment or specifications for a programming project that lack robustness, it is always advisable to check with the user or the instructor when in a classroom setting. + +### 4.2.2 Sentinel Controlled Loop + +The use of a prompt in the previous program to indicate how many integers will be entered is better than having the number "hard coded" into the program. A disadvantage with the previous loop structure is that it requires the user to know in advance how many integers will be entered prior to running the program. If the user miscounts the number of integers, the program will not work correctly. For example, if the user overcounts the number of integers, then the user will have one or more extra prompts to enter data and the average will be off, which is unacceptable. If the user undercounts the number of integers, then the user will have leftover data and again the average will be off. In these cases the only real alternative is for the user to restart the program from the beginning. Although this is not much of a problem for a small data set, it is clearly impractical for a large number of data items. + +Instead of having the user count all the data items prior to running the program, wouldn't it be useful to have the program do the counting for the user? This can be accomplished using a sentinel controlled loop, or what is sometimes called an End of Data (EOD) loop, which is usually implemented using a while loop. The idea is that the user continues to enter data until a sentinel value or end of data indicator is entered indicating that the end of data has been reached. The key is that the sentinel or EOD indicator must be a value that is different from the other data values. Using the above example, if only nonnegative integers were entered, then a negative integer such as −1 could be used as a sentinel. The main disadvantage of this method is that sometimes there is not an acceptable value that can serve as a sentinel, but in those instances where a sentinel is available, the sentinel controlled loop is better than the previous count-controlled loop. Although a count is not necessary to control the loop anymore, a count can be added to the program to help calculate the average as will be seen later. + +As always, it is helpful to begin with an example as shown in the following code segment: + +System.out.print("Enter a non-negative integer or -1 to stop: "); + +num = scanner.nextInt(); + +while(num != -1) { + +// body of loop + +System.out.print("Enter a non-negative integer or -1 to stop: "); + +num = scanner.nextInt(); + +} + +System.out.println("End of Program"); + +The first thing to notice is that the variable i is no longer controlling the loop. Since the while loop does not need a counter, it is called an indefinite loop structure. Whereas in the previous section one could tell how many times the loop would iterate merely by looking at it, such as looping 3 times or in some cases n times, here the number of times is not readily apparent and the code could loop indefinitely. + +At first this loop might appear a little confusing because the value num is prompted for and input in two places, once outside prior to the loop and another time inside at the end of the loop. However, if one takes a little time to think about the loop, it is not as confusing as it looks. First, the prompt and input outside prior to the loop is sometimes called a priming read. This can be thought of as the initialization section of the loop. The test portion of the loop includes the comparison of the value input into the variable num to the sentinel value of −1. If the value input is equal to the sentinel, then the loop is not executed, otherwise the data can be processed in the body of the loop. The second prompt and input is the change portion of the loop, where all subsequent values are input. Again, if a subsequent value input is not equal to the sentinel, the value is processed, otherwise the loop terminates. + +A disadvantage to the above loop is that as written, only a value of −1 will terminate the loop. What would happen if the user input a −2? As can be seen, all other negative values would be processed in the body of the loop, which might not be what was intended. Instead, the prompt and test could be rewritten to include all negative numbers as sentinel values as shown below: + +System.out.print("Enter a non-negative integer "); + +System.out.print("or a negative integer to stop: "); + +num = scanner.nextInt(); + +while(num >= 0) { + +// body of loop + +System.out.print("Enter a non-negative integer "); + +System.out.print("or a negative integer to stop: "); + +num = scanner.nextInt(); + +} + +System.out.println("End of Program"); + +Note that due to the length of the prompts, they are split into separate print statements and that the while statement now checks to see if num is greater than or equal to 0. Again, as long as the sentinel value is not part of the data to be processed, the sentinel controlled loop can prove to be a nice alternative to count-controlled loops. To help illustrate the usefulness of this loop, the following code segment shows how it can be used to implement the calculation of total in the example from the previous section: + +int num, total; + +total = 0; + +System.out.print("Enter a non-negative integer to be summed "); + +System.out.print("or a negative integer to stop: "); + +num = scanner.nextInt(); + +while(num >= 0) { + +total = total + num; + +System.out.print("Enter a non-negative integer to be summed "); + +System.out.print("or a negative integer to stop: "); + +num = scanner.nextInt(); + +} + +System.out.println("The total is " + total); + +As before, the value of total should be initialized to 0 prior to the loop. Notice that adding num to total is the first line in the body of the loop. Is this correct? At first this might look a little strange, but it is correct. Remember that the priming read will input the first value to be summed. Also, sometimes beginning programmers think there should be an if statement before adding num to total because they think that the sentinel value might be included in the total. However, an if statement is not necessary because the while loop is a pretest loop, and if a sentinel value is input, the loop would terminate. + +Can this loop be further expanded to include the calculation of the average as done previously? Yes, but a count will need to be added to the loop so that the total can be divided by the number of integers that are input as shown below: + +First notice that the value of i is initialized to 1 as has been done previously, and again it is incremented at the beginning of the loop prior to when total is calculated. Although the increment could be placed elsewhere, it is usually a good idea to keep all calculations together for ease of reading and modification of the code. Another thing to notice is that the variable i does not appear in the parentheses of the while statement. This again is because it is a sentinel controlled loop and not a count-controlled loop. Further, note the i-1 in the if statement, because the final value in i is one more than the number of times the loop was executed. Also notice that the total is divided by (i – 1), because without the parentheses the division would be incorrect. However, instead of using i - 1 twice, it might be more convenient to subtract 1 from i and then use just i as shown in the code segment below: + +i = i - 1; + +if(i > 0) + +average = (double) total / i; + +else + +average = 0.0; + +Although this method works, there is a more convenient way of solving this problem. Even though individuals tend to start counting from the number 1, it is often more helpful to have programs start counting from the number 0. By starting the count from 0, the final value in i will no longer be off by 1 at the end of the segment. This will become even more apparent in Chap.​ 7 on arrays, because an array actually starts at location 0. The following code segment reflects this change: + +So far the count-controlled loop and the sentinel controlled loop have been introduced separately. Is it possible to combine both in one loop? Given the information presented in Sect.​ 3.​5 on logic operations, the answer is yes. For example, what if one wanted to have a sentinel controlled loop that would accept up to a maximum of 10 numbers? In other words, the user could keep entering data until a sentinel value was entered, but if a sentinel value was not entered, the loop would stop after 10 numbers had been entered. The result is that the tests for the sentinel value and the count would need to occur in the while statement. Looking at a portion of the previous program, an && operator could be added to the while statement so that the body of the loop is executed only when both the value in num is not equal to a sentinel value and the count is less than 10. + +Note that the test for i is less than 10 instead of less than or equal to 10. This is because the variable i now begins at 0 instead of 1. If the value in num is greater than or equal to 0 and the count is less than 10, then the body of the loop is executed. However, if either the value in num is a sentinel value or the value in i is 10 or greater, then the loop will not be executed. + +What if there isn't an acceptable value that can be used as a sentinel value? Another possibility is to repeatedly prompt the user and ask if there is any data to be entered. A prompt asking the user to enter a Y or N, for yes or no, respectively, could be output using a sentinel controlled loop. Then, if there is more data, the user could be prompted to input data for each iteration through the loop as shown below: + +Note that the while loop checks for either an uppercase Y or a lowercase y to make it convenient for the user. Also, notice that if the user does not respond with either Y or y, it is assumed that the user entered either N or n and the loop terminates. Further, the prompts for more data can be different as necessary, as shown by the inclusion of the word more in the last prompt above. The disadvantage to this program segment is that the user has to enter a character each time before entering the actual data to be processed, but if a suitable sentinel value cannot be found, then this might be the only alternative. + +## 4.3 Posttest Indefinite Loop Structure + +In addition to the pretest indefinite loop structure of the previous section, Java also has a posttest indefinite loop structure called the do-while structure. Whereas a pretest loop has its test at the beginning and the body of the loop may be executed zero to many times, the posttest loop structure has its test at the end of the loop and the body of the loop will be executed one to many times. In other words, regardless of the result of the test, the body of the posttest loop will be executed at least once. As before, looking at the flowchart is a good place to start as shown in Fig. 4.3. + +Fig. 4.3 + +Count-controlled do-while loop + +It is easy to notice that the test condition is now located at the end of the loop instead of the beginning, thus showing it is a posttest loop structure. The body of the loop is executed while the condition is true, and when it is false, the flow of control falls through to the next statement. The above flowchart can be written in pseudocode as follows: + + * i ← 1 + + * do + + * //body of loop + + * i ← i + 1 + + * while i ≤ 3 + + * output "End of Program" + +As with previous pseudocode, the indenting indicates the body of the loop. As should be suspected, the Java code looks similar as follows: + +i = 1; + +do { + +// body of loop + +i = i + 1; + +} while(i <= 3); + +System.out.println("End of Program"); + +Notice the use of a compound statement, the { }, which is not optional within the do-while statement. Even if there is only one statement between the words do and while, a compound statement must be included. However, since the body of a do-while almost always has more than one statement, it is unlikely that one would forget to include the braces. Modifying the above code segment to prompt the user to enter the number of times to loop, similar to the last section, results in the code segment below: + +System.out.print("Enter the number of times to loop: "); + +n = scanner.nextInt(); + +i = 1; + +do { + +// body of loop + +i = i + 1; + +} while(i <= n); + +System.out.println("End of Program"); + +How many times would the body of the loop be executed in the above code segment if the user entered a value of 0 for n? The answer is one. Unlike the answer of zero for the pretest loop structure, the body of the loop is executed at least once with a posttest loop structure, because the comparison is at the end after the body of the loop has been executed. If one did not want the above code to iterate once in the event that someone entered a value of 0 for n, how would the code need to be modified? If one thinks about it, an if statement would need to be added at the beginning of the body of the loop or just prior to the loop to check for a value of zero or a negative number. Of these two choices, the if would be better placed outside the loop so that it does not need to be checked through each iteration of the loop and is executed only once prior to the loop as shown below: + +System.out.print("Enter the number of times to loop: "); + +n = scanner.nextInt(); + +if(n >= 1) { + +i = 1; + +do { + +// body of loop + +i = i + 1; + +} while(i <= n); + +} + +System.out.println("End of Program"); + +Although the above code segment solves the problem of iterating once through the loop when the value of n is 0 or negative, it does appear a little cumbersome with the use of both an if and a do-while statement. The above code segment can be easily implemented using a simple while loop as presented in the previous section and repeated below: + +System.out.print("Enter the number of times to loop: "); + +n = scanner.nextInt(); + +i = 1; + +while(i <= n) { + +// body of loop + +i = i + 1; + +} + +System.out.println("End of Program"); + +Clearly, the second example above using only the while loop is simpler than the previous example using an if and do-while statements. This is not to say that the many examples in the previous section and other problems cannot be implemented using the do-while and an if statements (see the exercises at the end of the chapter). Rather it is oftentimes simpler to use just the while statement instead. It is for this reason that the while statement tends to be used more often than the do-while statement. + +Although in most cases having the test at the beginning is more convenient, there are some special cases where the do-while can be quite useful. For example, assume that for input a user has to input an integer between 0 and 10, inclusive. If the user enters a number outside the range, then the user needs to be re-prompted to input the number again. At first this might seem to be a good application for the if statement, but what if the user continues to enter the wrong number? A single if statement would allow the user only one chance to reenter a correct number. Instead, a loop would be a better choice. The problem could be solved using a while loop, but since the user has to be prompted at least once, the do-while might be a good choice as seen below: + +do { + +System.out.print("Enter a number between 0 and 10, inclusive: "); + +number = scanner.nextInt(); + +} while(number < 0 || number > 10); + +The above loop provides a simple way to give a user multiple attempts to correct a problem with the input data. However, a disadvantage of the above loop is that the user might continue on indefinitely entering the wrong number. A solution is that a counter could be added so that after a certain number of attempts, the loop stops. Then, an if statement after the loop could check the number of attempts and either use a default value or exit the program. + +Another disadvantage of the above code segment is that the subsequent message output is the same as the first one, so the user might not understand what they did incorrectly. If a more detailed message is needed, an if could be added to the body of the loop to check a flag and offer a different message. + +firstAttempt=true; + +do { + +if(firstAttempt) + +firstAttempt=false; + +else + +System.out.println(number + " is an incorrect number"); + +System.out.print("Enter a number between 0 and 10, inclusive: "); + +number = scanner.nextInt(); + +} while(number < 0 || number > 10); + +Note the firstAttempt flag is set to true prior to the loop in order to indicate the first attempt, and once in the loop, the flag is set to false to indicate subsequent attempts. In the case of a subsequent attempt, a message is output to the user indicating what was input so that they might see what was incorrect. Notice that regardless of whether it was the first attempt or a subsequent attempt, a number needs to be prompted for and input, so the prompt and input statements come after the if statement. However, the use of the flag and if statement might seem a little clumsy, so possibly a while loop could be used instead. The advantage here is that the message in the body of the loop could be different than the initial message used in the priming read as follows: + +System.out.print("Enter a number between 0 and 10, inclusive: "); + +number = scanner.nextInt(); + +while(number < 0 || number > 10) { + +System.out.print(number + " is an incorrect number, try again"); + +System.out.print("Enter a number between 0 and 10, inclusive: "); + +number = scanner.nextInt(); + +} + +As suggested previously, a count could also be added so that after a certain number of attempts, the loop would stop. Again in this case, the pretest loop seems to be a little more appropriate than the posttest loop. In any event, a programmer should analyze the requirements and specifications of the program to be written and use the type of loop that best suits the task at hand. + +## 4.4 Definite Iteration Loop Structure + +As discussed in Sect. 4.2.1, the while loop can be used as a count-controlled loop. Since loops often need to iterate a fixed number of times, most languages include what is known as a definite iteration loop structure or what is sometimes called a fixed iteration loop structure. In Java, this is called a for loop, and like the while loop, it is a pretest loop. + +The for loop has a flowchart similar to the one shown previously in Fig. 4.2. However, instead of having the initialization and test as separate statements as they are in the while loop, they are included as part of the for loop statement. To help illustrate this in flowchart form, the diamond that has only the test portion of a while loop can be replaced with a rectangle that contains all three parts typically present in a loop (Fig. 4.4). + +Fig. 4.4 + +Definite iteration loop flowchart + +Notice that the initialization, test, and change are all located in one rectangle signifying that all three operations are written in the same statement. The optional internal arrows illustrate how the flow of control occurs within the statement. Notice that the order of operations is the same as with the previous flowchart for the while statement. The initialization is done just once prior to the loop. The test is done prior to the body of the loop and the change occurs after the body of the loop. + +The pseudocode for the for loop can be written as follows: + + * for i ← 1 to 3 incremented by 1 do + + * //body of loop + + * output "End of Program" + +In the for loop, the initialization is indicated as i←1, the to 3 is the test, and the change is the incremented by 1. Note that the use of the word do is optional and the body of the loop is indented. As before, the Java code follows: + +for(i=1; i<=3; i++) + +// body of loop + +System.out.println("End of Program"); + +After the for in the parentheses are the initialization i=1, the test i<=3, and the change or increment i++, all separated by semicolons. Note that the increment is using the shortcut i++ which is common in a for statement. Also notice that there are no braces in this example around the body of the loop, because if there is only one statement, they are unnecessary. Since the change or increment of the variable i is in the for statement itself, it is not uncommon that there might be only one statement in the body of a for loop. However, if there is more than one statement in the body of the loop, the use of a compound statement is necessary. In the above example, it is assumed that the variable i is declared elsewhere, but it is also possible to declare the variable i within the for statement itself by preceding the initialization of i with the word int as in for(int i=1; i<=3; i++). This is also a fairly common practice and will be used on many occasions in the future. + +Note that it is possible to have more than one statement in each of the three sections that are separated by semicolons within the parenthesis and each statement would be separated by commas. This gives the for statement quite a bit of flexibility, but this can become quite confusing and is considered by some to be poor programming practice. Since anything that can be done with a for loop can also be done by the while loop, should such a complex for loop need to be written, the programmer is usually better off writing the loop as a while loop. That being said, when should the for loop be used instead of a while loop? Since the for loop is typically thought of as a fixed iteration structure, it is in those situations where a fixed number of tasks need to be done that the for loop should be used. + +As an example of using the for loop, assume that Java did not contain the pow function in the Math class. How could the power function be implemented using iteration? As before, whenever trying to solve a problem using iteration, it helps to write down an example using specific values to see if a pattern can be found, followed by a more general solution. For example, when trying to calculate x n , where x is the number 2 and n is an integer greater than or equal to zero, then the following is a list of possible results: + + * 20 = 1 + + * 21 = 1 * 2 = 2 + + * 22 = 1 * 2 * 2 = 4 + + * 23 = 1 * 2 * 2 * 2 = 8 + + * . + + * . + + * 2 n = 1* 2 * 2 * 2 *... * 2 (n times) + +Further, if x is considered to be a positive nonzero integer in this example, then the above can be rewritten more generally as follows: + + * x 0 = 1 + + * x 1 = 1 * x + + * x 2 = 1 * x * x + + * x 3 = 1 * x * x * x + + * . + + * . + + * x n = 1* x * x * x *... * x (n times) + +As stated above, when solving a problem, it is helpful to try and see if there is a pattern present. In the above example, it can be seen that 20 and x 0 are defined to be 1, so that might be a good starting point for initialization. Further, note that for any value of n, there appears to be that number of multiplications present. For example, 23 is 2 multiplied by itself 3 times. This might be useful in the test part of the loop where the loop might need to iterate n times. Further, since the loop will iterate a fixed number of times, this would be a good fit for the for loop. Using this information, the loop skeleton from above can be modified to solve the problem. + +First, four variables will need to be declared, the loop control variable i, variables for both x and n, and a variable for the result which could be named answer as shown below: + +int i,x,n,answer; + +The values for x and n would need to be prompted for and input from the user as in the following: + +System.out.print("Enter a value for x: "); + +x = scanner.nextInt(); + +System.out.print("Enter a value for n: "); + +n = scanner.nextInt(); + +Next, if the loop needs to loop n times, then instead of having the relational expression compare the loop control variable i to 3 as was done previously, couldn't it instead be compared to n? The answer is yes, where the loop would not iterate 3 times, but rather n times. Also note that the answer for x 0 is 1. Further, each line in the definition for x n begins with the number 1, so this might be a good initial value for the variable answer. The result is that the following code segment could implement the power function: + +int i,x,n,answer; + +System.out.print("Enter a value for x: "); + +x = scanner.nextInt(); + +System.out.print("Enter a value for n: "); + +n = scanner.nextInt(); + +answer = 1; + +for(i=1; i<=n; i++) + +answer = answer * x; + +System.out.println(x + " raised to the " + n + " power = " + answer); + +Notice that answer is initialized to 1, that the loop iterates n times, and that each time through the loop answer is multiplied by x. Also note that there is only one statement in the body of the for loop so a compound statement is not used. What would happen if 0 or a negative value were entered for the value of n? The result would be that the initial value 1 in the variable i would not be less than or equal to the value 0 in n. Since the for loop is a pretest loop structure, the loop would not iterate, and the initial value 1 in answer would be output. Could this problem have been solved using a count-controlled while loop? Yes, but since the loop needs to iterate a fixed number of times, the for loop is the better choice. As will be seen later, the for loop will be especially useful with arrays in Chap.​ 7. + +## 4.5 Nested Iteration Structures + +As seen in Sect. 4.3, iteration structures can be nested within selection structures, and the reverse can also occur. Further, iteration structures can also be nested within other iteration structures, and when using nested loops, they require some special considerations. To start, consider the following nested while loops: + +int i,j; + +i = 1; + +while(i <= 3) { + +j = 1; + +while(j <= 2) { + +System.out.println("i = " + i + " j = " + j); + +j = j + 1; + +} + +i = i + 1; + +} + +System.out.println("End of Program"); + +First, notice that the loop control variable for the outer loop is the variable i and the loop control variable for the inner loop is the variable j. Although it is okay to reuse the same variable when the loops are not nested, if the same variable is used in a nested loop, it might cause what is known as an infinite loop as discussed in the next section. Given the above code segment, how many times will the inner println output its message? The answer is six times. If the outer loop iterates 3 times and the inner loop iterates 2 times, then one can multiply the number of times each loop iterates to get the answer, where 3 times 2 is 6. The output of the above code segment can be seen below: + +i = 1 j = 1 + +i = 1 j = 2 + +i = 2 j = 1 + +i = 2 j = 2 + +i = 3 j = 1 + +i = 3 j = 2 + +End of Program + +Note that the variable j counts to 2 and then starts over again when the value of i changes. It is often said in a description of this behavior that the value of the inner loop control variable varies more rapidly than the outer loop control variable which varies more slowly. Looking at another segment, how many times would the message generated by the inner println be output in the following example? + +int n,count; + +System.out.print("Enter a value for n: "); + +n = scanner.nextInt(); + +count = 0; + +for(int i=1; i<=n; i++) + +for(int j=1; j<=n; j++) + +System.out.println("count = " + count++); + +System.out.println("End of Program"); + +Although one might answer that it depends on the value in n, one can still give answer in terms of n. Given the previous example where the number of times the body of the loop was executed was equal to the number of times iterated by the outer loop times the inner loop, the same principle applies here. The outer loop is n and the inner loop is n, so n times n equals n 2 . As a particular example, if the value of n was 6, then the body of the inner loop would execute 36 times. + +First, note that the variables i and j are declared in the for statements. Second, notice that there are no compound statements in either for loop in the above code segment. The reason is that the inner for loop has just one statement in the body of its loop and the inner for loop is just one statement in the body of the outer for loop so braces are unnecessary. Lastly, note the use of count++ which increments the value of count after it has been output. + +At present, the need for nested loops is not as great, but later in Chap.​ 7 nested loops will be important when data needs to be sorted, for example, in ascending order. Nested loops will also be important when dealing with what are known as two-dimensional arrays. + +## 4.6 Potential Problems + +There are a number of problems that can occur with loops, some of which have already been alluded to earlier in this chapter. For example, if the relation in the test section of a loop is incorrect, the loop might iterate more or less times than was originally intended. The best way to check for this is try going through the code segment using a small enough number so that it is easy to walk through the segment but a big enough number so that any pattern in the code can be observed. A good number to test with is the number 3 as has been used frequently in this chapter. + +Just as it is important to check that the final number is correct, it is also important to ensure that the initial value is correct. For example, switching from the number 1 to the number 0 as the initial value usually requires a change in the relation in the test as discussed in Sect. 4.2. + +Other considerations are to be sure that the loop control variable is initialized in the first place. If one forgets to initialize it, then the value in the loop control variable would be indeterminate and the loop would iterate an unknown number of times. Probably a more serious problem is when one forgets to include a change in the body of a loop. Even though the loop control variable has been initialized properly and tested correctly, if there is no change in the loop, one has what is called an infinite loop, meaning the loop never stops. This can make it seem that the computer is "locked up" and not responding, or the program might ask for input or messages are output without stopping. + +Other concerns happen when incrementing the loop control variable by a value other than 1, such as counting by 2 and testing for only a particular value instead of a range of values as in the following code segment: + +i = 0; + +while(i != 3) { + +// body of loop + +i = i + 2; + +} + +System.out.println("End of Program"); + +Notice that the value of i starts with the number 0, then is incremented to 2, and then 4, so the value in i is never equal to the number 3. Although it is okay to increment by values other than 1, it is important that the comparison is in a range of numbers such as <=3 and that the loop iterates the expected number of times. + +One might have noticed that the loop control variables used have always been integers. A variable of type char can also be used as will be shown in the next section. Although real numbers can be used, sometimes the computer cannot represent real numbers accurately. For example, the number 0.1 cannot be represented exactly on a computer, because it is a repeating fraction in the binary number system (base 2) and is less than 0.1. If one wrote a program such as the following and added the value of 0.1, ten times, the result would not be equal to 1.0: + +double i; + +i = 0.0; + +while(i < 1.0) { + +// body of loop + +i = i + 0.1; + +} + +System.out.println("End of Program"); + +Instead of looping ten times as might be expected, the above program actually iterates eleven times. Again, real numbers can be used, but it is generally not good practice. + +As said previously, when writing loops, or any code for that matter, it is important to check programs carefully with smaller data sets and to also test the program thoroughly with actual data on the computer to help avoid the possibility of logic errors. + +## 4.7 Complete Programs: Implementing Iteration Structures + +As in Chap.​ 3, the first example does not use objects and the second example includes objects. + +### 4.7.1 Simple Program + +Using iteration structures and selection structures, one can write programs that are more complex and robust. Suppose that a program needs to be developed to find an average and the highest test scores in a course. This program will: + + * Allow a user to enter student exam scores assuming a score is an integer value between 0 and 100 + + * Compute the average and find the highest score + + * Display the average and the highest score + +Since there will be more than one score that needs to be processed, instead of storing each score in different variables, a loop will be used to input them. What kind of loop should be used? Because most likely every class has a different number of students, the number of iterations will not be known in advance. The program could ask the user to enter the number of students before the loop and use a while loop or a for loop. On the other hand, since the range of scores is given, a sentinel value can be easily identified in order to use a sentinel loop. It is not a good idea to use a do-while loop, because there may be no scores to be processed. Using a sentinel of −1, a pretest indefinite sentinel controlled loop structure will be used here. When no score is entered, there is no reason to compute an average, find the highest score, or display them. Therefore, in that case the message, "No scores were entered." will be output. Finding the average of numbers using a loop was discussed in Sect. 4.2, but what about finding the highest score? Since all of the scores are not saved, the highest value cannot be determined after the loop is terminated by looking at all the data at once. Then, how can the highest score be found as the scores are input? The answer is to keep the highest score among the scores entered so far. Assuming all the variables are declared appropriately, the following code finds the highest value entered: + +// priming read + +System.out.print("Enter a score or -1 to stop: "); + +score = scanner.nextInt(); + +highestScore = score; + +// loop to enter scores + +while(score != -1) { + +if(highestScore < score) + +highestScore = score; + +System.out.print("Enter a score or -1 to stop: "); + +score = scanner.nextInt(); + +} + +Notice that the first score input is used to initialize the variable highestScore which keeps the highest value up to that point. If the first score is not −1, then in the loop the score is checked against the highest score. At this point, only one test score has been entered; therefore, the values of score and highestScore are the same, meaning the condition of the if statement is false. If the second value entered is not equal to −1, the body of the loop will be executed again. The second input is compared with the value of highestScore, which has the first value input at this point. If the condition is false, it means the first value input is greater than the second. If the condition is true, it means the most recent value input is greater than the highest one so far, so highestScore needs to be updated. This process is repeated until the user enters a sentinel value of −1. At the end, the value of highestScore is the largest value of all the scores input. The complete program is shown below: + +First, notice the prompt and input prior to the loop which is the priming read. It is necessary to determine whether to enter the loop or not by checking the first input value against the sentinel. The prompt and input in the loop determine if the loop should continue to iterate. As was discussed in Sect. 4.6, it is important to make sure that the loop will eventually terminate to avoid an infinite loop. In this program a sentinel value of −1 will stop the loop. If there are no scores and the user enters −1 at the very beginning, the program will not execute the body of the loop in the else section of the if-then-else, thus ensuring that division by 0 will not occur for the calculation of the average. With the input value of −1 the output is as follows: + +Enter a score or -1 to stop: -1 + +No scores were entered. + +With values other than −1, the variable count is incremented by 1 inside the loop body to keep track of the number of scores and is used to find the average. Notice that sum, which has the total of all the scores, is declared as type double. Although score is of type int, by declaring sum as type double, the result of the calculation sum/count to find the average will be of type double since it is a double divided by an int. An example of the output with three scores is shown below: + +Enter a score or -1 to stop: 88 + +Enter a score or -1 to stop: 97 + +Enter a score or -1 to stop: 65 + +Enter a score or -1 to stop: -1 + +Average score is 83.33. + +The high score is 97. + +### 4.7.2 Program with Objects + +Next consider an example that involves objects. An object that keeps a distribution of scores for a particular exam is useful to figure out how many students made a grade of A, B, C, D, or F. The Grades class defines data members, a constructor, and three methods, enterGrade, getNumStudents, and getPercent. The definition of the Grades class is shown below and the actual implementation of the three methods is discussed shortly: + +Since the cutoff for the grade of A is 90, scores between 90 and 100 will receive a grade of A. Scores between 80 and 89 will result in a grade of B because the cutoff for the grade of B is 80, and so on. If the score is outside the range of 0–100, it is simply ignored in the enterGrade method. For example, what happens if the score is 95? Since it is a valid input inside the range of 0–100, the count is incremented by 1 to keep track of the number of scores entered. Then, it will increment the counter for the A group by 1. The enterGrade method shown in Fig. 4.5 is used to distribute the scores entered by the instructor into the correct grade group. + +Fig. 4.5 + +Implementation of enterGrade method + +The getNumStudents method in Fig. 4.6 returns the number of scores assigned to a particular grade and is implemented using a switch statement. It takes a grade (A, B, etc.) in a variable of type char as a parameter and returns a value of type int. + +Fig. 4.6 + +Implementation of getNumStudents method + +The getPercent method in Fig. 4.7 finds the percentage of scores assigned to a designated grade level and is also implemented using a switch statement. It takes a char value and returns a value of type double. Notice that the value 100.0 of type double is multiplied by the number of scores for the particular grade which is a value of type int, to make the result of type double. The result is divided by a value of type int stored in count, which results in the percentage of type double. If an invalid character is passed as a parameter, the value of −1, which represents an invalid value, is returned. + +Fig. 4.7 + +Implementation of getPercent method + +Like the previous Scores program, the client program using a Grades object outputs the message "No scores were entered.", if there were no scores as shown below: + +Enter a score or -1 to stop: -1 + +No scores were entered. + +An example of the output with eight scores is shown below: + +The client program will create an object of the Grade class named class1 and each score is processed as it is entered. The exam scores are input using a while loop since the number of scores is indefinite. The result is output using a for loop because the number of lines is known. The table displays the distribution and percent for each grade. The complete client program is shown below: + +The first line of the table contains column titles that are printed prior to the for loop. The second through fifth lines output the grade, distribution, and percent for grades for A, B, C, and D using a for loop. Notice that the char variable' letter is used as a loop control variable in the for loop. It is initialized to "A" at the beginning of the for loop, and when it is incremented by one, the value of letter is updated to the next character in alphabetical order such as A to B, and B to C. Because there is a gap between D and F, the information for the grade of F needs to be printed outside the for loop at the end. Control characters, c, d, and f, are used in the control string of the first printf statement to output the variables of type char, int, and double, respectively, in order to format the table as described in Chap.​ 1. + +## 4.8 Summary + + * The while loop and the do-while loop are known as indefinite iteration loop structures. + + * The for loop is known as a definite or fixed iteration loop structure. + + * The do-while loop is a posttest loop structure and can iterate one to many times. + + * The while loop and the for loop are pretest loops which can iterate zero to many times. + + * The do-while loop must always use a compound statement in the body of the loop whether there are one or many statements. + + * The body of the for and while loops only need to use a compound statement when there is more than one statement in the body of the loop. If there is only one statement, the compound statement is unnecessary. + + * When nesting loops, be sure to use a different loop control variable for each loop. + +## 4.9 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Identify the syntax errors in the following code segment: + +int sum, i; + +sum = 0; + +i = 0; + +while(i >= 0); { + +sum = sum + i; + +i = i + 2; + +} + +*2. + +Identify the syntax errors in the following code segment: + +int product; + +product = 1; + +for(i=1, i <= n, i++) + +product = product * i; + +*3. + +Determine the output from the following code segment: + +4. + +Determine the output from the following code segment: + +5. + +Determine the output from the following code segment: + +*6. + +Determine the output from the following code segment: + +int i, j; + +for(i=1; i<=5; i++) { + +for(j=1; j<=5-i; j++) + +System.out.print(" "); + +for(j=1; j<=2*i; j++) + +System.out.print("*"); + +System.out.println(); + +} + +7. + +Rewrite the following for loop as a + +A. + +While loop + +*B. + +do-while loop + +int total, count; + +total = 0; + +for(count = 1; count <= 40; count+=3) { + +total += count; + +} + +8. + +Assuming n is input, rewrite the following while loop as a(n) + +*A. + +for loop + +B. + +if statement and a do-while loop + +int total, count, n; + +total = 0; + +count = 0; + +n = 5; + +while(count < n) { + +total += count; + +count++; + +} + +9. + +A store is having a sale and items are either 30, 50, or 70 % off. Assuming all the items priced between $5.00 and $50.00 are on sale, output the following table using nested loops. Using correct formatting, make sure that the output is exactly as shown below: + +Original Price | 30% off | 50% off | 70% off + +---|---|---|--- + +$ 5.00 | $ 3.50 | $ 2.50 | $ 1.50 + +$10.00 | $ 7.00 | $ 5.00 | $ 3.00 + +$15.00 | $10.50 | $ 7.50 | $ 4.50 + +$20.00 | $14.00 | $10.00 | $ 6.00 + +$25.00 | $17.50 | $12.50 | $ 7.50 + +$30.00 | $21.00 | $15.00 | $ 9.00 + +$35.00 | $24.50 | $17.50 | $10.50 + +$40.00 | $28.00 | $20.00 | $12.00 + +$45.00 | $31.50 | $22.50 | $13.50 + +$50.00 | $35.00 | $25.00 | $15.00 + +10. + +Repeat Exercise 15 in Chap.​ 3 to allow the user to enter temperatures for any number of cities using the best iteration structure. + +11. + +The Fibonacci sequence is the series of numbers which can be found by adding up the two numbers before it as shown below: + + * 0, 1, 2, 3, 5, 8, 13, 21, 34,... + +Write a complete program to compute the Fibonacci number for an integer. + +12. + +Given two numbers, the largest divisor among all the integers that divide the two numbers is known as the greatest common divisor. For example, the positive divisors of 36 are 1, 2, 3, 4, 6, 9, 12, 18, and 36, and the positive divisors of 8 are 1, 2, 4, and 8. Thus, the common divisors of 36 and 8 are 1, 2, and 4. It follows that the greatest common divisor of 36 and 8 is 4. Write a complete program to compute the greatest common divisor of two integers. +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_5 + +© Springer-Verlag London 2014 + +# 5. Objects: Revisited + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +Objects are revisited in this chapter. The sending and returning an object to and from a method is illustrated using contours. Overloaded constructors and methods are discussed and the reserved word this is introduced. Local, instance, and class constants and variables along with class methods are shown using contour diagrams. Two complete programs, one with focus on overloaded methods and another with class data members and methods are included. + +Having learned in the previous two chapters about selection and iteration structures, both of which allow for more complex programs, it is time to return to the topic of objects that was introduced in Chap. . Objects allow programs to be created in a more modular way that makes complex programs easier to understand. In this chapter, topics such as passing objects to and from a method, constructor and method overloading, class data members and methods, and the use of the reserved word this will be discussed. At first, this chapter will use only simple objects to illustrate these concepts so that the details can more readily be understood and then more complex examples will be included in the complete program at the end of the chapter. + +## 5.1 Sending an Object to a Method + +So far all that has been discussed is how primitive data types can be sent to a method. However, data is often more complex than just a simple data type, so it would be helpful to have a way to send not just an item or two but rather an entire object to a method. For example, consider a method to determine the length of a line segment. It would need to be sent the two endpoints of the line, each consisting of x and y coordinates, which would require four arguments to be sent to the method. Since each point has two coordinates, this would lend itself to the creation of a simple class. Although in Java there is a Point class in the java.awt package, a point is a simple enough concept to help explain the sending of an object to a method that this text will define its own class for a point. Whereas the Java class Point uses integers, the class defined here will use double precision numbers and will be called PointD. Consider the preliminary definition of the class in Fig. 5.1. + +Fig. 5.1 + +Preliminary definition of PointD class + +The PointD class definition is fairly simple with the usual get and set methods. However, what will make it more interesting is the introduction of a method which allows an invocation to send an object of type PointD. For this example, assume the existence of a method called distance which will calculate the distance between two points. Since the method will be defined within the PointD class, it can be invoked by an object of type PointD and also use an argument of type PointD. Assuming the existence of two points p1 and p2 of type PointD, the method could be invoked as dist=p1.distance(p2);. What would such a method look like? Recall from algebra that the distance formula is + +Then the code for the method could be as follows: + +public double distance(PointD p) { + +double dist; + +dist = Math.sqrt(Math.pow(x-p.getX(),2) + +\+ Math.pow(y-p.getY(),2)); + +return dist; + +} + +First, notice that the method returns a value of type double. Second, note that the parameter is not of type double but rather of type PointD. Lastly, although the local variable dist is not required to be declared as local, it makes the subsequent contour diagram easier to follow when illustrating how objects are passed. Using all the information above and combined into a complete program, it could appear as shown below: + +Utilizing contour diagrams, the passing of objects can easily be illustrated. Note that some steps will be skipped since many of them were discussed thoroughly in Chap. . The state of execution prior to Line 11 in the main program would be as shown in Fig. 5.2. + +Fig. 5.2 + +State of execution prior to Line 11 + +Since the method distance is invoked from p1, the contour for the method appears in the contour referenced by p1 as shown in Fig. 5.3, indicating the state of execution just prior to Line 40 in the distance method. + +Fig. 5.3 + +State of execution prior to Line 40 + +In addition to the local variable dist, the method also contains a memory location for the parameter p. Note that when passing an object to a method via a parameter, the parameter does not contain the entire object. Rather, since the argument p2 has a reference to an object, the parameter p contains a copy of the reference to that same object. Although a straight arrow could have been drawn directly to the object, it would have covered up some of the information within the contour, so in this example, it is drawn around the contour diagram for the sake of neatness. However, in the future the arrows may be drawn over parts of contours in order to save space. Note that both the argument p2 and the parameter p are pointing to the same contour. When the calculation for the dist is performed, the references to x and y are to the ones globally accessible within the object pointed to by p1, whereas the getX and getY methods access the variables in the object referenced by p. + +## 5.2 Returning an Object from a Method + +If an object can be passed to a method, can an object be returned from a method? The answer is yes, as will be demonstrated in the example that follows. Whereas the previous example returned the dist of type double, this example will determine the midpoint of a line. The equations to determine the midpoint are as follows: + +Since the midpoint consists of x and y coordinates, this lends itself to the creation of a method to return an object of type PointD. The method midPoint below implements the equations above: + +public PointD midPoint(PointD p) { + +PointD mid; + +mid = new PointD(); + +mid.setX( (x+p.getX()) / 2 ); + +mid.setY( (y+p.getY()) / 2 ); + +return mid; + +} + +Notice that in addition to the parameter, the return type is also of type PointD. The method also creates an instance of type PointD and assigns the reference to the variable mid which is also declared of type PointD. The method then calculates the midpoint and sets the x and y coordinates in mid prior to the return of the object to the invoking program. + +This method can be added to class PointD, and in Fig. 5.4, it replaces the previous method distance in order to save space. + +Fig. 5.4 + +Complete program returning an object from a method + +Prior to the execution of Line 11, the contour diagram would look similar to Fig. 5.2 in the previous example, except the variable dist of type double would be replaced with the variable middle of type PointD. After invoking the midPoint method, the contour diagrams would appear similar to the ones shown in Fig. 5.3 in the previous section, except that in addition to the variable middle appearing in the main program, the distance contour would be replaced with the midPoint contour and the variable dist in the contour would be replaced with the variable mid of type PointD which would be indeterminate. However, once the body of the method midPoint is executed, that is when the significant differences can be seen when a new object is created in Line 39. Figure 5.5 illustrates this by showing the state of execution prior to the return statement in Line 42. + +Fig. 5.5 + +Contour just prior to the execution of the return statement in Line 42 + +Notice that in addition to the contour referenced by the parameter p, there is another contour referenced by the local variable mid that contains the coordinates of the midpoint. As with the passing of a reference to an object via a parameter, the entire contour will not be returned to the main program, but rather only the reference to the contour will be returned as illustrated in Fig. 5.6 which shows the state of execution prior to Line 12. + +Fig. 5.6 + +Contour after returning to the main program prior to Line 12 + +Notice that the contour for the method midPoint no longer exists after returning to the main program. However, the value in mid was returned back to the invoking statement on Line 11 and assigned to the variable middle, which now contains the reference to the object containing the midpoint values. When the output statements refer to the getX and getY methods of the appropriate objects, the correct values will be output. + +## 5.3 Overloaded Constructors and Methods + +The constructor in the previous example initializes the variables x and y to 0.0 as a default value. In addition, a constructor could have been created to initialize the instance variables to the values wanted by a programmer as shown in the following: + +public PointD(double xp, double yp) { + +x = xp; + +y = yp; + +} + +A programmer could then initialize x and y via the constructor when the object was created as shown below: + +p1 = new PointD(4.0,4.0); + +The advantage of this method is that a programmer does not need to invoke the setX and setY methods to initialize the variables in the object. Does this mean that the set methods could be deleted from the class definitions? If the values in the variables did not need to change, then yes the set methods could be deleted. However, what if after initializing the variables, their values needed to be changed later in the program? Then of course the set methods would need to be retained in the class definition. + +Given the previous constructor and the new constructor above, which of the two is better and which one should be included in the class definition? The answer depends on what needs to be done. For example, if the values are going to be changed often, then the first constructor and the set methods are the best choice, but if the values are going to be set just once, then the second constructor is probably the better choice. + +However, when the class is written, it might not be known which type of constructor would be the best one to include. Wouldn't it be nice to include both constructors and allow the programmer a choice? But further, could this cause a syntax error by having two constructors with the same name? The answer to the first question is yes and the answer to the second question is no. The reason why this would not cause an error is because even though the name of the constructor is the same, the number of parameters is different because the first constructor does not have any parameters and the second one has two parameters. This is known as overloading. In other words, even though constructors have the same name, they can differ by the number of parameters, the types of the parameters, or the order of the different types of parameters. When used carefully, overloading can be a very useful technique. + +Using the knowledge gained from Sect. 5.1, it is also possible to pass an object to a constructor. For example, if an object was already created and a copy of that object was needed, then that object could be passed via a parameter to another constructor to create the copy. Such a constructor would look as shown below: + +public PointD(PointD p) { + +x = p.getX(); + +y = p.getY(); + +} + +Notice that instead of two parameters of type double, there is now only one parameter of type PointD. In the body of the constructor, the coordinates are retrieved from the object sent using the getX and getY methods and placed into the x and y variables of the current object. The result is that if one wanted to create two objects with the same set of coordinates, instead of writing the following code: + +p1 = new PointD(1.0,1.0); + +p2 = new PointD(1.0,1.0); + +one would merely need to write the following: + +p1 = new PointD(1.0,1.0); + +p2 = new PointD(p1); + +Given the two new constructors, the original PointD class could be rewritten as follows: + +Using this new class, a programmer could create three different instances of the PointD class as follows: + +PointD p1, p2, p3; + +p1 = new PointD(); + +p2 = new PointD(1.0,1.0); + +p3 = new PointD(p2); + +Notice that the objects are being created using three different constructors. The only difference is the number of arguments. Further, since the first constructor ensures that coordinates referenced by p1 will be initialized to 0.0, the second constructor initializes the variables referenced by p2 via the arguments, and the third constructor makes a copy of the previous object which will be referenced by p3, the set methods do not need to be called. However, if the values in the points need to be changed later, the set methods are still there if necessary. + +If a constructor is not included in a class by the programmer, the system will generate a default constructor. Should the programmer include a constructor without any parameters, then this constructor overrides the default constructor generated by the system. Although a bit confusing, this constructor provided by the programmer is also sometimes called a default constructor since it overrides the system default constructor. However, if one writes the two new constructors above, and a default constructor is not included by the programmer, then the system will not generate a default constructor. In such a case, were one to code a p1=new PointD(); statement, a syntax error would occur. The result is if one wants to override the system default constructor, it is a good idea to override it with a programmer-defined default constructor to avoid a possible syntax error. Even if overloading is not being used in the class, it is generally best for a programmer to include a default constructor and not rely on the system default constructor. + +Just as constructors can be overloaded, so can methods. As with constructors, the name of the method can be the same, but the number of parameters, the types of the parameters, or the order of the different types of parameters must be different. For example, take the distance method from Sect. 5.1 which requires one parameter as shown again below: + +public double distance(PointD p) { + +double dist; + +dist=Math.sqrt(Math.pow(x-p.getX(),2) + +\+ Math.pow(y-p.getY(),2)); + +return dist; + +} + +What if another method was needed to determine the distance of a point from the origin? Certainly one could invoke the method above by having one of the two points as the origin using the new constructors introduced in this section as follows: + +PointD p1, p2; + +p1=new PointD(); + +p2=new PointD(3.0,4.0); + +dist = p2.distance(p1); + +In this example, the default constructor initializes the coordinates of p1 to 0.0, and the second constructor initializes the coordinates of p2 to 3.0 and 4.0. But the assumption could be that the distance will be calculated from the origin, and it would be convenient not to need it as a parameter in the distance method. Such a method would look as follows: + +public double distance() { + +double dist; + +dist=Math.sqrt(Math.pow(x,2)+ Math.pow(y,2)); + +return dist; + +} + +Instead of invoking the previous method with the dist = p2.distance(p1); statement, it could be invoked using the new method as follows: + +dist = p2.distance(); + +Again, the name of the method is the same, but the number of parameters is different. As mentioned earlier, it is also possible to have the same number of parameters but different types of parameters or a different order of the different types of the parameters. + +For example, assume a method of the Student class was to be sent two parameters: one for the number of credit hours and another to indicate whether the student has graduated. In the main program below, notice that in one case, an integer is in the first argument position and in the second case a Boolean value is the first argument position. Would this cause a problem? + +If there were only one method named setInformation, the answer would be yes. However, notice the setInformation method is overloaded. The parameters are reversed in the second method so that the order of the arguments in the calling program does not matter. Thus, if a programmer accidently puts the arguments in the wrong order, there is no error. As stated previously, overloading can sometimes be helpful if used carefully and not excessively. + +## 5.4 Use of the Reserved Word this + +In looking at portion of the original PointD class from Fig. 5.1 shown below, the parameter names in the constructor and in the two set methods are listed as xp and yp. + +class PointD { + +private double x, y; + +public PointD(double xp, double yp) { + +x = xp; + +y = yp; + +} + +public void setX(double xp) { + +x = xp; + +} + +public void setY(double yp) { + +y = yp; + +} + +} + +What would happen if the names of the variables xp and yp were changed to x and y, respectively? What would x and y refer to, the data members or the parameters? + +/** Caution: Incorrectly Implemented code **/ + +class PointD { + +private double x, y; + +public PointD(double x, double y) { + +x = x; + +y = y; + +} + +public void setX(double x) { + +x = x; + +} + +public void setY(double y) { + +y = y; + +} + +} + +The answer to the second question is that the parameters and local variables declared in a method take precedence over any globally declared variables in the object. The answer to the first question is that the contents of the parameters x and y would merely be assigned back into the memory locations associated with the parameter. The result is that the private data members would not contain the new values sent from the invoking program, and this is probably not what was intended. + +Is it possible to use the same variable names for both the parameters and the instance data members? The answer is yes. In any particular instance, the reserved word this can be used to refer to the instance. Java uses this as a self-referencing pointer to refer to the current object. Using the reserved word this, the previous class can be rewritten as shown below: + +class PointD { + +private double x, y; + +public PointD(double x, double y) { + +this.x = x; + +this.y = y; + +} + +public void setX(double x) { + +this.x = x; + +} + +public void setY(double y) { + +this.y = y; + +} + +} + +So, for example, consider the shortened skeleton of the program presented at the beginning of this chapter that uses only the setX and getX methods shown below: + +In the setX method, x refers to the parameter, and the value in x is assigned to this.x which is the data member x in the object. In a sense, this is a pointer to the current object as illustrated in the contour in Fig. 5.7 showing the state of execution just prior to Line 17. + +Fig. 5.7 + +State of execution prior to Line 17 + +Notice the arrow pointing back to the object PointD. It illustrates the word this and shows how the data member x is referenced. Although the example in Fig. 5.7 includes the cell for this and a self-referencing arrow, it tends to clutter up the contour diagrams, so in general it will not be included because its existence is understood. Notice that the constructor and the getX do not use the reserved word this on Lines 12, 13, and 19. In this case the word this is not necessary. Although one could still include the word this, it can be distracting to use it when it is not needed. As a result, this text will not use the word this unless it is necessary. + +The reserved word this can also be used in situations beyond just referring to variables. It can refer to constructors and methods as well. For example, consider the three constructors presented in the previous section and relisted below using the reserved word this in the second constructor: + +In one sense, the first constructor is just a special case of the second constructor, so it could be defined in terms of the second constructor. In other words, it could invoke the second constructor with the values 0.0 for the x and y coordinates. But how could it invoke the second constructor? Again, since it is the current object that needs to be referenced, the reserved word this could be used as shown below: + +public PointD() { + +this(0.0,0.0); + +} + +Even the third constructor could be written to invoke the second constructor as: + +public PointD(PointD p) { + +this(p.getX(),p.getY()); + +} + +Since an object of type PointD is being passed to the constructor, the methods getX and getY can be invoked to retrieve the values in x and y, which in turn can be sent as arguments to the second constructor. In order to invoke the second constructor, it is referred to using this. + +The advantage of the above technique is that if later a change needs to be made to the constructors, it might not need to be made to all three constructors, but possibly only one of them. This reduces the possibility of introducing unintended errors into the program, and the result of the modifications introduced in this section can be seen below: + +As with variables and constructors, it is possible to use the word this when referring to methods in the same object. For example, suppose that a method needed to access another method such as the previous distance method within the same class. It could be invoked as this.distance(), but although the method can be invoked using the reserved word this, there is no need to do so. As a result, the use of the word this prior to the invoking of a method should be avoided. + +## 5.5 Class Constants, Variables, and Methods + +This section will discuss how constants, variables, and methods can be declared not only within a method and in each instance of a class but also how they can be declared in the class itself. First, it looks at constants, then variables, and lastly methods. + +### 5.5.1 Local, Instance, and Class Constants + +If a constant needs to be used only within a single method, then it can be declared within that method. However, if several methods in the same class use the same constant, it could be declared within each method but that will take up more memory. If that constant needs to be changed, then it will need to be changed in more than one location. Although there already exists the Math.PI constant discussed in Sect.​ 1.​7, consider for example, the following program which includes the user-defined constant PI: + +In addition to the existence of the local variables c and a to help with understanding the contour diagrams, notice that both methods have their own locally declared constant PI at Lines 24 and 30. When each method is executed, its own copy of the constant is allocated. The contour diagram in Fig. 5.8 illustrates that each method has its own copy and shows the state of execution prior to Line 33. + +Fig. 5.8 + +State of execution prior to Line 33 + +Even though one contour is deallocated (indicated by the shaded contour) before the next one is invoked, it still had to allocate the constant. While this is only a minor problem now, any local constants can take up much more space in a recursive algorithm as will be discussed in Chap. . Since there is a potential for wasted memory, it would be better if the constant were not associated with each method, but rather with the object as illustrated in the following section showing the Circle class: + +Only the class is shown here because the main program has not changed. Again, the local variables in the method remain to help with the contour diagrams, but notice that the declaration of the constant is no longer within each method, but rather in the class at Line 16. An immediate obvious advantage is that should the constant need to change, it needs only to be changed in one location. The contour diagram representing the state of execution prior to Line 32 is shown in Fig. 5.9. + +Fig. 5.9 + +State of execution prior to Line 32 + +Note that the constant PI no longer appears in each of the methods, but rather is located in an instance of the Circle class. The advantage to declaring the constant in the class as opposed to each individual method is that the constant only needs to be allocated once. + +However, what if more than one object was declared? Then there would be one constant allocated within each of the objects. Consider the following modification to the main program that declares and allocates two objects: + +double radius1, radius2; // Line 3 + +Circle c1,c2; // Line 4 + +c1 = new Circle(); // Line 5 + +c2 = new Circle(); // Line 6 + +radius1 = 3.0; // Line 7 + +radius2 = 4.0 // Line 8 + +c1.setRadius(radius1); // Line 9 + +c2.setRadius(radius2); // Line 10 + +Using the same Circle class as before, without invoking any of the methods except for the constructor, note the state of execution just prior to Line 9 in the main program in Fig. 5.10. + +Fig. 5.10 + +State of execution prior to Line 9 + +Notice that the constant PI appears in both instances of the Circle class. Just like with the methods when the constant was moved from the individual methods, wouldn't it be nice if the constant could be moved so that it would be accessible by both objects? This can be accomplished by using what is known as a class constant. Showing the new complete program below, a class constant is created by using the reserved word static as shown in Line 24 below: + +Executing the first few lines of the program as done previously, the contour diagram in Fig. 5.11 shows the state of execution just prior to Line 9. Notice that each of the instances does not have a local constant PI. As mentioned previously in Sect. 2.​7, just as the main program has a contour around it, as shown in Fig. 5.11, so does the class Circle. Using the word static creates the class constant PI that does not get allocated each time a new instance of the class Circle is created. When there is a reference to the constant PI, it is not found in the instance, but rather in the class. As can be seen, this saves memory, especially when many objects are created. + +In contour diagrams, how can one distinguish the contour for the class itself from the contours associated with the instances of the class? One way is to note that variables of type Circle point to the instances of the Circle class. However, another way to help the reader is to allow the contour associated with the class itself to have the name of the class (in this case Circle) and then use a superscript for each instance of the class to indicate the order in which the objects were created as shown in Fig. 5.11. When necessary to help make this distinction clear, this text will use superscripts. + +Fig. 5.11 + +State of execution prior to Line 9 + +Just as this text has previously not drawn the contour around the main program in the interest of saving space, it would also help to save space to not draw the contour around all the instances of each object. As can be seen in Fig. 5.11, it could get rather cumbersome to draw such large contours. However, on occasion it is still helpful to draw a contour to represent the class, so instead of drawing it around all the instances, it is sometimes convenient to draw it separately, with the understanding that all the instances are within that contour. This second alternative is shown in Fig. 5.12. + +Fig. 5.12 + +Alternative contour diagram illustrating class constants + +Figure 5.11 is the ideal drawing and it will be used as necessary. However, generally and if needed, the contour for the class using a class constant will be drawn as shown in Fig. 5.12, with the understanding that all instances will be within that contour. + +### 5.5.2 Local, Instance, and Class Variables + +Local and instance variables are similar to local and instance constants. In fact, the variables c and a representing the circumference and area in the previous section are local variables in the methods, and the variable r representing the radius in a Circle object is an instance variable. In trying to decide where a variable needs to be declared, it helps to ask which methods need access to the variable. For example, the variables c and a were used only by the circumference and area methods, so it made sense to declare them there. However, the variable r is used by both methods; hence, it makes sense to declare it once within the object instead of in both methods. + +Although using the two local variables wasted a little memory, it made understanding the contours easier, and in this case it is not much of a problem. In fact, these variables are not even needed, because the expression to calculate each value could have been included in the return statement, as shown below: + +public double circumference () { + +return 2 * PI * r; + +} + +public double area() { + +return PI * r * r; + +} + +It is sometimes helpful to write the initial version of the code using extra memory to help understand how it works and help debug any logic errors, and then later the extra memory locations can be removed to make the code more efficient. This technique will become even more helpful when learning about recursion in Chap. . + +As with the constants in the previous section, just as some variables are better placed in the object as instance variables instead of as local variables in the methods, there are cases where some variables should be declared as class variables instead of as instance variables. For example, what if one wanted to count each time a new object was created? Although this could be done in the main program, what if an object other than the main program was also creating the objects to be counted? In this case, the main program could not count them, nor could an instance variable be used, because each instance could not count how many other objects of its own type were created. As one might suspect, this would be a good candidate for a class variable. + +A class variable is declared similarly to a class constant except the reserved word final is not used as shown in Line 15 of the following program which simulates a program that creates objects for charge cards that contain an account number: + +Although it would be nice to create an indefinite number of objects, that would be difficult to illustrate using contours and would also be difficult to implement without the use of arrays which will be introduced in Chap. . Instead, this program creates only three ChargeCard objects to help illustrate the class variable cardCount. Notice that their class variable is initialized by the compiler to 0 in Line 15. Then each time a new instance of the class is created, the class variable cardCount is incremented in the constructor. The contour in Fig. 5.13 illustrates the state of execution just prior to Line 10 in the main program. + +Fig. 5.13 + +State of execution prior to Line 10 in main + +As can be seen, the class variable is shown in the ChargeCard contour which is accessible by all of the instances of that class, as discussed in the previous section. Also note that instead of using a variable such as card1 to gain access to a class variable, the name of the class ChargeCard in Line 11 is used instead. Further, the reader might have noticed that whereas the class constant in the previous section was declared as private, the class variable cardCount is declared as public. In one sense this might seem convenient, because the class variable is accessible in the main program in Line 11. However, as mentioned in Chap. and as will be discussed in the next section, it is usually better to declare variables as private and access them using a public method. + +### 5.5.3 Class Methods + +Although declaring a class variable as public allowing it to be accessed from the main program works, it is not necessarily the best way to access class variables. Just as it is not a good idea to declare instance variables as public, the same applies to class variables. As before, it is better to declare class variables as private and then access them via a public class method. This is accomplished by declaring a method using the reserved word static as shown in the following modified program: + +First, notice that the method getCardCount has been added at Line 27. The use of the reserved word static makes it a class method instead of an instance method. Also note that the method is declared as public and the class variable cardCount at Line 15 is now declared as private. Next, notice in Line 11 that instead of accessing the class variable, the class method getCardCount is invoked to return the value of cardCount. As before, the class method is invoked using the class name ChargeCard. + +What is interesting to see is that when the main program invokes the class method getCardCount, the contour is not in one of the objects, but rather in the contour for the class ChargeCard as illustrated in Fig. 5.14 which shows the state of execution prior to Line 28 in the class method getCardCount. When Line 28 in the class method getCardCount is executed, it has access to the private class variable cardCount and will return the value 3 back to Line 11 in the main program. + +Fig. 5.14 + +State of execution prior to Line 28 in the getCardCount method + +Given the above, one needs to plan carefully where various constants, variables, and methods are declared. As a general rule, it makes sense to declare constants as class constants since they cannot be modified, they are accessible to all methods in the objects within the class, and they save memory. As another rule of thumb, it is generally a good idea to declare all variables as locally as possible. This helps organize a program and makes it easier to understand and maintain. However, if a method or object needs to communicate information with other methods or objects, then declaring the variables as instance or class variables makes sense. Although it might seem easy and be tempting to declare all variables as instance and class variables, this can make a program difficult to maintain and debug in the future. Likewise with methods, they should usually be declared as instance methods unless individual objects need to share a method, and then it should be declared as a class method. The key is to take the time when designing and creating a program to determine where each variable and method should be declared. + +## 5.6 Complete Programs: Implementing Objects + +The first complete program implements overloaded methods, and the second utilizes class data members and class methods. + +### 5.6.1 Program Focusing on Overloaded Methods + +After defining the PointD class earlier this chapter which represents a point, a class that represents a line will be developed in this section. Since a line consists of points, the PointD class can also be used. The main program will: + + * Set points and lines + + * Compare two lines + + * Find the distance between a line and a point + +A line can be defined in slope-intercept form y = mx + b, where m is the slope and b is the y-intercept, and the class will be named LineSI. The slope and y-intercept are kept in private instance variables, slope and intercept. + +Because a user may like to define a line in several different ways and to reinforce the concept of overloaded constructors, six constructors will be provided. The default constructor without any parameters will set the value of the slope and the y-intercept to 0.0. The next constructor accepts the value for the slope as a parameter and sets the y-intercept to 0.0 creating a line going through the origin. The third constructor receives a LineSI object and copies the slope and y-intercept of the line to the new object, essentially creating an identical line. This constructor is sometimes referred as a copy constructor. The fourth constructor accepts two parameters and assigns these values to the instance variables, slope and intercept. A line can also be defined in two-point form as + +where (x 0 , y 0) and (x 1 , y 1) are two different points on the line. So, the fifth constructor accepts two PointD objects, calculates the slope and the y-intercept, and assigns the results to appropriate data members. The last constructor receives the x and y coordinates of two points and calculates the slope and y-intercept. Initial implementations for the six overloaded constructors are shown below: + +All six overloaded constructors have the same name as the class and they are differentiated by their parameter lists. The first constructor has no parameters, the second and third constructors have one parameter, the fourth and fifth constructors have two parameters, and the sixth constructor has four parameters. Although both the second and third constructors have one parameter, the types are different; the second has one of type double and the third has one of type LineSI. The fourth and fifth constructors have two parameters; the fourth has two parameters of type double and the fifth has two parameters of type PointD. + +The reserved word this in a constructor invokes the other constructor with the corresponding parameter list within the same class. So, calling the default constructor in the main method to create a LineSI object causes the fourth constructor to be called as well. The second, third, fifth, and sixth constructors also call the fourth constructor by using the reserved word this. As was discussed in Sect. 5.4, the advantage of using the word this is that if a change needs to be made to a common feature of all the constructors, only the fourth constructor needs to be modified. Also, notice that in the fourth constructor, the keyword this is used in order to distinguish between the data member and the parameter. This ensures that values in the parameters are correctly copied into the data members. + +There will be two usual mutators to set each instance data member and two accessors to get the value of two data members as shown below: + +In addition to the two mutators above, there will be three more mutators named setLine to set both instance data members at the same time. Like constructors, methods can also be overloaded. The setLine method is overloaded; one takes the values of the slope and the y-intercept, another takes the x and y coordinates of two points as parameters, and the last takes two PointD objects. Even though the first and the second setLine methods have the same number of parameters, the types are different; the first setLine method has two parameters of type double and the second has two parameters of type PointD. The detailed implementations of these three overloaded methods are shown below: + +First, notice that the second and third setLine methods use the first setLine method. This is similar to the constructors, where all the other constructors invoked the fourth constructor. + +If one looks carefully, it can be seen that the implementation of the fourth constructor and the first setLine method is the same. Also, notice that the code for the fifth constructor appears similar to the code for the second setLine method except that the constructor is invoking the fourth constructor and the setLine method is calling the first setLine method with the corresponding parameter list defined within the class. The calculations for the slope and y-intercept used as the formal parameters in the methods are exactly the same. The same thing can be said for the sixth constructor and the third setLine method. How can one avoid having duplicate code in the program? The answer is to invoke the setLine method in the constructor instead of repeating the same code twice. This would make sense when more complex computations need to be performed several times in the separate methods within the class as in the second and third setLine methods. The modification to the fourth, fifth, and sixth constructors is illustrated below: + +The first setLine method can be further modified to avoid duplicate code. Notice that the two statements this.slope = slope; and this.intercept = intercept; are also in setSlope and setIntercept methods, respectively. Therefore, the original first setLine method can be rewritten as follows: + +// First setLine method, modified: + +public void setLine(double slope, double intercept) { + +// using setSlope and setIntercept methods + +setSlope(slope); + +setIntercept(intercept); + +} + +In order to understand the nesting of method calls in overloaded constructors and methods, consider what would happen when a LineSI object is created using a default constructor in the main method. Calling the default constructor would result in the fourth constructor being invoked. The fourth constructor will call the first setLine method which calls the setSlope and setIntercept methods to set the values of slope and intercept. Although at first this might seem more complicated, the purpose is to eliminate duplicate code making the program easier to maintain. + +The last two methods are named compareLines and distance. The LineSI object, which calls the method compareLines, will be compared to the LineSI object passed to the method. It returns true when the two lines are the same and false when they are different. The LineSI object, which calls the method distance, calculates the distance from the object to the point passed as a parameter. + +All the pieces are put together in the following class: + +Notice that along with the two private instance variables, the private class constant, DEFAULT_VALUE, was defined. It was declared as a class data member so that any method defined in the class can use it as a constant because the value does not need to be changed during execution. By declaring it as a class constant, it will avoid allocating memory for the same constant twice when it was used in the first and second constructors. + +The Lines class in Fig. 5.15 will test the methods defined in LineSI. It will create two points and six lines using six different constructors. Then it will output the properties of the lines and the result from the compareLines and distance methods. + +Fig. 5.15 + +A client program for LineSI and PointD classes + +The output from the above program is given below: + +line1: slope = 0.5, intercept = 3.5 + +line2: slope = 0.5, intercept = 3.5 + +line3: slope = -1.0, intercept = 3.0 + +line4: slope = 0.5, intercept = 3.5 + +line5: slope = 0.0, intercept = 0.0 + +line6: slope = 2.0, intercept = 0.0 + +line1 and line2 are the same. + +line4 and line5 are not the same. + +The distance between line3 and pt1 is 1.41. + +The distance between line6 and pt2 is 3.58. + +### 5.6.2 Program Focusing on Class Data Members and Class Methods + +In this section, the ChargeCard class defined in Sect. 5.5.3 will be modified. Assume that a cardholder travels to Europe and uses the card for shopping. The amount charged in Euros should be converted into US dollars and added to the balance of the card. Using the application, a user should be able to: + + * Open an account to receive a card + + * Make purchases in either US dollars or Euros + + * Print the current balance of the card + +The program should perform the conversion from Euros to US dollars. The calculation used in conversion is the same for any purchase made in Euros; therefore, all the Card objects can share the code for the conversion. For this reason, the convertEurosToDollars method will be declared as a class method. The program also keeps track of the conversion rate named rate in the program. Since rate is used in the class method and a class method does not have an access to an instance data member, rate should be declared as a class data member. Because the conversion rate changes frequently, it should be declared as a variable, not a constant. The mutator and accessor for rate will also be class methods since they deal with a class data member. The following code segment implements the class data member and class methods discussed so far: + +So far there is no instance data member or instance method implemented in the Card class; therefore, all the methods can be used without creating an object. The following main method will set the rate and output its value and the result of the conversion of 1.00 Euro to US dollars: + +public class Purchases { + +public static void main(String[] args) { + +// output the information for Euros conversion + +Card.setRate(1.2128); + +System.out.println("rate = " + Card.getRate()); + +System.out.printf("1.00 euro is equal to %.2f dollars.", + +Card.convertEurosToDollars(1.00)); + +System.out.println(); + +} + +} + +Notice that the three class methods are invoked using the class name Card in the dot notation. The following is the output from the above program: + +rate = 1.2128 + +1.00 euro is equal to 1.21 dollars. + +Now the data members, constructors, and instance methods can be added to the Card class. The additional data members include two class constants, DEFAULT_ACCOUNT_NUMBER and DEFAULT_BALANCE, and two instance variables, accountNum and balance. There will be two constructors: one default constructor and another constructor that has two formal parameters to store values in the instance variables. The setAccountNum method is a mutator to set the value of the variable accountNum. Both the purchaseInDollars and purchaseInEuros methods receive a formal parameter and increment the balance by the amount in the parameter. In the purchaseInEuros method, the amount of Euros passed to the method is converted to US dollars by calling the convertEurosToDollars method. There will also be two accessors, getAccountNum and getBalance, to get the values of the two instance variables. The following program defines the Card class: + +The complete main method in Fig. 5.16 includes the creation of a Card object, two purchases, one each in US dollars and Euros, and the output of the balance after each purchase. + +Fig. 5.16 + +A client program for Card class + +The following is the output from the above program: + +rate = 1.2128 + +1.00 euro is equal to 1.2128 dollars. + +after spending 100.00 dollars + +card: Account Number = 12345, balance = 100.00 dollars + +after spending 100.00 euros + +card: Account Number = 12345, balance = 221.28 dollars + +## 5.7 Summary + + * In addition to primitive data types, objects can be sent to and returned from methods. + + * Constructors and methods can be overloaded by having the same name but must have a different number of parameters, different types of parameters, or parameters of different types in a different order. + + * The reserved word this is used to refer to instance variables when there are parameters of the same name and to constructors when one constructor is defined in terms of another. + + * If a constant or variable is declared within a constructor or method, it is known as a local constant or variable. + + * If a constant or variable is declared within an object, they are known as an instance constant or variable and can be accessed by any constructor or method within the object. + + * The reserved word static causes a constant, variable, or method to be a class constant, variable, or method that can be accessed by an instance of the class. + + * Take the time to determine where variables and methods should be declared to help balance readability, communication, debugging, maintainability, and memory allocation. + +## 5.8 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Identify the valid and invalid overloaded constructors in the following code: + +2. + +Identify the valid and invalid overloaded methods in the following code: + +3. + +A hexahedron is a three-dimensional shape with six faces. In this problem, a class which represents a hexahedron with squares at the top and the bottom as shown below will be implemented. + +Assume that hexahedrons are made of different materials; therefore, the weight needs to be kept along with the side and the height in order to describe a particular hexahedron. The following code implements data members and a portion of the constructors of Hexahedron class. Complete the first six constructors to call the last one by using the reserved word this. + +4. + +Draw contour diagrams to show the state of execution right after the execution of the statement line1 = new LineSI(pt1, pt2); in Fig. 5.15 in Sect. 5.6.1. + +5. + +Draw contour diagrams to show the state of execution right after the execution of the statement card.purchaseInEuros(100.00); in Fig. 5.16 in Sect. 5.6.2. + +6. + +Implement a class Rectangle which represents a rectangle shape as described below: + +*A. + +The Rectangle class has one private class constant DEFAULT_VALUE that should be initialized to 0.0. + +*B. + +The Rectangle class has two private instance data members, sideX and sideY, of type double. + +*C. + +The first constructor is a default constructor and calls the third constructor (described below) using the reserved word this to set instance data members to the default value. + +D. + +The second constructor calls the third constructor (described below) using the reserved word this. It retrieves a Rectangle object as a formal parameter and copies sideX and sideY of the object to the new object. + +E. + +The third constructor calls the setSides method (described below). Two formal parameters are used as the parameters for the setSides method. + +*F. + +The mutator methods, setSideX and setSideY, each has one formal parameter and stores them in the instance data member. + +G. + +Another mutator method, setSides, has two formal parameters and stores them in the instance data members by using the setSideX and setSideY methods (described above). + +H. + +The accessor methods, getSideX and getSideY, return the value of the appropriate instance data member. + +I. + +A method named calcArea computes the area of a rectangle and returns the computed area. + +Next, write a client program to test the Rectangle class defined above. This class should be named Rectangles and should contain the main method which performs the following tasks: + +a. + +Declare three Rectangle objects. + +b. + +Create three Rectangle objects using the three different constructors. + +c. + +Output the contents of sideX and sideY of the three objects. + +d. + +Output the area of the third rectangle. + +Here is some sample output: + +rectangle1: sideX = 0.0, sideY = 0.0 + +rectangle2: sideX = 3.0, sideY = 4.0 + +rectangle3: sideX = 3.0, sideY = 4.0 + +rectangle3: area = 12.0 + +7. + +Expand the PointD class discussed in this chapter to include the quadrant information of a point. The x-axis and y-axis divide the plane into four regions called quadrants. The quadrants are labeled starting at the positive x-axis and going around counterclockwise as shown below: + +Write the new PointD class as described below. Points falling on the x-axis and y-axis are not considered to be in any quadrant, and therefore return the default value, 0: + +A. + +The PointD class has two private class constants, DEFAULT_VALUE of type double and DEFAULT_QUADRANT of type int, that should be initialized to 0.0 and 0, respectively. + +B. + +The PointD class has two private instance data members, x and y, of type double. + +C. + +The PointD class has one private instance data member quadrant of type int. + +D. + +The first constructor is a default constructor and calls the third constructor (described below), by using the reserved word this, to set the instance data members to the default values. + +E. + +The second constructor receives a PointD object as a formal parameter and stores the x, y, and quadrant of the object as the values of the instance data members. + +F. + +Third constructor calls the setPoint method (described below). Its two formal parameters are used as the parameters for the setPoint method. + +G. + +The mutator methods, setX and setY, have one formal parameter and call the setPoint method (described below). The setX method changes the value of data member x to the value of the parameter. The setY method changes the value of data member y to the value of the parameter. + +H. + +Another mutator method, setPoint, has two formal parameters and stores these values in the instance data members, x and y. It also sets the correct value for the data member quadrant depending on the values of the two parameters. + +I. + +The accessor methods, getX, getY, and getQuadrant, return the value of the appropriate instance data member. + +Next, write a client program to test the PointD class defined above. Call this class Points. The main method should perform the following tasks: + +J. + +Declare five PointD objects. + +K. + +Create five PointD objects using the three different constructors. The points should be in three different quadrants and also the origin. + +L. + +Output the contents of x, y, and quadrant for the five objects. + +M. + +Change the value of x or y for one of the points using a mutator so that the point will move to a different quadrant. + +Here is some sample output: + +point1: (0.0, 0.0) in quadrant 0 + +point2: (2.0, -5.0) in quadrant 4 + +point3: (2.0, -5.0) in quadrant 4 + +point4: (2.0, 5.0) in quadrant 1 + +point5: (-2.0, 5.0) in quadrant 2 + +after calling set method + +point3: (-2.0, -5.0) in quadrant 3 +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_6 + +© Springer-Verlag London 2014 + +# 6. Strings + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +This chapter discusses string variables and the String class. In addition to the concatenation of strings, various methods defined in the String class such as the length, indexOf, and substring methods are examined. The toString method which returns a string representation of the properties of an object is also shown along with a complete program implementing String objects. + +## 6.1 Introduction + +Up till now, this text has focused on numerical values such as integers and real numbers. In this chapter the focus is text values. Characters are another fundamental type of data used on a computer, and a string in Java is a sequence of characters. Each programming language supports a particular character set which is a list of characters in a particular order. The ASCII (American Standard Code for Information Interchange) character set is the most common one. The basic ASCII set uses seven bits per character to support 128 different characters including letters, punctuation, digits, special symbols, and control characters. In order to support more characters and symbols from many different natural languages, Java uses the Unicode character set, which uses 16 bits per character, supporting 65,536 unique characters. ASCII is a subset of the Unicode character set. + +Strings are not represented as a primitive data type such as int, double, or char but as an object of the String class. Text values can also be passed as an argument to methods such as system.out.print as described in Chap.​ 1. Similar to numbers, strings can be assigned to variables and manipulated using operators and methods defined in the String class. + +## 6.2 String Class + +The String class is a standard class, like the Math or Scanner classes, defined in the java.lang package. The following illustrates how a String variable is declared and a String object is created: + +String fullName; + +fullName = new String("Maya Plisetskaya"); + +After the variable fullName is declared as type String, the second statement creates an object with a value "Maya Plisetskaya" and then a reference to the new object is placed in the variable, fullName. The contour diagram in Fig. 6.1 illustrates the state of execution after the above two statements. + +Fig. 6.1 + +An object of String class + +Because the String class is a predefined class, a variable name is not in the contour diagram of the String object. Although the String class is not a primitive data type, a String object can be created by assigning a string within double quotes to a String variable, for example, + +String fullName; + +fullName = "Maya Plisetskaya"; + +Even though it looks like the text value is directly assigned to the variable, the variable fullName does not contain an actual value, like with a primitive data type, but rather an address of the object. The contour diagram after the above two statements will be exactly the same as the one shown in Fig. 6.1. + +Further, notice that the following statements using the keyword new will assign a null value to the variable: + +String fullName; + +fullName = new String(); + +The same thing will also happen with a simple assignment statement: + +String fullName; + +fullName = null; + +The differences between creating String objects using new statements and assignment statements will become more apparent in Sect. 6.4. Except for on a few occasions, the new statement will be used to create a String object in order to reinforce the ideas of object creation. In either case, once a String object is created, the string value inside of the object cannot be modified, which means that any of the characters in the string cannot be changed, nor can the string be shortened or lengthened. This property is called being immutable. If a string needs to be modified, an object of type StringBuffer which is a mutable sequence of characters can be used, but this is beyond the scope of this text. + +## 6.3 String Concatenation + +Although strings cannot be modified, there are a number of operators that can be used with strings. A useful String operation is concatenation accomplished by the use of a plus symbol, +, which was introduced briefly in Chap.​ 1 to support output. Two strings can be combined to create a new string. Consider the following example code segment: + +String firstName, lastName, fullName; + +firstName = new String("Maya"); + +lastName = new String("Plisetskaya"); + +fullName = firstName + " " + lastName; + +A first name and a last name are assigned to separate variables, firstName and lastName, respectively, and then combined together using a string concatenation operator. A contour diagram for fullName is again exactly the same as the one in Fig. 6.1. Notice that a space is concatenated between firstName and lastName. Without it, fullName would have the first name and a last name combined together as in "MayaPlisetskaya". + +A plus symbol was introduced as an arithmetic addition and as a concatenation in the output statements in Chap.​ 1. When an operator represents more than one operation, it is called an overloaded operator. What happens if overloaded operators appear in the expression with mixed types? The Java compiler treats + as an arithmetic addition when both the left and right operands are numbers, otherwise it will treat it as a string concatenation. Remember that the plus symbol is evaluated from left to right and the result of an expression with mixed types is String type. For example, what would the output be for the following code segment? + +int num1, num2; + +String str1, str2; + +num1 = 2; + +num2 = 3; + +str1 = new String("num1 + num2 = "); + +str2 = new String(" = num1 + num2"); + +System.out.println(str1 + num1 + num2); + +System.out.println(num1 + num2 + str2); + +System.out.println(str1 + (num1 + num2)); + +The first print statement results in + +num1 + num2 = 23 + +Since the left operand of the first plus symbol is String and the right operand is int, it will treat the contents of num1 as String. Because the first plus sign was treated as concatenation, the left operand of the second plus sign is a String type. Further, the right operand of the second plus symbol is int; it will again treat the contents of num2 as a String. + +How about the second print statement? The first plus sign is treated as an arithmetic addition because the left and the right operands of the first plus sign are both int types. Then, the second plus symbol is treated as a string concatenation since the last operand is of type String and it is mixed-type operands. The output will be + +5 = num1 + num2 + +In the third print statement, parentheses will force (num1 \+ num2) to be evaluated first. Therefore, the second + is treated as an arithmetic addition. The result will be + +num1 + num2 = 5 + +Another operator that can be used on String objects is a shortcut operator, +=. It has the same effect as the shortcut of arithmetic addition discussed in Chap.​ 1 and is left as an exercise at the end of the chapter. + +## 6.4 Methods in String Class + +There are over 50 methods defined in the String class that can be found in the Java API specification document on the Oracle website at + + * http://​docs.​oracle.​com/​javase/​7/​docs/​api/​java/​lang/​String.​html + +In this section, six of the most commonly used ones will be discussed: length, indexOf, substring, equals, equalsIgnoreCase, and charAt. + +### 6.4.1 The length Method + +In order to find the number of characters in a String object, the length method is used. For example, if the variable fullName refers to the string "Maya Plisetskaya", then + +fullName.length() + +will return the value 16 because there are 16 characters in the string. Notice that a space between the first name and the last name is counted as a character. If the string is empty, applying the length method results in 0. + +### 6.4.2 The indexOf Method + +A character in a string can be referred to by its position, or in other words its index, in the string. The index of the first character is 0, the second character is 1, and so on as illustrated in Fig. 6.2. + +Fig. 6.2 + +Index of characters in the string + +To find the position of a substring of a string, the indexOf method can be used. The method will return the position of the first character of the substring in the string. Here are some examples using fullName: + +statement | return value + +---|--- + +fullName.indexOf("Maya") | 0 + +fullName.indexOf("set") | 8 + +fullName.indexOf("Set") | -1 + +fullName.indexOf("ya") | 2 + +fullName.indexOf(" ") | 4 + +The first statement returns 0 because "Maya" occurs at the beginning of the string. The word "set" starts at the position 8. The return value −1 from the third statement indicates that the substring does not exist in the string. Since it performs a case-sensitive search, it did not find "Set" starting with an uppercase letter. There are two occurrences of "ya" at the position 2 and 14. Since if there is more than one occurrence of the substring in the string, the position of the first character of the first matching substring is returned, the fourth statement returns 2. As it was mentioned before, a space is considered to be a character; therefore, the last statement returns 4 which is the position of the space in the string. + +### 6.4.3 The substring Method + +On some occasions, one's name needs to be printed in a format of a last name, a comma, a space, and a first name. How can it be formatted if the full name is given in a first name, a space, and a last name? The answer is that the first name and the last name can be extracted from the full name and rearranged. In order to extract a substring from a string, a substring method can be used. A substring method takes two integers as arguments: the position of the first letter of the substring and the position of the last letter of the substring + 1. Using the string in Fig. 6.2, this means that the statement fullName.substring(8, 11); will return "set". Here are some more examples: + +statement | return value + +---|--- + +fullName.substring(0, 4) | Maya + +fullName.substring(2, 2) | an empty string + +fullName.substring(10, 6) | runtime error + +fullName.substring(18, 20) | runtime error + +The second statement will create a String object with empty string. The third example gives a runtime error because the first argument should be the same as or smaller than the second. In the fourth example, the arguments should be in the range of 0–16, otherwise they are out of bounds and cause a runtime error. + +Obtaining a first name, "Maya" from fullName is not very difficult. A statement fullName.substring(0, 4) would work. However, consider when the fullName contains a different name, for example, "George Balanchine". fullName.substring(0, 4); would return Geor, which is not the first name. How can this be changed so that the statement will extract the first name from any full name? Notice that the first name and the last name are separated by a space. So, using a position of the space spacePos = fullName.indexOf(" "), a first name can be easily extracted from any full name as in fullName.substring(0, spacePos). Once the first name is obtained, how can the last name be extracted? Remember the last name starts right after the space, so the position of the first letter of the last name is spacePos + 1. When does it end? It ends at the end of the string. Since fullName.length() returns 16 for "Maya Plisetskaya", which is the position of the last letter of the last name + 1, this is perfect for the second parameter of substring method for extracting a last name. All the pieces are put together in the following program: + +Alternatively, without declaring variables, spacePos and len, one could use return values from indexOf and length methods as arguments for the substring method. + +firstName = fullName.substring(0, fullName.indexOf(" ")); + +lastName = fullName.substring(fullName.indexOf(" ")+1, + +fullName.length()); + +Which way is better? The first option allocates memory for two more variables, spacePos and len; however, it does not call indexOf method twice as in the second option. For a small example like this, it does not matter which option one uses. For large programs, try to remember not to waste too much memory by declaring unnecessary variables and also try not to invoke complex methods multiple times. One should always be aware of a trade-off between space and time and make a very good balance between them when developing a large application. + +An example of the input and output from the above program is shown below: + +Enter full name, first name followed by last name: Maya Plisetskaya + +Plisetskaya, Maya + +### 6.4.4 Comparison of Two String Objects + +While a double equal sign, ==, was used to compare primitive data types, comparing two String objects takes extra care. Examine the following code segment: + +String str1, str2; + +str1 = new String("saddles"); + +str2 = new String("saddles"); + +System.out.println(str1 == str2); + +Is the output true or false? As a matter of fact, it prints false. Why does the comparison of str1 and str2 return false? Both String variables seem to contain the same value, "saddles", but remember that a String variable contains a reference to the String object, not the string itself. Since str1 and str2 are two completely different objects, two variables refer to different addresses shown below: + +The correct way to compare the contents of String object is to use a String method, equals. + +System.out.println(str1.equals(str2)); + +The above statement will output true since both str1 and str2 have the same value. The equals method does not compare the references, but rather the contents of the strings being referenced. What about when a String object is created by assigning a string literal? + +String str3, str4; + +str3 = "halters"; + +str4 = "halters"; + +System.out.println(str3 == str4); + +System.out.println(str3.equals(str4)); + +Interestingly, both print statements return true. This is because when the value is assigned to str4, the Java compiler will search the existing String objects for an exact match. If it finds one, which is the case here, a new String object is not created. Instead, the variable is assigned a reference to the existing String object show below: + +Of course, if the contents of one String variable is copied to another String variable, both variables would point to the same object as shown below because what is copied is the address of the object: + +String str5, str6; + +str5 = new String("bridles"); + +str6 = str5; + +System.out.println(str5 == str6); + +System.out.println(str5.equals(str6)); + +As can be seen in the above contour diagram, both print statements return true. Recall that this is exactly the same situation discussed in Sect.​ 2.​9, where variables of Number objects, num1 and num2, are referencing the same object containing the integer 5 after the assignment statement num1 = num2 shown in Fig.​ 2.​24 repeated below: + +The contour diagram showed that the intended task of copying the integer 5 from num1 to num2 was not accomplished. In general it is not a good idea to have two variables pointing to the same object, unless it is a String object. If the contents of the object num1 is referring to were modified by using a mutator method, the contents of the object num2 is referring to would be automatically changed because they are pointing to the same object. Is it the same way with String objects? If one were to execute the following statement to modify the contents of str5, + +str5 = "reins"; + +the Java compiler would search the existing String objects for one containing "reins". So far, two objects with "saddles", one object with "halters", and one object with "bridles" have been created. Since it does not find an object with "reins", a new String object will be created. Therefore, str5 and str6 will be referencing different String objects as shown below: + +Now, the following statements will both return false: + +System.out.println(str5 == str6); + +System.out.println(str5.equals(str6)); + +Unlike with num1 and num2, because of the immutable characteristic of String type, there is no danger of modifying the content of one object when two String variables are referencing the same object. + +### 6.4.5 The equalsIgnoreCase Method + +Assume that a program to play a Tic Tac Toe game has been written. At the end of each game, a user will be asked if he or she would like to play another game. For example, consider the code segment in Fig. 6.3: + +Fig. 6.3 + +Use of a method from String class to compare strings + +Because of the !, the condition of the if statement is true when a user does not enter yes. Then, the variable selection will be changed to false, and eventually the program stops. What happens if a user wanted to play another game and entered Yes instead of yes? Because the equals method checks for an exact match, the if condition again is true. In case the user types yes in different ways, the if condition can be modified to + +if(!(response.equals("yes") || response.equals("Yes") || + +response.equals("YES"))) + +selection = false; + +Then, the user can enter "yes", "Yes", "YES" to continue. Actually, there is a way to include all the combinations of upper- or lowercase characters in the word "yes" such as "yEs", "yeS", and "yES". One can compare the content of String objects ignoring the case of characters in the string. An equalsIgnoreCase method compares the content of a String object to that of another String object ignoring case considerations. Two strings are considered to be equal if they are of the same length and corresponding characters in the two strings are equal ignoring the case of the characters. In other words, the search can be done in a case-insensitive way. One can rewrite the if condition as + +if(!response.equalsIgnoreCase("yes")) + +selection = false; + +Given the equalsIgnoreCase method, the user can enter "yes", "Yes", "YES" or any other combination of uppercase or lowercase characters in the word "yes" to continue. + +### 6.4.6 The charAt Method + +The charAt method returns the character stored at the specified position in the string. For example, if the variable name refers to the string "George Balanchine", then fullname.charAt(0) will return the value "G" because the character "G" is the first character. The statement fullname.charAt(2) will return the value "o" because the index of the character "o" is 2. Suppose one likes to know the number of occurrences of certain character in a string, for instance, the character "G" in fullname. Each character in the fullname can be checked using the charAt method inside the loop and a counter can be incremented. The following code segment counts the number of "G" characters in "George Balanchine": + +An output from the above code segment would be + +The name George Balanchine contains 1 character "G". + +Notice that it only counts the capital letter "G" and ignores lowercase letter "g". If both uppercase and lowercase letters need to be counted, the if condition would look like + +if(letter == "G" || letter == "g") + +and the code will return 2 because one uppercase "G" and one lowercase "g" exist in "George Balanchine". A summary of some of the methods in the String class can be found in Table 6.1. + +Table 6.1 + +Various methods in the String class + +Method | Function preformed | Arguments | Value returned + +---|---|---|--- + +charAt(pos) | Returns character at given index | int | char + +equals(str) | Compares strings | String | boolean + +equalsIgnoreCase(str) | Compares strings ignoring case | String | boolean + +indexOf(str) | Returns index of first occurrence of substring | String | int + +length() | Returns length of string | None | int + +substring(pos,pos) | Returns substring of string | int, int | String + +## 6.5 The toString Method + +The overriding method, toString, receives no parameters and returns a String type. Although overriding methods will be discussed further in Chap.​ 9, it is introduced here because it is a useful method that helps output data stored in objects. Prior to demonstrating how toString works, the PointD class from Fig.​ 5.​4 in Chap.​ 5 is relisted in Fig. 6.4. + +Fig. 6.4 + +A client program and PointD class + +The main method in Fig. 6.4 creates objects of the PointD class and finds the midpoint of the two points. After executing the program, the output is + +The mid-point between (4.0,4.0) and (8.0,7.0) is (6.0,5.5) + +What would happen if the last five print statements of the main method were replaced by the following statement? + +System.out.println(middle); + +This statement is trying to output middle which is a PointD object. Does it output the contents of x and y of middle? The answer is no. Instead, the output would look like the following: + +PointD@ae3364 + +What is this? Is it garbage? The answer to the second question is no, it is not garbage. However, it is not very useful information at this level of programming. The System.out.println outputs the name of the class PointD, an @ symbol, and the memory address of the object in hexadecimal (base 16) representation. Since each time the program is run the object might be in a different location in memory, the output may be different every time the program is executed. In order to output the contents of x and y, one needs to use accessor methods, such as getX and getY as done in Fig. 6.4. However, wouldn't it be nice if there was a method to return the contents of an object? A toString method could be written in the PointD class to return a string representation of the contents of the data members of an object. The method could return x and y as the location of a point in the format (x,y) and would be written as follows: + +public String toString() { + +return "(" + x + "," + y + ")"; + +} + +Since the values in x and y are concatenated with strings, they are converted to type String and would be returned as a String. Then, in the following statement, the object middle can call the toString method + +System.out.println(middle.toString()); + +and the above statement will produce an output of + +(6.0,5.5) + +Now, if the last five print statements in the main method in Fig. 6.4 were replaced by the following code, + +System.out.println("The mid-point between " + +\+ p1.toString() + " and " + p2.toString() + " is " + +\+ middle.toString()); + +it would produce the same output as the original code as follows: + +The mid-point between (4.0,4.0) and (8.0,7.0) is (6.0,5.5) + +The usefulness of a toString method will be appreciated more when objects are discussed further in Chap.​ 9. + +## 6.6 Complete Program: Implementing String Objects + +In this section, an application which outputs course information will be developed. The program will: + + * Ask the user for a name of a class. The input consists of a department code, a course number, and a course title, such as "CS 360 Theory of Computation". + + * Process the input. + + * Output the title of the class, level of the class, and the department that offers the class. + +An example of the input and output for the Theory of Computation course would be + +Enter the course: CS 360 Theory of Computation + +The class, "Theory of Computation", is a + +junior level class offered by the + +Computer Science department. + +and the input and output for a Calculus course could be + +Enter the course: MA 213 Calculus I + +The class, "Calculus I", is a + +sophomore level class offered by the + +Mathematics department. + +When the user provides input, the program will create an object and store pieces of information inside of the object. The name of the department will be determined by the department code which is the first piece of the input. The course number is the second piece of the input, and the course title is the rest of the input. The level of the course will be obtained by checking the course number. Figure 6.5 contains the code defining the class for a Course object. + +Fig. 6.5 + +Course class + +The Course class consists of four data members that are all instance variables, two constructors, and mutators and accessors for each data member. The setDepartment method accepts a department code as a parameter, then the if-then-else structure determines the department, and the value is assigned to the data member. The setLevel method uses the value of data member, number, to figure out the level of the class. In order to use a case structure, the first character of number is extracted as a String and converted to a character since only char, byte, short, or int types can be used in the case statement. The charAt method is used to convert a string to a character. It takes a position of a character in a string and returns a character. The main program which uses Course class is shown in Fig. 6.6. + +Fig. 6.6 + +A client program for Course class + +After the user enters an input, pieces of information are extracted and used to create a Course object. Notice that in order to include a double quote in a string literal, a backslash is used as in \", which was discussed in the output section of Chap.​ 1. This application can be extended to accommodate more departments and graduate level classes. Course objects can also be stored in an array for further manipulation which will be discussed in Chap.​ 7. + +## 6.7 Summary + + * A String object can be created by using new, =, or += operators. + + * String objects are immutable, which means their contents cannot be changed. + + * When a String object is created by assigning a string literal, the Java compiler will search the existing String objects for an exact match. If it finds one, the variable is assigned a reference to the existing String object. + + * When a String object is created using the keyword new, a new object will be created even if there already exists an object with the same string value. + + * Individual characters of a string are numbered starting from 0. + + * When an equals method is applied to String objects, it compares the contents of the objects being referenced. + + * To compare the contents of String objects, a == operator cannot be used since it compares the references to objects. + + * Some String methods include indexOf, length, substring, equals, equalsIgnoreCase, and charAt. + +## 6.8 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Identify the errors in the following code segments: + +A. + +String text1; + +text1 = new String(girth); + +*B. + +String text2; + +text2 = new Text("shedding blade"); + +C. + +String text3; + +text3 = new Sting("grazing muzzle"); + +text3.indexOf("muzz"); + +text3.length(5); + +2. + +Determine the return value for each of these expressions, assuming the following declaration: + +String org; + +org = new String ("American Quarter Horse Association"); + +A. + +org.substring(5, 8) + +*B. + +org.length() + +C. + +org.substring(9, 22) + +*D. + +org.substring(17, 19) + org.substring(20, 22) + +E. + +org.substring(15, 16) + org.substring(18, 19) + +\+ org.substring(13, 14) + +\+ org.substring(org.length()–5, org.length()) + +F. + +org += org + +3. + +Draw contour diagrams to show the state of execution after the execution of the following code segment: + +String s1, s2, s3, s4; + +s1 = new String("stirrup irons"); + +s2 = "stirrup irons"; + +s3 = new String("stirrup irons"); + +s4 = s2; + +4. + +Determine the output from the following code segment: + +String star; + +star = "*"; + +int i; + +for (i=0; i<5; i++) { + +System.out.println(star); + +star += star; + +} + +5. + +Write a program that asks the user for a positive integer, receives input as a String, and outputs a string with commas in the appropriate places. For example, if the input is + +1000000 + +then the output is + +1,000,000 + +6. + +Write a program for a given word and string that will + +a. + +Check if the word is in the string. + +b. + +Count all occurrences of the word in the string. + +c. + +Remove all occurrences of the word from the string. + +*7. + +With a given String object called org containing a value "American Quarter Horse Association", write a program to output an abbreviation of the string, AQHA. + +8. + +Modify the previous program to ask a user for a name of his or her organization and print an abbreviation of the name. Realize that the name of the organization consists of any number of words. +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_7 + +© Springer-Verlag London 2014 + +# 7. Arrays + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +Arrays and array processing are illustrated in the chapter starting with declaration, access, input and output. In addition to simple processing, the passing of an array to and from a method is demonstrated. Other processing includes reversing, searching (sequential and binary), and sorting an array using the bubble sort. Also, two-dimensional arrays and arrays of objects are introduced, along with a complete program. + +## 7.1 Introduction + +Similar to a string which can store a group of characters, an array can be used to store numbers of type int or double. Not only can arrays store numbers, but they can also be used to store strings, objects, and even other arrays. Arrays are extremely useful to store data that needs to be processed more than once, such as data that needs to be searched or sorted. + +Related to an array are the predefined Array and Vector classes which are beyond the scope of this text, because before learning how to use these classes, it is good to understand how to input, process, and output data using arrays. This chapter will first introduce the reader to declaring an array, and as in the past the best way to learn is to get started with an example. + +## 7.2 Array Declaration + +When declaring an array, the type of data that will be stored in the elements of the array must be specified. For example, to declare a memory location to store a reference to an array of type int called number, one would write the following: + +int number[]; + +Alternatively, and used more often, the above could be declared as + +int[] number; + +This reserves a memory location called number, the square brackets indicate that it will be an array, and the word int indicates that each element of the array can contain an integer. Initially, the memory location number will contain a null reference, which means it does not initially reference anything. + +In order to create an array of three elements, the following instruction is needed: + +number = new int[3]; + +Although the word new has also been used to create a new object, here it is used to create a new array. The number in the square brackets indicates the length of the array, in this case three elements. In this example, the first element is number0] and the last one is number[2]. As with simple variables, the contents of the array are initialized to 0, but as in [Chap.​ 1 this text will assume that the contents are indeterminate. Lastly, a reference to the array is placed into the memory location number via the assignment symbol and is represented as an arrow in the following diagram: + +Alternatively the previous two lines could be combined as follows: + +int[] number = new int[3]; + +Although this takes up less space, the other two statements will be used more frequently to reinforce the concepts of declaration and allocation. As another alternative, a constant can be declared and used in the new statement. The advantage to this technique is that when iterating in a loop to process or output an array, the same constant can be used both to declare the array and as the end value of a for loop as will be seen in the next section: + +final int ARRAYSIZE = 3; + +int[] number; + +number = new int[ARRAYSIZE]; + +As another alternative, an array can be declared and initialized using the following technique: + +int[] number = {0,0,0}; + +While this is somewhat useful for small arrays, it would be impractical to initialize hundreds of elements. Though often smaller arrays will be initialized this way in order to save space, an alternative is presented in the next section. + +## 7.3 Array Access + +Assuming that an array has been created at the beginning of the program using the statements in the preceding section, the array can now be accessed. In order to access an individual element of an array, the name of the array is followed by the index of the element to be accessed. For example, + +number[0] = 5; + +indicates that the 0th element of the array, the first element, takes on the value of 5. This is illustrated in the following diagram: + +Be sure not to confuse the index, 0, with the contents of the array, 5. Notice that the 0th element of the array now contains the number 5. Should the contents of the first element need to be copied into the third element, it could be accomplished as follows: + +number[2] = number[0]; + +and would be represented as shown below: + +When accessing various elements of an array, be careful not to try to access or alter any elements outside the range of the array. In the example above, do not try to access number[−1] or below, or try to access number[3] or above, because an execution error will occur. + +Although the accessing of individual elements can be useful in particular instances, it is often more practical to be able to access all of the array elements. As an example, what if the elements of the array need to be initialized to zero? If only three elements need to be initialized, the technique illustrated at the end of the previous section could be used, but what if instead of three elements, one hundred elements needed to be initialized? Clearly, listing out one hundred individual zeros would be impractical. Instead, as mentioned previously in Chap.​ 4, this can be accomplished by using an iteration structure. Though any of the loop structures can be used, under different circumstances some iteration structures are better choices than others. + +For example, if each element of the above array needs to be initialized to zero, which loop would be the best choice? Since there is a fixed number of elements to be initialized, then a fixed iteration loop structure could be used, specifically the for loop as shown below: + +for(int i=0; i<3; i++) + +number[i] = 0; + +Notice that the loop control variable is of type int and iterates from 0 to 2 corresponding to the three elements of the array. For each iteration of the loop, the number 0 is placed into the ith element of the array. As when accessing individual elements of an array, be careful not to have the loop try to access elements that are outside the range of the array, such as number[−1] or number[3] because again an execution error will occur. + +Assuming the declaration of the constant ARRAYSIZE in the previous section, the above code segment could be rewritten as follows: + +for(int i=0; i= 0) { + +i++; + +System.out.print("Enter a non-negative integer "); + +System.out.print("or a negative integer to stop: "); + +number[i] = scanner.nextInt(); + +} + +As indicated by the comment prior to the code, the above code segment is implemented incorrectly. Although it appears to input all the valid data into the array, what is the problem? The problem is that the sentinel value is also input into the array. While this is not a major issue, the array would have to be declared to be one element larger to accommodate the sentinel value. Further, one would need to write all subsequent code to not process or output the sentinel value, which could be a potential source of logic errors. + +The best solution is not to put the sentinel value in the array in the first place. How could this be done? The problem is that both input statements put the values directly into the array. As an alternative, the value could be input to a temporary variable and checked to see whether it is a sentinel value before putting it into the array. However, instead of adding a couple of extra if statements, note that the while loop already checks for the sentinel value. If the value in the temporary variable is not a sentinel value, the body of the loop is entered and the value in the temporary variable can be copied into the array. On the other hand, if the value in the temporary variable is a sentinel value, the loop is not executed and the sentinel value is not placed in the array. A good name for the temporary variable is temp as in the following segment: + +i = 0; + +System.out.print("Enter a non-negative integer "); + +System.out.print("or a negative integer to stop: "); + +temp = scanner.nextInt(); + +while(temp >= 0) { + +number[i] = temp; + +i++; + +System.out.print("Enter a non-negative integer "); + +System.out.print("or a negative integer to stop: "); + +temp = scanner.nextInt(); + +} + +However, what is preventing the user from entering more data than there is space for in the array? Assume that the array is fixed at a particular size as in the following declaration and allocation: + +final int ARRAYSIZE = 10; + +int[] number; + +number = new int[ARRAYSIZE]; + +Note that a constant is being used for the allocation of the array. The while statement in the above code segment can now be altered using the constant to ensure that the user does not enter more data than was allocated for the array as shown below: + +while(temp >= 0 && i < ARRAYSIZE) { + +or alternatively + +while(temp >= 0 && i < number.length) { + +Whereas the previous example using the for loop had the advantage that the array was the exact size the user wanted, the disadvantage was that the user might miscount the number of data items to be entered. However, the advantage of the sentinel controlled loop above is that it does the counting for the user, but the disadvantage is that it is still using a fixed-size array. Can't the user enter the size of the array? It is possible that they could, but then same problem could occur as before and the user might miscount the number of items to be input. Further, the code in the sentinel controlled loop is doing the counting of the number of items, and the array has to be declared before the data is input. + +In the field of computer science, there are always trade-offs, and it is up to the designers of the algorithms to determine the best possible solution to the problem at hand. As will be seen in subsequent courses in computer science, the concept of a linked list is helpful in solving the above problem, but it should be noted that that solution is not without its own set of limitations. Another possible solution to the current problem, when there are more data items to be entered into an array than has been allocated, is to have the program allocate an array of a larger size, say twice as large, then copy the contents of the old array into the new one and allow the user to continue to enter data into the new array. Although this solution might slow down the processing, it does avoid the consequences of an array that is not large enough and this is left as an exercise at the end of the chapter. However, in this text when using the sentinel controlled loop, the emphasis will be on selecting the right size array in the first place. + +### 7.4.2 Output + +The output of an array could be done as the data is input, but then the output would be intermixed with the input. A better solution is to output the contents of the array after all the data has been input. But how does one know how many data items have been input when using a sentinel controlled loop? The answer is with the variable i used in the previous code segment. Since a fixed number of values have been input, a for loop is the best choice for output. The for loop could be written to iterate i times, but since i is typically used as a loop control variable, it might be better to copy the value in i to another variable such as n and then have the for loop reuse the variable i as a loop control variable and iterate n times. It is also helpful to add a column heading prior to the output of the contents of the array as shown in the following code segment: + +n = i; + +System.out.println(); + +System.out.println("Integers"); + +System.out.println(); + +for(i=0; i=0, i--) + +System.out.println(number[i]); + +Notice that the loop control variable starts at n-1, the loop continues while i is greater than or equal to 0, and that i is decremented each time through the loop. Although this would output the array in reverse order to the user, have the values in the array changed? The answer is no. So what if instead of outputting the array in reverse order, one actually wanted to reverse the contents of the array? One way to accomplish this task is to declare another array and then copy the contents of the first array into the second array in reverse order. However, what is a possible drawback with this solution? The problem is that it takes two arrays or twice as much memory. In this example, it would require two 10-element arrays for a total of 20 elements. For a small array this is not much of a problem, but for a very large array, this would entail a substantial amount of memory. Instead, the solution is to reverse the array in place, thus using only one array. + +The algorithm takes the first data item and the last data item and swaps them. Then, the second data item and the second to the last data item are swapped, and so on as shown in Fig. 7.1. + +Fig. 7.1 + +Reversing an array + +Again, one needs to be careful not to swap elements that do not contain values. When n equals 6, element 0 is swapped with the n-1 element, then element 1 is swapped with the n-2 element, and so on. The loop control variable can be used for elements 0, 1, and 2, but how does one access elements n-1, n-2, and n-3? One solution is to use a second variable such as j so that when the loop control variable, say i, is incremented, the variable j is decremented. But are two variables really needed? If one thinks about it, one should be able to see a pattern in accessing both ends of the data. When i is zero, the contents of location 0 needs to be swapped with location n-1. Although a little difficult to see here, in the first instance i is equal to 0, so n-1 could be thought of as n-i-1. However, sometimes a pattern is difficult to see in the first instance, but can be seen a little better in subsequent instances. Consider the next case when i is 1, it needs to be swapped with n-2. Since i would be equal to 1, n-2 could again be thought of as n-i-1. So instead of using two indexes, only one index is needed, which is a little more elegant. + +Lastly, the matter of the swap needs to be considered. If the contents of two simple variables need to be swapped, how can this be accomplished? When the value of one variable is transferred to another variable, the previous contents of the variable being swapped into are destroyed, so the previous contents need to be stored in a temporary memory location, often called temp. First the contents of the variable x need to be put aside in the temporary memory location temp using a temp = x; instruction. + +Once the contents of variable x have been moved into temp, the contents of variable y can be copied into the variable x using an x = y; instruction. + +Now that the contents of y have been copied into x, the contents of temp can be copied into the variable y using a y = temp; instruction. + +The whole sequence of instructions is as follows: + +temp = x; + +x = y; + +y = temp; + +So how can this be used with an array? Instead of using simple variables, the corresponding location of the array can be substituted using the variables i and n-i-1 as discussed above and shown below: + +temp = number[i]; + +number[i] = number[n-i-1]; + +number[n-i-1] = temp; + +Assuming i is equal to 0 and n is equal to 6, then going from left to right in Fig. 7.2 the execution of the three instructions is shown in the dashed boxes above each array. + +Fig. 7.2 + +Swapping items in an array + +Putting it all together with the loop results in the following code segment. However, one needs to be careful when writing the code to solve this problem. For example, can the error in the following code segment be spotted? + +// *** Caution: Incorrectly implemented code *** + +for(i=0; inumber[j+1]) { + +temp = number[j]; + +number[j]= number[j+1]; + +number[j+1] = temp; + +} + +The reader is encouraged to walk through the code segment to see how the algorithm works. Again, notice how the smallest number slowly moves or bubbles its way to the top of the array during each pass, thus giving the name to the bubble sort. To analyze the speed of this algorithm, it should be noticed that the outer loop iterates n-1 times. However, when doing analysis like this, the one less time than n that it loops is not very significant for a very large number n, so it is said to be of order n. The inner loop iterates one less time on each pass going from n-1 to 1 times, where it could be said that it loops on average n/2 times. But again, for a very large n, the division by two would still be a large number, so it is also said to be of order n. Recall from Chap.​ 4 that two nested loops each iterating n times the total number of iterations would be n*n, or n 2. Since in the current example, one loop is nested inside the other and also each loop is iterating approximately n times, this algorithm is of order n 2, or O(n 2). + +### 7.7.2 Modified Bubble Sort + +In the previous simplified sorting algorithm, does it make any difference whether the data in the array is in reverse order, random order, or already sorted? The answer is no, because the outer loop will still iterate n-1 times and the inner loop will still iterate n/2 times. Although this does not make a difference if the array is in reverse order, nor does it make a lot of difference if the array is totally random, what if the array is already sorted? Granted this might not happen very often, but if it was already sorted, it would still take O(n 2) to sort an already sorted array. Is there some way that this can be improved? During the first pass through the array, if there are no swaps between any of the pairs of elements, then it would be known that the array is already in order. Can the program be modified to take advantage of this scenario? Yes, a boolean flag can be used to indicate whether a swap has or has not occurred, and a good name for this flag is swap. + +The first for loop could be replaced with a while loop that not only checks to see how many passes have occurred but also checks to see if a swap has occurred. If a swap has not occurred, then another pass is not necessary. Initially the swap flag could be set to true prior to any code to indicate that a swap has occurred. This would force the execution of the first time through the outer loop. The first thing to be done inside to the loop is to reset the swap flag to false, so in case there are no swaps during the inner loop, then no subsequent passes through the outer loop need to occur. Lastly, should a swap occur in the if statement, the swap flag is sent to true, thus forcing another pass through the outer loop: + +swap = true; + +i = 0; + +while(i < n-1 && swap) { + +Swap = false; + +for(j=0; j number[j+1]) { + +swap = true; + +temp = number[j]; + +number[j] = number[j+1]; + +number[j+1] = temp; + +} + +i++; + +} + +As before, notice that swap is used in the while loop instead of swap == true or swap ! = false. Also notice the addition of the extra set of braces for the while loop, because now syntactically there are three statements in the body of the loop: the setting of swap to false, the for statement, and the increment of i. Lastly, notice that if there is more than one swap in the inner for loop, the swap is set repetitively to true. Although this seems a little redundant, it is quicker and easier to just keep setting swap back to true than adding code to check to see if it is already set to true. + +The result is that if the data in the array is in reverse order, there is no increase in the speed of the algorithm. However, if the data is already in order, then there is only one pass through the outer loop, and the inner loop iterates n-1 times. So, this algorithm with data already sorted is O(n), and the bubble sort is one of the fastest sorting algorithms for data that is already in order. Although this might seem a little confusing to use a sorting algorithm with data that is already sorted, the algorithm also works fairly well for data that is close to being in order. If only a few items need to be swapped, then the outer loop will only iterate a few times, until there is a pass without any swaps, in which case the outer loop stops iterating. So in cases where data is possibly in order, or close to being in order, the bubble is a very good sort. However, for large amounts of data that is in reverse order, close to being in reverse order, or totally random, the bubble sort is not the best choice. As will be seen in later courses, there are a number of other sorting algorithms that can handle these situations much faster. Nonetheless for this text, the bubble sort provides a good starting point for understanding how sorting algorithms work and can be used to sort small sets of data. + +## 7.8 Two-Dimensional Arrays + +The preceding sections introduced how to declare variables for one-dimensional arrays, how to create them, and how to access elements in them. One-dimensional arrays work well when dealing with a set of data such as a collection of grades for one student. However, what if there are multiple sets of data, such as grades for several students? Then, the data could be stored in a two-dimensional array, which are sometimes called a 2D array. + +### 7.8.1 Declaration, Creation, and Initialization + +Suppose that there are four students in a class and they each took three exams. Instead of creating four separate one-dimensional arrays in order to record the exam scores for each student, one two-dimensional array can be used to store all the scores. Three exam scores for each student are kept in a row; therefore, there will be four rows and three columns in the table. Assume that the scores are of type int and the name of the array is scores. To declare a two-dimensional array, two sets of brackets are required. The first one is for the rows and the second one is for the columns as shown below: + +int scores[][]; + +which is equivalent to + +int[][] scores; + +The two sets of brackets could be after or prior to the name of the array and the second example above is used more often. A diagram after the declaration is shown below: + +The following creates a two-dimensional array of four by three integer values: + +scores = new int[4][3]; + +The number 4 in the first set of brackets specifies the number of rows and the number 3 in the second set of the brackets specifies the number of columns. The diagram in Fig. 7.8 illustrates the array after its creation. Notice that a two-dimensional array is actually an array of one-dimensional arrays, meaning that it consists of an array in which each element is a one-dimensional array. + +Fig. 7.8 + +After creation of 2D array + +An array can be declared and created at the same time using the following statement: + +int[][] scores = new int[4][3]; + +The diagram for the above statement is the same as that in Fig. 7.8. Again, in order to reinforce the concepts of declaration and allocation, two separate instructions are used in this text. + +To access the data in a two-dimensional array, two subscripts or indices are used, one for the row number and the other for the column number. As in a one-dimensional array, each index is of type int and starts from 0 in the array. The first exam score of the first student is stored in scores[0][0], the second exam score is stored in scores[0][1], and the third exam score is stored in scores[0][2]. The scores for the second student are kept in scores[1][0], scores[1][1], and scores[1][2]. The scores for the third and fourth students are stored in a similar fashion. Suppose that the first student made a 72 on the first exam, an 85 on the second exam, and a 91 on the third exam. Then, the following statements store the scores for the first student in the appropriate positions in the array: + +scores[0][0] = 72; + +scores[0][1] = 85; + +scores[0][2] = 91; + +If the second student made 95, 89, and 90 on the three exams, the statements below will initialize the scores for the second student: + +scores[1][0] = 95; + +scores[1][1] = 89; + +scores[1][2] = 90; + +Scores for the third and fourth students can be entered in a similar manner. The diagram in Fig. 7.9 shows the two-dimensional array after the initialization. + +Fig. 7.9 + +After initialization of 2D array + +Alternatively the following statement will declare, create, and initialize a two-dimensional array: + +int[][] scores = {{72, 85, 91}, + +{95, 89, 90}, + +{77, 65, 73}, + +{97, 92, 93}}; + +The size of the array is determined by the number of values provided in the set of braces without explicitly specifying it inside the brackets. The diagram after the above statement is equivalent to the one in Fig. 7.9. + +### 7.8.2 Input and Output + +Although the techniques of assigning data used in the previous section are adequate for testing programs, how can the data be entered by the user? It is similar to a one-dimensional array, but instead of using a simple for loop, a nested for loop is used as shown below: + +int[][] scores; + +scores = new int[4][3]; + +for(int i=0; i<4; i++) { + +for(int j=0; j<3; j++) { + +System.out.print("Student " + (i+1) + ", exam " + +\+ (j+1) + ": "); + +scores[i][j] = scanner.nextInt(); + +} + +System.out.println(); + +} + +Notice that each position in the array can be accessed using two index variables, i and j, for the row number and the column number, respectively, inside the loop. A portion of the output with sample input is as follows: + +Student 1, exam 1: 72 + +Student 1, exam 2: 85 + +Student 1, exam 3: 91 + +Student 2, exam 1: 95 + +Student 2, exam 2: 89 + +Student 2, exam 3: 90 + +... + +Alternatively, the number of rows and columns could be entered by the user, and a two-dimensional array could then be created dynamically as discussed in Sect. 7.4. Once scores are in the array, one can output them using a nested for loop. Suppose three exam scores for each student are to be output in a row. The code segment below outputs the column labels first followed by the row labels and scores: + +System.out.println("exam 1 exam 2 exam 3"); + +for(int i=0; i<4; i++) { + +System.out.print("Student " + (i+1)); + +for(int j=0; j<3; j++) + +System.out.print(" " + scores[i][j]); + +System.out.println(); + +} + +Notice that the print statement for the column headings is outside the nested for loop, since they are only output once. The print statement for the row label is located prior to the inner for loop, which means it is output every time the control variable i of the outer for loop changes. Also notice that three scores for each student are output on the same line using the print in the inner for loop. The println after the inner for loop moves the cursor to the next line for the next student. The output from the above code segment is as follows: + +exam 1 exam 2 exam 3 + +Student 1 72 85 91 + +Student 2 95 89 90 + +Student 3 77 65 73 + +Student 4 97 92 93 + +What if all the scores of thee exams need to be output line by line as shown below? + +Student 1 Student 2 Student 3 Student 4 + +exam 1 72 95 77 97 + +exam 2 85 89 65 92 + +exam 3 91 90 73 93 + +Again, a nested for loop can be used. In order to access all the scores in one column of the array before going to the next column, the column number has to remain the same in an outer for loop, while the row number is changing in the inner for loop. This is left as an exercise at the end of the chapter. + +### 7.8.3 Processing Data + +Using the array scores, how can the average of the three exam scores for the first student be calculated? All the scores for the first student are stored in the first row of the two-dimensional array. In order to find the average, the values in the first row have to be added together and divided by the number of exams. The following formula will find the average for the first student: + +(scores[0][0] + scores[0][1] + scores[0][2])/3; + +The average exam scores of other students can be found in the similar way. However, if the instructor would like to find the averages for a large class, it would not be efficient to list the formula for each student. + +To process arrays, the length field is useful as discussed earlier in this chapter. When an array is created, a reference to the array is stored in the variable. At the same time, the length of the array is stored in an instance constant named length. For a one-dimensional array, the length holds the number of elements in the array. Since a two-dimensional array is an array of one-dimensional arrays, there are several length fields associated with it. They keep track of the number of rows and the number of columns for each row. With the array shown in Fig. 7.9, the length of the array scores can be obtained by scores.length which is the size of the one-dimensional array that the variable scores is referring to. In this case, the value would be 4 indicating the number of rows. As shown in Fig. 7.9, the elements of the array, scores[0], scores[1], scores[2], and scores[3], are references to one-dimensional arrays. Therefore, their length can be obtained by scores[0].length, scores[1].length, scores[2].length, and scores[3].length. Since it is a four by three array, all of them have a value of 3 indicating that the number of columns of the array scores is 3. + +Returning back to finding the average of all the exam scores for the first student, a for loop can be used as shown below: + +double total, average; + +total = 0.0; + +for(int j=0; j<3; j++) + +total = total + scores[0][j]; + +average = total/3; + +The variable total contains the total of the three exam scores and the variable average holds the average. The variable total is initialized to 0.0 at the beginning, and inside the for loop, the three test scores, scores[0][0], scores[0][1], and scores[0][2], are added together. The row number is fixed at 0 and the value of the index variable j changes from 0 to 2 accessing the scores of the first student. Since there are three exams, the total was divided by 3. Although the elements of the array scores are of type int, the value for average most likely requires more precision. Therefore, both the total and average were declared as type double in order to avoid integer division. Using the length field, the above code can be rewritten as + +double total, average; + +total = 0.0; + +for(int j=0; j 0, then x n−1 * x, otherwise 1} + +This forms the basis of the method which could be written as follows: + +public static int power(int x, int n) { + +int answer; + +if (n > 0) + +answer = power(x,n-1)*x; + +else + +answer = 1; + +return answer; + +} + +Notice the method is declared as static, so that a class does not need to be defined nor does an object need to be created as discussed in Chap.​ 5. Further, note that a local variable answer has been declared. As will be discussed later, this will waste memory in recursion, but for now using a memory location will be very helpful in tracing through the program using contour diagrams. After the code is understood using contours, the method can be rewritten to save memory as will be shown later. More importantly, notice that the power method is calling itself. Is that legal? Yes it is, but as discussed above, there needs to be a way to stop the recursion, and that is the purpose of the else section and the terminal case of answer=1. Of course a main program will need to be written to drive the method as shown in Fig. 8.1 with line numbers to help facilitate seeing the code execute via contours. + +Fig. 8.1 + +main program and power method + +Before calling the power method, notice that the main program checks whether x is greater than or equal to 0, that n is greater than or equal to 0, and that x and n are not both 0. It is often best to first test the base case to ensure that it is working properly. So to start, assume that the user has entered a value of 2 for x and 0 for n. Since n is not greater than 0, there should be no recursion, and answer is assigned a value of 1 which is returned to the main program and output. Because this is a simple instance, a contour will not be written for this case. + +However, what if x is equal to 3 and n is equal to 2? This is when things start to get interesting and contours are very helpful. Figure 8.2 shows the state of execution just prior to Line 22 in power. + +Fig. 8.2 + +Contour prior to the execution of Line 22 in the first call to power + +As discussed in Chap.​ 1, although typically the contour for Ch8Sample1 would not be drawn, it is helpful to see it in this case. Since the power method is static, notice that an object is not created nor is there a reference to an object. Instead, the contour for power is drawn in the class Ch8Sample1, just as the main method which is also declared as static. As can be seen in the contour for power, there is a new cell called ret. This is not the value returned from a method, but rather indicates where the method will return upon completion. Whereas previously it was fairly clear where a method was returning, with recursion and its multiple calls, it might not be so obvious. The ret cell also has listed a type of addr which is an abbreviation for address. Although there is not a type associated with this cell as there is with other variables and parameters, the address is the place where the flow of control will be transferred when the method is finished. Lastly, note that the line number is abbreviated as L14 and the name of the method main is included in the cell. Although in this case it should be apparent that Line 14 is in main, indicating the name of the method will be important as will be seen shortly. + +Since n is greater than 0, once Line 23 has begun to execute, the first thing that needs to be done is recursively call the power function. Figure 8.3 shows the state of execution just prior to Line 22 in the second call to the power method. + +Fig. 8.3 + +Contour prior to the execution of Line 22 in the second call to power + +As can be seen, there are now two contours depicting the power method. Similar to when there was more than one object of the same type in Chap.​ 5, notice that superscripts have again been employed to distinguish between the two contours. Also note that when calling power a second time, the value of n has been decremented by 1. Lastly, notice that the ret field points back to Line 23 in the first call to power. Of course, when Line 22 in the second call to power is executed, n is still greater than 0, and there is another call to power as shown in Fig. 8.4 illustrating the state of execution prior to Line 22. + +Fig. 8.4 + +Contour prior to the execution of Line 22 in the third call to power + +The third contour has now been added, where the return is to Line 23 in the second call to power and n is equal to 0. This time when Line 22 is executed, n is no longer greater than 0, but rather equal to 0, so instead of making the recursive call in the then section of the if statement, the else section is executed. This is the terminal case and no more recursive calls will occur. Instead 1 is assigned to answer, and Fig. 8.5 shows the state of execution prior to Line 26 in the third call to power. + +Fig. 8.5 + +Contour prior to the execution of Line 26 in the third call to power + +After the execution of Line 26, the value in answer is returned to Line 23 in the second call to power. Then the value 1 is multiplied by the value 3 in x. The result is then placed into the variable answer, and Fig. 8.6 shows the state of execution prior to Line 26 in the second call to power. + +Fig. 8.6 + +Contour prior to the execution of Line 26 in the second call to power + +Of course, the first thing one notices is that the contour for the third call to power is now shaded light gray to indicate that it is deallocated. Also, the value 3 is in answer ready to be returned to the first call to power. As before, contours can simply be erased as done in Fig. 8.7 which shows the state of execution prior to Line 26 in the first call to power. + +Fig. 8.7 + +Contour prior to the execution of Line 26 in the first call to power + +Notice that the value 3 returned from the second call to power has been multiplied by the value 3 in x and the result 9 is placed in answer. The flow of control continues to Line 26, and the value 9 is returned back to the calling program. The 9 is then placed into answer as illustrated in Fig. 8.8 which shows the state of execution just prior to Line 15 in main. + +Fig. 8.8 + +Contour prior to the execution of Line 15 in the first call to main + +Looking back at the base case in Fig. 8.5, notice that there were a lot of memory locations used to find answer in Fig. 8.8. If recursion takes up so much memory, why use it? Again, some problems are more naturally expressed using recursion than iteration. Further, with memory being much less expensive than it was in the past, the use of recursion is much less costly. Still, some larger problems can use quite a bit of memory, and there are some techniques to cut down on its usage. For example, the previous method used a variable answer each time a contour was created. Instead of assigning the result of the calculation to a variable, it can simply be returned to the calling method as shown in the following segment: + +public static int power(int x, int n) { + +if (n > 0) + +return power(x,n-1)*x; + +else + +return 1; + +} + +Of course, the method uses two return statements, which is considered unstructured programming. Again, if memory is a concern, this might be a justifiable trade-off. It is often helpful to initially write an algorithm with some built-in inefficiencies to ensure that it is working properly and then optimize the code, rather than initially try to optimize the code and risk, creating a code that does not work correctly in the first place. + +## 8.3 Stack Frames + +Notice that each time a recursive call occurs, another contour is drawn, and each time a new contour is created, more memory is used. Contours are helpful in understanding of the process of recursion. But how is this actually accomplished in the computer? It is done using a stack. A stack is known as a LIFO structure, which stands for Last In First Out. That means that the last item put on the stack is the first one taken off the stack, not unlike a stack of papers on a desk. The process of putting an item on a stack is known as a push operation, and the task of removing an item is known as a pop operation. + +When a method is called the first time, the values are stored in the variables, like when the first contour is drawn. However, in the program there is only one set of variables. What would happen when there is a recursive call to a method? What happens to the values in the variables? Instead of drawing a new contour, the variables in the contour need to be reused. The result is that all the variables in the method, along with some other possible information associated with the method, form what is known as a stack frame and it is pushed onto the stack. Once the values from the variables are stored on the stack, new values can now be stored in the variables. Each time there is another recursive call, the process is repeated. When there is a terminal case, the process reverses itself. As a simple example, assume there is only one recursive call. The values are pushed onto the stack and the variables reused. Then after the terminal case, the values can be popped off the stack and be placed back into the variables, and the processing can complete. + +Using the same example from the previous section calculating 32 and using only a partial contour diagram, Fig. 8.9 is the state of execution just prior to Line 26 in the program in Fig. 8.1 in the third call to power. + +Fig. 8.9 + +Contour and stack prior to the execution of Line 26 in the third call to power + +Figure 8.9 corresponds to Fig. 8.5 in Sect. 8.1. Note first that there is only one contour for power. Even though it represents power 3 , it is just labeled power since the contour is used for all calls to power. As each call is made, the contents of the power contour are pushed onto the stack. When power 1 called power 2 , the variables in power 1 were pushed onto the stack so that power 2 could use the variables in the contour. Then when power 3 was called, the contents for power 2 were pushed onto the stack so that power 3 could use the contour. Once power 3 is ready to return to power 2 , the stack frame for power 2 is popped off the stack and put back into the contour, and so on. Simply stated, each new contour created after the first one means another stack frame needs to be pushed onto the stack, and each time a contour is deallocated, that means that a stack frame is popped off the stack. + +Note that the names of the cells and their types are not pushed onto the stack, but only the contents are pushed onto the stack. However, also notice that the order in which they are pushed is the same as they occur in the contour so one can determine which cell is which. Although one could draw the stack with the other information, it gets a little cumbersome, and this is one of the reasons why contours are sometimes a little more convenient. + +But wasn't it said that each recursive call wastes memory? The answer is yes, because the stack is implemented in the computer's memory and each time a stack frame is pushed onto the stack, more memory is used. If infinite recursion occurs, oftentimes a message will be output saying something to the effect that there is a stack overflow, meaning that the stack is full and no memory is available to push more items onto the stack. + +Notice that using contours and stack frames are just two ways of looking at the same process. Although the stack frame model is more accurate, it is a little more cumbersome to draw, whereas the contour model is easier to draw and makes it easier to keep track of previous values. The importance of keeping track of previous values will become even more apparent in the next section with a more involved use of recursion. + +## 8.4 Fibonacci Numbers + +Another example of the use of recursion is the calculation of Fibonacci numbers that one may have encountered in a mathematics course. The Fibonacci numbers can be defined as follows: + + * Fibonacci(0) = 0 + + * Fibonacci(1) = 1 + + * Fibonacci(2) = 0 + 1 = 1 + + * Fibonacci(3) = 1 + 1 = 2 + + * Fibonacci(4) = 1 + 2 = 3 + + * Fibonacci(5) = 2 + 3 = 5 + + * Fibonacci(6) = 3 + 5 = 8 + +Although this is an iterative definition, it can help in the finding of a recursive definition. First, notice the base or terminal cases for 0 and 1. Then notice that any other given line is the addition of the two previous lines. For example, Fibonacci(6) is the sum of the numbers 3 and 5, which are the answers for the fourth and fifth Fibonacci numbers. In other words, couldn't Fibonacci(6) be defined in terms of adding Fibonacci(5) and Fibonacci(4)? The answer is yes, but what would the nth Fibonacci number look like? It would be as follows: + + * Fibonacci(n) = Fibonacci(n − 1) + Fibonacci(n − 2) + +Putting the base case and the nth case together, the definition of the Fibonacci numbers for nonnegative integers would be as follows: + + * Fibonacci(n) = { if n = 0 or n = 1, then n, + + * otherwise Fibonacci(n − 1) + Fibonacci(n − 2)} + +Given this definition, the code can then be written. As in the previous sections, it helps to use local variables to make the reading of contour diagrams easier. + +public static int fib(int n) { + +int answer1,answer2,answer; + +if (n > 1) { + +answer1 = fib(n-1); + +answer2 = fib(n-2); + +answer = answer1 + answer2; + +} + +else + +answer = n; + +return answer; + +} + +Again notice that the method is static and the name of the method is fib to save space in subsequent contour diagrams. Putting the above method together with a main program and adding Line numbers results in the program in Fig. 8.10. + +Fig. 8.10 + +Fibonacci program + +The main program checks for a negative number before calling the fib method. In the case where the input of n is either a 0 or 1, the result is just a simple call to the terminal case, and a corresponding value of 0 or 1 is returned to the main program and output. However, more interesting is a nonterminal case, such as when n is equal to 3. Figure 8.11 shows the state of execution just prior to Line 21 in the first call to fib. + +Fig. 8.11 + +Contour prior to the execution of Line 21 in the first call to fib + +As before, notice L12 main in the ret cell and the superscript for fib indicating the first call. Since 3 is greater than 1, the then portion of the if is taken. Then a recursive call is made as shown in Fig. 8.12 just prior to the execution of Line 21 in the second call to fib. + +Fig. 8.12 + +Contour prior to the execution of Line 21 in the second call to fib + +In the second call to fib, the parameter n has been decremented by 1. Since 2 is greater than 1, another call is made, and Fig. 8.13 shows the state of execution prior to Line 21 in the third call to fib. + +Fig. 8.13 + +Contour prior to the execution of Line 21 in third call to fib + +At Line 21, since n is no longer greater than 1 and the condition for the if statement is false, the else portion is executed and answer is set to 1. This value is then returned to Line 22 in the second call to fib, and the value 1 is stored in the variable answer1 as shown in Fig. 8.14 just prior to the execution of Line 23. + +Fig. 8.14 + +Contour prior to the execution of Line 23 in the second call to fib + +Notice that the variable answer in the third call to fib is 1 and that the contour is shaded gray. Further, note that there are no values in answer1 and answer2 in the third call to fib, because it was a terminal case and no recursive calls were made. Again, notice the value 1 has been returned to the second call to fib and stored in answer1. However, instead of the flow of control returning back to the first call to fib as it did in the power example, there is another call to fib to calculate answer2. So Fig. 8.15 shows the state of execution prior to Line 21 in the fourth call to fib. + +Fig. 8.15 + +Contour prior to the execution of Line 21 in the fourth call to fib + +At first glance, it might appear that the contour for the third call to fib is no longer shaded gray. However, look carefully and notice that it is not the third call but rather it is labeled the fourth call to the method fib, the value for n is 0, and ret references Line 23 in the second call to fib. This is the calculation for the second part of the second Fibonacci number. As before, n is not greater than 1, so the else section of the if statement is executed and answer is assigned a value of 0 that is returned to the second call. Figure 8.16 illustrates the state of execution prior to Line 24 in the second call to fib. + +Fig. 8.16 + +Contour prior to the execution of Line 24 in the second call to fib + +As before, the contour for the fourth call to fib has been shaded to indicate deallocation, and the value 0 is returned to answer2 in the second call to fib. When Line 24 is executed, the values in answer1 and answer2 are added together and stored in answer. Then answer in the second call to fib is returned to answer1 in the first call to fib as shown in Fig. 8.17 illustrating the state of execution just prior to Line 23. + +Fig. 8.17 + +Contour prior to the execution of Line 23 in the first call to fib + +Note now that the fourth call to fib has been erased so as not to cause confusion with the second call to fib which is now shaded to indicate it has been deallocated. Also, answer in the second call to fib now contains the sum of answer1 and answer2. Further, the value 1 in answer in the second call to fib has been returned to answer1 in the first call to fib. Even though there have been a number of calls, the second half of the calculation still needs to be determined. Figure 8.18 shows the state of execution prior to Line 21 in the fifth call to fib. + +Fig. 8.18 + +Contour prior to the execution of Line 21 in the fifth call to fib + +As before, notice this is not the second call to fib, but rather it is the fifth call to fib to calculate answer2 in the first call to fib. Since n is not greater than 1, the else portion of the if statement in the fifth call to fib is executed, and a 1 is placed in answer and returned back to the first call to fib. Figure 8.19 shows the state of execution prior to Line 24 in the first call to fib. + +Fig. 8.19 + +Contour prior to the execution of Line 24 in the first call to fib + +The fifth call to fib is now shaded indicating deallocation, and the value in answer is returned to answer2 in the first call to fib. The values in answer1 and answer2 in the first call to fib are then added together and stored in answer, which is returned and assigned to answer in main. Figure 8.20 shows the state of execution prior to answer being output in Line 13 in main. + +Fig. 8.20 + +Contour prior to the execution of Line 13 in main + +As can be seen, the first call to fib is shaded to indicate deallocation, and answer in main contains the value 2 that was returned. Granted, this seems like a lot of work to calculate a Fibonacci number, but it shows the amount of memory that would be involved. Although there were a total of five calls to fib, only three contours were activated at any given time. As with the power method previously, the number of memory cells can be decreased by eliminating the temporary variables answer1, answer2, and answer as shown in the following code segment: + +public static int fib(int n) { + +if (n > 1) { + +return fib(n-1) + fib(n-2); + +else + +return n; + +} + +As before, this introduces the unstructured practice of two return statements, but if memory is an issue, then this is a possible alternative. An even more efficient solution is to use iteration, which was an exercise in Chap.​ 4. + +As with the power function, a stack could also be used to represent recursion, but with more complex algorithms, it can be a little confusing. Yet another way to represent recursion is to use a tree of calls. The tree is drawn from the top down with the first call at the top which is called the root. Then each call after that represents a branch and terminal calls are referred to as leaves. The tree of calls for the Fibonacci number problem is shown in Fig. 8.21. + +Fig. 8.21 + +Tree of calls for fib(3) + +Notice that main makes a call to fib 1 (3), which then calls fib 2 (2), which then calls fib 3 (1). Once it is calculated, fib 3 returns the value 1 back to fib 2 , which calls fib 4 to calculate fib(0). Then the sum of those two can be returned to fib 1 which calls fib 5 to calculate fib(1). When that is completed, a 1 is returned to fib 1 , which then adds the two numbers and returns a 2 to main. + +Which is a better method to walk through recursion: stack frames, a tree of calls, or contours? It depends on the situation. As stated previously, stack frames are the most realistic but it is harder to use to keep track of each call. A tree of calls is short and convenient but lacks much of the detail. Given the drawbacks of these two extremes, this is why contours are used in this text. As one gets more proficient with recursion, one might gravitate to using a tree of calls for a simple problem, but still using contours when a problem gets more complicated or using stack frames when an accurate picture is needed. + +## 8.5 Complete Program: Implementing Recursion + +A program which computes the greatest common divisor of two integers using recursion will be developed in this section. The program will + + * Ask the user to enter two integers + + * Compute the greatest common divisor + + * Display the result + +Of all the integers that divide the two numbers given, the largest is known as the greatest common divisor. For example, the positive divisors of 36 are 1, 2, 3, 4, 6, 9, 12, 18, and 36, and the positive divisors of 8 are 1, 2, 4, and 8. Thus, the common divisors of 36 and 8 are 1, 2, and 4. It follows that the greatest common divisor of 36 and 8 is 4. The Euclidean algorithm which computes the greatest common divisor of two integers starts with a pair of positive integers. It forms a new pair that consists of the smaller number of the two and the remainder which is obtained by dividing the larger number by the smaller number. This process repeats until one number is zero, and then the other number is the greatest common divisor of the original pair. The following illustrates how the greatest common divisor of 36 and 8 is found. First, 36 divided by 8 is 4 with a remainder of 4 (4 = 36 − 4 × 8). Then, 8 divided by 4 is 2 with a remainder of 0 (0 = 8 − 2 × 4). Since the last remainder is zero, the algorithm ends with 4 as the greatest common divisor of 36 and 8. + +A recursive method to find the greatest common divisor of two positive integers can be defined by the following: + + * + +Recall from Sect.​ 1.​7 that % is the mod operator and if num1 and num2 are integers, num1%num2 returns the remainder. For example, 36%8 is 4. The implementation of the method gcd is shown below: + +public static int gcd(int num1, int num2) { + +if(num2 >= 1) + +return gcd(num2, num1%num2); + +else + +return num1; + +} + +The above method can be invoked for the pair 36 and 8 by + +int result; + +result = gcd(36, 8); + +After the execution of the method, the variable result will contain 4. In order to compute the greatest common divisor of 36 and 8, how many method calls were made? The first method call was gcd(36, 8), the next call was gcd(8, 4), and then gcd(4, 0) which was the last method call, resulting in a total of three method calls. The complete program with a main method is shown below: + +When the above code is compiled and executed using the sample input of 36 and 8, the output of the program is as follows: + +Enter first number: 36 + +Enter second number: 8 + +The greatest common divisor of 36 and 8 is 4. + +## 8.6 Summary + + * It helps to hunt for patterns when trying to create a recursive definition. + + * Be sure to identify the base or terminal case. + + * Without a base case, "infinite" recursion will occur. + + * When using contours, it is helpful to use local variables to store information. + + * To optimize recursion, eliminate local variables. + + * Drawing a stack frame and creating a tree of calls are alternatives to contour diagrams. + +## 8.7 Exercises (Items Marked with an * Have Solutions in Appendix E) + +1. + +Draw series of contour diagrams to show the state of execution of the program in Fig. 8.1 for x = 2 and n = 3. + +2. + +Draw series of contour diagrams to show the state of execution of the program in Fig. 8.10 for n = 2. + +3. + +Given the complete program in Sect. 8.4, what would happen if the numbers 36 and 8 were input in reverse order? How many contours for gcd would need to be drawn? + +4. + +Consider the program in Fig. 8.10 where Lines 22 and 23 are swapped. Draw a series of contour diagrams to show the state of execution for n = 3. + +5. + +Trace the program in Fig. 8.1 for x = 2 and n = 5 and draw the tree similar to the one in Fig. 8.21. + +6. + +Trace the program in Fig. 8.10 for n = 5 and draw the tree similar to the one in Fig. 8.21. + +*7. + +Write a recursive method to reverse a given string. The method accepts a string as a parameter and returns the reverse of the string. For example, if the argument is Java, then the method returns avaJ. + +8. + +Write a recursive method to multiply two positive integers using repeated addition. + +*9. + +Write a recursive method to compute the factorial of a nonnegative integer using the definition shown below: + +10. + +Write a recursive method to compute the binomial coefficient using the definition shown below: + +11. + +Find a reference on how to convert a decimal number to a binary number [4] and then write a recursive method to perform the conversion. + +Reference + +4. + +Streib JT (2011) Guide to assembly language: a concise introduction. Springer, London +James T. Streib and Takako SomaUndergraduate Topics in Computer ScienceGuide to Java2014A Concise Introduction to Programming10.1007/978-1-4471-6317-6_9 + +© Springer-Verlag London 2014 + +# 9. Objects: Inheritance and Polymorphism + +James T. Streib1 and Takako Soma1 + +(1) + +Department of Computer Science, Illinois College, Jacksonville, IL, USA + +Abstract + +This chapter returns to objects and explores the concepts of inheritance. Contours are used to explain how a subclass is extended and inherits data members and methods from a superclass. Further, protected variables and methods along with abstract classes are discussed. Another object-oriented programming concept, polymorphism, which is a useful tool for developing software, is introduced. A complete program implementing inheritance and polymorphism is included. + +Objects were introduced in Chap.​ 2, and topics such as passing objects, method overloading, and class methods were discussed in Chap.​ 5. In this chapter the concepts of inheritance, overriding methods, abstract classes, and polymorphism will be illustrated. At first these concepts might sound a little bit intimidating, but introducing them with simple programs and contour diagrams makes the concepts easier to understand. + +## 9.1 Inheritance + +An important concept in object-oriented programming is software reuse. Writing a program when the same code needs to be written and rewritten with minor variations can be time-consuming and can also waste memory. Further, if the code has already been written for one situation, rewriting it not only wastes time and memory, but the chance of making a logic error in subsequent versions also increases. Instead, it makes sense to reuse software that has already been written and tested. A further advantage of software reuse is with the maintaining of code. When a segment needs to be changed, it only needs to be changed in one place, and again the chance of introducing logic errors decreases. An important way of maximizing software reuse is through inheritance. + +When a new class is created using inheritance, the new class can inherit data members and methods from an already existing class. The existing class is known as the parent class and the new class is called the child class. Also, the parent class is sometimes called the base class and the child class is called the derived class. An even more common name for the base class is the superclass, and the derived class is then called the subclass. + +As an example, a regular polygon has equal length sides. Further, a three-sided regular polygon is an equilateral triangle, a four-sided regular polygon is a square, a six-sided regular polygon is a hexagon, and an eight-sided regular polygon is an octagon. Although there exists a generic formula for the area for an n-sided regular polygon, this text will use the specific algebraic formulas for each of the regular polygons to help illustrate the concepts of inheritance, overriding methods, abstract classes, and polymorphism. + +The specific equations for the area of each of these polygons share a common part: the length of one of its sides squared or s 2. One might recognize this is also the equation for a square, and because a square is such a simple example, it is not included in subsequent examples. Since this equation is shared by all the other equations, it can be made local to the class for a regular polygon. As a result, a regular polygon can be thought of as the superclass, and the triangle, hexagon, and octagon can be thought of as subclasses. + +Using a simple example, consider the RegPoygon class as shown in Fig. 9.1. Given the previous chapters on classes, the RegPoygon class should look fairly familiar. Notice the local private variable lenSide which is for the length of a side. The constructor initializes the variable with the value sent via the parameter. Further, there is one method that squares the length of the side using the pow method from the Math class. Lastly, as before, there is a local variable in the method that helps when using contour diagrams, but if memory were an issue, it could be eliminated and the expression could be used in the return statement. + +Fig. 9.1 + +RegPolygon class + +A main program segment that tests this class is shown in Fig. 9.2. Again, the statements in this program should be fairly familiar. A value is input from the user and a new instance of the RegPolygon class is created using the value that was input. Then the method is invoked and the value returned is output. + +Fig. 9.2 + +Main program segment using the RegPolygon class + +However, what if one wanted to write a new class for a triangle with a method to calculate the area of a triangle? One could just write the necessary expression and be done with it. + +However, as mentioned previously, isn't a triangle a regular polygon? The equation for the area of an equilateral triangle is which includes s 2. If the RegPolygon class already exists, then couldn't methods of that class be used? The answer as one might suspect is yes. The RegPolygon class would then be the superclass and the Triangle class would be a subclass, and the Triangle class could inherit methods from the RegPolygon class. Another way of saying this is that the Triangle class is an extension of the RegPolygon class. + +How is this accomplished in a program? The first line in the Triangle class would indicate that it extends the RegPolygon class as follows: + +class Triangle extends RegPolygon { + +By doing so, the Triangle class now has access to the data member, method, and constructor in the RegPolygon class. So instead of having to rewrite code segments, it can now reuse these code segments. How is this accomplished? + +First, it helps to look at the constructor for the Triangle class. Since the RegPolygon class already contains the variable lenSide and a Triangle is an extension of a RegPolygon, instead of declaring a local private variable, the variable in the RegPolygon class could be reused. And instead of initializing it in the Triangle class, the constructor in the RegPolygon class can also be reused. The constructor in the superclass RegPolygon is invoked by using super(lenSide) as shown in the following constructor: + +public Triangle(int lenSide) { + +super(lenSide); + +} + +Note that in order to invoke the constructor of the superclass, super(lenSide) must be the first line in the constructor as shown above. To calculate the area of a triangle, one would need to multiply by the results returned from the method calcRegPolyArea in the RegPolygon class as shown below: + +public double calcArea() { + +double area; + +area = Math.sqrt(3.0) / 4.0 * calcRegPolyArea(); + +return area; + +} + +Unlike the constructor, the invoking of other methods can occur anywhere in a method. As before, there is a local variable area declared in the method which will help later when creating contour diagrams. Would the word super need to be used as it was in the constructor? The answer in this case is no, but it is optional as in super.calcRegPolyArea(). Are there cases where super is needed? Yes, it is required in the constructor and in some other special cases as will be shown shortly. However, as a general rule, if it is not needed, do not include it. Before proceeding, it is helpful to see the complete Triangle class as shown in Fig. 9.3. + +Fig. 9.3 + +Triangle class + +As always, it helps to see the main program segment that invokes the method in the Triangle class as shown in Fig. 9.4. The main program inputs lenSide for the triangle. It then creates a new instance of the Triangle class by invoking the constructor, which as seen in Fig. 9.3 invokes the constructor of the RegPolygon class. It then invokes the calcArea method of the Triangle class which subsequently invokes the calcRegPolyArea method of the RegPolygon class. Lastly, the area is output. But how does this look using contour diagrams? To do so requires putting Figs. 9.1, 9.3, and 9.4 together in a complete program with line numbers as shown in Fig. 9.5. + +Fig. 9.4 + +Main program segment using the Triangle class + +Fig. 9.5 + +Complete main program with the RegPolygon and Triangle classes + +As in previous chapters, not every step will be shown using contour diagrams, but steps will be shown only at critical points to illustrate how the code executes. Assuming that the user inputs 2 for the lenSide, a good first stopping point in the execution of the program is just prior to Line 20 (abbreviated L 20 in Fig. 9.5) in the Triangle class as shown in Fig. 9.6. + +Fig. 9.6 + +Contour just prior to the execution of Line 20 + +Although the contour for a constructor is often not shown, it is shown here to help with understanding the flow of control of the program. First note that the parameter lenSide contains the value 2 passed from the main program, but it has not yet been assigned to the variable lenSide in the RegPolygon object. Further notice that the contour for Triangle is nested inside the contour for the RegPolygon class. As might be suspected, the reason for this is because RegPolygon is the superclass and Triangle is the subclass. As in the past, since Triangle is nested inside RegPolygon, it now has access to the non-private variable in RegPolygon. In other words, it can inherit the non-private variable in RegPolygon. As the execution of super(lenSide) occurs, the flow of control is transferred to the constructor in RegPolygon, and Fig. 9.7 shows the state of execution just prior to Line 32. + +Fig. 9.7 + +Contour just prior to the execution of the end of the constructor at Line 32 + +The value in the argument lenSide in the Triangle constructor is transferred to the parameter lenSide in the RegPolygon constructor, and from there it is assigned to the data member lenSide in RegPolygon. Notice in Fig. 9.7 that both the parameter lenSide in RegPolygon constructor and the variable lenSide in RegPolygon now contain the value 2 from lenSide in Triangle. After the constructor in RegPolygon is done, it returns to the constructor for Triangle and control is returned to the main program. Figure 9.8 shows the state of execution just prior to Line 12. + +Fig. 9.8 + +Contour just prior to the execution of Line 12 in main + +Notice that the two contours for the constructors are gone and the variable lenSide in RegPolygon now contains a 2. The method calcArea is then invoked, and the state of execution just prior to Line 24 is shown in Fig. 9.9. + +Fig. 9.9 + +Contour prior to the execution of Line 24 in calcArea + +Since Triangle is a subclass of RegPolygon, the contour for the method calcArea is created in Triangle as the constructor was previously. Then as Line 24 is executed, the method calcRegPolyArea is invoked, and the value for the variable a is calculated as shown just prior to Line 36 in Fig. 9.10. + +Fig. 9.10 + +Contour just prior to the execution of Line 36 in calcRegPolyArea + +Upon return from the method calcRegPolyArea, the state of execution just prior to Line 25 is shown in Fig. 9.11. Lastly, control is returned to the main program as shown just prior to output of the area on Line 14 in Fig. 9.12. + +Fig. 9.11 + +Contour prior to the execution of Line 25 in calcArea + +Fig. 9.12 + +Contour prior to the execution of Line 14 in the main program + +However, what if the name of the calcArea method in the Triangle class was changed to calcRegPolyArea? Would this cause a problem with the method calcRegPolyArea in the RegPolygon class? The answer is yes, because calcRegPolyArea in the Triangle class would have the same number and type of parameters as the calcRegPolyArea method in the RegPolygon class. A method in a subclass that has the same name, the same number of parameters, and the same type of parameters as another method in the superclass is known as an overriding method. Does this mean that there cannot be two methods of the same name, the same number of parameters, and same type of parameters, one in the superclass and one in the subclass? The answer is no, but if there is an overriding method, how does one access the method in the superclass? If calcRegPolyArea is invoked in the subclass, the method in the subclass would be used, and in this case it would recursively call itself which is not what is intended. As mentioned earlier, there are instances where the word super must be used and this is one of those instances. So, should one want to access the calcRegPolyArea method in the superclass, then the word super is no longer optional and must be used as shown in the segment in Fig. 9.13. + +Fig. 9.13 + +Overriding the calcRegPolyArea() method + +First, note that the name of the method has been changed from calcArea to calcRegPolyArea. Further, by including the word super prior to the call to calcRegPolyArea, the method in the superclass RegPolygon is invoked instead of recursively calling the calcRegPolyArea method in the subclass. Again, in this case the word super is not optional. Using the word super only when it is needed helps alert other programmers reading the code that there are two methods of the same name. For now, instead of changing the method name to calcRegPolyArea, the program in Fig. 9.5 will retain the method name calcArea. + +## 9.2 Protected Variables and Methods + +In the program in Fig. 9.5, what would happen if a method in the Triangle class tried to access the variable in the RegPolygon class? Specifically, what if the constructor in the Triangle class tried to access the variable lenSide in the RegPolygon class? The answer is the same as if trying to access the variable from the main program. If a variable is private, then it can only be accessed by methods in the RegPolygon class; thus the variable lenSide is initialized using the constructor. + +However, if a variable were made public, then the methods of the subclass could access it. Unfortunately, the variable would also be accessible from the main program as well. Is there a way that would allow only methods in the subclass to access a variable in the superclass, but still not allow the variable to be accessed from the main program? The answer is yes. Instead of private or public access, protected access can be used as shown in the following: + +protected int lenSide; + +Now instead of initializing the variables via the RegPolygon constructor, the variables can be accessed directly as in the following modified Triangle constructor: + +public Cylinder(int lenSide) { + +super.lenSide = lenSide; + +} + +To access the variable lenSide in the RegPolygon class, notice the use of the word super. Also note that this could have been used instead of super, but the use of the word super is preferred because it alerts programmers who might subsequently read the code that the variable is not located in the current class but rather in the superclass. + +Since the RegPolygon constructor would no longer be invoked, it could be deleted. However, if it was retained, but not invoked, a default constructor would need to be added to the RegPolygon class as follows: + +public RegPolygon() { + +} + +Although accessing a variable in this manner works and is better than declaring a variable as public, it can still suffer from some of the same problems as being declared public when there are a large number of subclasses. As a result, given a choice between accessing a protected variable or accessing a private variable via a method, this text will generally choose the latter as shown previously in Fig. 9.5. + +However, notice in Fig. 9.5 that although the variables in the RegPolygon class are private, the methods are public. While this is acceptable when access to the method is needed by both the main program and a subclass, what if access is only needed via the subclass and not from the main program? Is there a way that this can be accomplished? Again, as might be suspected, just as variables can be made accessible only by a subclass, this can also be true for methods. This is accomplished again using protected instead of public as shown in the following headings: + +protected RegPolygon(int lenSide) { + +protected double calcRegPolyArea() { + +This corresponds to the previous suggestion that variables should remain private and only accessed through methods. Further, these methods can only be accessed from other methods within the class or any subclasses, and not from the main program. + +## 9.3 Abstract Classes + +Given the program in Fig. 9.5, there is nothing preventing the main program from creating an instance of the RegPolygon class. Although not very useful, even if the variable lenSide is private and the methods are protected, an instance could be created. Is there a way to make it so that an instance of the class cannot be created? Yes, and it is known as an abstract class. The result is that subclasses can still be defined, yet an instance of the superclass cannot be created. The following first line of the RegPolygon class shows how this is accomplished: + +abstract class RegPolygon { + +If it is possible to create an abstract class, is it also possible to create an abstract method? The answer again is yes. When creating an abstract method, the heading is declared in the superclass, but the body of the method is not defined as in the following: + +public abstract double calcArea(); + +Again, note that there is no body to the method and the first line of the method ends in a semicolon. If the heading is in the superclass and there is no body to the method, where is the body defined? The complete method is defined in the subclass as it was before and as shown below: + +public double calcArea() { + +double area; + +area = Math.sqrt(3.0) * calcRegPolyArea() / 4.0; + +return area; + +} + +If the above method is the same as before, what is the advantage of doing this? The advantage is that it allows different subclasses to have different methods using the same heading to meet the needs of each subclass. For example, instead of a triangle, consider an octagon: + +The name for this new class could be Octagon. Further, since the equation for an octagon is , it could also be a subclass of the RegPolygon class. Since the formula s 2 is the same, the calcRegPolyArea method of the RegPolygon class could be invoked, but unlike the calculation for the area of the triangle, it would not need to be multiplied by but rather multiplied by . There is no change to the Triangle class and the new Octogon class is as follows: + +class Octagon extends RegPolygon { + +public Octagon(int lenSide) { + +super(lenSide); + +} + +public double calcArea() + +double area; + +area = 2.0 * (1.0 + Math.sqrt(2.0)) * calcRegPolyArea(); + +return area; + +} + +} + +Note in the first line that the Octagon class extends the RegPolygon class. Next, notice in the calcArea method that calcRegPolyArea() is not multiplied by but rather by as mentioned above. + +Note that an abstract class does not have to have any abstract methods, but if a class has abstract methods, the class needs to be declared as an abstract class. Using an abstract method in the superclass forces both subclasses to define different calcArea methods, and if the methods were not declared, a syntax error would occur. This is a handy feature to have when there are some differences in various subclasses, yet it is desired to retain some commonality among the subclasses. + +## 9.4 Polymorphism + +Another important feature of object-oriented programming is polymorphism, where the type of an object that is referenced by a superclass variable is determined at runtime instead of at compile time. This concept will be illustrated with the help of examples below. + +In Java, a variable of a superclass type can reference an object of any of its subclasses. In other words, both an object of the superclass and an object of a subclass can be referenced by a variable of the superclass type. Consider the definition of the class RegPolygon shown in Fig. 9.1 which is repeated below for convenience: + +class RegPolygon { + +private int lenSide; + +public RegPolygon(int lenSide) { + +this.lenSide = lenSide; + +} + +public double calcRegPolyArea() { + +double a; + +a = Math.pow(lenSide, 2); + +return a; + +} + +} + +Further, the class Triangle from Fig. 9.3, with the modification described in Fig. 9.13 with the method calcArea renamed to calcRegPolyArea, is shown below: + +class Triangle extends RegPolygon { + +public Triangle(int lenSide) { + +super(lenSide); + +} + +public double calcRegPolyArea() { + +double area; + +area = Math.sqrt(3.0) / 4.0 * super.calcRegPolyArea(); + +return area; + +} + +} + +The class Triangle is a subclass of the class RegPolygon, and the method calcRegPolyArea in the Triangle class is overriding the method calcRegPolyArea in the RegPolygon class. Suppose two variables of type RegPolygon are declared in the main method as follows: + +RegPolygon shape1, shape2; + +Naturally, a reference to an object of the class RegPolygon can be assigned to these variables. For example, the following statement assigns an object of the RegPolygon class to the variable shape1. + +shape1 = new RegPolygon(5); + +In addition, a reference to an object of the class Triangle can also be assigned to these variables. The following statement assigns an object of the Triangle class to the variable shape2. + +shape2 = new Triangle(2); + +Next, using the method calcRegPolyArea defined in both the class RegPolygon and the class Triangle, the square of the side and the area of the triangle will be calculated. For the object shape1, the code segment can be found in Fig. 9.14. This code segment will output the area with a side of 5 as + +Fig. 9.14 + +Code segment finding the square of the side of shape1 + +area of shape1: 25.00 + +Now, what would happen when the code segment in Fig. 9.15 is executed for the object shape2? Recall that the variable shape2 is of type Triangle. Will the method calcArea defined in the class RegPolygon be invoked and return 25.00? The answer is no. Instead it will output the following: + +Fig. 9.15 + +Code segment finding the area of shape2 + +area of shape2: 1.73 + +This is the area of a triangle with a side of length 2. The reason is that the type of the object invoking the method calcRegPolyArea determines which calcRegPolyArea method is called, either the one in the class RegPolygon or the one in the class Triangle. Even though the variable shape2 is of type RegPolyton, it references a Triangle object because that is the type assigned to it during runtime by the shape2=new Triangle(2); statement. This means that the Triangle object is invoking the method calcRegPolyArea defined in the class Triangle when it is executed. + +This is an example of polymorphism. Variables shape1 and shape2 could reference either a RegPolygon object or a Triangle object. At compile time, it cannot be determined what type of the object they will reference. However, at runtime when the object invokes the method calcRegPolyArea, the type of the object is determined and the appropriate calcRegPolyArea method is called. + +If a variable of a superclass type can reference an object of a subclass type, can a variable of a subclass type reference an object of a superclass type? The answer is no. Consider the following code segment: + +Triangle shape3; + +shape3 = new RegPolygon(6); + +The second statement causes a compile-time error, because a reference variable of a subclass type is not allowed to reference an object of its superclass. As one might suspect, the following statement is also incorrect, + +shape3 = shape1; + +because the variable shape1 is referencing an object of type RegPolygon. What about the following statement? + +shape3 = shape2; + +At first it looks okay since the variable shape3 is of type Triangle and the variable shape2 references an object of the Triangle class. But, the answer is again no. It causes a compile-time error because even though shape2 references a Triangle object, the variable shape2 is of type RegPolygon. However, the following statement is legal: + +shape3 = (Triangle) shape2; + +The above statement uses a typecast operator, discussed in Chap.​ 1, which allows shape3 of type Triangle to reference the Triangle object that shape2 of type RegPolygon references. + +Suppose another subclass of the class RegPolygon named Hexagon is defined. The equation for a hexagon is as shown below: + +class Hexagon extends RegPolygon { + +public Hexagon(int lenSide) { + +super(lenSide); + +} + +public double calcRegPolyArea() { + +double area; + +area = 3.0 * Math.sqrt(3.0) / 2.0 * super.calcRegPolyArea(); + +return area; + +} + +} + +As discussed above, a variable of the class RegPolygon can reference an object of the class Hexagon, but a variable of the Hexagon class cannot reference an object of the RegPolygon class. Also, a variable of the Hexagon class cannot reference an object of the Triangle class, and vice versa, since the Hexagon class and the Triangle classes are both subclasses of the RegPolygon class, also known as sibling classes. + +Returning to the output of the code segments in Figs. 9.14 and 9.15, instead of displaying the words "shape1" and "shape2" as shown below, would it be better if the type of the polygon is output? + +area of shape1: 25.00 + +area of shape2: 1.73 + +Is there a way to determine the type of an object during the runtime and output it? The answer is yes. To determine the type of an object, Java provides the operator instanceof. This operator is especially useful because the variable of a superclass can reference an object of either its own class or a subclass type. Consider the following expression: + +shape1 instanceof Triangle + +This expression evaluates to true if the variable shape1 refers to an object of the class Triangle; otherwise it evaluates to false. Using the operator instanceof, the printf statements in Figs. 9.14 and 9.15 can be rewritten as follows: + +if(shape1 instanceof Triangle) + +System.out.printf("area of triangle: %.2f", area1); + +else + +System.out.printf("square of side: %.2f", area1); + +System.out.println(); + +if(shape2 instanceof Triangle) + +System.out.printf("area of triangle: %.2f", area2); + +else + +System.out.printf("square of side: %.2f", area2); + +System.out.println(); + +The output of the above code segment is + +square of side: 25.00 + +area of triangle: 1.73 + +Since the variable shape1 references a RegPolygon object, the first if condition returns false. Therefore the printf statement in the else block was executed stating that the square of the side is calculated. For shape2, the then portion of the second if statement was executed. However, what would happen if there are a large number of shapes whose areas need to be calculated? Instead of having each object calling the calcRegPolyArea method separately and having if statements for the output, an array of objects can be used to simplify the program. + +Consider the creation of an array with different types of regular polygons. If the array is declared as a type RegPolygon, each element of the array could be an object of its subclasses. The following code segment declares and creates an array named shapes of type RegPolygon with five elements, which can be the Triangle class or the Hexagon class: + +RegPolygon[] shapes; + +shapes = new RegPolygon[5]; + +The following statements create either a Triangle object or a Hexagon object and place them in the array: + +shapes[0] = new Hexagon(3); + +shapes[1] = new Triangle(2); + +shapes[2] = new Triangle(5); + +shapes[3] = new Hexagon(4); + +shapes[4] = new Triangle(4); + +Once all the objects are stored in the array, a for loop can be used to calculate the areas and output them along with the type of the shape. + +for(int i=0; i= 0) { + +numStudents++; + +System.out.println("score " + numStudents + ": " + score); + +totalExam1 = totalExam1 + score; + +score = inFile.nextInt(); + +} + +average1 = totalExam1/numStudents; + +System.out.println(); + +System.out.printf("average: %.2f", average1); + +The variable numStudents is used to store the number of scores and calculate the average after the loop. However, what if one did not want to include a sentinel value in the data file? It would seem that the program should be able to keep reading the integers using a loop until there are no more scores in the file. Fortunately, the hasNextInt method can be used to check if another integer value exists in the file. If it does not find an integer, the method returns false. Using a while loop, the execution could continue to the statement that follows the loop. The revised loop is shown below: + +numStudents = 0; + +totalExam1 = 0; + +while(inFile.hasNextInt()) { + +score = inFile.nextInt(); + +numStudents++; + +System.out.println("score " + numStudents + ": " + score); + +totalExam1 = totalExam1 + score; + +} + +average1 = totalExam1/numStudents; + +System.out.println(); + +System.out.printf("average: %.2f", average1); + +The advantage to this technique is that the file does not need to contain a sentinel value, nor does the loop need a priming read. In addition to the method hasNextInt, there are a number of similar methods in the Scanner class that can be used with different types of data as listed in Table 10.1. + +Table 10.1 + +Selected methods of the Scanner class + +Methods | Return type | Description + +---|---|--- + +hasNext() | boolean | Returns true if there is another token available for input + +hasNextDouble() | boolean | Returns true if the next token is a double value + +hasNextInt() | boolean | Returns true if the next token is an int value + +hasNextLine() | boolean | Returns true if there is another line available for input + +next() | string | Returns the next token + +nextDouble() | double | Returns the next token as a double value + +nextInt() | int | Returns the next token as a int value + +nextLine() | string | Return the next line of input as a string. It may contain several token and spaces. The newline character \n could be there, but it is not included in the string + +Next, consider the case where the input file grades2.txt contains two sets of exam scores per line and the column headings as shown below: + +Exam1 Exam2 + +71 95 + +60 80 + +75 76 + +The task is to find the average score of both sets of exam scores. Since the first two items in the file are not scores, they have to be extracted using the next method instead of the nextInt and assigned to the String variables to be output later. Notice in the following code that both sets of scores are read and added to the appropriate variables during each iteration of the while loop before moving on to the next line. Further, since the number of students is not known in advance, it is necessary for the program to count the number of input lines using the variable numStudents as shown in Fig. 10.2. + +Fig. 10.2 + +A program that inputs data from a text file + +The output from the program would look like the following: + +Exam1 Exam2 + +Student 1: 71 95 + +Student 2: 60 80 + +Student 3: 75 76 + +Exam1 average: 68.67 + +Exam2 average: 83.67 + +Each individual score was output as they were read from the file inside the loop, and the last two lines were output after the calculation of the average outside the loop. + +## 10.3 File Output + +To send output to a file, the classes PrintWriter and FileWriter are used. The PrintWriter class prints formatted text using methods like print, println, and printf. The FileWriter class is a counterpart of FileReader class and is meant for writing streams of characters. As with the FileReader class, the PrintWriter and FileWriter classes are contained in the package java.io which needs to be imported at the beginning of the program. For file output, a variable of type PrintWriter is declared and associated with the destination, the file where the output will be stored. Suppose the output is to be stored in the file outs.txt in the same directory as the source code. Again, the way to specify the output file in a different directory will be discussed in Sect. 10.5. Consider the following statement: + +PrintWriter outFile + += new PrintWriter(new FileWriter("outs.txt")); + +This statement creates an object of type PrintWriter named outFile and associates it with the file outs.txt. An output file does not have to exist before it is opened for output. If it does not exist, the system creates an empty file in the current directory. If the designated output file already exists, a new empty file with the same name will be created, replacing the previous file of the same name. Sometimes, however, there is a time when new data should be appended to the end of the data that already exists in the file. The FileWriter class has an overloaded constructor that takes two arguments as in + +PrintWriter outFile + += new PrintWriter(new FileWriter("outs.txt", true)); + +The first argument is a name of the file and the second argument is a Boolean value. If it is true and the file already exists, the contents of the file will not be erased and the new data will be appended to the end of the file. If the argument is false and the file already exists, it will be replaced by the new one. If the boolean value is not included in the argument list, the value false is assumed and an existing file will be replaced. Finally, in any case, if the file does not exist, a new file is created. + +Similar to the Scanner class, an object of the File class could be associated to the file. Using an overloaded constructor of the PrintWriter class and a File object as an argument to create a PrintWriter object is shown below: + +PrintWriter outFile + += new PrintWriter(new File("outs.txt")); + +Another overloaded constructor of the PrintWriter class simply takes a filename as an argument just like the Scanner class as shown below: + +PrintWriter outFile = new PrintWriter("outs.txt"); + +The advantage of using an object of the class FileWriter over the File class or a simple filename is the ability of appending the text, if it is desired. + +Once the object of type PrintWriter is created, the methods such as print, println, and printf can be applied to the object outFile just the same way they have been used with the System.out. When the output is completed, the output file should be closed by using the method close shown in the following statement: + +outFile.close(); + +Data to be written to a file is stored in an output buffer in memory before it is written to the file. Closing a file ensures that any data remaining in the buffer will be emptied. If the file is not closed, it is not considered an error, but it could be possible that not all the information generated by the program will be sent to the output file. Therefore, it is good practice to always close the output file. The program in Fig. 10.2 is modified to output the result to the file outs.txt as shown in Fig. 10.3. + +Fig. 10.3 + +A program that outputs data to a text file + +The program in Fig. 10.3 will have the same output as the program in Fig. 10.2, but this time, it will be output to the file outs.txt. To see the output, simply open the file using a utility program and examine the results. + +## 10.4 File Input and Output Using an Array + +Assuming the scores from different exams are kept in separate files, how can the scores in each file be processed using the same program? It would not be a good idea to have the input filename hardcoded into the program. Instead, the program should allow the user to enter the filename. Also, after the scores are processed, the results can also be stored in a user-specified file. If variables are used for the name of both input and output files, it is not necessary to change and to compile the code every time the program is executed for a different set of data. + +If every course has a different number of students, the number of scores in the input file is not known in advance. Suppose that an array of the same size as the number of scores were to be created, then the scores would need to be counted and the count stored in a variable would be used to allocate the array. In order to count scores, every score is read without being stored or used for calculations. The code segment in Fig. 10.4 will count scores in the file. + +Fig. 10.4 + +A code segment that counts the data in an input file + +Note that the user is prompted for and inputs the name of the file. Further, notice that inside the while loop, although the exam scores were read from the file using the statement inFile.nextInt(); because the return values were not used for any calculations at this point, they were not stored in memory. The instruction inFile.nextInt(); was simply used to count the number of exam scores. At the end of the while loop, the variable numStudents will have the number of scores in the file. The next step is to create an array of the size numStudents, read the scores from the file again, and this time store them in the array. Consider the following code segment that could be added to the code in Fig. 10.4 to do these tasks: + +// create array of size numStudents + +scores = new int[numStudents]; + +// read scores from input file and save them in array + +for(i=0; i 100) + +throw new RuntimeException(); + +flag = false; + +} + +catch(RuntimeException exception) { + +System.out.println("Error: Score must be in 0-100."); + +} + +catch(InputMismatchException exception) { + +scanner.next(); + +System.out.println("Error: Score must be an integer."); + +} + +This results in a compiler error with the message: + +exception java.util.InputMismatchException has already + +been caught + +Why? Recall that the InputMismatchException class is a subclass of the RuntimeException class as shown in Fig. B.1 and partially repeated below: + + * Exception + + * IOException + + *... + + * RuntimeException + + *... + + * NoSuchElementException + + * InputMismatchException + +When the object of the InputMismatchException class is thrown, the first catch block is executed and all other catch blocks are ignored. This means that the second catch block will never be executed because any exception object that is an instance of the RuntimeException class or its subclasses will match the first catch block. + +When there are multiple catch blocks, each catch clause has to correspond to a specific type of exception. With the example above, since the InputMismatchException class is a subclass of the RuntimeException class, both exceptions could be caught by the catch clause with RuntimeExeption . Further, having two catch clauses for the same type of exception in the try-catch statement, as shown below, will cause the compiler to issue an error message "exception java.lang.RuntimeException has already been caught" in the second catch clause. + +try { + +score = scanner.nextInt(); + +if(score < 0 || score > 100) + +throw new RuntimeException(); + +flag = false; + +} + +catch(RuntimeException exception) { + +scanner.next(); + +System.out.println("Error: Score must be an integer."); + +} + +catch(RuntimeException exception) { + +System.out.println("Error: Score must be in 0-100."); + +} + +If there is a block of code that needs to be executed regardless of whether an exception is thrown, then the try-catch statement can include a finally block which must appear after all of the catch blocks. Consider the following while loop modified from Fig. B.4 with a finally block added at the end of the try-catch statement: + +The output using the same input values, 8o , 180 , and 80 , is shown below: + +Enter the score: 8o + +Error: Score must be integer. + +End of try-catch statement. + +Enter the score: 180 + +Error: Score must be in 0-100. + +End of try-catch statement. + +Enter the score: 80 + +End of try-catch statement. + +Your score is 80. + +Since the first two inputs were invalid, both an error message from the catch block and a message from the finally block were output. The last input did not throw an exception, so all the catch blocks were skipped, but the message from the finally block was still displayed. + +### B.4 Checked and Unchecked Exceptions + +Among the exceptions, including the ones listed in Fig. B.1 , there are two categories: checked and unchecked. Unchecked exceptions are those that inherit from the Error class or the RuntimeException class. They are also called runtime exceptions because they are detected during runtime. As mentioned before, the exceptions that inherit from the Error class are thrown when a critical error occurs, and therefore they should not be handled by the program. Exceptions that were handled in the previous sections are all instances of the RuntimeException class or its subclasses. However, in general not all the possible exceptions from the RuntimeException class are handled in the program because handling each one of them in the program is not practical. As a result, exception handling should only be used when the problem can be corrected, and simply catching and ignoring any exception is a bad practice. + +A RuntimeException indicates programming errors, so it could possibly be avoided altogether by writing better code. However, large applications might never be entirely bug-free, and exception handling can be used to display an appropriate message instead of surprising the user by an abnormal termination of the program. If the application is running critical tasks and must not crash, exception handling can be used to log the problem and the execution can continue. + +All exceptions that are not inherited from the Error class or the RuntimeException class are called checked exceptions because they are checked during compile time. Consider a program which opens a file, reads numbers from the file, and outputs the total. Suppose the scores.txt file contains the following data and exists in the same directory as the .java file: + +70 + +80 + +90 + +The code in Fig. B.5 opens the scores.txt file, reads three numbers from the file, and outputs the total. What happens during the compilation of the program? The compiler will issue an error message "Unreported exception java.io.FileNotFoundException; must be caught or declared to be thrown" for the line inFile = new Scanner(new File("scores.txt")); because this statement can potentially throw a checked exception. If the file scores.txt does not exist as discussed in Chap.​ 10 , the checked exception of a FileNotFoundException has to be thrown. A simple solution to eliminate this error is to add a throws clause, throws IOException , in the method header. The throws clause informs the compiler of the exceptions that could be thrown from a program. If the exception actually occurs during runtime, because the system could not find the file scores.txt , the system will deal with the exception by halting execution. Consider the following modified version of the code from Fig. B.5 : + +Fig. B.5 + +A program with a checked exception + +Notice that throws IOException is added in the main method header. The FileNotFoundException could be used in the header instead of IOException since it is the class that the exception object is actually created from. However, because the IOException class is a superclass of the FileNotFoundException class as shown below from Fig. B.1 , the throws clause with IOException can catch the instance of the FileNotFoundException class. Including the more general exception class in the header is useful since it can catch exceptions of all the subclasses. + + * Exception + + * IOException + + * CharConversionException + + * EOFException + + * FileNotFoundException + + * RuntimeException + + *... + +The other way to handle a checked exception is to include the try-catch statement in the body of the program. Because the statement inFile = new Scanner(new File("scores.txt")); could possibly throw a checked exception, it should be included inside the try block. The statements that should be executed in response to the thrown exception are placed in the matching catch block. To simply display an error message and continue when the exception is thrown, a try-catch statement is added to the code in Fig. B.5 as shown below: + +If the designated file does not exist in the system, the program will stop whether a try-catch block exists or not. However, without a try-catch block, the execution stops abnormally, and with a try-catch block, the program terminates normally. If it was a part of a larger application program, it would be convenient if the program did not crash just because it did not find one file, but continued the execution of the next part of the program. + +## Appendix C: Javadoc Comments + +In Chap.​ 1 , different ways of documenting a Java program were discussed. As was mentioned, comments are intended for programmers and are ignored during execution. However, documentation is an important aspect of developing applications. In the real world, once an application is released, programming bugs that were not detected during development need to be fixed and new features may be added. Often those who modify a program are not the ones who developed it. The documentation then becomes very helpful for a programmer attempting to understand somebody else's program. This appendix explains more about specialized comments called Javadoc . + +### C.1 Javadoc + +Java provides a standard form for writing comments and documenting classes. Javadoc comments in a program interact with the documentation tool also named Javadoc, which comes with the Java Development Kit (JDK). The Javadoc tool reads the Javadoc comments from the source file and produces a collection of HyperText Markup Language ( HTML ) pages, which can be read and displayed by web browsers. These pages look just like the Java API specification document at the Oracle website at http://​docs.​oracle.​com/​javase/​7/​docs/​api/​index.​html . The HTML pages created by the Javadoc tool contain only documentation and no actual Java code. The documentation allows programmers to understand and use the classes someone else has written without seeing how they are actually implemented. + +Javadoc comments begin with a slash followed by two asterisks /** and end with an asterisk followed by a slash */ . Many programmers also place a single asterisk * at the start of each line in the comment as shown in the program in Fig. C.1 . Although they have no significance and the Javadoc tool ignores them, they make it easy to see the entire extent of the comments in the program. + +Fig. C.1 + +A program with Javadoc comments + +The Javadoc comments for the class are placed between the import statements and the class header. After the description of the class, the rest of the comment consists of a series of Javadoc tags , which are special markers that begin with the @ symbol. Each tag tells the Javadoc tool certain information. The documentation for a class will usually contain an author tag. The Javadoc tag @author indicates the name of the programmer(s) who created the class. The Javadoc comments for the description of a method are placed above the method header. As an example, two Javadoc comments are added to the QuadEq class discussed in Sect. 1.10 of Chap.​ 1 and shown in Fig. C.1 . + +The use of Javadoc comments does not preclude the use of other types of comments in the program. In addition to the Javadoc comments in Fig. C.1 , the regular comments with two slashes // are used to describe the sections of the code. Since Javadoc comments included in the HTML page are the only ones describing the class, its data members, and its methods, the comments describing the sections will not appear in the HTML page even if they are written as Javadoc comments. However, the comments in the middle of the code are still important when a programmer is reading to understand the code. Therefore, Javadoc comments are useful for a programmer who simply uses the classes without looking at the implementation, and other comments in the code are helpful for a programmer who is actually modifying the code. + +Once all the Javadoc comments are added to the class, the next step is to generate the corresponding HTML documentation file. Many Java editors and Integrated Development Environments (IDEs) include a menu option that can be used to generate a Javadoc documentation file quickly and easily. Part of the resulting HTML page for the QuadEq class is shown below: + +In the nicely formatted HTML page, the description of the class which has been added to the program as a Javadoc comment is shown. The author tag appears in boldface and the names of the authors are shown as well. Since there is no constructor defined in the class, a system-generated default constructor is listed in the Constructor Summary section. The Method Summary section contains only the main method along with the Javadoc comments added in the program because only one method exists in the class. + +### C.2 More Javadoc Tags + +The format of the Javadoc comments for a method is similar to the one for a class. In addition to a general description, a number of Javadoc tags can be included. The main purpose of the comments for a method is to record its purpose, a list of any parameters passed to the method, and any value returned from the method. If the method receives a parameter, the @param tag is used, and if the method returns a value, the @return tag is added. The Javadoc comments for the method convertEurosToDollars as defined in the Card class from Sect. 5.6.2 are shown below: + +/** + +* Convert the passed value to Dollars. + +* + +* @param euros the amount in Euros + +* @return the amount in Dollars + +*/ + +public static double convertEurosToDollars(double euros) { + +return euros*rate; + +} + +Notice that the Javadoc comments for the method need to be placed just above the method header. Each parameter of the method is documented by using a tag @param , followed by the name and the description of the parameter. A description of a return value is listed after the Javadoc tag @return . Notice the effect of the @param and @return tags in the following HTML document for the above method: + +The Javadoc comments for a constructor can be defined in a manner similar to the one for a method, except it does not have a @return tag. In addition to the above tags, if the method could throw exceptions, they can be listed using the @throws tag, just like the @param and the @return tags in the Javadoc comments. The topic of exceptions is discussed in Appendix B. + +More complex methods may need complete precondition and postcondition lists. Also an example of how the method is used may be useful information for other programmers. The tags such as @precondition , @postcondition , and @example that are not predefined in the Javadoc tool can be created by programmers. Since the convertEurosToDollars is a simple method, only the @example tag will be added to the Javadoc comments as shown below: + +/** + +* Convert the passed value to Dollars. + +* + +* @param euros the amount in Euros + +* @return the amount in Dollars + +* @example conversion of 1.00 Euros to US dollars - + +* Card.convertEurosToDollars(1.00); + +*/ + +public static double convertEurosToDollars(double euros) { + +return euros*rate; + +} + +Note that in order to include the user-defined tags in the documentation, the HTML page may need to be generated from a command line if the Java editor does not have a capability of including the options, as will be discussed in the next section. The HTML document for the above method also appears in the next section. + +Similar to the standard classes, programmer-defined classes and HTML documentation can be shared with other programmers. First, .java files are written in the usual way but include the Javadoc comments described in this appendix. After they are compiled, the .class files can be moved to a location where other programmers can have access to them. Then the Javadoc tool can be run on each .java file to create an HTML page, and all Javadoc HTML files can be moved to a public place where a web browser could be used to read them. This way, by importing the classes at the beginning of the Java program, the programmer-defined classes are available to other programmers without compiling them just like the standard classes. + +### C.3 Generating Javadoc Documentation from a Command Line + +An HTML page can also be generated from a command line. In the command prompt window, the commands javac and java are used to compile and run Java programs, respectively. Similarly, the javadoc command is used for generating Javadoc documentation files. For example, to generate a Javadoc documentation file for the QuadEq class, the following command is used: + +javadoc QuadEq.java + +After the command is executed, a collection of HTML files will be created. The documentation can be viewed by opening the file index.html and clicking the QuadEq link. + +When a programmer-defined tag such as @example is included in the source code, options need to be included in the command line to generate the HTML. The following command can be used to create Javadoc documentation for the Card class which implements the method convertEurosToDollars : + +javadoc –private –author –tag param -tag return + +-tag example:a:"Example:" Card.java + +The –private option generates the documentation for the class, variables, and methods including the public , protected , and private members of the class. The –author option puts the author tag in boldface followed by the author's name in the documentation. The other options starting with –tag indicate the order in which the tags appear in the HTML file: the parameter(s) first, then the return specification, and finally the example. Two of these options, param and return , are predefined in the Javadoc system, so only –tag param and –tag return are listed. However, because an example tag is not predefined in Javadoc, the extra information at the end such as :a:"Example:" is needed and indicates how the tag is to appear in the documentation. The a: means that all occurrences of the @example tag should be put in the documentation along with a heading, which in this case is Example: as it appears in the quotation marks. Headings will always appear in boldface in the documentation created by the javadoc command. The following is the HTML document for the method convertEurosToDollars that is generated after the @example tag is added to the source code. + +For more information about Javadoc, refer to the Java API specification document at the Oracle website at http://​docs.​oracle.​com/​javase/​7/​docs/​technotes/​tools/​windows/​javadoc.​html . + +## Appendix D: Glossary + +All of the terms in italics in the text can be found in the index, and some of these terms (including abbreviations) can be found here in the glossary. The descriptions of terms in this glossary should not be used in lieu of the complete descriptions in the text, but rather they serve as a quick review. Should a more complete description be needed, the index can guide the reader to the appropriate pages where the terms are discussed in more detail. + +Algorithm + +A step-by-step sequence of instructions, but not necessarily a program for a computer. + +API + +Application Programming Interface. + +Array + +A collection of contiguous memory locations that have the same name and are distinguished from one another by an index. + +Assembly language + +A low-level language that uses mnemonics and is converted to machine language by an assembler. + +Bytecode + +An intermediate language between Java and machine language. + +Class + +A definition or blueprint of a set of objects. + +Compiler + +A translator that converts a high-level language program to a low-level language for subsequent execution. + +Contour diagram + +A visual representation of the state of execution of a program. + +CPU + +Central Processing Unit. + +Data members + +The variables and constants that are part of an object. + +EOD + +End of Data. + +Exception + +An execution error, an error condition, or an unexpected event during execution of a program. + +GUI + +Graphical User Interface. + +High-level language + +A more English-like and math-like programming language, such as Java. + +HTML + +HyperText Markup Language. + +IDE + +Integrated Development Environment. + +Inheritance + +The ability of a subclass to reuse methods and data members of a superclass. + +Interpreter + +A translator that converts and executes a high-level language program one instruction at a time. + +IPO + +Input Process Output. + +Iteration structures + +Allows a program to repeat a section of code, often called a loop. + +Javadoc + +Specialized comments for documenting classes and methods. + +LCV + +Loop Control Variable. + +LIFO + +Last In First Out as with a stack. + +Low-level language + +A language closer to a particular CPU, such as assembly language and machine language. + +Machine language + +The native language of the processor coded on ones and zeros. + +Method + +A series of instructions that can be invoked to access and manipulate the data members of an object. + +Object + +An instance of a class. + +OOP + +Object-Oriented Programming. + +Overloading + +A method in the same class that has the same name but a different number of parameters, different types of parameters, or parameters of different types in a different order. + +Overriding + +A method in a subclass that has the same name and also the same number and type of parameters as the one in the superclass. + +Polymorphism + +The type of an object referenced by a superclass variable determined at runtime. + +Pseudocode + +A design tool consisting of a combination of English and a programming language that helps one concentrate on logic instead of syntax when developing a program. + +RAM + +Random Access Memory. + +Recursion + +A definition that is defined in terms of itself and includes a base or terminal case. + +Selection structure s + +Allows a program to follow one of more paths, sometimes called decision structures. + +Semantics + +The meaning of what each instruction does in a programming language. + +Syntax + +The grammar of a programming language. + +UML + +Universal Modeling Language. + +Variables + +Named memory locations used to store data in a program. + +## Appendix E: Answers to Selected Exercises + +### Chapter 1 + +1.B. + +Correct. + +1.D. + +Incorrect, a double number cannot be assigned to a variable of integer type. + +2.A. + +0 + +3.B. + +5.34 + +4.B. + +final double EULER_NUMBER = 2.7182 ; + +6. + +System.out.println("** **"); + +System.out.println("** **"); + +System.out.println(" ****"); + +System.out.println(" ****"); + +System.out.println(" ****"); + +System.out.println(" ****"); + +System.out.println("** **"); + +System.out.println("** **"); + +7. + +After execution, value1 is 9 , value2 is 4 , and value3 is 9 . + +8.B. + +s = r * Math.PI * Math.sqrt(Math.pow(r,2) + Math.pow(h,2)); + +### Chapter 2 + +1.A. + +Incorrect, it should be Circle circle = new Circle(); + +1.C. + +Correct. + +4.A. + +Circle innerCircle; + +innerCircle = new Circle(); + +4.C. + +System.out.println("The value of radius is " + +\+ innerCircle.getRadius()); + +6. + +Answers to A. and D. of the Cone class + +### Chapter 3 + +1.A. + +40 + +2.B. + +50 + +3.C. + +3 + +5.A. + +true || false → true + +5.C. + +true || flag1 && flag2 → true || false → true + +5.E. + +(true || false) && false → true && false → false + +8. + +9. + +### Chapter 4 + +2. + +, in the for statement + +3. + +sum = 1 + +count = 2 + +sum = 3 + +count = 3 + +sum = 6 + +count = 4 + +sum = 10 + +count = 5 + +sum = 10 + +count = 5 + +6. + +** + +**** + +****** + +******** + +********** + +7.B. + +int total, count + +total = 0; + +count = 1; + +do { + +total += count; + +count += 3; + +} while (count <= 40); + +8.A. + +int total, count, n; + +total = 0; + +n = 5; + +for(count = 0; count < n; count++) { + +total += count; + +} + +### Chapter 5 + +1. + +constructor 1 : valid + +constructor 3 : invalid + +2. + +method 2 : invalid + +method 6: valid + +method 10 : valid + +6. + +answers to A., B., C., and F. of the Cone class + +### Chapter 6 + +1.B. + +The second line should be text2 = new String("Shedding blade"); + +2.B. + +34 + +2.D. + +Hose_ + +7. + +### Chapter 7 + +1.B. + +Incorrect, the size has to be specified. + +1.C. + +Incorrect, the braces have to be used instead of the square brackets. + +1.E. + +Incorrect, the size should not be specified. + +2. + +int total = 0; + +for(int i=0; i "Those who don't understand Unix are condemned to reinvent it, poorly." —Henry Spencer + +Some of the tools I'll show you are available online via a web browser, which will be the easiest for most readers to use. Others you'll use from a command or a shell prompt, and a few you'll run on the desktop. The tools, if you don't have them, will be easy to download. The majority are free or won't cost you much money. + +This book also goes light on jargon. I'll share with you what the correct terms are when necessary, but in small doses. I use this approach because over the years, I've found that jargon can often create barriers. In other words, I'll try not to overwhelm you with the dry language that describes regular expressions. That is because the basic philosophy of this book is this: Doing useful things can come before knowing everything about a given subject. + +There are lots of different implementations of regular expressions. You will find regular expressions used in Unix command-line tools like _vi_ ( _vim_ ), _grep_ , and _sed_ , among others. You will find regular expressions in programming languages like Perl (of course), Java, JavaScript, C# or Ruby, and many more, and you will find them in declarative languages like XSLT 2.0. You will also find them in applications like Notepad++, Oxygen, or TextMate, among many others. + +Most of these implementations have similarities and differences. I won't cover all those differences in this book, but I will touch on a good number of them. If I attempted to document _all_ the differences between _all_ implementations, I'd have to be hospitalized. I won't get bogged down in these kinds of details in this book. You're expecting an introductory text, as advertised, and that is what you'll get. + +# Who Should Read This Book + +The audience for this book is people who haven't ever written a regular expression before. If you are new to regular expressions or programming, this book is a good place to start. In other words, I am writing for the reader who has heard of regular expressions and is interested in them but who doesn't really understand them yet. If that is you, then this book is a good fit. + +The order I'll go in to cover the features of regex is from the simple to the complex. In other words, we'll go step by simple step. + +Now, if you happen to already know something about regular expressions and how to use them, or if you are an experienced programmer, this book may not be where you want to start. This is a beginner's book, for rank beginners who need some hand-holding. If you have written some regular expressions before, and feel familiar with them, you can start here if you want, but I'm planning to take it slower than you will probably like. + +I recommend several books to read after this one. First, try Jeff Friedl's _Mastering Regular Expressions, Third Edition_ (see ). Friedl's book gives regular expressions a thorough going over, and I highly recommend it. I also recommend the _Regular_ Expressions _Cookbook_ (see ) by Jan Goyvaerts and Steven Levithan. Jan Goyvaerts is the creator of RegexBuddy, a powerful desktop application (see ). Steven Levithan created RegexPal, an online regular expression processor that you'll use in the first chapter of this book (see ). + +# What You Need to Use This Book + +To get the most out of this book, you'll need access to tools available on Unix or Linux operating systems, such as Darwin on the Mac, a variant of BSD (Berkeley Software Distribution) on the Mac, or Cygwin on a Windows PC, which offers many GNU tools in its distribution (see and ). + +There will be plenty of examples for you to try out here. You can just read them if you want, but to really learn, you'll need to follow as many of them as you can, as the most important kind of learning, I think, always comes from doing, not from standing on the sidelines. You'll be introduced to websites that will teach you what regular expressions are by highlighting matched results, workhorse command line tools from the Unix world, and desktop applications that analyze regular expressions or use them to perform text search. + +You will find examples from this book on Github at . You will also find an archive of all the examples and test files in this book for download from . It would be best if you create a working directory or folder on your computer and then download these files to that directory before you dive into the book. + +# Conventions Used in This Book + +The following typographical conventions are used in this book: + + _Italic_ + +Indicates new terms, URLs, email addresses, filenames, file extensions, and so forth. + +`Constant width` + +Used for program listings, as well as within paragraphs, to refer to program elements such as expressions and command lines or any other programmatic elements. + +### Tip + +This icon signifies a tip, suggestion, or a general note. + +# Using Code Examples + +This book is here to help you get your job done. In general, you may use the code in this book in your programs and documentation. You do not need to contact us for permission unless you're reproducing a significant portion of the code. For example, writing a program that uses several chunks of code from this book does not require permission. Selling or distributing a CD-ROM of examples from O'Reilly books does require permission. Answering a question by citing this book and quoting example code does not require permission. Incorporating a significant amount of example code from this book into your product's documentation does require permission. + +We appreciate, but do not require, attribution. An attribution usually includes the title, author, publisher, and ISBN. For example: " _Introducing Regular Expressions_ by Michael Fitzgerald (O'Reilly). Copyright 2012 Michael Fitzgerald, 978-1-4493-9268-0." + +If you feel your use of code examples falls outside fair use or the permission given above, feel free to contact O'Reilly at _permissions@oreilly.com_. + +# Safari® Books Online + +### Note + +Safari Books Online (www.safaribooksonline.com) is an on-demand digital library that delivers expert content in both book and video form from the world's leading authors in technology and business. + +Technology professionals, software developers, web designers, and business and creative professionals use Safari Books Online as their primary resource for research, problem solving, learning, and certification training. + +Safari Books Online offers a range of product mixes and pricing programs for organizations, government agencies, and individuals. Subscribers have access to thousands of books, training videos, and prepublication manuscripts in one fully searchable database from publishers like O'Reilly Media, Prentice Hall Professional, Addison-Wesley Professional, Microsoft Press, Sams, Que, Peachpit Press, Focal Press, Cisco Press, John Wiley & Sons, Syngress, Morgan Kaufmann, IBM Redbooks, Packt, Adobe Press, FT Press, Apress, Manning, New Riders, McGraw-Hill, Jones & Bartlett, Course Technology, and dozens more. For more information about Safari Books Online, please visit us online. + +# How to Contact Us + +Please address comments and questions concerning this book to the publisher: + +O'Reilly Media, Inc. +--- +1005 Gravenstein Highway North +Sebastopol, CA 95472 +800-998-9938 (in the United States or Canada) +707-829-0515 (international or local) +707-829-0104 (fax) + +This book has a web page listing errata, examples, and any additional information. You can access this page at: + + +--- + +To comment or to ask technical questions about this book, send email to: + +bookquestions@oreilly.com +--- + +For more information about O'Reilly books, courses, conferences, and news, see its website at . + +Find O'Reilly on Facebook: + +Follow O'Reilly on Twitter: + +Watch O'Reilly on YouTube: + +# Acknowledgments + +Once again, I want to express appreciation to my editor at O'Reilly, Simon St. Laurent, a very patient man without whom this book would never have seen the light of day. Thank you to Seara Patterson Coburn and Roger Zauner for your helpful reviews. And, as always, I want to recognize the love of my life, Cristi, who is my _raison d'être_. + +# Chapter 1. What Is a Regular Expression? + +Regular expressions are specially encoded text strings used as patterns for matching sets of strings. They began to emerge in the 1940s as a way to describe regular languages, but they really began to show up in the programming world during the 1970s. The first place I could find them showing up was in the QED text editor written by Ken Thompson. + +> "A regular expression is a pattern which specifies a set of strings of characters; it is said to match certain strings." —Ken Thompson + +Regular expressions later became an important part of the tool suite that emerged from the Unix operating system—the _ed_ , _sed_ and _vi_ ( _vim_ ) editors, _grep_ , _AWK_ , among others. But the ways in which regular expressions were implemented were not always so regular. + +### Note + +This book takes an inductive approach; in other words, it moves from the specific to the general. So rather than an example after a treatise, you will often get the example first and then a short treatise following that. It's a learn-by-doing book. + +Regular expressions have a reputation for being gnarly, but that all depends on how you approach them. There is a natural progression from something as simple as this: + + \d + +a _character shorthand_ that matches any digit from 0 to 9, to something a bit more complicated, like: + + ^(\(\d{3}\)|^\d{3}[.-]?)?\d{3}[.-]?\d{4}$ + +which is where we'll wind up at the end of this chapter: a fairly robust regular expression that matches a 10-digit, North American telephone number, with or without parentheses around the area code, or with or without hyphens or dots (periods) to separate the numbers. (The parentheses must be balanced, too; in other words, you can't just have one.) + +### Note + +Chapter 10 shows you a slightly more sophisticated regular expression for a phone number, but the one above is sufficient for the purposes of this chapter. + +If you don't get how that all works yet, don't worry: I'll explain the whole expression a little at a time in this chapter. If you will just follow the examples (and those throughout the book, for that matter), writing regular expressions will soon become second nature to you. Ready to find out for yourself? + +I at times represent Unicode characters in this book using their code point—a four-digit, hexadecimal (base 16) number. These code points are shown in the form _U+0000_. U+002E, for example, represents the code point for a full stop or period (.). + +# Getting Started with Regexpal + +First let me introduce you to the Regexpal website at . Open the site up in a browser, such as Google Chrome or Mozilla Firefox. You can see what the site looks like in Figure 1-1. + +Figure 1-1. Regexpal in the Google Chrome browser + +You can see that there is a text area near the top, and a larger text area below that. The top text box is for entering regular expressions, and the bottom one holds the subject or target text. The target text is the text or set of strings that you want to match. + +### Note + +At the end of this chapter and each following chapter, you'll find a "Technical Notes" section. These notes provide additional information about the technology discussed in the chapter and tell you where to get more information about that technology. Placing these notes at the end of the chapters helps keep the flow of the main text moving forward rather than stopping to discuss each detail along the way. + +# Matching a North American Phone Number + +Now we'll match a North American phone number with a regular expression. Type the phone number shown here into the lower section of Regexpal: + + 707-827-7019 + +Do you recognize it? It's the number for O'Reilly Media. + +Let's match that number with a regular expression. There are lots of ways to do this, but to start out, simply enter the number itself in the upper section, exactly as it is written in the lower section (hold on now, don't sigh): + + 707-827-7019 + +What you should see is the phone number you entered in the lower box highlighted from beginning to end in yellow. If that is what you see (as shown in Figure 1-2), then you are in business. + +### Note + +When I mention colors in this book, in relation to something you might see in an image or a screenshot, such as the highlighting in Regexpal, those colors may appear online and in e-book versions of this book, but, alas, not in print. So if you are reading this book on paper, then when I mention a color, your world will be grayscale, with my apologies. + +Figure 1-2. Ten-digit phone number highlighted in Regexpal + +What you have done in this regular expression is use something called a _string literal_ to match a string in the target text. A string literal is a literal representation of a string. + +Now delete the number in the upper box and replace it with just the number _7_. Did you see what happened? Now only the sevens are highlighted. The literal character (number) _7_ in the regular expression matches the four instances of the number 7 in the text you are matching. + +# Matching Digits with a Character Class + +What if you wanted to match all the numbers in the phone number, all at once? Or match any number for that matter? + +Try the following, exactly as shown, once again in the upper text box: + + [0-9] + +All the numbers (more precisely _digits_ ) in the lower section are highlighted, in alternating yellow and blue. What the regular expression `[0-9]` is saying to the regex processor is, "Match any digit you find in the range 0 through 9." + +The square brackets are not literally matched because they are treated specially as _metacharacters_. A metacharacter has special meaning in regular expressions and is reserved. A regular expression in the form `[0-9]` is called a _character class_ , or sometimes a _character set_. + +You can limit the range of digits more precisely and get the same result using a more specific list of digits to match, such as the following: + + [012789] + +This will match only those digits listed, that is, 0, 1, 2, 7, 8, and 9. Try it in the upper box. Once again, every digit in the lower box will be highlighted in alternating colors. + +To match any 10-digit, North American phone number, whose parts are separated by hyphens, you could do the following: + + [0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9] + +This will work, but it's bombastic. There is a better way with something called a shorthand. + +# Using a Character Shorthand + +Yet another way to match digits, which you saw at the beginning of the chapter, is with `\d` which, by itself, will match all Arabic digits, just like `[0-9]`. Try that in the top section and, as with the previous regular expressions, the digits below will be highlighted. This kind of regular expression is called a _character shorthand_. (It is also called a _character escape_ , but this term can be a little misleading, so I avoid it. I'll explain later.) + +To match any digit in the phone number, you could also do this: + + \d\d\d-\d\d\d-\d\d\d\d + +Repeating the `\d` three and four times in sequence will exactly match three and four digits in sequence. The hyphen in the above regular expression is entered as a literal character and will be matched as such. + +What about those hyphens? How do you match them? You can use a literal hyphen (-) as already shown, or you could use an escaped uppercase _D_ (`\D`), which matches any character that is _not_ a digit. + +This sample uses `\D` in place of the literal hyphen. + + \d\d\d\D\d\d\d\D\d\d\d\d + +Once again, the entire phone number, including the hyphens, should be highlighted this time. + +# Matching Any Character + +You could also match those pesky hyphens with a dot (.): + + \d\d\d.\d\d\d.\d\d\d\d + +The dot or period essentially acts as a wildcard and will match any character (except, in certain situations, a line ending). In the example above, the regular expression matches the hyphen, but it could also match a percent sign (%): + + 707%827%7019 + +Or a vertical bar (|): + + 707|827|7019 + +Or any other character. + +### Note + +As I mentioned, the dot character (officially, the full stop) will not normally match a new line character, such as a line feed (U+000A). However, there are ways to make it possible to match a newline with a dot, which I will show you later. This is often called the _dotall_ option. + +# Capturing Groups and Back References + +You'll now match just a portion of the phone number using what is known as a _capturing group_. Then you'll refer to the content of the group with a _backreference_. To create a capturing group, enclose a `\d` in a pair of parentheses to place it in a group, and then follow it with a `\1` to backreference what was captured: + + (\d)\d\1 + +The `\1` refers back to what was captured in the group enclosed by parentheses. As a result, this regular expression matches the prefix `707`. Here is a breakdown of it: + + * `(\d)` matches the first digit and captures it (the number _7_ ) + + * `\d` matches the next digit (the number _0_ ) but does not capture it because it is not enclosed in parentheses + + * `\1` references the captured digit (the number _7_ ) + +This will match only the area code. Don't worry if you don't fully understand this right now. You'll see plenty of examples of groups later in the book. + +You could now match the whole phone number with one group and several backreferences: + + (\d)0\1\D\d\d\1\D\1\d\d\d + +But that's not quite as elegant as it could be. Let's try something that works even better. + +# Using Quantifiers + +Here is yet another way to match a phone number using a different syntax: + + \d{3}-?\d{3}-?\d{4} + +The numbers in the curly braces tell the regex processor _exactly_ how many occurrences of those digits you want it to look for. The braces with numbers are a kind of _quantifier_. The braces themselves are considered metacharacters. + +The question mark (`?`) is another kind of quantifier. It follows the hyphen in the regular expression above and means that the hyphen is optional—that is, that there can be zero or one occurrence of the hyphen (one or none). There are other quantifiers such as the plus sign (`+`), which means "one or more," or the asterisk (`*`) which means "zero or more." + +Using quantifiers, you can make a regular expression even more concise: + + (\d{3,4}[.-]?)+ + +The plus sign again means that the quantity can occur one or more times. This regular expression will match either three or four digits, followed by an optional hyphen or dot, grouped together by parentheses, one or more times (`+`). + +Is your head spinning? I hope not. Here's a character-by-character analysis of the regular expression above: + + * `(` open a capturing group + + * `\` start character shorthand (escape the following character) + + * `d` end character shorthand (match any digit in the range 0 through 9 with `\d`) + + * `{` open quantifier + + * `3` minimum quantity to match + + * `,` separate quantities + + * `4` maximum quantity to match + + * `}` close quantifier + + * `[` open character class + + * `.` dot or period (matches literal dot) + + * `-` literal character to match hyphen + + * `]` close character class + + * `?` zero or one quantifier + + * `)` close capturing group + + * `+` one or more quantifier + +This all works, but it's not quite right because it will also match other groups of 3 or 4 digits, whether in the form of a phone number or not. Yes, we learn from our mistakes better than our successes. + +So let's improve it a little: + + (\d{3}[.-]?){2}\d{4} + +This will match two nonparenthesized sequences of three digits each, followed by an optional hyphen, and then followed by exactly four digits. + +# Quoting Literals + +Finally, here is a regular expression that allows literal parentheses to optionally wrap the first sequence of three digits, and makes the area code optional as well: + + ^(\(\d{3}\)|^\d{3}[.-]?)?\d{3}[.-]?\d{4}$ + +To ensure that it is easy to decipher, I'll look at this one character by character, too: + + * `^` (caret) at the beginning of the regular expression, or following the vertical bar (`|`), means that the phone number will be at the beginning of a line. + + * `(` opens a capturing group. + + * `\(` is a literal open parenthesis. + + * `\d` matches a digit. + + * `{3}` is a quantifier that, following `\d`, matches exactly three digits. + + * `\)` matches a literal close parenthesis. + + * `|` (the vertical bar) indicates _alternation_ , that is, a given choice of alternatives. In other words, this says "match an area code with parentheses or without them." + + * `^` matches the beginning of a line. + + * `\d` matches a digit. + + * `{3}` is a quantifier that matches exactly three digits. + + * `[.-]?` matches an optional dot or hyphen. + + * `)` close capturing group. + + * `?` make the group optional, that is, the prefix in the group is not required. + + * `\d` matches a digit. + + * `{3}` matches exactly three digits. + + * `[.-]?` matches another optional dot or hyphen. + + * `\d` matches a digit. + + * `{4}` matches exactly four digits. + + * `$` matches the end of a line. + +This final regular expression matches a 10-digit, North American telephone number, with or without parentheses, hyphens, or dots. Try different forms of the number to see what will match (and what won't). + +### Note + +The capturing group in the above regular expression is not necessary. The group is necessary, but the capturing part is not. There is a better way to do this: a non-capturing group. When we revisit this regular expression in the last chapter of the book, you'll understand why. + +# A Sample of Applications + +To conclude this chapter, I'll show you the regular expression for a phone number in several applications. + +TextMate is an editor that is available only on the Mac and uses the same regular expression library as the Ruby programming language. You can use regular expressions through the Find (search) feature, as shown in Figure 1-3. Check the box next to _Regular expression_. + +Figure 1-3. Phone number regex in TextMate + +Notepad++ is available on Windows and is a popular, free editor that uses the PCRE regular expression library. You can access them through search and replace (Figure 1-4) by clicking the radio button next to _Regular expression_. + +Figure 1-4. Phone number regex in Notepad++ + +Oxygen is also a popular and powerful XML editor that uses Perl 5 regular expression syntax. You can access regular expressions through the search and replace dialog, as shown in Figure 1-5, or through its regular expression builder for XML Schema. To use regular expressions with Find/Replace, check the box next to _Regular expression_. + +Figure 1-5. Phone number regex in Oxygen + +This is where the introduction ends. Congratulations. You've covered a lot of ground in this chapter. In the next chapter, we'll focus on simple pattern matching. + +# What You Learned in Chapter 1 + + * What a regular expression is + + * How to use Regexpal, a simple regular expression processor + + * How to match string literals + + * How to match digits with a character class + + * How to match a digit with a character shorthand + + * How to match a non-digit with a character shorthand + + * How to use a capturing group and a backreference + + * How to match an exact quantity of a set of strings + + * How to match a character optionally (zero or one) or one or more times + + * How to match strings at either the beginning or the end of a line + +# Technical Notes + + * Regexpal () is a web-based, JavaScript-powered regex implementation. It's not the most complete implementation, and it doesn't do everything that regular expressions can do; however, it's a clean, simple, and very easy-to-use learning tool, and it provides plenty of features for you to get started. + + * You can download the Chrome browser from or Firefox from . + + * Why are there so many ways of doing things with regular expressions? One reason is because regular expressions have a wonderful quality called _composability_. A language, whether a formal, programming or schema language, that has the quality of _composability_ (James Clark explains it well at ) is one that lets you take its atomic parts and composition methods and then recombine them easily in different ways. Once you learn the different parts of regular expressions, you will take off in your ability to match strings of any kind. + + * TextMate is available at . For more information on regular expressions in TextMate, see . + + * For more information on Notepad, see . For documentation on using regular expressions with Notepad, see . + + * Find out more about Oxygen at . For information on using regex through find and replace, see . For information on using its regular expression builder for XML Schema, see . + +# Chapter 2. Simple Pattern Matching + +Regular expressions are all about matching and finding patterns in text, from simple patterns to the very complex. This chapter takes you on a tour of some of the simpler ways to match patterns using: + + * String literals + + * Digits + + * Letters + + * Characters of any kind + +In the first chapter, we used Steven Levithan's RegexPal to demonstrate regular expressions. In this chapter, we'll use Grant Skinner's RegExr site, found at (see Figure 2-1). + +### Note + +Each page of this book will take you deeper into the regular expression jungle. Feel free, however, to stop and smell the syntax. What I mean is, start trying out new things as soon as you discover them. Try. Fail fast. Get a grip. Move on. Nothing makes learning sink in like _doing_ something with it. + +Figure 2-1. Grant Skinner's RegExr in Firefox + +Before we go any further, I want to point out the helps that RegExr provides. Over on the right side of RegExr, you'll see three tabs. Take note of the Samples and Community tabs. The Samples tab provides helps for a lot of regular expression syntax, and the Community tab shows you a large number of contributed regular expressions that have been rated. You'll find a lot of good information in these tabs that may be useful to you. In addition, pop-ups appear when you hover over the regular expression or target text in RegExr, giving you helpful information. These resources are one of the reasons why RegExr is among my favorite online regex checkers. + +This chapter introduces you to our main text, "The Rime of the Ancient Mariner," by Samuel Taylor Coleridge, first published in _Lyrical Ballads_ (London, J. & A. Arch, 1798). We'll work with this poem in chapters that follow, starting with a plain-text version of the original and winding up with a version marked up in HTML5. The text for the whole poem is stored in a file called _rime.txt_ ; this chapter uses the file _rime-intro.txt_ that contains only the first few lines. + +The following lines are from _rime-intro.txt_ : + + THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + + ARGUMENT. + + How a Ship having passed the Line was driven by Storms to the cold + Country towards the South Pole; and how from thence she made her course + to the tropical Latitude of the Great Pacific Ocean; and of the strange + things that befell; and in what manner the Ancyent Marinere came back to + his own Country. + + I. + + 1 It is an ancyent Marinere, + 2 And he stoppeth one of three: + 3 "By thy long grey beard and thy glittering eye + 4 "Now wherefore stoppest me? + +Copy and paste the lines shown here into the lower text box in RegExr. You'll find the file _rime-intro.txt_ at Github at . You'll also find the same file in the download archive found at . You can also find the text online at Project Gutenberg, but without the numbered lines (see ). + +# Matching String Literals + +The most outright, obvious feature of regular expressions is matching strings with one or more literal characters, called _string literals_ or just _literals_. + +The way to match literal strings is with normal, literal characters. Sounds familiar, doesn't it? This is similar to the way you might do a search in a word processing program or when submitting a keyword to a search engine. When you search for a string of text, character for character, you are searching with a string literal. + +If you want to match the word _Ship_ , for example, which is a word (string of characters) you'll find early in the poem, just type the word _Ship_ in the box at the top of Regexpal, and then the word will be highlighted in the lower text box. (Be sure to capitalize the word.) + +Did light blue highlighting show up below? You should be able to see the highlighting in the lower box. If you can't see it, check what you typed again. + +### Note + +By default, string matching is case-sensitive in Regexpal. If you want to match both lower- and uppercase, click the checkbox next to the words _Case insensitive_ at the top left of Regexpal. If you click this box, both _Ship_ and _ship_ would match if either was present in the target text. + +# Matching Digits + +In the top-left text box in RegExr, enter this character shorthand to match the digits: + + \d + +This matches all the Arabic digits in the text area below because the _global_ checkbox is selected. Uncheck that checkbox, and `\d` will match only the first occurrence of a digit. (See Figure 2-2.) + +Figure 2-2. Matching all digits in RegExr with \d + +Now in place of `\d` use a character class that matches the same thing. Enter the following range of digits in the top text box of RegExr: + + [0-9] + +As you can see in Figure 2-3, though the syntax is different, using `\d` does the same thing as `[0-9]`. + +Figure 2-3. Matching all digits in RegExr with [0-9] + +### Note + +You'll learn more about character classes in Chapter 5. + +The character class `[0-9]` is a _range_ , meaning that it will match the range of digits 0 through 9. You could also match digits 0 through 9 by listing all the digits: + + [0123456789] + +If you want to match only the binary digits 0 and 1, you would use this character class: + + [01] + +Try `[12]` in RegExr and look at the result. With a character class, you can pick the exact digits you want to match. The character shorthand for digits (`\d`) is shorter and simpler, but it doesn't have the power or flexibility of the character class. I use character classes when I can't use `\d` (it's not always supported) and when I need to get very specific about what digits I need to match; otherwise, I use `\d` because it's a simpler, more convenient syntax. + +# Matching Non-Digits + +As is often the case with shorthands, you can flip-flop—that is, you can go the other way. For example, if you want to match characters that are not digits, use this shorthand with an uppercase _D_ : + + \D + +Try this shorthand in RegExr now. An uppercase _D_ , rather than a lowercase, matches non-digit characters (check Figure 2-4). This shorthand is the same as the following character class, a negated class (a negated class says in essence, "don't match these" or "match all but these"): + + [^0-9] + +which is the same as: + + [^\d] + +Figure 2-4. Matching non-digits in RegExr with \D + +# Matching Word and Non-Word Characters + +In RegExr, now swap `\D` with: + + \w + +This shorthand will match all word characters (if the _global_ option is still checked). The difference between `\D` and `\w` is that `\D` matches whitespace, punctuation, quotation marks, hyphens, forward slashes, square brackets, and other similar characters, while `\w` does not—it matches letters and numbers. + +In English, `\w` matches essentially the same thing as the character class: + + [a-zA-Z0-9] + +### Note + +You'll learn how to match characters beyond the set of English letters in Chapter 6. + +Now to match a non-word character, use an uppercase _W_ : + + \W + +This shorthand matches whitespace, punctuation, and other kinds of characters that aren't used in words in this example. It is the same as using the following character class: + + [^a-zA-Z0-9] + +Character classes, granted, allow you more control over what you match, but sometimes you don't want or need to type out all those characters. This is known as the "fewest keystrokes win" principle. But sometimes you must type all that stuff out to get precisely what you want. It is your choice. + +Just for fun, in RegExr try both: + + [^\w] + +and + + [^\W] + +Do you see the differences in what they match? + +Table 2-1 provides an extended list of character shorthands. Not all of these work in every regex processor. + +Table 2-1. Character shorthands + +Character Shorthand| Description +---|--- + +\a | Alert + +\b | Word boundary + +[\b] | Backspace character + +\B | Non-word boundary + +`\c` _`x`_ | Control character + +\d | Digit character + +\D | Non-digit character + +`\d` _`xxx`_ | Decimal value for a character + +\f | Form feed character + +\r | Carriage return + +\n | Newline character + +pass:[\o\ _xxx_ ] | Octal value for a character + +\s | Space character + +\S | Non-space character + +\t | Horizontal tab character + +\v | Vertical tab character + +\w | Word character + +\W | Non-word character + +\0 | Nul character + +`\` `x` _`xx`_ | Hexadecimal value for a character + +`\u` _`xxxx`_ | Unicode value for a character + +# Matching Whitespace + +To match whitespace, you can use this shorthand: + + \s + +Try this in RegExr and see what lights up (see Figure 2-5). The following character class matches the same thing as `\s`: + + [ \t\n\r] + +In other words, it matches: + + * Spaces + + * Tabs (`\t`) + + * Line feeds (`\n`) + + * Carriage returns (`\r`) + +### Note + +Spaces and tabs are highlighted in RegExr, but not line feeds or carriage returns. + +Figure 2-5. Matching whitespace in RegExr with \s + +As you can imagine, `\s` has its _compañero_. To match a non-whitespace character, use: + + \S + +This matches everything except whitespace. It matches the character class: + + [^ \t\n\r] + +Or: + + [^\s] + +Test these out in RegExr to see what happens. + +In addition to those characters matched by `\s`, there are other, less common whitespace characters. Table 2-2 lists character shorthands for common whitespace characters and a few that are more rare. + +Table 2-2. Character shorthands for whitespace characters + +Character Shorthand| Description +---|--- + +\f | Form feed + +\h | Horizontal whitespace + +\H | Not horizontal whitespace + +\n | Newline + +\r | Carriage return + +\t | Horizontal tab + +\v | Vertical tab (whitespace) + +\V | Not vertical whitespace + +### Note + +If you try `\h`, `\H`, or `\V` in RegExr, you will see results, but not with `\v`. Not all whitespace shorthands work with all regex processors. + +# Matching Any Character, Once Again + +There is a way to match _any_ character with regular expressions and that is with the dot, also known as a period or a full stop (U+002E). The dot matches all characters but line ending characters, except under certain circumstances. + +In RegExr, turn off the _global_ setting by clicking the checkbox next to it. Now any regular expression will match on the first match it finds in the target. + +Now to match a single character, any character, just enter a single dot in the top text box of RegExr. + +In Figure 2-6, you see that the dot matches the first character in the target, namely, the letter _T_. + +Figure 2-6. Matching a single character in RegExr with "." + +If you wanted to match the entire phrase _THE RIME_ , you could use eight dots: + + ........ + +But this isn't very practical, so I don't recommend using a series of dots like this often, if ever. Instead of eight dots, use a quantifier: + + .{8} + +and it would match the first two words and the space in between, but crudely so. To see what I mean by _crudely_ , click the checkbox next to _global_ and see how useless this really is. It matches sequences of eight characters, end on end, all but the last few characters of the target. + +Let's try a different tack with word boundaries and starting and ending letters. Type the following in the upper text box of RegExr to see a slight difference: + + \bA.{5}T\b + +This expression has a bit more specificity. (Try saying _specificity_ three times, out loud.) It matches the word _ANCYENT_ , an archaic spelling of _ancient_. How? + + * The shorthand `\b` matches a word boundary, without consuming any characters. + + * The characters _A_ and _T_ also bound the sequence of characters. + + * `.{5}` matches any five characters. + + * Match another word boundary with `\b`. + +This regular expression would actually match both _ANCYENT_ or _ANCIENT_. + +Now try it with a shorthand: + + \b\w{7}\b + +Finally, I'll talk about matching zero or more characters: + + .* + +which is the same as: + + [^\n] + +or + + [^\n\r] + +Similar to this is the dot used with the one or more quantifier (+): + + .+ + +Try these in RegExr and they will, either of them, match the first line (uncheck _global_ ). The reason why is that, normally, the dot does not match newline characters, such as a line feed (U+000A) or a carriage return (U+000D). Click the checkbox next to _dotall_ in RegExr, and then `.*` or `.+` will match _all_ the text in the lower box. ( _dotall_ means a dot will match all characters, including newlines.) + +The reason why it does this is because these quantifiers are _greedy_ ; in other words, they match all the characters they can. But don't worry about that quite yet. Chapter 7 explains quantifiers and greediness in more detail. + +# Marking Up the Text + +"The Rime of the Ancient Mariner" is just plain text. What if you wanted to display it on the Web? What if you wanted to mark it up as HTML5 using regular expressions, rather than by hand? How would you do that? + +In some of the following chapters, I'll show you ways to do this. I'll start out small in this chapter and then add more and more markup as you go along. + +In RegExr, click the Replace tab, check _multiline_ , and then, in the first text box, enter: + + (^T.*$) + +Beginning at the top of the file, this will match the first line of the poem and then capture that text in a group using parentheses. In the next box, enter: + +

$1

+ +The replacement regex surrounds the captured group, represented by `$1`, in an _h1_ element. You can see the result in the lowest text area. The `$1` is a backreference, in Perl style. In most implementations, including Perl, you use this style: `\1`; but RegExr supports only `$1`, `$2`, `$3` and so forth. You'll learn more about groups and backreferences in Chapter 4. + +## Using _sed_ to Mark Up Text + +On a command line, you could also do this with _sed_. _sed_ is a Unix streaming editor that accepts regular expressions and allows you to transform text. It was first developed in the early 1970s by Lee McMahon at Bell Labs. If you are on the Mac or have a Linux box, you already have it. + +Test out _sed_ at a shell prompt (such as in a Terminal window on a Mac) with this line: + + echo Hello | sed s/Hello/Goodbye/ + +This is what should have happened: + + * The _echo_ command prints the word _Hello_ to standard output (which is usually just your screen), but the vertical bar (|) pipes it to the _sed_ command that follows. + + * This pipe directs the output of _echo_ to the input of _sed_. + + * The _s_ (substitute) command of _sed_ then changes the word _Hello_ to _Goodbye_ , and _Goodbye_ is displayed on your screen. + +If you don't have _sed_ on your platform already, at the end of this chapter you'll find some technical notes with some pointers to installation information. You'll find discussed there two versions of _sed_ : BSD and GNU. + +Now try this: At a command or shell prompt, enter: + + sed -n 's/^/

/;s/$/<\/h1>/p;q' rime.txt + +And the output will be: + +

THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.

+ +Here is what the regex did, broken down into parts: + + * The line starts by invoking the _sed_ program. + + * The `-n` option suppresses _sed_ 's default behavior of echoing each line of input to the output. This is because you want to see only the line effected by the regex, that is, line 1. + + * `s/^/

/` places an _h1_ start-tag at the beginning (`^`) of the line. + + * The semicolon (;) separates commands. + + * `s/$/<\/h1>/` places an _h1_ end-tag at the end (`$`) of the line. + + * The _p_ command prints the affected line (line 1). This is in contrast to `-n`, which echoes every line, regardless. + + * Lastly, the _q_ command quits the program so that _sed_ processes only the first line. + + * All these operations are performed against the file _rime.txt_. + +Another way of writing this line is with the `-e` option. The `-e` option appends the editing commands, one after another. I prefer the method with semicolons, of course, because it's shorter. + + sed -ne 's/^/

/' -e 's/$/<\/h1>/p' -e 'q' rime.txt + +You could also collect these commands in a file, as with _h1.sed_ shown here (this file is in the code repository mentioned earlier): + + #!/usr/bin/sed + + s/^/

/ + s/$/<\/h1>/ + q + +To run it, type: + + sed -f h1.sed rime.txt + +at a prompt in the same directory or folder as _rime.txt_. + +## Using Perl to Mark Up Text + +Finally, I'll show you how to do a similar process with Perl. Perl is a general purpose programming language created by Larry Wall back in 1987. It's known for its strong support of regular expressions and its text processing capabilities. + +Find out if Perl is already on your system by typing this at a command prompt, followed by Return or Enter: + + perl -v + +This should return the version of Perl on your system or an error (see Technical Notes). + +To accomplish the same output as shown in the _sed_ example, enter this line at a prompt: + + perl -ne 'if ($. == 1) { s/^/

/; s/$/<\/h1>/m; print; }' rime.txt + +and, as with the _sed_ example, you will get this result: + +

THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.

+ +Here is what happened in the Perl command, broken down again into pieces: + + * _perl_ invokes the Perl program. + + * The `-n` option loops through the input (the file _rime.txt_ ). + + * The `-e` option allows you to submit program code on the command line, rather than from a file (like _sed_ ). + + * The _if_ statement checks to see if you are on line 1. `$.` is a special variable in Perl that matches the current line. + + * The first substitute command _s_ finds the beginning of the first line (`^`) and inserts an _h1_ start-tag there. + + * The second substitute command searches for the end of the line (`$`), and then inserts an _h1_ end-tag. + + * The _m_ or _multiline_ modifier or flag at the end of the substitute command indicates that you are treating this line distinctly and separately; consequently, the `$` matches the end of line 1, not the end of the file. + + * At last, it prints the result to standard output (the screen). + + * All these operations are performed again the file _rime.txt_. + +You could also hold all these commands in a program file, such as this file, _h1.pl_ , found in the example archive. + + #!/usr/bin/perl -n + + if ($. == 1) { + s/^/

/; + s/$/<\/h1>/m; + print; + } + +And then, in the same directory as _rime.txt_ , run the program like this: + + perl h1.pl rime.txt + +There are a lot of ways you can do things in Perl. I am not saying this is the most efficient way to add these tags. It is simply one way. Chances are, by the time this book is in print, I'll think of other, more efficient ways to do things with Perl (and other tools). I hope you will, too. + +In the next chapter, we'll talk about boundaries and what are known as _zero-width assertions_. + +# What You Learned in Chapter 2 + + * How to match string literals + + * How to match digits and non-digits + + * What the _global_ mode is + + * How character shorthands compare with character classes + + * How to match word and non-word characters + + * How to match whitespace + + * How to match any character with the dot + + * What the _dotall_ mode is + + * How to insert HTML markup to a line of text using RegExr, _sed_ , and Perl + +# Technical Notes + + * RegExr is found at and also has a desktop version (). RegExr was built in Flex 3 () and relies on the ActionScript regular expression engine (). Its regular expressions are similar to those used by JavaScript (see ). + + * Git is a fast version control system (). GitHub is a web-based repository for projects using Git (). I suggest using the GitHub repository for samples in this book only if you feel comfortable with Git or with other modern version control systems, like Subversion or Mercurial. + + * HTML5 () is the fifth major revision of the W3C's HTML, the markup language for publishing on the World Wide Web. It has been in draft for several years and changes regularly, but it is widely accepted as the heir apparent of HTML 4.01 and XHTML. + + * _sed_ is readily available on Unix/Linux systems, including the Mac (Darwin or BSD version). It is also available on Windows through distributions like Cygwin () or individually at (currently at version 4.2.1, see ). + + * To use the Perl examples in this chapter, you may have to install Perl on your system. It comes by default with Mac OS X Lion and often is on Linux systems. If you are on Windows, you can get Perl by installing the appropriate Cygwin packages (see ) or by downloading the latest package from the ActiveState website (go to ). For detailed information on installing Perl, visit or . + +To find out if you already have Perl, enter the command below at a shell prompt. To do this, open a command or shell window on your system, such as a Terminal window (under Applications/Utilities) on the Mac or a Windows command line window (open Start, and then enter _cmd_ in the text box at the bottom of the menu). At the prompt, type: + + perl -v + +If Perl is alive and well on your system, then this command will return version information for Perl. On my Mac running Lion, I've installed the latest version of Perl (5.16.0 at the time of this writing) from source and compiled it (see ). I get the following information back when I enter the command above: + + This is perl 5, version 16, subversion 0 (v5.16.0) built for darwin-2level + + Copyright 1987-2012, Larry Wall + + Perl may be copied only under the terms of either the Artistic License or the + GNU General Public License, which may be found in the Perl 5 source kit. + + Complete documentation for Perl, including FAQ lists, should be found on + this system using "man perl" or "perldoc perl". If you have access to the + Internet, point your browser at http://www.perl.org/, the Perl Home Page. + +Both `perl` and `perldoc` are installed at `/usr/local/bin` when compiled and built from source, which you can add to your path. For information on setting your path variable, see . + +# Chapter 3. Boundaries + +This chapter focuses on assertions. Assertions mark boundaries, but they don't consume characters—that is, characters will not be returned in a result. They are also known as _zero-width assertions_. A zero-width assertion doesn't match a character, per se, but rather a location in a string. Some of these, such as `^` and `$`, are also called _anchors_. + +The boundaries I'll talk about in this chapter are: + + * The beginning and end of a line or string + + * Word boundaries (two kinds) + + * The beginning and end of a subject + + * Boundaries that quote string literals + +To start, I'll use RegExr again, but this time, for variety, I'll use the Safari browser (however, you can use any browser you like). I'll also use the same text I used last time: the first 12 lines of _rime.txt_. Open the Safari browser with and copy the first 12 lines of _rime.txt_ from the code archive into the lower box. + +# The Beginning and End of a Line + +As you have seen a number of times already, to match the beginning of a line or string, use the caret or circumflex (U+005E): + + ^ + +Depending on the context, a `^` will match the beginning of a line or string, sometimes a whole document. The context depends on your application and what options you are using with that application. + +To match the end of a line or string, as you know, use the dollar sign: + + $ + +In RegExr, make sure that _multiline_ is checked. _global_ is checked by default when you open RegExr, but you can leave it checked or unchecked for this example. When _multiline_ is not checked, the entire target is considered one string. + +In the upper text box, enter this regular expression: + + ^How.*Country\.$ + +This will match the entire line beginning with the word _How_. Notice that the period or dot at the end is preceded by a backslash. This escapes the dot so that it is interpreted as a literal. If it was not escaped, what would it match? Any character. If you want to match a literal dot, you have to either escape it or put it in a character class (see Chapter 5). + +Figure 3-1. RegExr in Safari + +If you uncheck _multiline_ , then what happens? The highlighting is turned off. With it unchecked and _dotall_ checked, enter: + + ^THE.*\?$ + +and you'll see that it matches all the text. + +The _dotall_ option means that the dot will match newlines in addition to all other characters. Uncheck _dotall_ , and the expression matches nothing. However, the following: + + ^THE.* + +will match the first line. Click _dotall_ again, and all text is matched again. The `\?$` is not required to match to the end of the text. + +# Word and Non-word Boundaries + +You have already seen `\b` used several times. It marks a word boundary. Try: + + \bTHE\b + +and it will match both occurrences of _THE_ in the first line (with _global_ checked). Like, `^` or `$`, `\b` is a zero-width assertion. It may appear to match things like a space or the beginning of a line, but in actuality, what it matches is a zero-width nothing. Did you notice that the spaces around the second _THE_ are not highlighted? That is because they are not part of the match. Not the easiest thing to grasp, but you'll get it by seeing what it does and does not do. + +You can also match non-word boundaries. A non-word boundary matches locations that are not equivalent to a word boundary, like a letter or a number within a word or string. To match a non-word boundary, give this a spin: + + \Be\B + +and watch what it matches (see Figure 3-2). You'll see that it matches a lowercase _e_ when it is surrounded by other letters or non-word characters. Being a zero-width assertion, it does not match the surrounding characters, but it recognizes when the literal _e_ is surrounded by non-word boundaries. + +Figure 3-2. Matching non-word boundaries with \B + +In some applications, another way for specifying a word boundary is with: + + \< + +for the beginning of a word, and with: + + \> + +for the end of the word. This is an older syntax, not available in most recent regex applications. It is useful in some instances because, unlike `\b`, which matches _any_ word boundary, this syntax allows you to match either the beginning or ending of a word. + +If you have _vi_ or _vim_ on your system, you can try this out with that editor. Just follow these steps. They're easy even if you have never used _vim_ before. In a command or shell window, change directories to where the poem is located and then open it with: + + vim rime.txt + +Then enter the following search command: + + /\> + +and press Enter or Return. The forward slash (`/`) is the way you begin a search in _vim_. Watch the cursor and you'll see that this search will find the ends of words. Press _n_ to repeat the search. Next enter: + + /\< + +followed by Enter or Return. This time the search will find the beginning of words. To exit _vim_ , just type `ZZ`. + +This syntax also works with _grep_. Since the early 1970s, _grep_ like _sed_ has been a Unix mainstay. (In the 1980s, I had a coworker who had a vanity license plate that said _GREP._ ) Try this command from a shell prompt: + + grep -Eoc '\<(THE|The|the)\>' rime.txt + +The - _E_ option indicates that you want to use extended regular expressions (EREs) rather than the basic regular expressions (BREs) which are used by _grep_ by default. The `-o` option means you want to show in the result only that part of the line that matches the pattern, and the `-c` option means only return a count of the result. The pattern in single quotes will match either _THE_ , _The_ , or _the_ as whole words. That's what the `\<` and `\>` help you find. + +This command will return: + + 259 + +which is the count of the words found. + +On the other hand, if you don't include the `\<` and `\>`, you get a different result. Do it this way: + + grep -Eoc '(THE|The|the)' rime.txt + +and you will get a different number: + + 327 + +Why? Because the pattern will match only whole words, plus _any_ sequence of characters that contain the word. So that is one reason why the `\<` and `\>` can come in handy. + +# Other Anchors + +Similar to the `^` anchor is the following, a shorthand that matches the start of a subject: + + \A + +This is not available with all regex implementations, but you can get it with Perl and PCRE (Perl Compatible Regular Expressions), for example. To match the end of a subject, you can use `\A`'s companion. + + \Z + +Also, in some contexts: + + \z + + _pcregrep_ is a version of _grep_ for the PCRE library. (See Technical Notes to find out where to get it.) Once installed, to try this syntax with _pcregrep_ , you could do something like this: + + pcregrep -c '\A\s*(THE|The|the)' rime.txt + +which will return a count (`-c`) of 108 occurrences of the word _the_ (in three cases) which occur near the beginning of a line, preceded by whitespace (zero or more). Next enter this command: + + pcregrep -n '(MARINERE|Marinere)(.)?\Z' rime.txt + +This matches either _MARINERE_ or _Marinere_ at the end of a line (subject) and is followed by any optional character, which in this case is either a punctuation mark or the letter _S_. (The parentheses around the dot are not essential.) + +You'll see this output: + + 1:THE RIME OF THE ANCYENT MARINERE, + 10: It is an ancyent Marinere, + 38: The bright-eyed Marinere. + 63: The bright-eyed Marinere. + 105: "God save thee, ancyent Marinere! + 282: "I fear thee, ancyent Marinere! + 702: He loves to talk with Marineres + +The `-n` option with _pcregrep_ gives you the line numbers at the beginning of each line of output. The command line options of _pcregrep_ are very similar to those of _grep_. To see them, do: + + pcre --help + +# Quoting a Group of Characters as Literals + +You can use these sequences to quote a set of characters as literals: + + \Q + +and + + \E + +To show you how this works, enter the following metacharacters in the lower box of RegExr: + + .^$*+?|(){}[]\- + +These 15 metacharacters are treated as special characters in regular expressions, used for encoding a pattern. (The hyphen is treated specially, as signifying a range, inside of the square brackets of a character class. Otherwise, it's not special.) + +If you try to match those characters in the upper text box of RegExr, nothing will happen. Why? Because RegExr thinks (if it can think) that you are entering a regular expression, not literal characters. Now try: + + \Q$\E + +and it will match `$` because anything between `\Q` and `\E` is interpreted as a literal character (see Figure 3-3). (Remember, you can precede a metacharacer with a \ to make it literal.) + +Figure 3-3. Quoting metacharacters as literals + +# Adding Tags + +In RegExr, uncheck _global_ and check _multiline_ , click the Replace tab, and then, in the first text box (marked number 1 in Figure 3-4), enter: + + ^(.*)$ + +This will match and capture the first line of text. Then in the next box (marked number 2), enter this or something similar: + + \n\nRime\n\n +

$1

+ +As you enter the replacement text, you'll notice that the subject text (shown in the box marked number 3) is changed in the results text box (marked number 4), to include the markup you've added (see Figure 3-4). + +Figure 3-4. Adding markup with RegExr + +RegExr does well to demonstrate one way to do this, but it is limited in what it can do. For example, it can't save any results out to a file. We have to look beyond the browser for that. + +## Adding Tags with _sed_ + +On a command line, you could also do something similar to what we just did in RegExr with _sed_ , which you saw in the last chapter. The insert (`i`) command in _sed_ allows you to insert text above or before a location in a document or a string. By the way, the opposite of _i_ in _sed_ is _a_ , which appends text below or after a location. We'll use the append command later. + +The following command inserts the HTML5 doctype and several other tags, beginning at line 1: + + sed '1 i\ + \ + \ + \ + Rime\ + \ + + + s/^/

/ + s/$/<\/h1>/ + q' rime.txt + +The backslashes (`\`) at the end of the lines allow you to insert newlines into the stream and not execute the command prematurely. The backslashes in front of the quotation marks _escape_ the quotes so that they are seen as literal characters, not part of the command. + +When you run this _sed_ command correctly, this is what your output will look like: + + + + + The Rime of the Ancyent Mariner (1798) + + +

THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.

+ +These same _sed_ commands are saved in the file _top.sed_ in the example archive. You can run this on the file using this command: + + sed -f top.sed rime.txt + +You should get the same output as you saw in the previous command. If you want to save the output to a file, you can redirect the output to a file, like so: + + sed -f top.sed rime.txt > temp + +In addition to showing the result on the screen, this redirect part of the command (`> temp`) will save the output to the file _temp_. + +## Adding Tags with Perl + +Let's try to accomplish this same thing with Perl. Without explaining everything that's going on, just try this: + + perl -ne 'print "\ + \ + Rime\ + \ + " if $. == 1; + s/^/

/;s/$/<\/h1>/m;print;exit;' rime.txt + +Compare this with the _sed_ command. How is it similar? How is it different? The _sed_ command is a little simpler, put Perl is a lot more powerful, in my opinion. + +Here is how it works: + + * The `$.` variable, which is tested with the _if_ statement, represents the current line. The _if_ statement returns _true_ , meaning it passes the test that the current line is line 1. + + * When Perl finds line 1 with _if_ , it prints the doctype and a few HTML tags. It is necessary to escape the quote marks as in _sed_. + + * The first substitution inserts an _h1_ start-tag at the beginning of the line, and the second one inserts an _h1_ end-tag at the end of the line. The _m_ at the end of the second substitution means that it uses a _multiline_ modifier. This is done so that the command recognizes the end of the first line. Without _m_ , the _$_ would match to the end of the file. + + * The _print_ command prints the result of the substitutions. + + * The _exit_ command exits Perl immediately. Otherwise, because of `-n` option, it would loop through every line of the file, which we don't want for this script. + +That was a lot of typing, so I put all that Perl code in a file and called it _top.pl_ , also found in the code archive. + + #!/usr/bin/perl -n + + if ($ == 1) { + print "\ + \ + \ + The Rime of the Ancyent Mariner (1798)\ + \ + \ + "; + s/^/

/; + s/$/<\/h1>/m; + print; + exit; + } + +Run this with: + + perl top.pl rime.txt + +You get a similar output as in the previous command, though it is formed a little differently. (You can redirect the output with >, as with _sed_.) + +The next chapter covers alternation, groups, and backreferences, among other things. See you over there. + +# What You Learned in Chapter 3 + + * How to use anchors at the beginning or end of a line with `^` or `$` + + * How to use word boundaries and non-word boundaries + + * How to match the beginning or end of a subject with `\A` and `\Z` (or `\z`) + + * How to quote strings as literals with `\Q` and `\E` + + * How to add tags to a document with RegExr, _sed_ , and Perl + +# Technical Notes + + * _vi_ is a Unix editor developed in 1976 by Sun cofounder Bill Joy that uses regular expressions. The _vim_ editor is a replacement for _vi_ , developed primarily by Bram Moolenaar (see ). An early paper on _vi_ by Bill Joy and Mark Horton is found here: . The first time I used _vi_ was in 1983, and I use it nearly every day. It lets me to do more things more quickly than with any other text editor. And it is so powerful that I am always discovering new features that I never knew about, even though I've been acquainted with it for nearly 30 years. + + * _grep_ is a Unix command-line utility for searching and printing strings with regular expressions. Invented by Ken Thompson in 1973, _grep_ is said to have grown out of the _ed_ editor command `g/re/p` (global/regular expression/print). It was superseded but not retired by _egrep_ (or _grep -E_ ), which uses extended regular expressions (EREs) and has additional metacharacters such as |, +, ?, (, and ). _fgrep_ ( _grep -F_ ) searches files using literal strings; metacharacters like $, *, and | don't have special meaning. _grep_ is available on Linux systems as well as the Mac OS X's Darwin. You can also get it as part of the Cygwin GNU distribution () or you can download it from . + + * PCRE () or Perl Compatible Regular Expressions is a C library of functions (8-bit and 16-bit) for regular expressions that are compatible with Perl 5, and include some features of other implementations. _pcregrep_ is an 8-bit, _grep_ -like tool that enables you to use the features of the PCRE library on the command line. You can get _pcregrep_ for the Mac through Macports () by running the command `sudo port install pcre`. (Xcode is a prerequisite; see . Login required.) + +# Chapter 4. Alternation, Groups, and Backreferences + +You have already seen groups in action. Groups surround text with parentheses to help perform some operation, such as the following: + + * Performing alternation, a choice between two or more optional patterns + + * Creating subpatterns + + * Capturing a group to later reference with a backreference + + * Applying an operation to a grouped pattern, such as a quantifer + + * Using non-capturing groups + + * Atomic grouping (advanced) + +We'll be using a few contrived examples, in addition to the text from "The Rime of the Ancyent Mariner" again, in _rime.txt_. This time, I'll use the desktop version of RegExr, as well as other tools like _sed_. You can download the desktop version of RegExr from , for Windows, Mac, or Linux (it was written with Adobe AIR). Click the Desktop Version link on the RegExr web page (lower-right corner) for more information. + +# Alternation + +Simply said, _alternation_ gives you a choice of alternate patterns to match. For example, let's say you wanted to find out how many occurrences of the article _the_ are in the "The Rime of the Ancient Mariner." The problem is, the word occurs as _THE_ , _The_ , and _the_ in the poem. You can use alternation to deal with this peculiarity. + +Open the RegExr desktop application by double-clicking on its icon. It looks very much like the online version but has the advantage of being local on your machine, so you won't suffer the network issues that sometimes occur when using web applications. I've copied and pasted the entire poem in RegExr desktop for the next exercise. I'm using it on a Mac running OS X Lion. + +In the top text box, enter the pattern: + + (the|The|THE) + +and you'll see all occurrences of _the_ in the poem highlighted in the lower box (see Figure 4-1). Use the scroll bar to view more of the result. + +Figure 4-1. Using alternation in RegExr desktop version + +We can make this group shorter by applying an option. Options let you specify the way you would like to search for a pattern. For example, the option: + + (?i) + +makes your pattern case-insensitive, so instead of using the original pattern with alternation, you can do this instead: + + (?i)the + +Try this in RegExr to see how it works. You can also specify case-insensitivity by checking _ignoreCase_ in RegExr, but both will work. This and other options or modifiers are listed in Table 4-1. + +Table 4-1. Options in regular expressions + +Option| Description| Supported by +---|---|--- + +`(?d)` | Unix lines | Java + +`(?i)` | Case insensitive | PCRE, Perl, Java + +`(?J)` | Allow duplicate names | PCRE[a] + +`(?m)` | Multiline | PCRE, Perl, Java + +`(?s)` | Single line (dotall) | PCRE, Perl, Java + +`(?u)` | Unicode case | Java + +`(?U)` | Default match lazy | PCRE + +`(?x)` | Ignore whitespace, comments | PCRE, Perl, Java + +`(?-...)` | Unset or turn off options | PCRE + +[a] See "Named Subpatterns" in . + +Let's now use alternation with _grep_. The options in Table 4-1, by the way, don't work with _grep_ , so you are going to use the original alternation pattern. To count the number of lines where the word _the_ occurs, regardless of case, one or more times, use: + + grep -Ec "(the|The|THE)" rime.txt + +and get this answer: + + 327 + +This result does not tell the whole story. Stay tuned. + +Here is an analysis of the _grep_ command: + + * The _-E_ option means that you want to use extended regular expressions (EREs) rather than basic regular expressions (BREs). This, for example, saves you from having to escape the parentheses and the vertical bar, like `\(THE\|The\|the\)`, as you must with BREs. + + * The _-c_ option returns a count of the matched lines (not matched words). + + * The parentheses group the choice or alternation of _the_ , _The_ , or _THE_. + + * The vertical bar separates possible choices, which are evaluated left to right. + +To get a count of actual words used, this approach will return each occurrence of the word, one per line: + + grep -Eo "(the|The|THE)" rime.txt | wc -l + +This returns: + + 412 + +And here is a bit more analysis: + + * The `-o` option means to show only that part of the line that matches the pattern, though this is not apparent due to the pipe (`|`) to _wc_. + + * The vertical bar, in this context, pipes the output of the _grep_ command to the input of the _wc_ command. _wc_ is a word count command, and `-l` counts the number of lines of the input. + +Why the big difference between 327 and 412? Because _-c_ gives you a count of matching lines, but there can be more than one match on each line. If you use _-o_ with _wc -l_ , then each occurrence of the various forms of the word will appear on a separate line and be counted, giving the higher number. + +To perform this same match with Perl, write your command this way: + + perl -ne 'print if /(the|The|THE)/' rime.txt + +Or better yet, you can do it with the `(?i)` option mentioned earlier, but without alternation: + + perl -ne 'print if /(?i)the/' rime.txt + +Or even better yet, append the _i_ modifier after the last pattern delimiter: + + perl -ne 'print if /the/i' rime.txt + +and you will get the same outcome. The simpler the better. For a list of additional modifiers (also called _flags_ ), see Table 4-2"). Also, compare options (similar but with a different syntax) in Table 4-1. + +Table 4-2. Perl modifiers (flags)[1] + +Modifier| Description +---|--- + +a | Match `\d`, `\s`, `\w`, and POSIX in ASCII range only + +c | Keep current position after match fails + +d | Use default, native rules of the platform + +g | Global matching + +i | Case-insensitive matching + +l | Use current locale's rules + +m | Multiline strings + +p | Preserve the matched string + +s | Treat strings as a single line + +u | Use Unicode rules when matching + +x | Ignore whitespace and comments + +[1] See . + +# Subpatterns + +Most often, when you refer to _subpatterns_ in regular expressions, you are referring to a group or groups within groups. A subpattern is a pattern within a pattern. Often, a condition in a subpattern is matchable when a preceding pattern is matched, but not always. Subpatterns can be designed in a variety of ways, but we're concerned primarily with those defined within parentheses here. + +In one sense, the pattern you saw earlier: + + (the|The|THE) + +has three subpatterns: _the_ is the first subpattern, _The_ is the second, and _THE_ the third, but matching the second subpattern, in this instance, is not dependent on matching the first. (The leftmost pattern is matched first.) + +Now here is one where the subpattern(s) depend on the previous pattern: + + (t|T)h(e|eir) + +In plain language, this will match the literal characters _t_ or _T_ followed by an _h_ followed by either an _e_ or the letters _eir_. Accordingly, this pattern will match any of: + + * _the_ + + * _The_ + + * _their_ + + * _Their_ + +In this case, the second subpattern `(e|eir)` is dependent on the first `(tT)`. + +Subpatterns don't require parentheses. Here is an example of subpatterns done with character classes: + + \b[tT]h[ceinry]*\b + +This pattern can match, in addition to _the_ or _The_ , words such as _thee_ , _thy_ and _thence_. The two word boundaries (`\b`) mean the pattern will match whole words, not letters embedded in other words. + +Here is a complete analysis of this pattern: + + * `\b` matches a beginning word boundary. + + * `[tT]` is a character class that matches either an lowercase _t_ or an uppercase _T_. We can consider this the first subpattern. + + * Then the pattern matches (or attempts to match) a lowercase _h_. + + * The second or last subpattern is also expressed as a character class `[ceinry]` followed by a quantifier `*` for zero or more. + + * Finally, another word boundary `\b` ends the pattern. + +### Note + +One interesting aspect of the state of regular expressions is that terminology, while usually close in meaning, can also range far. In defining _subpattern_ and other terms in this book, I've examined a variety of sources and have tried to bring them together under one roof. But I suspect that there are some who would argue that a character class is not a subpattern. My take is they can function as subpatterns, so I lump them in. + +# Capturing Groups and Backreferences + +When a pattern groups all or part of its content into a pair of parentheses, it captures that content and stores it temporarily in memory. You can reuse that content if you wish by using a backreference, in the form: + + \1 + +or: + + $1 + +where `\1` or `$1` reference the first captured group, `\2` or `$2` reference the second captured group, and so on. _sed_ will only accept the `\1` form, but Perl accepts both. + +### Note + +Originally, _sed_ supported backreferences in the range `\1` through `\9`, but that limitation does not appear to exist any longer. + +You have already seen this in action, but I'll demonstrate it here again. We'll use it to rearrange the wording of a line of the poem, with apologies to Samuel Taylor Coleridge. In the top text box in RegExr, after clicking the Replace tab, enter this pattern: + + (It is) (an ancyent Marinere) + +Scroll the subject text (third text area) down until you can see the highlighted line, and then in the second box, enter: + + $2 $1 + +and you'll see in the lowest box the line rearranged as: + + an ancyent Marinere It is, + +(See Figure 4-2.) + +Figure 4-2. Referencing backreferences with $1 and $2 + +Here is how to accomplish the same result with _sed_ : + + sed -En 's/(It is) (an ancyent Marinere)/\2 \1/p' rime.txt + +and the output will be: + + an ancyent Marinere It is, + +just as in RegExr. Let's analyze the _sed_ command to help you understand everything that is going on: + + * The _-E_ option once again invokes EREs, so you don't have to quote the parentheses, for example. + + * The _-n_ option suppresses the default behavior of printing every line. + + * The substitute command searches for a match for the text "It is an ancyent Marinere," capturing it into two groups. + + * The substitute command also replaces the match by rearranging the captured text in the output, with the backreference `\2` first, then `\1`. + + * The _p_ at the end of the substitute command means you want to print the line. + +A similar command in Perl will do the same thing: + + perl -ne 'print if s/(It is) (an ancyent Marinere)/\2 \1/' rime.txt + +Notice that this uses the `\1` style syntax. You can, of course, use the `$1` syntax, too: + + perl -ne 'print if s/(It is) (an ancyent Marinere)/$2 $1/' rime.txt + +I like how Perl lets you print a selected line without jumping through hoops. + +I'd like to point out something about the output: + + an ancyent Marinere It is, + +The capitalization got mixed up in the transformation. Perl can fix that with `\u` and `\l`. Here's how: + + perl -ne 'print if s/(It is) (an ancyent Marinere)/\u$2 \l$1/' rime.txt + +Now the result looks much better: + + An ancyent Marinere it is, + +And here is why: + + * The `\l` syntax does not match anything, but it changes the character that follows to lowercase. + + * The `\u` syntax capitalizes the character that follows it. + + * The `\U` directive (not shown) turns the text string that follows into all uppercase. + + * The `\L` directive (not shown) turns the text string that follows into all lowercase. + +These directives remain in effect until another is found (like `\l` or `\E`, the end of a quoted string). Experiment with these to see how they work. + +## Named Groups + + _Named groups_ are captured groups with names. You can access those groups by name later, rather than by integer. I'll show you how here in Perl: + + perl -ne 'print if s/(?It is) (?an ancyent Marinere)/\u$+{two} + \l$+{one}/' rime.txt + +Let's look at it: + + * Adding `?` and `?` inside the parentheses names the groups _one_ and _two_ , respectively. + + * `$+{one}` references the group named _one_ , and `$+{two}`, the group named _two_. + +You can also reuse named groups within the pattern where the group was named. I'll show you what I mean. Let's say you were searching for a string that contained six zeros all together: + + 000000 + +It's a shallow example, but serves to show you how this works. So name a group of three zeros with this pattern (the _z_ is arbitrary): + + (?0{3}) + +You can then use the group again like this: + + (?0{3})\k + +Or this: + + (?0{3})\k'z' + +Or this: + + (?0{3})\g{z} + +Try this in RegExr for quick results. All these examples will work. Table 4-3 shows many of the possibilities with named group syntax. + +Table 4-3. Named group syntax + +Syntax| Description +---|--- + +(?< _name_ >...) | A named group + +(? _name_...) | Another named group + +(?P< _name_ >...) | A named group in Python + +\k< _name_ > | Reference by name in Perl + +`\k' _`name`_ '` | Reference by name in Perl + +`\g{ _`name`_ }` | Reference by name in Perl + +`\k{ _`name`_ }` | Reference by name in .NET + +`(?P= _`name`_ )` | Reference by name in Python + +# Non-Capturing Groups + +There are also groups that are non-capturing groups—that is, they don't store their content in memory. Sometimes this is an advantage, especially if you never intend to reference the group. Because it doesn't store its content, it is possible it may yield better performance, though performance issues are hardly perceptible when running the simple examples in this book. + +Remember the first group discussed in this chapter? Here it is again: + + (the|The|THE) + +You don't need to backreference anything, so you could write a non-capturing group this way: + + (?:the|The|THE) + +Going back to the beginning of this chapter, you could add an option to make the pattern case-insensitive, like this (though the option obviates the need for a group): + + (?i)(?:the) + +Or you could do it this way: + + (?:(?i)the) + +Or, better yet, the _pièce de résistance_ : + + (?i:the) + +The option letter _i_ can be inserted between the question mark and the colon. + +## Atomic Groups + +Another kind of non-capturing group is the _atomic group_. If you are using a regex engine that does backtracking, this group will turn backtracking off, not for the entire regular expression but just for that part enclosed in the atomic group. The syntax looks like this: + + (?>the) + +When would you want to use atomic groups? One of the things that can really slow regex processing is backtracking. The reason why is, as it tries all the possibilities, it takes time and computing resources. Sometimes it can gobble up a lot of time. When it gets really bad, it's called _catastrophic_ _backtracking_. + +You can turn off backtracking altogether by using a non-backtracking engine like re2 () or by turning it off for parts of your regular expression with atomic grouping. + +### Note + +My focus in this book is to introduce syntax. I talk very little about performance tuning here. Atomic groups are mainly a performance consideration in my view. + +In Chapter 5, you'll learn about character classes. + +# What You Learned in Chapter 4 + + * That alternation allows a choice between two or more patterns + + * What options modifiers are and how to use them in a pattern + + * Different kinds of subpatterns + + * How to use capturing groups and backreferences + + * How to use named groups and how to reference them + + * How to use non-capturing groups. + + * A little about atomic grouping. + +# Technical Notes + + * The Adobe AIR runtime lets you use HTML, JavaScript, Flash, and ActionScript to build web applications that run as standalone client applications without having to use a browser. Find out more at . + + * Python () is an easy-to-understand, high-level programming language. It has a regular expression implementation (see ). + + * .NET () is a programming framework for the Windows platform. It, too, has a regular expression implementation (see ). + + * More advanced explanations of atomic grouping are available at and . + +# Chapter 5. Character Classes + +I'll now talk more about character classes or what are sometimes called _bracketed expressions_. Character classes help you match specific characters, or sequences of specific characters. They can be just as broad or far-reaching as character shorthands—for example, the character shorthand `\d` will match the same characters as: + + 0-9 + +But you can use character classes to be even more specific than that. In this way, they are more versatile than shorthands. + +Try these examples in whatever regex processor you prefer. I'll use Rubular in Opera and Reggy on the desktop. + +To do this testing, enter this string in the subject or target area of the web page: + + ! " # $ % & ' ( ) * + , - . / + 0 1 2 3 4 5 6 7 8 9 + : ; < = > ? @ + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + [ \ ] ^ _ ` + a b c d e f g h i j k l m n o p q r s t u v w x y z + { | } ~ + +You don't have to type all that in. You'll find this text stored in the file _ascii-graphic.txt_ in the code archive that comes with this book. + +To start out, use a character class to match a set of English characters—in this case, the English vowels: + + [aeiou] + +The lowercase vowels should be highlighted in the lower text area (see Figure 5-1). How would you highlight the uppercase vowels? How would you highlight or match both? + +Figure 5-1. Character class with Rubular in the Opera browser + +With character classes, you can also match a range of characters: + + [a-z] + +This matches the lowercase letters _a_ through _z_. Try matching a smaller range of those characters, something like _a_ through _f_ : + + [a-f] + +Of course, you can also specify a range of digits: + + [0-9] + +Or an even smaller range such as 3, 4, 5, and 6: + + [3-6] + +Now expand your horizon. If you wanted to match even numbers in the range 10 through 19, you could combine two character classes side by side, like this: + + \b[1][24680]\b + +Or you could push things further and look for even numbers in the range 0 through 99 with this (yes, as we learned in high school, zero by itself is even): + + \b[24680]\b|\b[1-9][24680]\b + +If you want to create a character class that matches hexadecimal digits, how would you do it? Here is a hint: + + [a-fA-F0-9] + +You can also use shorthands inside of a character class. For example, to match whitespace and word characters, you could create a character class like this: + + [\w\s] + +Which is the same as: + + [_a-zA-Z \t\n\r] + +but easier to type. + +# Negated Character Classes + +You have already seen syntax a number of times, so I'll be brief. A negated character class matches characters that do not match the content of the class. For example, if you didn't want to match vowels, you could write (try it in your browser, then see Figure 5-2): + + [^aeiou] + +In essence, the caret (`^`) at the beginning of the class means "No, I don't want these characters." (The caret _must_ appear at the beginning.) + +Figure 5-2. Negated character class with Regexpal in Opera + +# Union and Difference + +Character classes can act like sets. In fact, one other name for a character class is a _character set_. This functionality is not supported by all implementations. But Java supports it. + +I'll now show you a Mac desktop application called Reggy (see Technical Notes). Under Preferences (Figure 5-3), I changed the Regular Expression Syntax to _Java_ , and in Font (under Format), I changed the point size to 24 points for readability. + +Figure 5-3. Reggy preferences + +If you wanted a union of two character sets, you could do it like this: + + [0-3[6-9]] + +The regex would match 0 through 3 or 6 through 9. Figure 5-4 shows you how this looks in Reggy. + +Figure 5-4. Union of two character sets in Reggy + +To match a difference (in essence, subtraction): + + [a-z&&[^m-r]] + +Figure 5-5. Difference of two characters sets in Reggy + +which matches all the letters from _a_ to _z_ , except _m_ through _r_ (see Figure 5-5). + +# POSIX Character Classes + +POSIX or Portable Operating System Interface is a family of standards maintained by IEEE. It includes a regular expression standard, (ISO/IEC/IEEE 9945:2009), which provides a set of named character classes that have the form: + + [[: _xxxx_ :]] + +where _xxxx_ is a name, such as _digit_ or _word_. + +To match alphanumeric characters (letters and digits), try: + + [[:alnum:]] + +Figure 5-6 shows the alphanumeric class in Rubular. + +Figure 5-6. POSIX alphanumeric character class in Reggy + +An alternative for this is simply the shorthand `\w`. Which is easier to type, the POSIX character class or the shorthand? You know where I'm going: The least amount of typing wins. I admit I don't use POSIX classes very often. But they're still worth knowing about. + +For alphabetic characters in either upper- or lowercase, use: + + [[:alpha:]] + +If you want to match characters in the ASCII range, choose: + + [[:ascii:]] + +Of course, there are negated POSIX character classes as well, in the form: + + [[:^ _xxxx_ :]] + +So if you wanted to match non-alphabetic characters, you could use: + + [[:^alpha:]] + +To match space and tab characters, do: + + [[:space:]] + +Or to match all whitespace characters, there's: + + [[:blank:]] + +There are a number of these POSIX character classes, which are shown in Table 5-1. + +Table 5-1. POSIX character classes + +Character Class| Description +---|--- + +[[:alnum:]] | Alphanumeric characters (letters and digits) + +[[:alpha:]] | Alphabetic characters (letters) + +[[:ascii:]] | ASCII characters (all 128) + +[[:blank:]] | Blank characters + +[[:ctrl:]] | Control characters + +[[:digit:]] | Digits + +[[:graph:]] | Graphic characters + +[[:lower:]] | Lowercase letters + +[[:print:]] | Printable characters + +[[:punct:]] | Punctuation characters + +[[:space:]] | Whitespace characters + +[[:upper:]] | Uppercase letters + +[[:word:]] | Word characters + +[[:xdigit:]] | Hexadecimal digits + +The next chapter is dedicated to matching Unicode and other characters. + +# What You Learned in Chapter 5 + + * How to create a character class or set with a bracketed expression + + * How to create one or more ranges within a character class + + * How to match even numbers in the range 0 through 99 + + * How to match a hexadecimal number + + * How to use character shorthands within a character class + + * How to negate a character class + + * How to perform union, and difference with character classes + + * What POSIX character classes are + +# Technical Notes + + * The Mac desktop application Reggy can be downloaded for free at . Reggy shows you what it has matched by changing the color of the matched text. The default is blue, but you can change this color in Preferences under the Reggy menu. Under Preferences, choose Java under Regular Expression Syntax. + + * The Opera Next browser, currently in beta, can be downloaded from . + + * Rubular is an online Ruby regular expression editor created by Michael Lovitt that supports both versions 1.8.7 and 1.9.2 of Ruby (see ). + + * Read more about even numbers, of which zero is one, at . + + * The Java (1.6) implementation of regular expressions is documented at . + + * You can find out more about IEEE and its family of POSIX standards at . + +# Chapter 6. Matching Unicode and Other Characters + +You will have occasion to match characters or ranges of characters that are outside the scope of ASCII. ASCII, or the American Standard Code for Information Interchange, defines an English character set—the letters A through Z in upper- and lowercase, plus control and other characters. It's been around for a long time: The 128-character Latin-based set was standardized in 1968. That was back before there was such a thing as a personal computer, before VisiCalc, before the mouse, before the Web, but I still look up ASCII charts online regularly. + +I remember when I started my career many years ago, I worked with an engineer who kept an ASCII code chart in his wallet. Just in case. The ASCII Code Chart: Don't leave home without it. + +So I won't gainsay the importance of ASCII, but now it is dated, especially in light of the Unicode standard (), which currently represents over 100,000 characters. Unicode, however, does not leave ASCII in the dust; it incorporates ASCII into its Basic Latin code table (see ). + +In this chapter, you will step out of the province of ASCII into the not-so-new world of Unicode. + +The first text is _voltaire.txt_ from the code archive, a quote from Voltaire (1694–1778), the French Enlightenment philosopher. + +> Qu'est-ce que la tolérance? c'est l'apanage de l'humanité. Nous sommes tous pétris de faiblesses et d'erreurs; pardonnons-nous réciproquement nos sottises, c'est la première loi de la nature. + +Here is an English translation: + +> What is tolerance? It is the consequence of humanity. We are all formed of frailty and error; let us pardon reciprocally each other's folly—that is the first law of nature. + +# Matching a Unicode Character + +There are a variety of ways you can specify a Unicode character, also known as a code point. (For the purposes of this book, a Unicode character is one that is outside of the range of ASCII, though that is not strictly accurate.) + +Start out by placing the Voltaire quote in Regexpal (), and then entering this regular expression: + + \u00e9 + +The `\u` is followed by a hexadecimal value 00e9 (this is case insensitive—that is, 00E9 works, too). The value 00e9 is equivalent to the decimal value 233, well out of the ASCII range (0–127). + +Notice that the letter _é_ (small letter e with an acute accent) is highlighted in Regexpal (see Figure 6-1). That's because _é_ is the code point U+00E9 in Unicode, which was matched by `\u00e9`. + +Figure 6-1. Matching U+00E9 in Regexpal + +Regexpal uses the JavaScript implementation of regular expressions. JavaScript also allows you to use this syntax: + + \xe9 + +Try this in Regexpal and see how it matches the same character as `\u00e9`. + +Let's try it with a different regex engine. Open in a browser. Regex Hero is written in .NET and has a little different syntax. Drop the contents of the file _basho.txt_ into the text area labeled Target String. This contains a famous haiku written by the Japanese poet Matsuo Basho (who, coincidentally, died just one week before Voltaire was born). + +Here is the poem in Japanese: + + 古池 + 蛙飛び込む + 水の音 + —芭蕉 (1644–1694) + +And here is a translation in English: + + At the ancient pond + a frog plunges into + the sound of water. + —Basho (1644–1694) + +To match part of the Japanese text, in the text area marked Regular Expression, type the following: + + \u6c60 + +This is the code point for the Japanese (Chinese) character for _pond_. It will be highlighted below (see Figure 6-2). + +Figure 6-2. Matching U+6c60 in Regex Hero + +While you are here, try matching the em dash (—) with: + + \u2014 + +Or the en dash (–) with: + + \u2013 + +Now look at these characters in an editor. + +## Using _vim_ + +If you have _vim_ on your system, you can open _basho.txt_ with it, as shown: + + vim basho.txt + +Now, starting with a slash (\\), enter a search with this line: + + /\%u6c60 + +followed by Enter or Return. The cursor moves to the beginning of the match, as you can see in Figure 6-3. Table 6-1 shows you your options. You can use _x_ or _X_ following the `\%` to match values in the range 0–255 (0–FF), _u_ to match up to four hexadecimal numbers in the range 256–65,535 (100–FFFF), or _U_ to match up to eight characters in the range 65,536–2,147,483,647 (10000–7FFFFFFF). That takes in a lot of code—a lot more than currently exist in Unicode. + +Table 6-1. Matching Unicode in Vim + +First Character| Maximum Characters| Maximum Value +---|---|--- + +x or X | 2 | 255 (FF) + +u | 4 | 65,535 (FFFF) + +U | 8 | 2,147,483,647 (7FFFFFFF) + +Figure 6-3. Matching U+6c60 in Vim + +# Matching Characters with Octal Numbers + +You can also match characters using an octal (base 8) number, which uses the digits 0 to 7. In regex, this is done with three digits, preceded by a slash (\\). + +For example, the following octal number: + + \351 + +is the same as: + + \u00e9 + +Experiment with it in Regexpal with the Voltaire text. `\351` matches _é_ , with a little less typing. + +# Matching Unicode Character Properties + +In some implementations, such as Perl, you can match on Unicode character properties. The properties include characteristics like whether the character is a letter, number, or punctuation mark. + +I'll now introduce you to _ack_ , a command-line tool written in Perl that acts a lot like _grep_ (see ). It won't come on your system; you have to download and install it yourself (see Technical Notes). + +We'll use _ack_ on an excerpt from Friederich Schiller's "An die Freude," composed in 1785 (German, if you can't tell): + + An die Freude. + + Freude, schöner Götterfunken, + Tochter aus Elisium, + Wir betreten feuertrunken + Himmlische, dein Heiligthum. + Deine Zauber binden wieder, + was der Mode Schwerd getheilt; + Bettler werden Fürstenbrüder, + wo dein sanfter Flügel weilt. + + Seid umschlungen, Millionen! + Diesen Kuß der ganzen Welt! + Brüder, überm Sternenzelt + muß ein lieber Vater wohnen. + +There are a few interesting characters in this excerpt, beyond ASCII's small realm. We'll look at the text of this poem through properties. (If you would like a translation of this poem fragment, you can drop it into Google Translate. + +Using _ack_ on a command line, you can specify that you want to see all the characters whose property is Letter (L): + + ack '\pL' schiller.txt + +This will show you all the letters highlighted. For lowercase letters, use _Ll_ , surrounded by braces: + + ack '\p{Ll}' schiller.txt + +You must add the braces. For uppercase, it's _Lu_ : + + ack '\p{Lu}' schiller.txt + +To specify characters that do _not_ match a property, we use uppercase _P_ : + + ack '\PL' schiller.txt + +This highlights characters that are not letters. + +The following finds those that are not lowercase letters: + + ack '\P{Ll}' schiller.txt + +And this highlights the ones that are not uppercase: + + ack '\P{Lu}' schiller.txt + +You can also do this in yet another browser-based regex tester, . Figure 6-4 shows the Schiller text with its lowercase letters highlighted using the lowercase property (`\p{Ll}`). + +Figure 6-4. Characters with the lowercase letter property + +Table 6-2 lists character property names for use with `\p{` _`property`_`}` or `\P{` _`property`_`}` (see pcresyntax(3) at ). You can also match human languages with properties; see Table A-8. + +Table 6-2. Character properties + +Property| Description +---|--- + +C | Other + +Cc | Control + +Cf | Format + +Cn | Unassigned + +Co | Private use + +Cs | Surrogate + +L | Letter + +Ll | Lowercase letter + +Lm | Modifier letter + +Lo | Other letter + +Lt | Title case letter + +Lu | Uppercase letter + +L& | Ll, Lu, or Lt + +M | Mark + +Mc | Spacing mark + +Me | Enclosing mark + +Mn | Non-spacing mark + +N | Number + +Nd | Decimal number + +Nl | Letter number + +No | Other number + +P | Punctuation + +Pc | Connector punctuation + +Pd | Dash punctuation + +Pe | Close punctuation + +Pf | Final punctuation + +Pi | Initial punctuation + +Po | Other punctuation + +Ps | Open punctuation + +S | Symbol + +Sc | Currency symbol + +Sk | Modifier symbol + +Sm | Mathematical symbol + +So | Other symbol + +Z | Separator + +Zl | Line separator + +Zp | Paragraph separator + +Zs | Space separator + +# Matching Control Characters + +How do you match control characters? It's not all that common that you will search for control characters in text, but it's a good thing to know. In the example repository or archive, you'll find the file _ascii.txt_ , which is a 128-line file that contains all the ASCII characters in it, each on separate line (hence the 128 lines). When you perform a search on the file, it will usually return a single line if it finds a match. This file is good for testing and general fun. + +### Note + +If you search for strings or control characters in _ascii.txt_ with _grep_ or _ack_ , they may interpret the file as a binary file. If so, when you run a script on it, either tool may simply report "Binary file ascii.txt matches" when it finds a match. That's all. + +In regular expressions, you can specify a control character like this: + +`\c` _`x`_ + +where _x_ is the control character you want to match. + +Let's say, for example, you wanted to find a null character in a file. You can use Perl to do that with the following command: + + perl -n -e 'print if /\c@/' ascii.txt + +Provided that you've got Perl on your system and it's running properly, you will get this result: + + 0. Null + +The reason why is that there is a null character on that line, even though you can't see the character in the result. + +### Note + +If you open _ascii.txt_ with an editor other than _vim_ , it will likely remove the control characters from the file, so I suggest you don't do it. + +You can also use `\0` to find a null character. Try this, too: + + perl -n -e 'print if /\0/' ascii.txt + +Pressing on, you can find the bell (BEL) character using: + + perl -n -e 'print if /\cG/' ascii.txt + +It will return the line: + + 7. Bell + +Or you can use the shorthand: + + perl -n -e 'print if /\a/' ascii.txt + +To find the escape character, use: + + perl -n -e 'print if /\c[/' ascii.txt + +which gives you: + + 27. Escape + +Or do it with a shorthand: + + perl -n -e 'print if /\e/' ascii.txt + +How about a backspace character? Try: + + perl -n -e 'print if /\cH/' ascii.txt + +which spits back: + + 8. Backspace + +You can also find a backspace using a bracketed expression: + + perl -n -e 'print if /[\b]/' ascii.txt + +Without the brackets, how would `\b` be interpreted? That's right, as a word boundary, as you learned in Chapter 2. The brackets change the way the `\b` is understood by the processor. In this case, Perl sees it as a backspace character. + +Table 6-3 lists the ways we matched characters in this chapter. + +Table 6-3. Matching Unicode and other characters + +Code| Description +---|--- + +`\u` _`xxxx`_ | Unicode (four places) + +`\` _`xxx`_ | Unicode (two places) + +`\x``{ _`xxxx`_ }` | Unicode (four places) + +`\x``{ _`xx`_ }` | Unicode (two places) + +`\000` | Octal (base 8) + +`\c _`x`_` | Control character + +`\0` | Null + +`\a` | Bell + +`\e` | Escape + +`[\b]` | Backspace + +That wraps things up for this chapter. In the next, you'll learn more about quantifiers. + +# What You Learned in Chapter 6 + + * How to match any Unicode character with `\u` _`xxxx`_ or `\` _`xxx`_ + + * How to match any Unicode character inside of _vim_ using `\%` _`xxx`_ , `\%X` _`xx`_ , `\%u` _`xxxx`_ , or `\%U` _`xxxx`_ + + * How to match characters in the range 0–255 using octal format with `\000` + + * How to use Unicode character properties with `\p{` _`x`_`}` + + * How to match control characters with `\e` or `\cH` + + * More on how to use Perl on the command line (more Perl one-liners) + +# Technical Notes + + * I entered control characters in _ascii.txt_ using _vim_ (). In _vim_ , you can use Ctrl+V followed by the appropriate control sequence for the character, such as Ctrl+C for the end-of-text character. I also used Ctrl+V followed by _x_ and the two-digit hexadecimal code for the character. You can also use digraphs to enter control codes; in _vim_ enter `:digraph` to see the possible codes. To enter a digraph, use Ctrl+K while in Insert mode, followed by a two-character digraph (for example, _NU_ for null). + + * RegexHero () is a .NET regex implementation in a browser written by Steve Wortham. This one is for pay, but you can test it out for free, and if you like it, the prices are reasonable (you can buy it at a standard or a professional level). + + * _vim_ () is an evolution of the _vi_ editor that was created by Bill Joy in 1976. The _vim_ editor was developed primarily by Bram Moolenaar. It seems archaic to the uninitiated, but as I've mentioned, it is incredibly powerful. + + * The _ack_ tool () is written in Perl. It acts like _grep_ and has many of its command line options, but it outperforms _grep_ in many ways. For example, it uses Perl regular expressions instead of basic regular expressions like _grep_ (without _-E_ ). For installation instructions, see . I used the specific instructions under "Install the ack executable." I didn't use _curl_ but just downloaded _ack_ with the link provided and then copied the script into _/usr/bin_ on both my Mac and a PC running Cygwin () in Windows 7. + +# Chapter 7. Quantifiers + +You have already seen some quantifiers at work earlier in this book, but here I'll talk about them in more detail. + +For our example this time, we'll use a Mac desktop application called Reggy (Figure 7-1), as we did in Chapter 5. Uncheck _Match All_ at the bottom to start. + +If you are not on a Mac, you can try these examples in one of the applications you've seen earlier in the book. Paste the right triangle of digits from the _triangle.txt_. The file is in the archive of examples. + +Figure 7-1. Reggy application + +# Greedy, Lazy, and Possessive + +I'm not talking about your teenager here. I'm talking about quantifiers. These adjectives may not sound like good character qualities, but they are interesting features of quantifiers that you need to understand if you want to use regular expressions with skill. + +Quantifiers are, by themselves, greedy. A greedy quantifier first tries to match the whole string. It grabs as much as it can, the whole input, trying to make a match. If the first attempt to match the whole string goes awry, it backs up one character and tries again. This is called _backtracking_. It keeps backing up one character at a time until it finds a match or runs out of characters to try. It also keeps track of what it is doing, so it puts the most load on resources compared with the next two approaches. It takes a mouthful, then spits back a little at a time, chewing on what it just ate. You get the idea. + +A lazy (sometimes called _reluctant_ ) quantifier takes a different tack. It starts at the beginning of the target, trying to find a match. It looks at the string one character at a time, trying to find what it is looking for. At last, it will attempt to match the whole string. To get a quantifier to be lazy, you have to append a question mark (`?`) to the regular quantifier. It chews one nibble at a time. + +A possessive quantifier grabs the whole target and then tries to find a match, but it makes only one attempt. It does not do any backtracking. A possessive quantifier appends a plus sign (+) to the regular quantifier. It doesn't chew; it just swallows, then wonders what it just ate. I'll demonstrate each of these in the pages that follow. + +# Matching with *, +, and ? + +If you have the triangle of digits in Reggy, you can now begin testing. First we'll use the Kleene star, named for the man credited as the inventor of regular expressions, Stephen Kleene. If you use the star or asterisk following a dot like this: + + .* + +it would match, being greedy, all the characters (digits) in the subject text. As you know from earlier reading, `.*` matches any character zero or more times. All the digits in the lower box should be highlighted by changing color. Of the Kleene star, an early manual said: + +> A regular expression followed by "*" [Kleene star] is a regular expression which matches any number (including zero) of adjacent occurrences of the text matched by the regular expression. + +Now try: + + 9* + +and the row of nines near the bottom should be highlighted. Now: + + 9.* + +lights up the row of nines and the row of zeros below it. Because _Multiline_ is checked (at the bottom of the application window), the dot will match the newline character between the rows; normally, it would not. + +To match one or more 9s, try: + + 9+ + +How is that different? You can't really tell because there are nine 9s in the subject text. The main difference is that + is looking for at least one 9, but `*` is looking for zero or more. + +To match zero or one time (optional), use: + + 9? + +This will match the first occurrence of 9 only. That 9 is considered optional, so because it does exist in the subject text, it is matched and highlighted. If you do this: + + 99? + +then both the first and second 9 are matched. + +Table 7-1 lists the basic quantifiers and some of the possibilities that they have. These quantifiers are by default _greedy_ , meaning that they match as many characters as they possibly can on the first attempt. + +Table 7-1. Basic quantifiers + +Syntax| Description +---|--- + +`?` | Zero or one (optional) + ++ | One or more + +`*` | Zero or more + +# Matching a Specific Number of Times + +When you use braces or squiggly brackets, you can match a pattern a specific number of times in a range. Unmodified, these are greedy quantifiers. For example: + + 7{1} + +will match the first occurrence of 7. If you wanted to match one _or more_ occurrences of the number 7, all you have to do is add a comma: + + 7{1,} + +You've probably realized that both: + + 7+ + +and + + 7{1,} + +are essentially the same thing, and that: + + 7* + +and + + 7{0,} + +are likewise the same. In addition: + + 7? + +is the same as: + + 7{0,1} + +To find a range of matches, that is, to match _m_ to _n_ times: + + 7{3,5} + +This will match three, four, or five occurrences of 7. + +So to review, the squiggly bracket or range syntax is the most flexible and precise quantifier. Table 7-2 summarizes these features. + +Table 7-2. Summary of range syntax + +Syntax| Description +---|--- + +{ _n_ } | Match _n_ times exactly + +{ _n_ ,} | Match _n_ or more times + +{ _m,n_ } | Match _m_ to _n_ times + +{0,1} | Same as `?` (zero or one) + +{1,0} | Same as + (one or more) + +{0,} | Same as `*` (zero or more) + +# Lazy Quantifiers + +Now let's set aside greediness and get lazy. The easiest way for you to understand this is by seeing it in action. In Reggy (making sure _Match All_ is unchecked), try to match zero or one 5 using a single question mark (`?`): + + 5? + +The first 5 is highlighted. Add an additional `?` to make the quantifier lazy: + + 5?? + +Now it doesn't appear to match anything. The reason why is that the pattern is being lazy, that is, it's not even forced to match that first 5. By nature, the _lazy_ match matches as few characters as it can get away with. It's a slacker. + +Try this zero or more times: + + 5*? + +and it won't match anything either, because you gave it the option to match a minimum of zero times, and that's what it does. + +Try it again matching one or more times, à la lazy: + + 5+? + +And there you go. Lazy just got off the couch and matched one 5. That's all it had to do to keep its day job. + +Things get a bit more interesting as you apply _m,n_ matching. Try this: + + 5{2,5}? + +Only two 5s are matched, not all five of them, as a greedy match would. + +Table 7-3 lists the lazy quantifiers. When is lazy matching useful? You can use lazy matching when you want to match the bare minimum of characters, not the maximum possible. + +Table 7-3. Lazy quantifiers + +Syntax| Description +---|--- + +?? | Lazy zero or one (optional) + ++? | Lazy one or more + +*? | Lazy zero or more + +{ _n_ }? | Lazy _n_ + +{ _n_ ,}? | Lazy _n_ or more + +{ _m,n_ }? | Lazy _m,n_ + +# Possessive Quantifiers + +A possessive match is like a greedy match, it grabs as much as it can get away with. But unlike a greedy match: It does not backtrack. It does not give up anything it finds. It is selfish. That is why it is called _possessive_. Arms folded firmly, it doesn't give up any ground. But the good thing about possessive quantifiers is that they are faster, because they don't do any backtracking, and they also fail in a hurry. + +### Note + +The truth is, you can hardly tell the difference between greedy, lazy, and possessive matches with the examples in this book. But as you gain more experience, and performance tuning becomes important, you'll want to be aware of these differences. + +To make sense of this, first we'll try matching the zeroes with a leading zero, then with a trailing zero. In Reggy, make sure _Match All_ is checked, and enter this expression with a leading zero: + + 0.*+ + +What happened? All the zeroes are highlighted. There was a match. The possessive match appears to do the same thing as a greedy match, with one subtle difference: There is no backtracking. You can now prove it. Enter this with a trailing zero: + + .*+0 + +No match. The reason why is there was no backtracking. It gobbled up the entire input and never looked back. It wasted its inheritance with riotous living. It can't find the trailing zero. It doesn't know where to look. If you remove the plus sign, it would find all the zeroes as it goes back to a greedy match. + + .*0 + +You might want to use a possessive quantifier when you are aware of what is in your text, you know where you will find matches. You don't care if it grabs with gusto. A possessive match can help you match with improved performance. Table 7-4 shows the possessive quantifiers. + +Table 7-4. Possessive quantifiers + +Syntax| Description +---|--- + +?+ | Possessive zero or one (optional) + +````++ | Possessive one or more + +*+ | Possessive zero or more + +{ _n_ }+ | Possessive _n_ + +{ _n_ ,}+ | Possessive _n_ or more + +{ _m,n_ }+ | Possessive _m,n_ + +You'll be introduced to lookarounds in the next chapter. + +# What You Learned in Chapter 7 + + * The differences between greedy, lazy, and possessive matching + + * How to match one or more (+) + + * How to match optionally (zero or one, `?`) + + * How to match zero or one (`*`) + + * How to use { _m,n_ } quantifiers + + * How to use greedy, lazy (reluctant), and possessive quantifiers. + +# Technical Notes + +The quote comes from Dennis Ritchie and Ken Thompson, _QED Text Editor_ (Murray Hill, NJ, Bell Labs, 1970) p. 3 (see ). + +# Chapter 8. Lookarounds + +Lookarounds are non-capturing groups that match patterns based on what they find either in front of or behind a pattern. Lookarounds are also considered _zero-width assertions_. + +Lookarounds include: + + * Positive lookaheads + + * Negative lookaheads + + * Positive lookbehinds + + * Negative lookbehinds + +In this chapter, I'll show you how each of these works. We'll start out using RegExr on the desktop and then move on to Perl and _ack_ ( _grep_ doesn't know about lookarounds). Our text is still Coleridge's well-worn poem. + +# Positive Lookaheads + +Suppose you want to find every occurrence of the word _ancyent_ that is followed by _marinere_ (I use the archaic spellings because that is what is found in the file). To do this, we could use a positive lookahead. + +First let's try it in RegExr desktop. The following case-insentitive pattern goes in the text box at the top: + + (?i)ancyent (?=marinere) + +### Note + +You can also specify case-insensitivity with RegExr by simply checking the box next to _ignoreCase_ , but both methods work. + +Because you use the case-insensitive option (`?i`), you don't need to worry about what case you use in your pattern. You are looking for every line that has the word _ancyent_ followed hard by _marinere_. The results will be highlighted in the text area below the pattern area (see Figure 8-1); however, only the first part of the pattern will be highlighted ( _ancyent_ ), not the lookahead pattern ( _Marinere_ ). + +Figure 8-1. Positive lookahead in RegExr + +Let's now use Perl to do a positive lookahead. You can form the command like so: + + perl -ne 'print if /(?i)ancyent (?=marinere)/' rime.txt + +and the output should look like this: + + THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + How a Ship having passed the Line was driven by Storms to the cold Country towards + the South Pole; and how from thence she made her course to the tropical Latitude of + the Great Pacific Ocean; and of the strange things that befell; and in what manner the + Ancyent Marinere came back to his own Country. + It is an ancyent Marinere, + "God save thee, ancyent Marinere! + "I fear thee, ancyent Marinere! + +There are five lines in the poem where the word _ancyent_ shows up right before the word _marinere_. What if we just wanted to check if the word following _ancyent_ started with the letter _m_ , either in upper- or lowercase? We could do it this way: + + perl -ne 'print if /(?i)ancyent (?=m)/' rime.txt + +In addition to `Marinere`, you would get `man` and `Man`: + + And thus spake on that ancyent man, + And thus spake on that ancyent Man, + + _ack_ also can do lookarounds as it is written in Perl. The command-line interface for _ack_ is very similar to _grep_. + +Try this: + + ack '(?i)ancyent (?=ma)' rime.txt + +and you'll see highlighted results, as shown in Figure 8-2. + +Figure 8-2. Positive lookahead with ack in Terminal + +With _ack_ , you can specify case-insensitivity with the command-line option _-i_ , rather than with the embedded option `(?i)`: + + ack -i 'ancyent (?=ma)' rime.txt + +I'll throw something in here for good measure. If you want to add line numbers to _ack_ 's output, you can do several things. You can add the _-H_ option: + + ack -Hi 'ancyent (?=ma)' rime.txt + +Or you could add this code with the _\--output_ option: + + ack -i --output '$.:$_' 'ancyent (?=ma)' rime.txt + +This is a bit of a hack, and turns off highlighting, but it works. + +# Negative Lookaheads + +The flip side of a positive lookahead is a negative lookahead. This means that as you try to match a pattern, you _won't_ find a given lookahead pattern. A negative lookahead is formed like this: + + (?i)ancyent (?!marinere) + +Only one character changed: The equals sign (`=`) in the positive lookahead became an exclamation point (`!`) in the negative lookahead. Figure 8-3 shows you this negative lookahead in Opera. + +Figure 8-3. Negative lookahead with RegExr in Opera + +In Perl, we could do a negative lookahead this way: + + perl -ne 'print if /(?i)ancyent (?!marinere)/' rime.txt + +and this is what we would get back: + + And thus spake on that ancyent man, + And thus spake on that ancyent Man, + +In _ack_ , the same results could be produced with: + + ack -i 'ancyent (?!marinere)' rime.txt + +# Positive Lookbehinds + +A positive lookbehind looks to the left, in the opposite direction as a lookahead. The syntax is: + + (?i)(?<=ancyent) marinere + +The positive lookbehind throws in a less-than sign (`<`), reminding you which direction lookbehind is. Try this in RegExr and see what the difference is. Instead of _ancyent_ being highlighted, _marinere_ is. Why? Because the positive lookbehind is a condition of the match and is not included or consumed in the match results. + +Do it like so in Perl: + + perl -ne 'print if /(?i)(?<=ancyent) marinere/' rime.txt + +And like this with _ack_ : + + ack -i '(?<=ancyent) marinere' rime.txt + +# Negative Lookbehinds + +Finally, there is the negative lookbehind. And how do you think this one works? + +It is looking to see if a pattern does _not_ show up behind in the left-to-right stream of text. Again, it adds a less-than sign (`<`), reminding you which direction lookbehind is. + +Do this in RegExr and see the results. + + (?1)(?`) or end-tags (e.g., ``), but I have found the one that follows to be reliable. It will match start-tags, with or without attributes: + + <[_a-zA-Z][^>]*> + +Here is what it does: + + * The first character is a left angle bracket (<). + + * Elements can begin with an underscore character (_) in XML or a letter in the ASCII range, in either upper- or lowercase (see Technical Notes). + + * Following the start character, the name can be followed by zero or more characters, any character other than a right angle bracket (>). + + * The expression ends with a right angle bracket. + +Try this with _grep_. Match it against a sample DITA file in the archive, _lorem.dita_ : + + grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita + +yields this answer: + + + + <body> + <p> + <p> + <ul> + <li> + <li> + <li> + <p> + <p> + +To match both start- and end-tags, simply add a forward slash followed by a question mark. The question mark makes the forward slash optional: + + </?[_a-zA-Z][^>]*> + +I'm sticking with start-tags only here. To refine the output, I often pipe in a few other tools to make it prettier: + + grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed 's/^<//;s/ id=\".*\"//;s/> + $//' + +This gives you a list of sorted XML tag names: + + body + li + p + p + title + topic + ul + +I'll take this a step further in the next and final chapter. The following sections will take you through some of the steps you have learned before, but with a few new twists. + +# Transforming Plain Text with _sed_ + +Let's add some markup to the top of the text in _rime.txt_. We can do this with the insert command (`i\`). In the directory where the _rime.txt_ file is located, enter the following at a shell prompt: + + sed '1 i\ + <!DOCTYPE html>\ + <html lang="en">\ + <head>\ + <title>The Rime of the Ancyent Marinere (1798)\ + \ + \ + \ + + q' rime.txt + +After you press Enter or Return, your output should look like the following, with the tags at the top: + + + + + The Rime of the Ancyent Marinere (1798) + + + + THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + +The command you just entered did not actually change the file—it only produced an output to your screen. I'll show you how to write your changes to a file later. + +## Substitution with _sed_ + +In the next example, _sed_ finds the first line of the file and captures the entire line in a capturing group using escaped parentheses `\(` and `\)`. _sed_ needs to escape the parentheses used to capture a group unless you use the _-E_ option (more on this in a moment). The beginning of the line is demarcated with `^`, and the end of the line with a `$`. The backreference `\1` pulls the captured text into the content of the _title_ element, indented with one space. + +Run the command that follows: + + sed '1s/^\(.*\)$/ \1<\/title>/;q' rime.txt + +The resulting line looks like this: + + <title>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + +Now try it this way: + + sed -E '1s/^(.*)$/\ + \ + \ + \1<\/title>\ + <\/head>\ + <body>\ + <h1>\1<\/h1>\ + /;q' rime.txt + +Let's talk about it: + + * The _-E_ options tells _sed_ to use extended regular expressions or EREs (so you don't have to escape the parentheses, etc.). + + * Using a substitute ( _s_ ) command, grab line 1 in a capturing group (`^(.*)$`) so you can reuse the text with `\1`. + + * Create HTML tags and escape newlines with `\`. + + * Insert the captured text in the _title_ and _h1_ tags using `\1`. + + * Quit at this point (`q`) to stop printing the rest of the poem to the screen. + +The correct result is: + + <!DOCTYPE html> + <html lang="en"> + <head> + <title>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + + +

THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.

+ +## Handling Roman Numerals with _sed_ + +The poem is divided into seven sections, with each section introduced with a Roman numeral. There is also an "ARGUMENT" heading. The following line will use _sed_ to capture that heading and those Roman numerals and surround them in _h2_ tags: + + sed -En 's/^(ARGUMENT\.|I{0,3}V?I{0,2}\.)$/

\1<\/h2>/p' rime.txt + +and here is what you'll see: + +

ARGUMENT\.

+

I.

II.

III.

IV.V.VI.VII.\1<\/p>/p' rime.txt + +and places that paragraph in a _p_ tag: + +

How a Ship having passed the Line was driven by Storms to the cold Country towards + the South Pole; and how from thence she made her course to the tropical Latitude + of the Great Pacific Ocean; and of the strange things that befell; and in what + manner the Ancyent Marinere came back to his own Country.

+ +I know this looks like we are moving inchmeal at the moment, but hang on and I'll bring it all together in a page or two. + +## Handling the Lines of the Poem with _sed_ + +Next we'll mark up the lines of the poem with: + + sed -E '9s/^[ ]*(.*)/

\1/;10,832s/^([ ]{5,7}.*)/\1/; + 833s/^(.*)/\1<\/p>/' rime.txt + +These _sed_ substitutions depend on line numbers to get their little jobs done. This wouldn't work with a generalized case, but it works quite well when you know exactly what you are dealing with. + + * On line 9, the first line of verse, the _s_ command grabs the line and, after prepending a few spaces, it inserts a _p_ start-tag and appends a _br_ (break) tag at the end of the line. + + * Between lines 10 and 832, every line that begins with between 5 to 7 spaces gets a _br_ appended to it. + + * On line 833, the last line of the poem, instead of a _br_ , the _s_ appends a _p_ end-tag. + +A sample of the resulting markup is here: + +

It is an ancyent Marinere,
+ And he stoppeth one of three:
+ "By thy long grey beard and thy glittering eye
+ "Now wherefore stoppest me?
+ + "The Bridegroom's doors are open'd wide
+ "And I am next of kin;
+ "The Guests are met, the Feast is set,--
+ "May'st hear the merry din.--
+ +You should also replace the blank lines with a _br_ , to keep the verses separated: + + sed -E 's/^$//' rime.txt + +See what you just did: + + He prayeth best who loveth best, + All things both great and small: + For the dear God, who loveth us, + He made and loveth all. +
+ The Marinere, whose eye is bright, + Whose beard with age is hoar, + Is gone; and now the wedding-guest + Turn'd from the bridegroom's door. +
+ He went, like one that hath been stunn'd + And is of sense forlorn: + A sadder and a wiser man + He rose the morrow morn. + +I have found that I can play with this kind of thing endlessly, getting the tags and space just right. I encourage you to do so yourself. + +# Appending Tags + +Now we'll append some tags to the end of the poem. With the append command (`a\`), the `$` finds the end (the last line) of the file, and appends (`a\`) the _body_ and _html_ end-tags after the last line: + + sed '$ a\ + <\/body>\ + <\/html>\ + ' rime.txt + +Here's how the end of the file will look now: + + He went, like one that hath been stunn'd + And is of sense forlorn: + A sadder and a wiser man + He rose the morrow morn. + + + +Enough _sed_. + +What if you wanted to do all of these changes at the same time? You know what to do. You've already done it. You just have to put all these commands in a file and use the _-f_ option with _sed_. + +## Using a Command File with _sed_ + +This example shows the file _html.sed_ , which collects all the previous _sed_ commands into one file, plus a command or two more. We'll use this file of commands to transform _rime.txt_ to HTML using _sed_. The numbered callouts in the example will guide you through what is happening in the _sed_ script. + + #!/usr/bin/sed ![1](callouts/1.png) + + 1s/^(.*)$/\ ![2](callouts/2.png) + \ + \ + \1<\/title>\ + <\/head>\ + <body>\ + <h1>\1<\/h1>\ + / + + s/^(ARGUMENT|I{0,3}V?I{0,2})\.$/<h2>\1<\/h2>/ ![3](callouts/3.png) + 5s/^([A-Z].*)$/<p>\1<\/p>/ ![4](callouts/4.png) + 9s/^[ ]*(.*)/ <p>\1<br\/>/ ![5](callouts/5.png) + 10,832s/^([ ]{5,7}.*)/\1<br\/>/ ![6](callouts/6.png) + 833s/^(.*)/\1<\/p>/ ![7](callouts/7.png) + 13,$s/^$/<br\/>/ ![8](callouts/8.png) + $ a\ ![9](callouts/9.png) + <\/body>\ + <\/html>\ + +The first line is called the _shebang_ line, a hint to the shell of where the executable ( _sed_ ) is located. + +At line 1, substitute ( _s_ ) the line with the tags that follow. The backslash (\\) indicates that the text you want to add continues on the next line so a newline is inserted. Insert the title of the poem from line 1 with `\1`, as the content of _title_ and _h1_ elements. + +Surround headings and Roman numerals with _h2_ tags. + +On line 5, enclose the introductory paragraph in a _p_ element. + +On line 9, prepend a _p_ start-tag and add a _br_ at the end of the line. + +Between line 9 and 832, add a _br_ at the end of each line that begins with a certain number of spaces. + +At the end of the poem, append a _p_ end-tag. + +After line 13, replace each blank line with a break ( _br_ ). + +Appends a few tags at the end (`$`) of the document. + +To apply this command file to _rime.txt_ , enter this line, followed by Enter or Return: + + sed -E -f html.sed rime.txt + +To redirect the output to a file: + + sed -E -f html.sed rime.txt > rime.html + +Open _rime.html_ in a browser to see what you have created (see Figure 9-1). + +Figure 9-1. rime.html in Firefox + +# Transforming Plain Text with Perl + +I'll now show you how to mark up a file with HTML using Perl. First, like with _sed_ , I'll give you a series of one-liners; then I'll show those same commands in a file. + +### Note + +This book introduces you to only the rudiments of the Perl language, and how to get started using it. It is not a Perl tutorial or manual, but I hope to pique your interest in Perl and show you a few possibilities. A good place to get started with Perl is at the Learning Perl website found at <http://learn.perl.org/>, which also includes instructions on how to install it. + +If the current line (`$.`) is line 1, assign the whole line ($_) to the _$title_ variable and print _$title_. + + perl -ne 'if ($. == 1) {chomp($title = $_); print "<h1>" . $title . "</h1>" . "\n";};' + rime.txt + +If all goes well, the result should be: + + <h1>THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.</h1> + +Here is an explanation for the Perl command : + + * Test if you are on line 1 with `$.` + + * Chomp the line ($_) and assign the string to the `$title` variable. When you chomp the line with the _chomp_ function, it removes the trailing newline from the string. + + * Print `$title` in an _h1_ element, followed by a newline (`\n`). + +### Note + +For more information on Perl's built-in variables, such as `$.`, enter the command `perldoc -v $.` at a prompt ( _perldoc_ normally is installed when you install Perl). If this doesn't work, see Technical Notes. + +To prepend some markup to the top of the file, including that _h1_ tag, use this: + + perl -ne 'if ($. == 1) {chomp($title = $_)}; + print "<!DOCTYPE html>\ + <html xmlns=\"http://www.w3.org/1999/xhtml\">\ + <head>\ + <title>$title\ + \ + \ + \ +

$title

\n" if $. == 1; exit' rime.txt + +and you'll get the following output: + + + + + THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS. + + + +

THE RIME OF THE ANCYENT MARINERE, IN SEVEN PARTS.

+ +The _print_ function prints the tags that follow, and each line (except the last), is followed by a `\`, which enters a newline into the output. The `$title` variable is expanded within the _title_ and _h1_ elements. + +## Handling Roman Numerals with Perl + +To tag up the heading and those Roman numeral section breaks, use: + + perl -ne 'print if s/^(ARGUMENT\.|I{0,3}V?I{0,2}\.)$/

\1<\/h2>/;' rime.txt + +This is the output: + +

ARGUMENT.

+

I.

+

II.

+

III.

+

IV.

+

V.

+

VI.

+

VII.

+ +The substitute ( _s_ ) command captures the _ARGUMENT_ heading and those seven uppercase Roman numerals, each on separate lines and followed by a period, in the range I through VII. Then it encloses the captured text in an _h2_ tag. + +## Handling a Specific Paragraph with Perl + +Use this code to enclose the introductory paragraph in a _p_ element, if the line number is equal to 5: + + perl -ne 'if ($. == 5) {s/^([A-Z].*)$/

$1<\/p>/;print;}' rime.txt + +You should see this: + +

How a Ship having passed the Line was driven by Storms to the cold Country towards + the South Pole; and how from thence she made her course to the tropical Latitude + of the Great Pacific Ocean; and of the strange things that befell; and in what + manner the Ancyent Marinere came back to his own Country.

+ +## Handling the Lines of the Poem with Perl + +The following command places a _p_ start-tag at the beginning of the first line of the poem, and a _br_ tag after the end of that line: + + perl -ne 'if ($. == 9) {s/^[ ]*(.*)/

$1/;print;}' rime.txt + +It gives you: + +

It is an ancyent Marinere,
+ +Next, between lines 10 and 832, this bit of Perl puts a _br_ at the end of each line of the poem: + + perl -ne 'if (10..832) { s/^([ ]{5,7}.*)/$1/; print;}' rime.txt + +A sample of what you will see: + + Farewell, farewell! but this I tell
+ To thee, thou wedding-guest!
+ He prayeth well who loveth well
+ Both man and bird and beast.
+ +Add a _p_ end-tag to the end of the last line of the poem. + + perl -ne 'if ($. == 833) {s/^(.*)/$1<\/p>/; print;}' rime.txt + +It shows: + + He rose the morrow morn.

+ +Replace blank lines at the end of each line with a _br_ tag: + + perl -ne 'if (9..eof) {s/^$//; print;}' rime.txt + +to yield this: + +
+ He prayeth best who loveth best, + All things both great and small: + For the dear God, who loveth us, + He made and loveth all. +
+ The Marinere, whose eye is bright, + Whose beard with age is hoar, + Is gone; and now the wedding-guest + Turn'd from the bridegroom's door. +
+ +And finally, when the end of the file is discovered, print a couple of end-tags: + + perl -ne 'if (eof) {print "\n\n"};' rime.txt + +All this code works together more easily when it's in a file. You'll see that next. + +## Using a File of Commands with Perl + +The following lists _html.pl_ which transforms _rime.txt_ to HTML using Perl. The numbered callouts in the example guide you through what is happening in the script. + + #!/usr/bin/perl -p ![1](callouts/1.png) + + if ($. == 1) { ![2](callouts/2.png) + chomp($title = $_); + } + print "\ ![3](callouts/3.png) + \ + \ + $title\ + \ + \ + \ +

$title

\n" if $. == 1; + s/^(ARGUMENT|I{0,3}V?I{0,2})\.$/

$1<\/h2>/; ![4](callouts/4.png) + if ($. == 5) { ![5](callouts/5.png) + s/^([A-Z].*)$/

$1<\/p>/; + } + if ($. == 9) { ![6](callouts/6.png) + s/^[ ]*(.*)/

$1/; + } + if (10..832) { ![7](callouts/7.png) + s/^([ ]{5,7}.*)/$1/; + } + if (9..eof) { ![8](callouts/8.png) + s/^$//; + } + if ($. == 833) { ![9](callouts/9.png) + s/^(.*)$/$1<\/p>\n <\/body>\n<\/html>\n/; + } + +This is called the _shebang_ directive, which gives a hint to the shell of where the program you are running is located. + +If the current line (`$.`) is line 1, then assign the whole line ($_) to the _$title_ variable, chomping off (with `chomp`) the last character in the string (a newline) in the process. + +Print a doctype and several HTML tags at the top of the document at line 1, and reuse the value of the `$title` variable in several places. + +Give the ARGUMENT heading and the Roman numerals _h2_ tags. + +Surround the introductory paragraph with _p_ tags. + +Prepend a _p_ start-tag to the beginning of the first line of verse, and append a _br_ to that line. + +Append a _br_ tag to the end of each line of verse, except the last line. + +Replace each blank line, after line 9, with a _br_ tag. + +Append _p_ , _body_ , and _html_ end-tags to the last line. + +To run this, simply do the following: + + perl html.pl rime.txt + +You can also redirect the output with a > to save your output to a file. In the next and final chapter, I'll conclude our regex tutorial. + +# What You Learned in Chapter 9 + + * How to use _sed_ on the command line + + * How to prepend (insert), substitute, and append text (tags) with _sed_ + + * How to use Perl to do the same + +# Technical Notes + + * AsciiDoc () by Stuart Rackham is a text format that can be converted, using a Python processor, into HTML, PDF, ePUB, DocBook and man pages. The syntax for the text files is similar to Wiki or Markdown and much quicker than hand-coding HTML or XML tags. + + * The underscore applies to XML tag names only, not HTML. In addition, XML tags can of course have a much wider range of characters in their names than what is represented in the ASCII set. For more information on characters used in XML names, see . + + * If the command `perldoc` doesn't work, you have some alternatives. First, you can easily read about Perl online at . (To learn more about `$.`, for example, go to .) If you are on a Mac, try `perldoc5.12`. If you installed Perl from ActiveState, you will find it at `/usr/local/ActivePerl-5.XX/bin`. Both `perl` and `perldoc` are installed at `/usr/local/bin` when compiled and built from source. You can add `/usr/local/bin` to your path so `perl` and `perldoc` will run. For information on setting your path variable, see . + +# Chapter 10. The End of the Beginning + +> "Unix was not designed to stop you from doing stupid things, because that would also stop you from doing clever things." —Doug Gwyn + +Congratulations for making it this far. You're not a regular expression novice anymore. You have been introduced to the most commonly used regular expression syntax. And it will open a lot of possibilities up to you in your work as a programmer. + +Learning regular expressions has saved me a lot of time. Let me give you an example. + +I use a lot of XSLT at work, and often I have to analyze the tags that exist in a group of XML files. + +I showed you part of this in the last chapter, but here is a long one-liner that takes a list of tag names from _lorem.dita_ and converts it into a simple XSLT stylesheet: + + grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed '1 i\ + \ + + ; s/^$/">\ + \ + <\/xsl:template>/;$ a\ + \ + \ + ' + +I know this script may appear a bit acrobatic, but after you work with this stuff for a long time, you start thinking like this. I am not even going to explain what I've done here, because I am sure you can figure it out on your own now. + +Here is what the output looks like: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +That's only a start. Of course, this simple stylesheet will need a lot of editing before it can do anything useful, but this is the kind of thing that can save you a lot of keystrokes. + +I'll admit, it would be easier if I put these _sed_ commands in a file. As a matter of fact, I did. You'll find _xslt.sed_ in the sample archive. This is the file: + + #!/usr/bin/sed + + 1 i\ + \ + + s/^$/">\ + \ + <\/xsl:template>/;$ a\ + \ + \ + +And here is how to run it: + + grep -Eo '<[_a-zA-Z][^>]*>' lorem.dita | sort | uniq | sed -f xslt.sed + +# Learning More + +Even though you have a good strong grip on regex now, there is still lots to learn. I have a couple of suggestions of where to go next. + +I pass these recommendations along out of experience and observation, not from any sense of obligation or to be "salesy." I won't get any kickbacks for mentioning them. I talk about them because these resources will actually benefit you. + +Jeffrey E. F. Friedl's _Mastering Regular Expressions, Third Edition_ is the source many programmers look to for a definitive treatment of the regular expression. Both expansive and well-written, if you are going to do any significant work with regex, you need to have this book on your shelf or in your e-reader. Period. + +Jan Goyvaerts and Steven Levithan's _Regular Expressions Cookbook_ is another great piece of work, especially if you are comparing different implementations. I'd get this one, too. + +The _Regular Expression Pocket Reference: Regular Expressions for Perl, Ruby, PHP, Python, C, Java and .NET_ by Tony Stubblebine is a 128-page guide which, though it is several years old, still remains popular. + +Andrew Watt's book _Beginning Regular Expressions_ (Wrox, 2005) is highly rated. I have found Bruce Barnett's online _sed_ tutorial particularly useful (see ). He demonstrates a number of _sed_ 's less understood features, features I have not explained here. + +# Notable Tools, Implementations, and Libraries + +I've mentioned a number of tools, implementations, and libraries in this book. I'll recap those here and mention several others. + +## Perl + +Perl is a popular, general-purpose programming language. A lot of people prefer Perl for text processing with regular expressions over other languages. You likely already have it, but for information on how to install Perl on your system, go to . Read about Perl's regular expressions at . Don't get me wrong. There are plenty of other languages that do a great job with regex, but it pays to have Perl in your toolbox. To learn more, I'd get a copy of the latest edition of Learning Perl, by Randal Schwartz, brian d foy, and Tom Phoenix, also published by O'Reilly. + +## PCRE + +Perl Compatible Regular Expressions or PCRE (see ) is a regular expression library written in C (both 8-bit and 16-bit). This library mainly consists of functions that may be called within any C framework or from any other language that can use C libraries. It is compatible with Perl 5 regular expressions, as its name suggests, and includes some features from other regex implementations. The Notepad++ editor uses the PCRE library. + + _pcregrep_ is an 8-bit, _grep_ -like tool that enables you to use the features of the PCRE library on the command line. You used it in Chapter 3. See for download information (from ). You can get _pcregrep_ for the Mac through Macports () by running the command `sudo port install pcre` (Xcode is a prerequisite; see , where a login is required). To install it on the Windows platform (binaries), go to . + +## Ruby (Oniguruma) + +Oniguruma is a regular expression library that is standard with Ruby 1.9; see . It is written in C and was written specifically to support Ruby. You can try out Ruby's regular expression using Rubular, an online app that supports both 1.8.7 and 1.9.2 (see and Figure 10-1). TextMate, by the way, uses the Oniguruma library. + +Figure 10-1. Phone number regex in Rubular + +## Python + +Python is a general-purpose programming language that supports regular expressions (see ). It was first created by Guido van Rossum in 1991. You can read about Python 3's regular expression syntax here: . + +## RE2 + +RE2 is a non-backtracking C++ regular expression library (see ). While RE2 is quite fast, it does not do backtracking or backreferences. It is available as a CPAN package for Perl and can fall back on Perl's native library if backreferences are needed. For instructions on making API calls, see . For an interesting discussion on RE2, see "Regular Expression Matching in the Wild" at . + +# Matching a North American Phone Number + +You remember the North American phone number example from the first chapter? You've come a long way since then. + +Here is a more robust regular expression for matching phone numbers than the one we used there. It is adapted from Goyvaerts and Levithan's example on page 235 of their _Regular Expressions Cookbook_ (first edition). + + ^\(?(?:\d{3})\)?[-.]?(?:\d{3})[-.]?(?:\d{4})$ + +Play with it with the tool of your choice (see it in Reggy in Figure 10-2). By now, you should be able to pick this regex apart with hardly any hand-holding. I'm proud of you for that. But I'll go over it for good measure. + + * `^` is the zero-width assertion for the beginning of a line or subject. + + * `\(?` is a literal left parenthesis, but it is optional (`?`). + + * `(?:\d{3})` is a non-capturing group matching three consecutive digits. + + * `\)?` is an optional right parenthesis. + + * `[-.]?` allows for an optional hyphen or period (dot). + + * `(?:\d{3})` is another non-capturing group matching three more consecutive digits. + + * `[-.]?` allows for an optional hyphen or dot again. + + * `(?:\d{4})` is yet another non-capturing group matching exactly four consecutive digits. + + * `$` matches the end of a line or subject. + +This expression could be even more refined, but I leave that to you because you can now do it on your own. + +Figure 10-2. Phone number regex in Reggy + +# Matching an Email Address + +Lastly, I'll throw one more regular expression at you, an email address: + + ^([\w-.!#$%&'*+-/=?^_`{|}~]+)@((?:\w+\.)+)(?:[a-zA-Z]{2,4})$ + +This is an adaptation of one provided by Grant Skinner with RegExr. I'd like to challenge you to do your best to explain what each character means in the context of a regular expression, and to see if you can improve on it. I am sure you can. + +Thank you for your time. I've enjoyed spending it with you. You should now have a good grasp of the fundamental concepts of regular expressions. You are no longer a member of the beginners' club. I hope you've made friends with regular expressions and learned something worthwhile along the way. + +# What You Learned in Chapter 10 + + * How to extract a list of XML elements from a document and convert the list into an XSLT stylesheet. + + * Where to find additional resources for learning about regular expressions. + + * What are some notable regex tools, implementations, and libraries. + + * A slightly, more robust pattern for matching a North American phone number. + +# Appendix A. Regular Expression Reference + +This appendix is a reference for regular expressions. + +# Regular Expressions in QED + +QED (short for Quick Editor) was originally written for the Berkeley Time-Sharing System, which ran on the Scientific Data Systems SDS 940. A rewrite of the original QED editor by Ken Thompson for MIT's Compatible Time-Sharing System yielded one of the earliest (if not the first) practical implementation of regular expressions in computing. Table A-1, taken from pages 3 and 4 of a 1970 Bell Labs memo, outlines the regex features in QED. It amazes me that most of this syntax has remained in use to this day, over 40 years later. + +Table A-1. QED regular expressions + +Feature| Description +---|--- + +_literal_ | "a) An ordinary character [literal] is a regular expression which matches that character." + +^ | "b) _^_ is a regular expression which matches the null character at the beginning of a line." + +$ | "c) _$_ is a regular expression which matches the null character before the character [newline] (usually at the end of a line)." + +. | "d) _._ is a regular expression which matches any character except [newline]." + +[] | "e) "[]" is a regular expression which matches any of the characters in the and no others." + +[^] | "f) "[^] is a regular expression which matches any character but [newline] and the characters of the ." + +* | "g) A regular expression followed by "*" is a regular expression which matches any number (including zero) of adjacent occurrences of the text matched by the regular expression." + +| + +"h) Two adjacent regular expressions form a regular expression which matches adjacent occurrences of the text matched by the regular expressions." + +| | "i) Two regular expressions separated by "|" form a regular expression which matches the text matched by either of the regular expressions." + +( ) | "j) A regular expression in parentheses is a regular expression which matches the same text as the original regular expression. Parentheses are used to alter the order of evaluation implied by g), h), and i): _a(b|c)d_ will match _abd_ or _acd_ , while _ab|cd_ matches _ab_ or _cd_." + +{ } | "k) If "" is a regular expression, "{}x" is a regular expression, where _x_ is any character. This regular expression matches the same things as ; it has certain side effects as explained under the Substitute command." [The Substitute command was formed _(.,.)S/ //_ (see page 13 of the memo), similar to the way it is still used in programs like _sed_ and Perl.] + +\E | "l) If is the name of a regular expression named by the E command (below), then "\E" is a regular expression which matches the same things as the regular expression specified in the E command. More discussion is presented under the E command." [The \E command allowed you to name a regular expression and repeat its use by name.] + +| + +"m) The null regular expression standing alone is equivalent to the last regular expression encountered. Initially the null regular expression is undefined; it also becomes undefined after an erroneous regular expression and after use of the E command." + +| + +"n) Nothing else is a regular expression." + +| + +"o) No regular expression will match text spread across more than one line." + +# Metacharacters + +There are 14 metacharacters used in regular expressions, each with special meaning, as described in Table A-2. If you want to use one of these characters as a literal, you must precede it with a backslash to escape it. For example, you would escape the dollar sign like this `\$`, or a backslash like this `\\`. + +Table A-2. Metacharacters in regular expressions + +Metacharacter| Name| Code Point| Purpose +---|---|---|--- + +. | Full Stop | U+002E | Match any character + +\ | Backslash | U+005C | Escape a character + +| | Vertical Bar | U+007C | Alternation (or) + +^ | Circumflex | U+005E | Beginning of a line anchor + +$ | Dollar Sign | U+0024 | End of a line anchor + +? | Question Mark | U+003F | Zero or one quantifier + +* | Asterisk | U+002A | Zero or more quantifier + ++ | Plus Sign | U+002B | One or more quantifier + +[ | Left Square Bracket | U+005B | Open character class + +] | Right Square Bracket | U+005D | Close character class + +{ | Left Curly Brace | U+007B | Open quantifier or block + +} | Right Curly Brace | 007D | Close quantifier or block + +( | Left Parenthesis | U+0028 | Open group + +) | Right Parenthesis | U+0029 | Close group + +# Character Shorthands + +Table A-3 lists character shorthands used in regular expressions. + +Table A-3. Character shorthands + +Character Shorthand| Description +---|--- + +\a | Alert + +\b | Word boundary + +[\b] | Backspace character + +\B | Non-word boundary + +\cx | Control character + +\d | Digit character + +\D | Non-digit character + +\d _xxx_ | Decimal value for a character + +\f | Form feed character + +\r | Carriage return + +\n | Newline character + +\o _xxx_ | Octal value for a character + +\s | Space character + +\S | Non-space character + +\t | Horizontal tab character + +\v | Vertical tab character + +\w | Word character + +\W | Non-word character + +\0 | Null character + +\x _xx_ | Hexadecimal value for a character + +\u _xxxx_ | Unicode value for a character + +# Whitespace + +Table A-4 is a list of character shorthands for whitespace. + +Table A-4. Whitespace characters + +Character Shorthand| Description +---|--- + +\f | Form feed + +\h | Horizontal whitespace + +\H | Not horizontal whitespace + +\n | Newline + +\r | Carriage return + +\t | Horizontal tab + +\v | Vertical whitespace + +\V | Not vertical whitespace + +# Unicode Whitespace Characters + +Whitespace characters in Unicode are listed in Table A-5. + +Table A-5. Whitespace characters in Unicode + +Abbreviation or Nickname| Name| Unicode Code Point| Regex +---|---|---|--- + +HT | Horizontal tab | U+0009 | \u0009 or \t + +LF | Line feed | U+000A | \u000A or \n + +VT | Vertical tab | U+000B | \u000B or \v + +FF | Form feed | U+000C | \u000C or \f + +CR | Carriage return | U+000D | \u000d or \r + +SP | Space | U+0020 | \u0020 or \s[a] + +NEL | Next line | U+0085 | \u0085 + +NBSP | No-break space | U+00A0 | \u00A0 + +— | Ogham space mark | U+1680 | \u1680 + +MVS | Mongolian vowel separator | U+180E | \u180E + +BOM | Byte order mark | U+FEFF | \ufeff + +NQSP | En quad | U+2000 | \u2000 + +MQSP, Mutton Quad | Em quad | U+2001 | \u2001 + +ENSP, Nut | En space | U+2002 | \u2002 + +EMSP, Mutton | Em space | U+2003 | \u2003 + +3MSP, Thick space | Three-per-em space | U+2004 | \u2004 + +4MSP, Mid space | Four-per-em space | U+2005 | \u2005 + +6/MSP | Six-per-em space | U+2006 | \u2006 + +FSP | Figure space | U+2007 | \u2007 + +PSP | Punctuation space | U+2008 | \u2008 + +THSP | Thin space | U+2009 | \u2009 + +HSP | Hair space | U+200A | \u200A + +ZWSP | Zero width space | U+200B | \u200B + +LSEP | Line separator | U+2028 | \u2028 + +PSEP | Paragraph separator | U+2029 | \u2029 + +NNBSP | Narrow no-break space | U+202F | \u202F + +MMSP | Medium mathematical space | U+205F | \u205f + +IDSP | Ideographic space | U+3000 | \u3000 + +[a] Also matches other whitespace. + +# Control Characters + +Table A-6 shows a way to match control characters in regular expressions. + +Table A-6. Matching control characters + +Control Character| Unicode Value| Abbreviation| Name +---|---|---|--- + +c@[a] | U+0000 | NUL | Null + +\cA | U+0001 | SOH | Start of heading + +\cB | U+0002 | STX | Start of text + +\cC | U+0003 | ETX | End of text + +\cD | U+0004 | EOT | End of transmission + +\cE | U+0005 | ENQ | Enquiry + +\cF | U+0006 | ACK | Acknowledge + +\cG | U+0007 | BEL | Bell + +\cH | U+0008 | BS | Backspace + +\cI | U+0009 | HT | Character tabulation or horizontal tab + +\cJ | U+000A | LF | Line feed (newline, end of line) + +\cK | U+000B | VT | Line tabulation or vertical tab + +\cL | U+000C | FF | Form feed + +\cM | U+000D | CR | Carriage return + +\cN | U+000E | SO | Shift out + +\cO | U+000F | SI | Shift in + +\cP | U+0010 | DLE | Data link escape + +\cQ | U+0011 | DC1 | Device control one + +\cR | U+0012 | DC2 | Device control two + +\cS | U+0013 | DC3 | Device control three + +\cT | U+0014 | DC4 | Device control four + +\cU | U+0015 | NAK | Negative acknowledge + +\cV | U+0016 | SYN | Synchronous idle + +\cW | U+0017 | ETB | End of Transmission block + +\cX | U+0018 | CAN | Cancel + +\cY | U+0019 | EM | End of medium + +\cZ | U+001A | SUB | Substitute + +\c[ | U+001B | ESC | Escape + +\c\ | U+001C | FS | Information separator four + +\c] | U+001D | GS | Information separator three + +\c^ | U+001E | RS | Information separator two + +\c_ | U+001F | US | Information separator one + +[a] Can use upper- or lowercase. For example, `\cA` or `\ca` are equivalent; however, Java implementations require uppercase.`\cA` or `\ca` are equivalent; however, Java implementations require uppercase. + +# Character Properties + +Table A-7 lists character property names for use with `\p{` _property_`}` or `\P{` _property_`}`. + +Table A-7. Character properties[2] + +Property| Description +---|--- + +C | Other + +Cc | Control + +Cf | Format + +Cn | Unassigned + +Co | Private use + +Cs | Surrogate + +L | Letter + +Ll | Lowercase letter + +Lm | Modifier letter + +Lo | Other letter + +Lt | Title case letter + +Lu | Uppercase letter + +L& | Ll, Lu, or Lt + +M | Mark + +Mc | Spacing mark + +Me | Enclosing mark + +Mn | Non-spacing mark + +N | Number + +Nd | Decimal number + +Nl | Letter number + +No | Other number + +P | Punctuation + +Pc | Connector punctuation + +Pd | Dash punctuation + +Pe | Close punctuation + +Pf | Final punctuation + +Pi | Initial punctuation + +Po | Other punctuation + +Ps | Open punctuation + +S | Symbol + +Sc | Currency symbol + +Sk | Modifier symbol + +Sm | Mathematical symbol + +So | Other symbol + +Z | Separator + +Zl | Line separator + +Zp | Paragraph separator + +Zs | Space separator + +[2] See pcresyntax(3) at . + +# Script Names for Character Properties + +Table A-8 shows the language script names for use with `/p{` _property_`}` or `/P{` _property_`}`. + +Table A-8. Script names[3] + +Arabic (Arab)| Glagolitic (Glag)| Lepcha (Lepc)| Samaritan (Samr) +---|---|---|--- + +Armenian (Armn) | Gothic (Goth) | Limbu (Limb) | Saurashtra (Saur) + +Avestan (Avst) | Greek (Grek) | Linear B (Linb) | Shavian (Shaw) + +Balinese (Bali) | Gujarati (Gujr) | Lisu (Lisu) | Sinhala (Sinh) + +Bamum (Bamu) | Gurmukhi (Guru) | Lycian (Lyci) | Sundanese (Sund) + +Bengali (Beng) | Han (Hani) | Lydian (Lydi) | Syloti Nagri (Sylo) + +Bopomofo (Bopo) | Hangul (Hang) | Malayalam (Mlym) | Syriac (Syrc) + +Braille (Brai) | Hanunoo (Hano) | Meetei Mayek (Mtei) | Tagalog (Tglg) + +Buginese (Bugi) | Hebrew (Hebr) | Mongolian (Mong) | Tagbanwa (Tagb) + +Buhid (Buhd) | Hiragana (Hira) | Myanmar (Mymr) | Tai Le (Tale) + +Canadian Aboriginal (Cans) | Hrkt: Katakana or Hiragana) | New Tai Lue (Talu) | Tai Tham (Lana) + +Carian (Cari) | Imperial Aramaic (Armi) | Nko (Nkoo) | Tai Viet (Tavt) + +Cham (None) | Inherited (Zinh/Qaai) | Ogham (Ogam) | Tamil (Taml) + +Cherokee (Cher) | Inscriptional Pahlavi (Phli) | Ol Chiki (Olck) | Telugu (Telu) + +Common (Zyyy) | Inscriptional Parthian (Prti) | Old Italic (Ital) | Thaana (Thaa) + +Coptic (Copt/Qaac) | Javanese (Java) | Old Persian (Xpeo) | Thai (None) + +Cuneiform (Xsux) | Kaithi (Kthi) | Old South Arabian (Sarb) | Tibetan (Tibt) + +Cypriot (Cprt) | Kannada (Knda) | Old Turkic (Orkh) | Tifinagh (Tfng) + +Cyrillic (Cyrl) | Katakana (Kana) | Oriya (Orya) | Ugaritic (Ugar) + +Deseret (Dsrt) | Kayah Li (Kali) | Osmanya (Osma) | Unknown (Zzzz) + +Devanagari (Deva) | Kharoshthi (Khar) | Phags Pa (Phag) | Vai (Vaii) + +Egyptian Hieroglyphs (Egyp) | Khmer (Khmr) | Phoenician (Phnx) | Yi (Yiii) + +Ethiopic (Ethi) | Lao (Laoo) | Rejang (Rjng) + +| + +Georgian (Geor) | Latin (Latn) | Runic (Runr) + +| + +[3] See pcresyntax(3) at or . + +# POSIX Character Classes + +Table A-9 shows a list of POSIX character classes. + +Table A-9. POSIX character classes + +Character Class| Description +---|--- + +[[:alnum:]] | Alphanumeric characters (letters and digits) + +[[:alpha:]] | Alphabetic characters (letters) + +[[:ascii:]] | ASCII characters (all 128) + +[[:blank:]] | Blank characters + +[[:ctrl:]] | Control characters + +[[:digit:]] | Digits + +[[:graph:]] | Graphic characters + +[[:lower:]] | Lowercase letters + +[[:print:]] | Printable characters + +[[:punct:]] | Punctuation characters + +[[:space:]] | Whitespace characters + +[[:upper:]] | Uppercase letters + +[[:word:]] | Word characters + +[[:xdigit:]] | Hexadecimal digits + +# Options/Modifiers + +Tables A-10 and A-11") list options and modifiers. + +Table A-10. Options in regular expressions + +Option| Description| Supported by +---|---|--- + +`(?d)` | Unix lines | Java + +`(?i)` | Case insensitive | PCRE, Perl, Java + +`(?J)` | Allow duplicate names | PCRE[a] + +`(?m)` | Multiline | PCRE, Perl, Java + +`(?s)` | Single line (dotall) | PCRE, Perl, Java + +`(?u)` | Unicode case | Java + +`(?U)` | Default match lazy | PCRE + +`(?x)` | Ignore whitespace, comments | PCRE, Perl, Java + +`(?-...)` | Unset or turn off options | PCRE + +[a] See "Named Subpatterns" in .. + +Table A-11. Perl modifiers (flags)[4] + +Modifier| Description +---|--- + +a | Match `\d`, `\s`, `\w` and POSIX in ASCII range only + +c | Keep current position after match fails + +d | Use default, native rules of the platform + +g | Global matching + +i | Case-insensitive matching + +l | Use current locale's rules + +m | Multiline strings + +p | Preserve the matched string + +s | Treat strings as a single line + +u | Use Unicode rules when matching + +x | Ignore whitespace and comments + +[4] See . + +# ASCII Code Chart with Regex + +Table A-12 is an ASCII code chart with regex cross-references. + +Table A-12. ASCII code chart + +Binary| Oct| Dec| Hex| Char| Kybd| Regex| Name +---|---|---|---|---|---|---|--- + +00000000 | 0 | 0 | 0 | NUL | ^@ | \c@ | Null character + +00000001 | 1 | 1 | 1 | SOH | ^A | \cA | Start of header + +00000010 | 2 | 2 | 2 | STX | ^B | \cB | Start of text + +00000011 | 3 | 3 | 3 | ETX | ^C | \cC | End of text + +00000100 | 4 | 4 | 4 | EOT | ^D | \cD | End of transmission + +00000101 | 5 | 5 | 5 | ENQ | ^E | \cE | Enquiry + +00000110 | 6 | 6 | 6 | ACK | ^F | \cF | Acknowledgment + +00000111 | 7 | 7 | 7 | BEL | ^G | \a, \cG | Bell + +00001000 | 10 | 8 | 8 | BS | ^H | [\b], \cH | Backspace + +00001001 | 11 | 9 | 9 | HT | ^I | \t, \cI | Horizontal tab + +00001010 | 12 | 10 | 0A | LF | ^J | \n, \cJ | Line feed + +00001011 | 13 | 11 | 0B | VT | ^K | \v, \cK | Vertical tab + +00001100 | 14 | 12 | 0C | FF | ^L | \f, \cL | Form feed + +00001101 | 15 | 13 | 0D | CR | ^M | \r, \cM | Carriage return + +00001110 | 16 | 14 | 0E | SO | ^N | \cN | Shift out + +00001111 | 17 | 15 | 0F | SI | ^O | \cO | Shift in + +00010000 | 20 | 16 | 10 | DLE | ^P | \cP | Data link escape + +00010001 | 21 | 17 | 11 | DC1 | ^Q | \cQ | Device control 1 (XON) + +00010010 | 22 | 18 | 12 | DC2 | ^R | \cR | Device control 2 + +00010011 | 23 | 19 | 13 | DC3 | ^S | \cS | Device control 3 (XOFF) + +00010100 | 24 | 20 | 14 | DC4 | ^T | \cT | Device control 4 + +00010101 | 25 | 21 | 15 | NAK | ^U | \cU | Negative acknowledgement + +00010110 | 26 | 22 | 16 | SYN | ^V | \cV | Synchronous idle + +00010111 | 27 | 23 | 17 | ETB | ^W | \cW | End of transmission block + +00011000 | 30 | 24 | 18 | CAN | ^X | \cX | Cancel + +00011001 | 31 | 25 | 19 | EM | ^Y | \cY | End of medium + +00011010 | 32 | 26 | 1A | SUB | ^Z | \cZ | Substitute + +00011011 | 33 | 27 | 1B | ESC | ^[ | \e, \c[ | Escape + +00011100 | 34 | 28 | 1C | FS | ^| | \c| | File separator + +00011101 | 35 | 29 | 1D | GS | ^] | \c] | Group separator + +00011110 | 36 | 30 | 1E | RS | ^^ | \c^ | Record separator + +00011111 | 37 | 31 | 1F | US | ^_ | \c_ | Unit Separator + +00100000 | 40 | 32 | 20 | SP | SP | \s, [ ] | Space + +00100001 | 41 | 33 | 21 | ! | ! | ! | Exclamation mark + +00100010 | 42 | 34 | 22 | " | " | " | Quotation mark + +00100011 | 43 | 35 | 23 | # | # | # | Number sign + +00100100 | 44 | 36 | 24 | $ | $ | \$ | Dollar sign + +00100101 | 45 | 37 | 25 | % | % | % | Percent sign + +00100110 | 46 | 38 | 26 | & | & | & | Ampersand + +00100111 | 47 | 39 | 27 | ' | ' | ' | Apostrophe + +00101000 | 50 | 40 | 28 | ( | ( | (, \\( | Left parenthesis + +00101001 | 51 | 41 | 29 | ) | ) | ), \\) | Right parenthesis + +00101010 | 52 | 42 | 2A | * | * | * | Asterisk + +00101011 | 53 | 43 | 2B | + | + | + | Plus sign + +00101100 | 54 | 44 | 2C | " | " | " | Comma + +00101101 | 55 | 45 | 2D | - | - | - | Hyphen-minus + +00101110 | 56 | 46 | 2E | . | . | \\., [.] | Full stop + +00101111 | 57 | 47 | 2F | / | / | / | Solidus + +00110000 | 60 | 48 | 30 | 0 | 0 | \d, [0] | Digit zero + +00110001 | 61 | 49 | 31 | 1 | 1 | \d, [1] | Digit one + +00110010 | 62 | 50 | 32 | 2 | 2 | \d, [2] | Digit two + +00110011 | 63 | 51 | 33 | 3 | 3 | \d, [3] | Digit three + +00110100 | 64 | 52 | 34 | 4 | 4 | \d, [4] | Digit four + +00110101 | 65 | 53 | 35 | 5 | 5 | \d, [5] | Digit five + +00110110 | 66 | 54 | 36 | 6 | 6 | \d, [6] | Digit six + +00110111 | 67 | 55 | 37 | 7 | 7 | \d, [7] | Digit seven + +00111000 | 70 | 56 | 38 | 8 | 8 | \d, [8] | Digit eight + +00111001 | 71 | 57 | 39 | 9 | 9 | \d, [9] | Digit nine + +00111010 | 72 | 58 | 3A | : | : | : | Colon + +00111011 | 73 | 59 | 3B | ; | ; | ; | Semicolon + +00111100 | 74 | 60 | 3C | < | < | < | Less-than sign + +00111101 | 75 | 61 | 3D | = | = | = | Equals sign + +00111110 | 76 | 62 | 3E | > | > | > | Greater-than sign + +00111111 | 77 | 63 | 3F | ? | ? | ? | Question mark + +01000000 | 100 | 64 | 40 | @ | @ | @ | Commercial at + +01000001 | 101 | 65 | 41 | A | A | \w, [A] | Latin capital letter A + +01000010 | 102 | 66 | 42 | B | B | \w, [B] | Latin capital letter B + +01000011 | 103 | 67 | 43 | C | C | \w, [C] | Latin capital letter C + +01000100 | 104 | 68 | 44 | D | D | \w, [D] | Latin capital letter D + +01000101 | 105 | 69 | 45 | E | E | \w, [E] | Latin capital letter E + +01000110 | 106 | 70 | 46 | F | F | \w, [F] | Latin capital letter F + +01000111 | 107 | 71 | 47 | G | G | \w, [G] | Latin capital letter G + +01001000 | 110 | 72 | 48 | H | H | \w, [H] | Latin capital letter H + +01001001 | 111 | 73 | 49 | I | I | \w, [I] | Latin capital letter I + +01001010 | 112 | 74 | 4A | J | J | \w, [J] | Latin capital letter J + +01001011 | 113 | 75 | 4B | K | K | \w, [K] | Latin capital letter K + +01001100 | 114 | 76 | 4C | L | L | \w, [L] | Latin capital letter L + +01001101 | 115 | 77 | 4D | M | M | \w, [M] | Latin capital letter M + +01001110 | 116 | 78 | 4E | N | N | \w, [N] | Latin capital letter N + +01001111 | 117 | 79 | 4F | O | O | \w, [O] | Latin capital letter O + +01010000 | 120 | 80 | 50 | P | P | \w, [P] | Latin capital letter P + +01010001 | 121 | 81 | 51 | Q | Q | \w, [Q] | Latin capital letter Q + +01010010 | 122 | 82 | 52 | R | R | \w, [R] | Latin capital letter R + +01010011 | 123 | 83 | 53 | S | S | \w, [S] | Latin capital letter S + +01010100 | 124 | 84 | 54 | T | T | \w, [T] | Latin capital letter T + +01010101 | 125 | 85 | 55 | U | U | \w, [U] | Latin capital letter U + +01010110 | 126 | 86 | 56 | V | V | \w, [V] | Latin capital letter V + +01010111 | 127 | 87 | 57 | W | W | \w, [W] | Latin capital letter W + +01011000 | 130 | 88 | 58 | X | X | \w, [X] | Latin capital letter X + +01011001 | 131 | 89 | 59 | Y | Y | \w, [Y] | Latin capital letter Y + +01011010 | 132 | 90 | 5A | Z | Z | \w, [Z] | Latin capital letter Z + +01011011 | 133 | 91 | 5B | [ | [ | \\[ | Left square bracket + +01011100 | 134 | 92 | 5C | \ | \ | \ | Reverse solidus + +01011101 | 135 | 93 | 5D | ] | ] | \\] | Right square bracket + +01011110 | 136 | 94 | 5E | ^ | ^ | ^, [^] | Circumflex accent + +01011111 | 137 | 95 | 5F | _ | _ | _, [_] | Low line + +00100000 | 140 | 96 | 60 | ` | ` | \\` | Grave accent + +01100001 | 141 | 97 | 61 | a | a | \w, [a] | Latin small letter A + +01100010 | 142 | 98 | 62 | b | b | \w, [b] | Latin small letter B + +01100011 | 143 | 99 | 63 | c | c | \w, [c] | Latin small letter C + +01100100 | 144 | 100 | 64 | d | d | \w, [d] | Latin small letter D + +01100101 | 145 | 101 | 65 | e | e | \w, [e] | Latin small letter E + +01100110 | 146 | 102 | 66 | f | f | \w, [f] | Latin small letter F + +01100111 | 147 | 103 | 67 | g | g | \w, [g] | Latin small letter G + +01101000 | 150 | 104 | 68 | h | h | \w, [h] | Latin small letter H + +01101001 | 151 | 105 | 69 | i | i | \w, [i] | Latin small letter I + +01101010 | 152 | 106 | 6A | j | j | \w, [j] | Latin small letter J + +01101011 | 153 | 107 | 6B | k | k | \w, [k] | Latin small letter K + +01101100 | 154 | 108 | 6C | l | l | \w, [l] | Latin small letter L + +01101101 | 155 | 109 | 6D | m | m | \w, [m] | Latin small letter M + +01101110 | 156 | 110 | 6E | n | n | \w, [n] | Latin small letter N + +01101111 | 157 | 111 | 6F | o | o | \w, [o] | Latin small letter O + +01110000 | 160 | 112 | 70 | p | p | \w, [p] | Latin small letter P + +01110001 | 161 | 113 | 71 | q | q | \w, [q] | Latin small letter Q + +01110010 | 162 | 114 | 72 | r | r | \w, [r] | Latin small letter R + +01110011 | 163 | 115 | 73 | s | s | \w, [s] | Latin small letter S + +01110100 | 164 | 116 | 74 | t | t | \w, [t] | Latin small letter T + +01110101 | 165 | 117 | 75 | u | u | \w, [u] | Latin small letter U + +01110110 | 166 | 118 | 76 | v | v | \w, [v] | Latin small letter V + +01110111 | 167 | 119 | 77 | w | w | \w, [w] | Latin small letter W + +01111000 | 170 | 120 | 78 | x | x | \w, [x] | Latin small letter X + +01111001 | 171 | 121 | 79 | y | y | \w, [y] | Latin small letter Y + +01111010 | 172 | 122 | 7A | z | z | \w, [z] | Latin small letter Z + +01111011 | 173 | 123 | 7B | { | { | { | Left curly brace + +01111100 | 174 | 124 | 7C | | | | | | | Vertical line (Bar) + +01111101 | 175 | 125 | 7D | } | } | } | Right curly brace + +01111110 | 176 | 126 | 7E | ~ | ~ | \~ | Tilde + +01111111 | 177 | 127 | 7F | DEL | ^? | \c? | Delete + +# Technical Notes + +You can find Ken Thompson and Dennis Ritchie's QED memo-cum manual at . + +# Regular Expression Glossary + +anchor + +Specifies a location in a line or string. For example, the caret or circumflex character (`^`) signifies the beginning of a line or string of characters, and the dollar sign character (`$`), the end of a line or string. + +alternation + +Separating a list of regular expressions with a vertical bar (`|`) character, indicating _or_. In other words, match any of the regular expressions separated by one or more | characters. In some applications, such as _grep_ or _sed_ that use basic regular expressions (BREs), the `|` is preceded by a backslash, as in `\|`. _See also_ basic regular expressions. + +ASCII + +American Standard Code for Information Interchange. A 128-character encoding scheme for English (Latin) characters developed in the 1960s. _See also_ Unicode. + +assertions + + _See_ zero-width assertions. + +atom + + _See_ metacharacter. + +atomic group + +A grouping that turns off backtracking when a regular expression inside `(?>...)` fails to match. _See also_ backtracking, groups. + +backreference + +Refers to a previous regular expression captured with parentheses using a reference in the form of \1, \2, and so forth. + +backtracking + +Stepping back, character by character, through an attempted match to find a successful match. Used with a greedy match, but not a lazy or possessive match. Catastrophic backtracking occurs when a regex processor makes perhaps thousands of attempts to make a match and consumes a vast amount (read _most_ ) of the computing resources available. One way to avoid catastrophic backtracking is with atomic grouping. _See also_ atomic group, greedy match, lazy match, possessive match. + +basic regular expressions + +An early implementation of regular expressions that is less advanced and considered obsolete by most. Also called _BREs_. BREs required you to escape certain characters in order for them to function as metacharacters, such as braces (\`{` and `}`\\). _See also_ extended regular expressions. + +bound + + _See_ quantifier. + +bracketed expression + +A regular expression given in square brackets; for example, _a-f]_ , that is, the range of lowercase letters a through f. _See also_ [character class. + +branch + +A concatenation of pieces in a regular expression in POSIX.1 terminology. _See also_ POSIX. + +BREs + + _See_ basic regular expressions. + +capturing group + + _See_ groups. + +catastrophic backtracking + + _See_ backtracking. + +character class + +Usually, a set of characters enclosed in square brackets; for example, _[a-bA-B0-9]_ is a character class for all upper- and lowercase characters plus digits in the ASCII or Low Basic Latin character set. + +character escape + +A character preceded by a backward slash. Examples are \t (horizontal tab), \v (vertical tab), and \f (form feed). + +character set + + _See_ character class. + +code point + + _See_ Unicode. + +composability + +"A schema language (or indeed a programming language) provides a number of atomic objects and a number of methods of composition. The methods of composition can be used to combine atomic objects into compound objects which can in turn be composed into further compound objects. The composability of the language is the degree to which the various methods of composition can be applied uniformly to all the various objects of the language, both atomic and compound...Composability improves ease of learning and ease of use. Composability also tends to improve the ratio between complexity and power: for a given amount of complexity, a more composable language will be more powerful than a less composable one." From James Clark, "The Design of RELAX NG," . + +ed + +The Unix line editor created by Ken Thompson in 1971, which implemented regular expressions. It was a precursor to _sed_ and _vi_. + +EREs + + _See_ extended regular expressions. + +extended regular expressions + +Extended regular expressions or EREs added additional functionality to basic regular expressions or BREs, such as alternation (\|) and quantifiers such as ? and +, which work with _egrep_ (extended grep). These new features were delineated in IEEE POSIX standard 1003.2-1992. You can use the _-E_ option with _grep_ (same as using _egrep_ ), which means that you want to use extended regular expressions rather than basic regular expressions. _See also_ alternation, basic regular expressions, grep. + +flag + + _Seemodifier_. + +greedy match + +A greedy match consumes as much of a target string as possible, and then backtracks through the string to attempt to find a match. _See_ backtracking, lazy match, possessive match. + +grep + +A Unix command-line utility for searching strings with regular expressions. Invented by Ken Thompson in 1973, _grep_ is said to have grown out of the _ed_ editor command `g/re/p` (global/regular expression/print). Superseded but not retired by _egrep_ (or _grep -E_ —which has additional metacharacters such as |, +, ?, (, and )— _grep_ uses basic regular expressions, whereas _grep -E_ or _egrep_ use extended regular expressions. _fgrep_ ( _grep -F_ ) searches files using literal strings and metacharacters like $, *, and | don't have special meaning. _See also_ basic regular expressions, extended regular expressions. + +groups + +Groups combine regular expression atoms within a pair of parentheses, `( )`. In some applications, such as _grep_ or _sed_ (without _-E_), you must precede the parenthesis with a backslash, as in `\)` or `\(`. There are capturing groups and non-capturing groups. A capturing group stores the captured group in memory so that it can be reused while a non-capturing group does not. Atomic groups do not backtrack. _See also_ atomic group. + +hexadecimal + +A base 16 numbering system represented by the digits 0–9 and the letters A–F or a–f. For example, the base 10 number 15 is represented as F in hexadecimal, and 16 is 10. + +hold buffer + + _See_hold space. + +hold space + +Used by _sed_ to store one or more lines for further processing. Also called the _hold buffer_. _See also_ pattern space, _sed_. + +lazy match + +A lazy match consumes a subject string one character at a time, attempting to find a match. It does not backtrack. _See_ _also_ backtracking, greedy match, possessive match. + +literal + + _See_ string literal. + +lookaround + + _See_ lookahead, lookbehind. + +lookahead + +A regular expression that matches only if another specified regular expression follows the first. A positive lookahead uses the syntax `regex(?=regex)`. A negative lookahead means that the regular expression is _not_ followed by a regular expression that follows the first. Uses the syntax `regex(?!regex)`. + +lookbehind + +A regular expression that matches only if another specified regular expression precedes the first. A positive lookbehind uses the syntax `regex(?<=regex)`. A negative lookbehind means that the regular expression is _not_ followed by a regular expression that precedes the first. Uses the syntax `regex(?. + +piece + +A portion of a regular expression, usually concatenated, in POSIX.1 terminology. _See also_ POSIX. + +positive lookahead + + _See_ lookahead. + +positive lookbehind + + _See_ lookbehind. + +POSIX + +Portable Operating System Interface for Unix. A family of Unix-related standards by the Institute of Electrical and Electronics Engineers (IEEE). The most recent POSIX standard for regular expressions is POSIX.1-2008 (see ). + +possessive match + +A possessive match consumes an entire subject string in one fell swoop, attempting to find a match. It does not backtrack. _See also_ backtracking, greedy match, lazy match. + +quantifier + +Defines the number of times a regular expression may occur in an attempted match. An integer or pair of integers separated by a comma, surrounded by braces, is one form; for example, `{3}` indicates that the expression may occur exactly three times (with older tools that use basic regular expressions, you must escape the braces, as in `\{3\}`). + +Other quantifiers include `?` (zero or one times), `+` (one or more), and `*` (zero or more). A quantifier is also called a _bound_ or a _modifier_. By themselves, quantifiers are greedy. There are also lazy quantifiers (e.g., `{3}?`) and possessive quantifiers (e.g., `{3}+`). _See also_ basic regular expressions, greedy match, lazy match, possessive match. + +regular expression + +A specially encoded string of characters that, when used within an application or utility, may match other strings or sets of strings. First described in the early 1950s by the mathematician Stephen Kleene (1909–1994) in his work with formal language theory in his book _Introduction to Metamathematics_ , published in 1952. Began to gain momentum in computer science with the work of Ken Thompson, _et al._ on the QED editor (under the General Electric Time Sharing System [GE-TSS] on a GE-635 computer) and, later, other tools under AT&T Bell Labs' Unix operating system in the early 1970s. + +sed + +A Unix streaming editor that accepts regular expressions and transforms text. It was developed in the early 1970s by Lee McMahon at Bell Labs. An example of _sed_ : `sed -n 's/this/that/g\' file.ext > new.ext`. Use _sed -E_ to indicate that you want to use extended regular expressions. _See also_ extended regular expressions. + +string literal + +A string of characters interpreted literally—for example, the literal string "It is an ancyent Marinere" as opposed to something like "[Ii]t[ ]is[ ].*nere." + +Unicode + +Unicode is a system for encoding characters for writing systems of the world. Each character in Unicode is assigned a numeric code point. There are over 100,000 characters represented in Unicode. In regular expressions, a Unicode character can be specified as `\u` _`xxxx`_ or `\x` _`{xxxx}`_ , where _x_ represents a hexadecimal number in the range 0–9, A–F (or a–f), using one to four places. For example, `\u00E9` represents the character _é_ , the Latin small letter _e_ with an acute accent. _See also_ . + +vi + +A Unix editor that was first developed in 1976 by Bill Joy and that uses regular expressions. The _vim_ editor is an improved replacement for _vi_ , developed primarily by Bram Moolenaar (see ). I currently use six or seven different editors during a regular work day, but the one I use most often is _vim. In fact, if I were shipwrecked on a desert island, and could have only one text editor, I would choose _vim_. No question. + +vim + + _See_ vi. + +work buffer + + _See_ pattern space. + +zero-width assertions + +Boundaries that do not consume any characters in a match. `^` and `$`, which match the beginning and end of a line, respectively, are examples. + +# Index + +### A note on the digital index + +A link in an index entry is displayed as the section title in which that entry appears. Because some sections have multiple index markers, it is not unusual for an entry to have several links to the same section. Clicking on any link will take you directly to the place in the text in which the marker appears. + +### Symbols + +$ (dollar sign), Quoting Literals, The Beginning and End of a Line, Regular Expressions in QED, Metacharacters + +matching end of line with, Regular Expressions in QED +as metacharacter, Metacharacters +usage examples, Quoting Literals, The Beginning and End of a Line +() (parentheses), Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences, Subpatterns, Regular Expressions in QED, Metacharacters + +as metacharacters, Metacharacters +QED regex feature, Regular Expressions in QED +subpatterns and, Subpatterns +usage examples, Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences +* (asterisk), Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expressions in QED, Metacharacters, Regular Expression Glossary + +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +as quantifier, Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expression Glossary +\+ (plus sign), Using Quantifiers, Matching with *, +, and ?, Metacharacters, Regular Expression Glossary + +as metacharacter, Metacharacters +as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary +\- (hyphen) metacharacter, Quoting a Group of Characters as Literals +. (dot) character, Matching Any Character, Matching Any Character, Once Again, Regular Expressions in QED, Metacharacters + +described, Matching Any Character, Once Again +matching any character, Matching Any Character +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +/ (forward slash), Word and Non-word Boundaries, Matching Tags +\0 (Null) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +; (semicolon), Using sed to Mark Up Text +<> (angle brackets), Matching Tags +? (question mark), Using Quantifiers, Quoting Literals, Matching with *, +, and ?, Matching Tags, Metacharacters, Regular Expression Glossary + +matching tags, Matching Tags +as metacharacter, Metacharacters +as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary +usage examples, Quoting Literals +] (square brackets), [Quoting Literals, Character Classes, Metacharacters + +as metacharacters, Metacharacters +usage examples, Quoting Literals, Character Classes +\ (backslash) metacharacter, Quoting Literals, The Beginning and End of a Line, Adding Tags with sed, Metacharacters, Metacharacters + +described, Metacharacters +escaping metacharacters, The Beginning and End of a Line, Metacharacters +inserting newlines, Adding Tags with sed +usage example, Quoting Literals +^ (caret), Quoting Literals, The Beginning and End of a Line–The Beginning and End of a Line, Negated Character Classes, Regular Expressions in QED, Metacharacters + +matching beginning or end of lines, The Beginning and End of a Line–The Beginning and End of a Line +as metacharacter, Metacharacters +negated character classes, Negated Character Classes +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals +_ (underscore), Matching Tags, Technical Notes +{} (curly braces), Using Quantifiers, Quoting Literals, Matching a Specific Number of Times, Regular Expressions in QED, Metacharacters + +as metacharacters, Using Quantifiers, Metacharacters +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals, Matching a Specific Number of Times +| (vertical bar), Quoting Literals, Regular Expressions in QED, Metacharacters + +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals + +### A + +a (append) command (sed), Appending Tags +\a (alert) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +\A (start of subject) character shorthand, Other Anchors +a modifier (Perl), Alternation, Options/Modifiers +ack tool, Matching Unicode Character Properties, Technical Notes +Adobe AIR runtime, Technical Notes +alert (\a) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +[:alnum:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +[:alpha:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +alternation, Quoting Literals, Alternation, Alternation, Alternation, Alternation, Regular Expression Glossary + +described, Quoting Literals, Alternation, Regular Expression Glossary +with grep, Alternation +with Perl, Alternation +with RegExr, Alternation +American Standard Code for Information Interchange (ASCII), ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary + +described, Regular Expression Glossary +regex cross-references, ASCII Code Chart with Regex–ASCII Code Chart with Regex +"An die Freude" (Schiller), Matching Unicode Character Properties +anchors, Boundaries, Regular Expression Glossary +angle brackets (<>), Matching Tags +append (a) command (sed), Appending Tags +ASCII (American Standard Code for Information Interchange), ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary + +described, Regular Expression Glossary +regex cross-references, ASCII Code Chart with Regex–ASCII Code Chart with Regex +[:ascii:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +Asciidoc text format, Technical Notes +assertions, Boundaries, Boundaries, Boundaries, Regular Expression Glossary + +as boundaries, Boundaries +described, Boundaries +zero-width, Boundaries, Regular Expression Glossary +asterisk (*), Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expressions in QED, Metacharacters, Regular Expression Glossary + +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +as quantifier, Using Quantifiers, Matching Any Character, Once Again, Subpatterns, Matching with *, +, and ?, Regular Expression Glossary +atom, Regular Expression Glossary (see metacharacters) +atomic groups, Atomic Groups, Technical Notes, Regular Expression Glossary + +### B + +\b] (backspace) character shorthand, [Matching Word and Non-Word Characters, Character Shorthands +\b (word boundary) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands +\B (non-word boundary) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries, Character Shorthands +backreferences, capturing groups and, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups +backslash (\\) metacharacter, Quoting Literals, The Beginning and End of a Line, Adding Tags with sed, Metacharacters, Metacharacters + +described, Metacharacters +escaping metacharacters, The Beginning and End of a Line, Metacharacters +inserting newlines, Adding Tags with sed +usage example, Quoting Literals +backspace \b] character shorthand, [Matching Word and Non-Word Characters, Character Shorthands +backtracking, Atomic Groups, Atomic Groups, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Lazy Quantifiers, Possessive Quantifiers, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +catastrophic, Atomic Groups, Regular Expression Glossary +described, Greedy, Lazy, and Possessive, Regular Expression Glossary +greedy match and, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary +lazy match and, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary +possessive match and, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary +turning off, Atomic Groups +Barnett, Bruce, Learning More +Basho (poet), Matching a Unicode Character +basic regular expressions (BREs), Word and Non-word Boundaries, Alternation, Regular Expression Glossary + +described, Regular Expression Glossary +grep and, Word and Non-word Boundaries, Alternation +Berkeley Time-Sharing System (BTSS), Preface, Regular Expressions in QED +[:blank:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +bound, Regular Expression Glossary (see quantifiers) +boundaries, Matching Word and Non-Word Characters, Boundaries, The Beginning and End of a Line–The Beginning and End of a Line, Word and Non-word Boundaries–Word and Non-word Boundaries, Word and Non-word Boundaries, Other Anchors, Quoting a Group of Characters as Literals, Adding Tags–Adding Tags with Perl, Character Shorthands + +adding tags, Adding Tags–Adding Tags with Perl +assertions as, Boundaries +matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line +matching start and end of subject, Other Anchors +non-word, Word and Non-word Boundaries +quoting groups of characters as literals, Quoting a Group of Characters as Literals +word, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands +bracketed expressions, Character Classes, Character Classes, Regular Expression Glossary + +(see also character classes) +branches, Regular Expression Glossary +BREs (basic regular expressions), Word and Non-word Boundaries, Alternation, Regular Expression Glossary + +described, Regular Expression Glossary +grep and, Word and Non-word Boundaries, Alternation +BTSS (Berkeley Time-Sharing System), Preface, Regular Expressions in QED + +### C + +\c xx (control) character shorthand, Matching Word and Non-Word Characters, Matching Control Characters, Character Shorthands +c modifier (Perl), Alternation, Options/Modifiers +capturing groups, Capturing Groups and Back References, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups, Named Groups, Regular Expression Glossary + +backreferences and, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups +described, Capturing Groups and Back References, Regular Expression Glossary +named groups, Named Groups +caret (^), Quoting Literals, The Beginning and End of a Line–The Beginning and End of a Line, Negated Character Classes, Regular Expressions in QED, Metacharacters + +matching beginning or end of lines, The Beginning and End of a Line–The Beginning and End of a Line +as metacharacter, Metacharacters +negated character classes, Negated Character Classes +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals +carriage return (\r) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +case sensitivity, Matching String Literals, Alternation, Alternation, Positive Lookaheads + +in Regexpal, Matching String Literals +in RegExr, Alternation, Positive Lookaheads +in regular expressions, Alternation +catastrophic backtracking, Atomic Groups, Regular Expression Glossary +character classes, Matching Digits with a Character Class, Matching Digits with a Character Class, Matching Word and Non-Word Characters, Character Classes–Character Classes, Character Classes, Character Classes, Character Classes, Negated Character Classes, Union and Difference, Union and Difference, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes, Regular Expression Glossary, Regular Expression Glossary + +creating, Character Classes +described, Matching Digits with a Character Class, Character Classes–Character Classes, Regular Expression Glossary +difference of, Union and Difference +fewest keystrokes win principle and, Matching Word and Non-Word Characters +matching digits with, Matching Digits with a Character Class +matching range of characters, Character Classes +matching range of digits, Character Classes +negated, Negated Character Classes, Regular Expression Glossary +POSIX, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes +union of, Union and Difference +character escape, Using a Character Shorthand, The Beginning and End of a Line, Metacharacters, Regular Expression Glossary + +described, Using a Character Shorthand, Regular Expression Glossary +metacharacters and, The Beginning and End of a Line, Metacharacters +character properties, Matching Unicode Character Properties–Matching Unicode Character Properties, Matching Unicode Character Properties, Character Properties, Script Names for Character Properties + +described, Matching Unicode Character Properties, Character Properties +matching, Matching Unicode Character Properties–Matching Unicode Character Properties +script names for, Script Names for Character Properties +character sets, Matching Digits with a Character Class (see character classes) +character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Using a Character Shorthand, Matching Word and Non-Word Characters, Matching Whitespace, Other Anchors, Quoting a Group of Characters as Literals, Character Classes, Character Shorthands, Whitespace + +character class and, Character Classes +described, What Is a Regular Expression?, Using a Character Shorthand, Matching Word and Non-Word Characters, Character Shorthands +matching digits with, Using a Character Shorthand +quoting group of characters as literals, Quoting a Group of Characters as Literals +start and end of subject, Other Anchors +for whitespace, Matching Whitespace, Whitespace +characters, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again, Quoting a Group of Characters as Literals, Character Classes, Matching Unicode and Other Characters + +matching any, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again +matching range of, Character Classes, Matching Unicode and Other Characters +quoting groups of characters as literals, Quoting a Group of Characters as Literals +Chrome browser, Technical Notes +circumflex, The Beginning and End of a Line (see caret (^)) +Clark, James, Technical Notes, Regular Expression Glossary +code points, Regular Expression Glossary (see Unicode) +Coleridge, Samuel Taylor, Simple Pattern Matching +command files, Using sed to Mark Up Text, Using Perl to Mark Up Text, Adding Tags with sed, Adding Tags with Perl, Using a Command File with sed, Using a File of Commands with Perl, The End of the Beginning + +using with Perl, Using Perl to Mark Up Text, Adding Tags with Perl, Using a File of Commands with Perl +using with sed, Using sed to Mark Up Text, Adding Tags with sed, Using a Command File with sed, The End of the Beginning +composability, Technical Notes, Regular Expression Glossary +control characters, Matching Word and Non-Word Characters, Matching Control Characters, Matching Control Characters, Technical Notes, Character Shorthands, Control Characters + +additional information, Technical Notes +character shorthand, Matching Word and Non-Word Characters, Matching Control Characters, Character Shorthands +matching, Matching Control Characters +in regular expressions, Control Characters +[:ctrl:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +curly braces {}, Using Quantifiers, Quoting Literals, Matching a Specific Number of Times, Regular Expressions in QED, Metacharacters + +as metacharacters, Using Quantifiers, Metacharacters +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals, Matching a Specific Number of Times + +### D + +\d (digit) character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Quoting Literals, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching digits, What Is a Regular Expression?, Matching Digits–Matching Digits +usage example, Using a Character Shorthand, Quoting Literals +\d xxx (decimal value) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +\D (non-digit) character shorthand, Using a Character Shorthand, Matching Non-Digits, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching non-digits, Matching Non-Digits +usage example, Using a Character Shorthand +d modifier (Perl), Alternation, Options/Modifiers +decimal value (\d xxx) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +difference of character sets, Union and Difference +[:digit:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +digits, What Is a Regular Expression?, Matching Digits with a Character Class, Using a Character Shorthand, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Quoting Literals–A Sample of Applications, Quoting Literals, Matching Digits–Matching Digits, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Classes, Character Shorthands + +capturing groups and backreferences, Capturing Groups and Back References +character shorthand, What Is a Regular Expression?, Using a Character Shorthand, Quoting Literals, Matching Digits–Matching Digits, Matching Word and Non-Word Characters, Character Shorthands +matching any characters, Matching Any Character +matching range of, Character Classes +matching with character classes, Matching Digits with a Character Class +matching with character shorthand, Using a Character Shorthand +matching with shorthand, Matching Digits–Matching Digits +quoting literals, Quoting Literals–A Sample of Applications +documents, marking up with HTML, Marking Up a Document with HTML (see marking up documents with HTML5) +dollar sign ($), Quoting Literals, The Beginning and End of a Line, Regular Expressions in QED, Metacharacters + +matching end of line with, Regular Expressions in QED +as metacharacter, Metacharacters +usage examples, Quoting Literals, The Beginning and End of a Line +dot (.) character, Matching Any Character, Matching Any Character, Once Again, Regular Expressions in QED, Metacharacters + +described, Matching Any Character, Once Again +matching any character, Matching Any Character +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +dotall option, Matching Any Character, Matching Any Character, Once Again, The Beginning and End of a Line + +### E + +\E (quoting literal characters) character shorthand, Quoting a Group of Characters as Literals +E command-line option, Regular Expressions in QED +echo command, Using sed to Mark Up Text +ed editor, Regular Expression Glossary +egrep utility, Technical Notes, Regular Expression Glossary +email address example, Matching an Email Address +EREs (extended regular expressions), Word and Non-word Boundaries, Alternation, Regular Expression Glossary + +described, Regular Expression Glossary +grep -E option for, Word and Non-word Boundaries, Alternation + +### F + +\f (form feed) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +fewest keystrokes win principle, Matching Word and Non-Word Characters +fgrep utility, Technical Notes, Regular Expression Glossary +flags, Regular Expression Glossary (see modifiers (flags)) +form feed (\f) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +forward slash (/), Word and Non-word Boundaries, Matching Tags +Friedl, Jeff, Who Should Read This Book, Learning More +full stop, Matching Any Character, Once Again (see dot character) + +### G + +g modifier (Perl), Alternation, Options/Modifiers +GE-TSS (General Electric Time Sharing System), Regular Expression Glossary +Git version control system, Technical Notes +Goyvaerts, Jan, Who Should Read This Book, Learning More, Matching a North American Phone Number +[:graph:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +greedy match, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary +grep utility, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Word and Non-word Boundaries, Technical Notes, Alternation, Alternation, Alternation, Alternation, Alternation, Regular Expression Glossary + +alternation with, Alternation +BREs and, Word and Non-word Boundaries, Alternation +-c option, Word and Non-word Boundaries, Alternation +described, Technical Notes, Regular Expression Glossary +-E option, Word and Non-word Boundaries, Alternation +-o option, Word and Non-word Boundaries, Alternation +search syntax, Word and Non-word Boundaries +groups and grouping, Capturing Groups and Back References, Quoting a Group of Characters as Literals, Subpatterns, Capturing Groups and Backreferences–Named Groups, Named Groups, Non-Capturing Groups, Atomic Groups, Technical Notes, Lookarounds–What You Learned in Chapter 8, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +atomic, Atomic Groups, Technical Notes, Regular Expression Glossary +capturing, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups, Regular Expression Glossary +described, Regular Expression Glossary +lookarounds, Lookarounds–What You Learned in Chapter 8 +named, Named Groups +non-capturing, Non-Capturing Groups, Regular Expression Glossary, Regular Expression Glossary +quoting groups of characters as literals, Quoting a Group of Characters as Literals +subpatterns, Subpatterns +Gwyn, Doug, The End of the Beginning + +### H + +\h (horizontal) whitespace character, Matching Whitespace, Whitespace +\H (non-horizontal) whitespace character, Matching Whitespace, Whitespace +hexadecimal numbering system, Matching Word and Non-Word Characters, Character Classes, Matching a Unicode Character, Character Shorthands, Regular Expression Glossary + +character shorthand, Matching Word and Non-Word Characters, Character Shorthands +described, Regular Expression Glossary +matching character classes, Character Classes +matching Unicode characters, Matching a Unicode Character +hold space, Regular Expression Glossary +horizontal (\h) whitespace character, Matching Whitespace, Whitespace +horizontal tab (\t) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +Horton, Mark, Technical Notes +HTML5, Marking Up the Text, Technical Notes, Marking Up a Document with HTML–Technical Notes + +additional information, Technical Notes +marking up documents with, Marking Up a Document with HTML–Technical Notes +marking up text as, Marking Up the Text +hyphen (-) metacharacter, Quoting a Group of Characters as Literals + +### I + +i (insert) command (sed), Adding Tags with sed, Transforming Plain Text with sed +i modifier (Perl), Alternation, Options/Modifiers +IEEE (Institute of Electrical and Electronics Engineers), POSIX Character Classes, Technical Notes, Regular Expression Glossary +insert (i) command (sed), Adding Tags with sed, Transforming Plain Text with sed + +### J + +Java programming language, Union and Difference, Technical Notes +Joy, Bill, Technical Notes, Technical Notes, Regular Expression Glossary + +### K + +Kernighan, Brian, Preface +Kleene star, Matching with *, +, and ? +Kleene, Stephen, Preface, Matching with *, +, and ?, Regular Expression Glossary + +### L + +l modifier (Perl), Alternation, Options/Modifiers +lazy match, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary +Levithan, Steven, Who Should Read This Book, Learning More, Matching a North American Phone Number +lines, matching beginning and end of, The Beginning and End of a Line–The Beginning and End of a Line +literals, Regular Expression Glossary (see string literals) +lookaheads, Positive Lookaheads–Positive Lookaheads, Negative Lookaheads, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +described, Regular Expression Glossary +negative, Negative Lookaheads, Regular Expression Glossary +positive, Positive Lookaheads–Positive Lookaheads, Regular Expression Glossary +lookarounds, Regular Expression Glossary (see lookaheads; lookbehinds) +lookbehinds, Positive Lookbehinds, Negative Lookbehinds, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +described, Regular Expression Glossary +negative, Negative Lookbehinds, Regular Expression Glossary +positive, Positive Lookbehinds, Regular Expression Glossary +[:lower:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes + +### M + +m modifier (Perl), Alternation, Options/Modifiers +marking up documents with HTML5, Using sed to Mark Up Text–Using sed to Mark Up Text, Using Perl to Mark Up Text–Using Perl to Mark Up Text, Adding Tags with sed–Adding Tags with sed, Adding Tags with Perl–Adding Tags with Perl, Marking Up a Document with HTML, Matching Tags, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Appending Tags–Using a Command File with sed, Transforming Plain Text with Perl–Using a File of Commands with Perl + +adding tags with Perl, Adding Tags with Perl–Adding Tags with Perl +adding tags with sed, Adding Tags with sed–Adding Tags with sed +appending tags, Appending Tags–Using a Command File with sed +described, Marking Up a Document with HTML +marking up with Perl, Using Perl to Mark Up Text–Using Perl to Mark Up Text +marking up with sed, Using sed to Mark Up Text–Using sed to Mark Up Text +matching tags, Matching Tags +transforming plain text with Perl, Transforming Plain Text with Perl–Using a File of Commands with Perl +transforming plain text with sed, Transforming Plain Text with sed–Handling the Lines of the Poem with sed +marking up text, Using sed to Mark Up Text, Using Perl to Mark Up Text–Using Perl to Mark Up Text + +using Perl, Using Perl to Mark Up Text–Using Perl to Mark Up Text +using sed, Using sed to Mark Up Text +McMahon, Lee, Using sed to Mark Up Text, Regular Expression Glossary +metacharacters, Matching Digits with a Character Class, Quoting a Group of Characters as Literals, Metacharacters, Metacharacters, Regular Expression Glossary + +described, Matching Digits with a Character Class, Quoting a Group of Characters as Literals, Regular Expression Glossary +escaping, Metacharacters +in regular expressions, Metacharacters +modifiers (flags), Alternation, Alternation, Options/Modifiers, Regular Expression Glossary + +described, Regular Expression Glossary +in regular expressions, Alternation, Alternation, Options/Modifiers +Moolenaar, Bram, Technical Notes, Technical Notes, Regular Expression Glossary + +### N + +\n (newline) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +named groups, Named Groups +negated character classes, Negated Character Classes, Regular Expression Glossary +negative lookaheads, Negative Lookaheads, Regular Expression Glossary +negative lookbehinds, Negative Lookbehinds, Regular Expression Glossary +.NET programming framework, Technical Notes +newlines, Matching Any Character, Matching Word and Non-Word Characters, The Beginning and End of a Line, Adding Tags with sed, Character Shorthands + +character shorthand, Matching Word and Non-Word Characters, Character Shorthands +inserting, Adding Tags with sed +matching with dotall option, Matching Any Character, The Beginning and End of a Line +non-capturing groups, Non-Capturing Groups, Lookarounds–What You Learned in Chapter 8, Regular Expression Glossary, Regular Expression Glossary + +described, Non-Capturing Groups, Regular Expression Glossary, Regular Expression Glossary +lookarounds, Lookarounds–What You Learned in Chapter 8 +non-digit (\D) character shorthand, Using a Character Shorthand, Matching Non-Digits, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching non-digits, Matching Non-Digits +usage example, Using a Character Shorthand +non-horizontal (\H) whitespace character, Matching Whitespace, Whitespace +non-space (\S) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +non-vertical (\V) whitespace character, Matching Whitespace, Whitespace +non-word (\W) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +non-word boundary (\B) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries, Character Shorthands +Notepad++ editor, A Sample of Applications, Technical Notes +null (\0) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +numbers, Matching Digits with a Character Class (see digits) + +### O + +\o (octal value) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +occurrence constraints, Regular Expression Glossary (see quantifiers) +octal characters, Matching Word and Non-Word Characters, Matching Characters with Octal Numbers, Character Shorthands, Regular Expression Glossary + +character shorthand, Matching Word and Non-Word Characters, Character Shorthands +described, Regular Expression Glossary +matching Unicode with, Matching Characters with Octal Numbers +Oniguruma library (Ruby), Ruby (Oniguruma) +Opera Next browser, Technical Notes +options, Alternation, Options/Modifiers, Regular Expression Glossary + +described, Regular Expression Glossary +in regular expressions, Alternation, Options/Modifiers +Oxygen XML editor, A Sample of Applications, Technical Notes + +### P + +p modifier (Perl), Alternation, Options/Modifiers +parentheses (), Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences, Subpatterns, Regular Expressions in QED, Metacharacters + +as metacharacters, Metacharacters +QED regex feature, Regular Expressions in QED +subpatterns and, Subpatterns +usage examples, Capturing Groups and Back References, Quoting Literals, Alternation, Groups, and Backreferences +pattern matching, Simple Pattern Matching–Simple Pattern Matching, Matching String Literals, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text–Using Perl to Mark Up Text, Subpatterns, Regular Expression Glossary + +described, Simple Pattern Matching–Simple Pattern Matching, Regular Expression Glossary +marking up text, Marking Up the Text–Using Perl to Mark Up Text +matching any character, Matching Any Character, Once Again–Matching Any Character, Once Again +matching digits, Matching Digits–Matching Digits +matching non-digits, Matching Non-Digits +matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +matching string literals, Matching String Literals +matching whitespace, Matching Whitespace +matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +subpatterns and, Subpatterns +pattern space, Regular Expression Glossary +PCRE (Perl Compatible Regular Expressions), Other Anchors, Technical Notes, PCRE +pcregrep utility, Other Anchors, Other Anchors, Other Anchors, Technical Notes, PCRE + +-c option, Other Anchors +described, Other Anchors, Technical Notes, PCRE +-n option, Other Anchors +period, Matching Any Character, Once Again (see dot character) +Perl Compatible Regular Expressions (PCRE), Other Anchors, Technical Notes, PCRE +Perl programming language, Using Perl to Mark Up Text–Using Perl to Mark Up Text, Using Perl to Mark Up Text, Using Perl to Mark Up Text, Technical Notes, Other Anchors, Adding Tags with Perl, Adding Tags with Perl, Alternation, Alternation, Named Groups, Transforming Plain Text with Perl–Using a File of Commands with Perl, Handling Roman Numerals with Perl, Using a File of Commands with Perl, Perl, Options/Modifiers, Regular Expression Glossary + +accessing named groups, Named Groups +adding tags, Adding Tags with Perl +additional information, Technical Notes +alternation and, Alternation +command files and, Using Perl to Mark Up Text, Adding Tags with Perl, Using a File of Commands with Perl +described, Using Perl to Mark Up Text, Perl, Regular Expression Glossary +handling Roman numerals, Handling Roman Numerals with Perl +marking up text, Using Perl to Mark Up Text–Using Perl to Mark Up Text +modifiers in regular expressions, Alternation, Options/Modifiers +start and end of subjects, Other Anchors +transforming plain text with, Transforming Plain Text with Perl–Using a File of Commands with Perl +perldoc command, Technical Notes +phone numbers, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number–Matching a North American Phone Number, Matching Digits with a Character Class, Matching Digits with a Character Class, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Quoting Literals–A Sample of Applications, Matching a North American Phone Number + +capturing groups and backreferences, Capturing Groups and Back References +matching any characters, Matching Any Character +matching digits with character classes, Matching Digits with a Character Class +matching in regular expressions, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number +matching with character classes, Matching Digits with a Character Class +matching with character shorthand, Using a Character Shorthand +quoting literals, Quoting Literals–A Sample of Applications +piece (regular expressions), Regular Expression Glossary +plain text, Transforming Plain Text with sed (see strings and string literals) +plus sign (+), Using Quantifiers, Matching with *, +, and ?, Metacharacters, Regular Expression Glossary + +as metacharacter, Metacharacters +as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary +Portable Operating System Interface for Unix (POSIX), POSIX Character Classes, Regular Expression Glossary +positive lookaheads, Positive Lookaheads–Positive Lookaheads, Regular Expression Glossary +positive lookbehinds, Positive Lookbehinds, Regular Expression Glossary +POSIX (Portable Operating System Interface for Unix), POSIX Character Classes, Regular Expression Glossary +POSIX character classes, POSIX Character Classes–POSIX Character Classes, POSIX Character Classes +POSIX.1-2008 standard, Regular Expression Glossary +possessive match, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary +[:print:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +Project Gutenberg, Simple Pattern Matching +[:punct:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +Python programming language, Technical Notes, Python + +### Q + +q (quit) command (sed), Using sed to Mark Up Text +\Q (quoting literal characters) character shorthand, Quoting a Group of Characters as Literals +QED editor, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED–Regular Expressions in QED, Regular Expressions in QED, Technical Notes, Technical Notes, Regular Expression Glossary + +additional information, Technical Notes +Ken Thompson and, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary +regular expressions in, Regular Expressions in QED–Regular Expressions in QED +quantifiers, Using Quantifiers, Using Quantifiers, Quoting Literals, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Greedy, Lazy, and Possessive, Greedy, Lazy, and Possessive, Matching with *, +, and ?, Matching a Specific Number of Times, Lazy Quantifiers, Possessive Quantifiers, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +described, Using Quantifiers, Greedy, Lazy, and Possessive, Regular Expression Glossary +greedy match and, Greedy, Lazy, and Possessive–Matching a Specific Number of Times, Regular Expression Glossary +lazy match and, Greedy, Lazy, and Possessive, Lazy Quantifiers, Regular Expression Glossary +matching specific number of times, Matching a Specific Number of Times +matching with *, +, and ?, Matching with *, +, and ? +possessive match and, Greedy, Lazy, and Possessive, Possessive Quantifiers, Regular Expression Glossary +usage examples, Using Quantifiers, Quoting Literals +question mark (?), Using Quantifiers, Quoting Literals, Matching with *, +, and ?, Matching Tags, Metacharacters, Regular Expression Glossary + +matching tags, Matching Tags +as metacharacter, Metacharacters +as quantifier, Using Quantifiers, Matching with *, +, and ?, Regular Expression Glossary +usage examples, Quoting Literals +quit (q) command (sed), Using sed to Mark Up Text +quoting literals, Quoting Literals–A Sample of Applications, Quoting a Group of Characters as Literals + +quoting groups of characters as, Quoting a Group of Characters as Literals +usage example, Quoting Literals–A Sample of Applications + +### R + +\r (carriage return) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +Rackham, Stuart, Technical Notes +range of characters, matching, Character Classes, Matching Unicode and Other Characters +range of digits, matching, Character Classes +Re2 library, RE2 +Regex Hero, Matching a Unicode Character, Technical Notes +RegexBuddy application, Who Should Read This Book +Regexpal regex processor, Who Should Read This Book, Getting Started with Regexpal, Matching a North American Phone Number–Matching a North American Phone Number, Technical Notes, Matching String Literals, Negated Character Classes, Matching a Unicode Character + +additional information, Technical Notes +described, Who Should Read This Book, Getting Started with Regexpal +matching phone numbers, Matching a North American Phone Number–Matching a North American Phone Number +matching Unicode characters, Matching a Unicode Character +negated character classes and, Negated Character Classes +string matching in, Matching String Literals +RegExr regex processor, Simple Pattern Matching, Simple Pattern Matching, Simple Pattern Matching, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text, Marking Up the Text, Technical Notes, The Beginning and End of a Line–The Beginning and End of a Line, Quoting a Group of Characters as Literals, Adding Tags, Alternation, Groups, and Backreferences, Alternation, Alternation, Capturing Groups and Backreferences, Named Groups, Positive Lookaheads + +adding tags, Adding Tags +additional information, Technical Notes +alternation with, Alternation +backreference support, Capturing Groups and Backreferences +case-insensitivity, Alternation, Positive Lookaheads +Community tab, Simple Pattern Matching +described, Simple Pattern Matching +downloading, Alternation, Groups, and Backreferences +marking up text, Marking Up the Text +matching any characters, Matching Any Character, Once Again–Matching Any Character, Once Again +matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line +matching digits, Matching Digits–Matching Digits +matching non-digits, Matching Non-Digits +matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +matching whitespace, Matching Whitespace +matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +metacharacters and, Quoting a Group of Characters as Literals +named groups, Named Groups +Replace tab, Marking Up the Text +Samples tab, Simple Pattern Matching +Reggy application, Technical Notes, Quantifiers +regular expressions, Preface, What Is a Regular Expression?, Matching a North American Phone Number–Matching a North American Phone Number, Matching Digits with a Character Class, Using a Character Shorthand, Matching Any Character, Capturing Groups and Back References, Using Quantifiers, Quoting Literals–A Sample of Applications, Matching String Literals, Matching Digits–Matching Digits, Matching Non-Digits, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Matching Whitespace, Matching Any Character, Once Again–Matching Any Character, Once Again, Marking Up the Text, Quoting a Group of Characters as Literals, Alternation, Alternation, Subpatterns, Capturing Groups and Backreferences–Named Groups, Character Classes–Technical Notes, Learning More–What You Learned in Chapter 10, Matching a North American Phone Number, Matching an Email Address, Regular Expressions in QED–Regular Expressions in QED, Metacharacters, Character Shorthands, Control Characters, Options/Modifiers, Options/Modifiers, ASCII Code Chart with Regex–ASCII Code Chart with Regex, Regular Expression Glossary, Regular Expression Glossary + +additional information, Learning More–What You Learned in Chapter 10 +ASCII code chart, ASCII Code Chart with Regex–ASCII Code Chart with Regex +capturing groups and backreferences, Capturing Groups and Back References, Capturing Groups and Backreferences–Named Groups +character shorthand in, Matching Word and Non-Word Characters, Character Shorthands +control characters in, Control Characters +described, Preface, What Is a Regular Expression?, Regular Expression Glossary +marking up text, Marking Up the Text +matching any character, Matching Any Character, Matching Any Character, Once Again–Matching Any Character, Once Again +matching digits, Matching Digits–Matching Digits +matching digits with character classes, Matching Digits with a Character Class +matching email addresses, Matching an Email Address +matching non-digits, Matching Non-Digits +matching non-word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +matching phone numbers, Matching a North American Phone Number–Matching a North American Phone Number, Matching a North American Phone Number +matching string literals, Matching String Literals +matching whitespace, Matching Whitespace +matching with character classes, Character Classes–Technical Notes +matching with character shorthand, Using a Character Shorthand +matching word characters, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +metacharacters in, Quoting a Group of Characters as Literals, Metacharacters +modifiers in, Alternation, Options/Modifiers +options in, Alternation, Options/Modifiers +pieces of, Regular Expression Glossary +in QED editor, Regular Expressions in QED–Regular Expressions in QED +quantifiers in, Using Quantifiers +quoting literals in, Quoting Literals–A Sample of Applications +subpatterns and, Subpatterns +reluctant (lazy) quantifiers, Greedy, Lazy, and Possessive +"The Rime of the Ancient Mariner" (Coleridge), Simple Pattern Matching +Ritchie, Dennis, Preface, Technical Notes, Technical Notes +Roman numerals, Handling Roman Numerals with sed, Handling Roman Numerals with Perl + +handling with Perl, Handling Roman Numerals with Perl +handling with sed, Handling Roman Numerals with sed +Rubular Ruby regex processor, Technical Notes, Ruby (Oniguruma) + +### S + +s (substitute) command, Using sed to Mark Up Text, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed, Handling Roman Numerals with Perl + +with Perl, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Handling Roman Numerals with Perl +with sed, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed +\s (space) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +\S (non-space) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +s modifier (Perl), Alternation, Options/Modifiers +Schiller, Friedrich, Matching Unicode Character Properties +script names for character properties, Script Names for Character Properties +search command (vim), Word and Non-word Boundaries +sed editor, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Using sed to Mark Up Text, Technical Notes, Adding Tags with sed–Adding Tags with sed, Adding Tags with sed, Adding Tags with sed, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Transforming Plain Text with sed, Substitution with sed, Handling Roman Numerals with sed, Appending Tags, Using a Command File with sed, Using a Command File with sed, The End of the Beginning, Regular Expression Glossary + +a (append) command, Appending Tags +adding tags with, Adding Tags with sed–Adding Tags with sed +additional information, Technical Notes +backreference support, Capturing Groups and Backreferences, Capturing Groups and Backreferences +command files and, Using sed to Mark Up Text, Adding Tags with sed, Using a Command File with sed, The End of the Beginning +described, Using sed to Mark Up Text, Regular Expression Glossary +-E option, Capturing Groups and Backreferences +handling Roman numerals, Handling Roman Numerals with sed +i (insert) command, Adding Tags with sed, Transforming Plain Text with sed +marking up text, Using sed to Mark Up Text +-n option, Capturing Groups and Backreferences +q (quit) command, Using sed to Mark Up Text +s (substitute) command, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed +transforming plain text with, Transforming Plain Text with sed–Handling the Lines of the Poem with sed +semicolon (;), Using sed to Mark Up Text +shebang directive, Using a File of Commands with Perl +Skinner, Grant, Simple Pattern Matching +space character (\s) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +[:space:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +special characters in regular expressions, Quoting a Group of Characters as Literals (see metacharacters) +Spencer, Henry, Preface +square brackets ], [Quoting Literals, Character Classes, Metacharacters + +as metacharacters, Metacharacters +usage examples, Quoting Literals, Character Classes +strings and string literals, Matching a North American Phone Number, Matching a North American Phone Number, Quoting Literals–A Sample of Applications, Matching String Literals, The Beginning and End of a Line–The Beginning and End of a Line, Quoting a Group of Characters as Literals, Transforming Plain Text with sed–Handling the Lines of the Poem with sed, Transforming Plain Text with Perl–Using a File of Commands with Perl, Regular Expression Glossary + +described, Matching a North American Phone Number, Regular Expression Glossary +matching, Matching String Literals +matching beginning and end of lines, The Beginning and End of a Line–The Beginning and End of a Line +matching phone numbers, Matching a North American Phone Number +quoting, Quoting Literals–A Sample of Applications, Quoting a Group of Characters as Literals +transforming with Perl, Transforming Plain Text with Perl–Using a File of Commands with Perl +transforming with sed, Transforming Plain Text with sed–Handling the Lines of the Poem with sed +Stubblebine, Tony, Learning More +subpatterns, Subpatterns +substitute (s) command, Using sed to Mark Up Text, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed, Handling Roman Numerals with Perl + +with Perl, Using Perl to Mark Up Text, Capturing Groups and Backreferences, Handling Roman Numerals with Perl +with sed, Using sed to Mark Up Text, Capturing Groups and Backreferences, Substitution with sed, Using a Command File with sed + +### T + +\t (horizontal tab) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +tab characters, Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands, Character Shorthands + +horizontal tab shorthand, Matching Word and Non-Word Characters, Character Shorthands +vertical tab shorthand, Matching Word and Non-Word Characters, Character Shorthands +tags, Adding Tags, Adding Tags with sed–Adding Tags with sed, Adding Tags with Perl, Matching Tags, Appending Tags–Using a Command File with sed + +adding with Perl, Adding Tags with Perl +adding with sed, Adding Tags with sed–Adding Tags with sed +appending, Appending Tags–Using a Command File with sed +described, Adding Tags +matching, Matching Tags +text, Transforming Plain Text with sed (see strings and string literals) +TextMate editor, A Sample of Applications, Technical Notes +Thompson, Ken, Preface, What Is a Regular Expression?, Technical Notes, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary, Regular Expression Glossary, Regular Expression Glossary + +ed editor and, Regular Expression Glossary +grep and, Technical Notes, Regular Expression Glossary +QED editor and, What Is a Regular Expression?, Technical Notes, Regular Expressions in QED, Technical Notes, Regular Expression Glossary +regular expressions and, Preface + +### U + +\u (Unicode) character shorthand, Matching Word and Non-Word Characters, Matching a Unicode Character, Character Shorthands +u modifier (Perl), Alternation, Options/Modifiers +underscore (_), Matching Tags, Technical Notes +Unicode, What Is a Regular Expression?, Matching Word and Non-Word Characters, Matching Unicode and Other Characters, Matching a Unicode Character–Matching Characters with Octal Numbers, Matching a Unicode Character, Matching Unicode Character Properties–Matching Unicode Character Properties, Character Shorthands, Unicode Whitespace Characters, Regular Expression Glossary, Regular Expression Glossary + +character shorthand, Matching Word and Non-Word Characters, Matching a Unicode Character, Character Shorthands +code point assignments, What Is a Regular Expression?, Regular Expression Glossary +described, Matching Unicode and Other Characters, Regular Expression Glossary +matching character properties, Matching Unicode Character Properties–Matching Unicode Character Properties +matching characters, Matching a Unicode Character–Matching Characters with Octal Numbers +whitespace characters in, Unicode Whitespace Characters +union of character sets, Union and Difference +[:upper:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes + +### V + +\v (vertical tab) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +\v (vertical) whitespace character, Matching Whitespace, Whitespace +\V (non-vertical) whitespace character, Matching Whitespace, Whitespace +van Rossum, Guido, Python +vertical (\v) whitespace character, Matching Whitespace, Whitespace +vertical bar (|), Quoting Literals, Regular Expressions in QED, Metacharacters + +as metacharacter, Metacharacters +QED regex feature, Regular Expressions in QED +usage example, Quoting Literals +vertical tab (\v) character shorthand, Matching Word and Non-Word Characters, Character Shorthands +vi editor, Technical Notes, Regular Expression Glossary +vim editor, Word and Non-word Boundaries, Using vim, Technical Notes, Regular Expression Glossary + +additional information, Technical Notes +described, Regular Expression Glossary +matching Unicode characters, Using vim +search command in, Word and Non-word Boundaries +Voltaire (philosopher), Matching Unicode and Other Characters + +### W + +\w (word) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +\W (non-word) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +Wall, Larry, Using Perl to Mark Up Text, Regular Expression Glossary +Watt, Andrew, Learning More +wc command, Alternation +whitespace, Matching Whitespace, Matching Whitespace, Whitespace, Unicode Whitespace Characters + +character shorthand for, Matching Whitespace, Whitespace +matching with RegExr, Matching Whitespace +in Unicode, Unicode Whitespace Characters +wildcards, matching any character, Matching Any Character +word (\w) character shorthand, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters, Matching Word and Non-Word Characters, Character Shorthands + +described, Matching Word and Non-Word Characters, Character Shorthands +matching, Matching Word and Non-Word Characters–Matching Word and Non-Word Characters +word boundary (\b) character shorthand, Matching Word and Non-Word Characters, Word and Non-word Boundaries–Word and Non-word Boundaries, Character Shorthands +[:word:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +work buffer, Regular Expression Glossary +Wortham, Steve, Technical Notes + +### X + +x modifier (Perl), Alternation, Options/Modifiers +[:xdigit:]] POSIX character class, [POSIX Character Classes, POSIX Character Classes +XML tags, Matching Tags, Technical Notes +XSLT stylesheet, The End of the Beginning + +### Z + +\Z (end of subject) character shorthand, Other Anchors +zero-width assertions, Boundaries, Regular Expression Glossary + +# About the Author + +Michael Fitzgerald describes Ruby as "my favorite language so far" and is working regularly with Ruby and the Rails framework. He has written over 150 Ruby programs for testing and demonstration, and has been developing a library of sample Ruby code. He is the author of Learning XSLT and XML Hacks, and co-author on the XML Pocket Reference. + +# Colophon + +The animal on the cover of _Introducing Regular Expressions_ is a fruit bat. + +Members of the suborder _Megachiroptera_ and family _Pteropodidae_ are known as fruit bats, flying foxes, old world fruit bats, or megabats. Despite the latter nickname, members of the Pteropodidae family vary greatly in size—the smallest measure six centimeters, while others weigh in at two pounds, with wingspans up to approximately five feet long. + +True to their name, fruit bats are frugivorous, or nectavorious, meaning they eat fruit or lick nectar from flowers. Some use their teeth to bite through fruit skin and actually eat the fruit, while others lick juices from crushed fruit. Because many of them dine on flower nectar, fruit bats are excellent pollinators and seed-spreaders—in fact, the World Bat Sanctuary estimates that approximately 95% of all new rainforest growth can be attributed to fruit bats' distribution of seeds. This relationship between the bats and plants is a form of mutualism—the way organisms of different species interact biologically for a mutual fitness benefit—known as chiropterophily. + +Fruit bats can be found all over the world, though they prefer warm, tropical climates, due in part to the availability of fruit and flowers. While they're excellent flyers, fruit bats are known for their clumsy landings; they often crash land into trees or try to grab limbs with their feet in order to stop themselves. This perpetuates the misconception that they're blind, when in fact, fruit bats are said to have the best vision of all the bat species, most of which rely on echolocation to get around. Fruit bats use vision—along with their advanced senses of smell—to locate food and navigate. + +The cover image is from Cassell's _Natural History_. The cover font is Adobe ITC Garamond. The text font is Linotype Birka; the heading font is Adobe Myriad Condensed; and the code font is LucasFont's TheSansMonoCondensed. + +# SPECIAL OFFER: Upgrade this ebook with O'Reilly + +Upgrade this ebook today for $4.99 at oreilly.com and get access to additional DRM-free formats, including PDF and EPUB, along with free lifetime updates. + +# + +# Introducing Regular Expressions + +### Michael Fitzgerald + +#### Editor + +### Simon St. Laurent + +Revision History +--- +2012-07-10| First release + +Copyright © 2012 Michael Fitzgerald + +O'Reilly books may be purchased for educational, business, or sales promotional use. Online editions are also available for most titles (http://my.safaribooksonline.com). For more information, contact our corporate/institutional sales department: 800-998-9938 or corporate@oreilly.com. + +Nutshell Handbook, the Nutshell Handbook logo, and the O'Reilly logo are registered trademarks of O'Reilly Media, Inc. _Introducing Regular Expressions_ , the image of a fruit bat, and related trade dress are trademarks of O'Reilly Media, Inc. + +Many of the designations used by manufacturers and sellers to distinguish their products are claimed as trademarks. Where those designations appear in this book, and O'Reilly Media, Inc., was aware of a trademark claim, the designations have been printed in caps or initial caps. + +While every precaution has been taken in the preparation of this book, the publisher and authors assume no responsibility for errors or omissions, or for damages resulting from the use of the information contained herein. + +O'Reilly Media + +1005 Gravenstein Highway North + +Sebastopol, CA 95472 + +2012-07-10T09:13:05-07:00 + diff --git a/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt b/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt new file mode 100644 index 00000000..1c28aec0 --- /dev/null +++ b/kag/examples/csqa/builder/data/introduction_to_the_theory_of_programming_languages.txt @@ -0,0 +1,2866 @@ +Introduction to the Theory of Programming Languages + +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2© Springer-Verlag London Limited 2011 + +Undergraduate Topics in Computer Science + +Series EditorIan Mackie + +Advisory EditorsSamson Abramsky, Chris Hankin, Dexter Kozen, Andrew Pitts, Hanne Riis Nielson, Steven Skiena and Iain Stewart + +Undergraduate Topics in Computer Science (UTiCS) delivers high-quality instructional content for undergraduates studying in all areas of computing and information science. From core foundational and theoretical material to final-year topics and applications, UTiCS books take a fresh, concise, and modern approach and are ideal for self-study or for a one- or two-semester course. The texts are all authored by established experts in their fields, reviewed by an international advisory board, and contain numerous examples and problems. Many include fully worked solutions. + +For other volumes: http://www.springer.com/series/7592 + +Gilles Dowek and Jean-Jacques Lévy + +Introduction to the Theory of Programming Languages + +Gilles Dowek + +Labo. d'Informatique, École polytechnique, Palaiseau, France + +Jean-Jacques Lévy + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, Orsay Cedex, France + +ISSN 1863-7310 + +ISBN 978-0-85729-075-5e-ISBN 978-0-85729-076-2 + +Springer London Dordrecht Heidelberg New York + +British Library Cataloguing in Publication Data A catalogue record for this book is available from the British Library + +© Springer-Verlag London Limited 2011 + +The work was first published in 2006 by Les editions de l'École polytechnique with the following title: 'Introduction à la théorie des langages de programmation'. The translator of the work is Maribel Fernandez. + +Apart from any fair dealing for the purposes of research or private study, or criticism or review, as permitted under the Copyright, Designs and Patents Act 1988, this publication may only be reproduced, stored or transmitted, in any form or by any means, with the prior permission in writing of the publishers, or in the case of reprographic reproduction in accordance with the terms of licenses issued by the Copyright Licensing Agency. Enquiries concerning reproduction outside those terms should be sent to the publishers. + +The use of registered names, trademarks, etc., in this publication does not imply, even in the absence of a specific statement, that such names are exempt from the relevant laws and regulations and therefore free for general use. + +The publisher makes no representation, express or implied, with regard to the accuracy of the information contained in this book and cannot accept any legal responsibility or liability for any errors or omissions that may be made. + +Printed on acid-free paper + +Springer is part of Springer Science+Business Media (www.springer.com) + +What Is the Theory of Programming Languages? + +The ultimate, definitive programming language has not been created yet, far from it. Almost every day a new language is created, and new functionalities are added to existing languages. Improvements in programming languages contribute to making programs more reliable, shorten the development time, and make programs easier to maintain. Improvements are also needed to satisfy new requirements, such as the development of parallel, distributed or mobile programs. + +The first thing that we need to describe, when defining a programming language, is its syntax . Should we write x := 1 or x = 1 ? Should we put brackets after an if or not? More generally, what are the strings of symbols that can be used as a program? There is a useful tool for this: the notion of a formal grammar . Using a grammar, we can describe the syntax of the language in a precise way, and this makes it possible to build programs to check the syntactical correctness of programs. + +But it is not sufficient to know what a syntactically correct program is in order to know what is going to happen when we run the program. When defining a programming language, it is also necessary to describe its semantics , that is, the expected behaviour of the program when it is executed. Two languages may have the same syntax but different semantics. + +The following is an example of what is meant (informally) by semantics. Function evaluation is often explained as follows. " The resultVof the evaluation of an expression of the formfe1...en , where the symbolfis a function defined by the expressionfx1...xn= e' , is obtained in the following way. First, the argumentse1, ...,enare evaluated, returning valuesW1, ...,Wn . Then, these values are associated to the variablesx1, ...,xn , and finally the expressione'is evaluated. The valueVis the result of this evaluation ." + +This explanation of the semantics of the language, expressed in a natural language (English), allows us to understand what happens when a program is executed, but is it precise? Consider, for example, the program + +Depending on the way we interpret the explanation given above, we can deduce that the program will result in the value 2 or in the value 9 . This is because the natural language explanation does not indicate whether we have to evaluate g 2 before or after g 7 , and the order in which we evaluate these expressions is important in this case. Instead, the explanation should have said: "the arguments e1, ...,en are evaluated starting frome1 " or else " starting fromen ". + +If two different programmers read an ambiguous explanation, they might understand different things. Even worse, the designers of the compilers for the language might choose different conventions. Then the same program will give different results depending on the compiler used. + +It is well known that natural languages are too imprecise to express the syntax of a programming language, a formal language should be used instead. Similarly, natural languages are too imprecise to express the semantics of a programming language, and we need to use a formal language for this. + +What is the semantics of a program? Let us take for instance a program p that requests an integer, computes its square, and displays the result of this operation. To describe the behaviour of this program, we need to describe a relation R between the input value and the associated output. + +The semantics of this program is, thus, a relation R between elements of the set E of input values and elements of the set S of output values, that is, a subset of E × S . + +The semantics of a program is then a binary relation. The semantics of a programming language is, in turn, a ternary relation: "the program p with input value e returns the output value s ". We denote this relation by p, e ↪ s . The program p and the input e are available before the execution of the program starts. Often, these two elements are paired in a termp e , and the semantics of the language assigns a value to this term. The semantics of the language is then a binary relation t ↪ s . + +To express the semantics of a programming language we need a language that can express relations. + +When the semantics of a program is a functional relation, that is, for each input value there is at most one output value, we say that the program is deterministic . Video games are examples of non-deterministic programs, since some randomness is necessary to make the game enjoyable. A language is deterministic if all the programs that can be written in the language are deterministic, or equivalently, if the semantics is a functional relation. In this case, it is possible to define its semantics using a language to define functions instead of a language to define relations. + +Acknowledgements + +The authors would like to thank Gérard Assayag, Antonio Bucciarelli, Roberto Di Cosmo, Xavier Leroy, Dave MacQueen, Luc Maranget, Michel Mauny, François Pottier, Didier Rémy, Alan Schmitt, Élodie-Jane Sims and Véronique Viguié Donzeau-Gouge. + +Contents + +1 Terms and Relations 1 + +1.1 Inductive Definitions 1 + +1.1.1 The Fixed Point Theorem 1 + +1.1.2 Inductive Definitions 4 + +1.1.3 Structural Induction 6 + +1.1.4 The Reflexive-Transitive Closure of a Relation 6 + +1.2 Languages 7 + +1.2.1 Languages Without Variables 7 + +1.2.2 Variables 7 + +1.2.3 Many-Sorted Languages 9 + +1.2.4 Free and Bound Variables 10 + +1.2.5 Substitution 10 + +1.3 Three Ways to Define the Semantics of a Language 12 + +1.3.1 Denotational Semantics 12 + +1.3.2 Big-Step Operational Semantics 12 + +1.3.3 Small-Step Operational Semantics 12 + +1.3.4 Non-termination 13 + +2 The Language PCF 15 + +2.1 A Functional Language: PCF 15 + +2.1.1 Programs Are Functions 15 + +2.1.2 Functions Are First-Class Objects 15 + +2.1.3 Functions with Several Arguments 16 + +2.1.4 No Assignments 16 + +2.1.5 Recursive Definitions 16 + +2.1.6 Definitions 17 + +2.1.7 The Language PCF 17 + +2.2 Small-Step Operational Semantics for PCF 18 + +2.2.1 Rules 18 + +2.2.2 Numbers 19 + +2.2.3 A Congruence 20 + +2.2.4 An Example 21 + +2.2.5 Irreducible Closed Terms 22 + +2.2.6 Non-termination 23 + +2.2.7 Confluence 24 + +2.3 Reduction Strategies 24 + +2.3.1 The Notion of a Strategy 24 + +2.3.2 Weak Reduction 26 + +2.3.3 Call by Name 26 + +2.3.4 Call by Value 27 + +2.3.5 A Bit of Laziness Is Needed 27 + +2.4 Big-Step Operational Semantics for PCF 27 + +2.4.1 Call by Name 28 + +2.4.2 Call by Value 29 + +2.5 Evaluation of PCF Programs 31 + +3 From Evaluation to Interpretation 33 + +3.1 Call by Name 33 + +3.2 Call by Value 35 + +3.3 An Optimisation: de Bruijn Indices 36 + +3.4 Construction of Functions via Fixed Points 38 + +3.4.1 First Variation: Recursive Closures 38 + +3.4.2 Second Variation: Rational Values 40 + +4 Compilation 43 + +4.1 An Interpreter Written in a Language Without Functions 44 + +4.2 From Interpretation to Compilation 44 + +4.3 An Abstract Machine for PCF 45 + +4.3.1 The Environment 45 + +4.3.2 Closures 46 + +4.3.3 PCF Constructs 46 + +4.3.4 Using de Bruijn Indices 47 + +4.3.5 Small-Step Operational Semantics 48 + +4.4 Compilation of PCF 48 + +5 PCF with Types 51 + +5.1 Types 51 + +5.1.1 PCF with Types 52 + +5.1.2 The Typing Relation 53 + +5.2 No Errors at Run Time 54 + +5.2.1 Using Small-Step Operational Semantics 55 + +5.2.2 Using Big-Step Operational Semantics 55 + +5.3 Denotational Semantics for Typed PCF 56 + +5.3.1 A Trivial Semantics 56 + +5.3.2 Termination 57 + +5.3.3 Scott's Ordering Relation 58 + +5.3.4 Semantics of Fixed Points 59 + +6 Type Inference 63 + +6.1 Inferring Monomorphic Types 63 + +6.1.1 Assigning Types to Untyped Terms 63 + +6.1.2 Hindley's Algorithm 64 + +6.1.3 Hindley's Algorithm with Immediate Resolution 66 + +6.2 Polymorphism 68 + +6.2.1 PCF with Polymorphic Types 68 + +6.2.2 The Algorithm of Damas and Milner 70 + +7 References and Assignment 73 + +7.1 An Extension of PCF 74 + +7.2 Semantics of PCF with References 75 + +8 Records and Objects 81 + +8.1 Records 81 + +8.1.1 Labelled Fields 81 + +8.1.2 An Extension of PCF with Records 82 + +8.2 Objects 85 + +8.2.1 Methods and Functional Fields 85 + +8.2.2 What Is "Self"? 86 + +8.2.3 Objects and References 88 + +9 Epilogue 89 + +References93 + +Index95 +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_1© Springer-Verlag London Limited 2011 + +# 1. Terms and Relations + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +For the book to be really self contained, this chapter introduces all the basic notions about inductive definitions and formal languages in general (variables, expressions, substitution, bound and free variables, sorts,...). Then it introduces to three ways to define the semantics of a programming language: denotational semantics, big-step and small-step operational semantics. This chapter start from scratch and gives many examples. + +## 1.1 Inductive Definitions + +Since the semantics of a programming language is a relation, we will start by introducing some tools to define sets and relations. + +The most basic tool is the notion of an explicit definition. We can, for example, define explicitly the function that multiplies its argument by 2: x ↦ 2 * x, the set of even numbers: {n ∈ ℕ | ∃p ∈ ℕ n = 2 * p}, or the divisibility relation: {(n,m) ∈ ℕ2 | ∃p ∈ ℕ n = m * p}. However, these explicit definitions are not sufficient to define all the objects we need. A second tool to define sets and relations is the notion of an inductive definition. This notion is based on a simple theorem: the fixed point theorem. + +### 1.1.1 The Fixed Point Theorem + +Let ≤ be an ordering relation—that is, a reflexive, antisymmetric and transitive relation—over a set E, and let u0,u1,u2, ... be an increasing sequence, that is, a sequence such that u0 ≤ u1 ≤ u2 ≤ ... The element l of E is called limit of the sequence u0,u1,u2, ... if it is a least upper bound of the set {u0,u1,u2, ...}, that is, if + + * for all i, ui ≤ l + + * if, for all i, ui ≤ l', then l ≤ l'. + +If it exists, the limit of a sequence (ui)i is unique, and we denote it by limiui. + +The ordering relation ≤ is said to be weakly complete if all the increasing sequences have a limit. + +The standard ordering relation over the real numbers interval [0, 1] is an example of a weakly complete ordering. In addition, this relation has a least element 0. However, the standard ordering relation over ℝ+ is not weakly complete since the increasing sequence 0, 1, 2, 3, ... does not have a limit. + +Let A be an arbitrary set. The inclusion relation ⊆ over the set ℘(A) of all the subsets of A is another example of a weakly complete ordering. The limit of an increasing sequence U0,U1,U2, ... is the set . In addition, this relation has a least element ∅. + +Let f be a function from E to E. The function f is increasing if + +It is continuous if, in addition, for any increasing sequence + +First Fixed Point Theorem + +Let ≤ be a weakly complete ordering relation over a setEthat has a least elementm. Letfbe a function fromEtoE. Iffis continuous thenp =limi (fi m)is the least fixed point off. + +Proof + +First, since m is the smallest element in E, m ≤ f m. The function f is increasing, therefore fi m ≤ fi+1 m. Since the sequence fi m is increasing, it has a limit. The sequence fi+1 m also has p as limit, thus, p =limi (f (fi m)) = f (limi (fi m)) = f p. Moreover, p is the least fixed point, because if q is another fixed point, then m ≤ q and fi m ≤ fi q = q (since f is increasing). Hence p =limi (fi m) ≤ q. + +The second fixed point theorem states the existence of a fixed point for increasing functions, even if they are not continuous, provided the ordering satisfies a stronger property. + +An ordering ≤ over a set E is strongly complete if every subset A of E has a least upper bound sup A. + +The standard ordering relation over the interval [0, 1] is an example of a strongly complete ordering relation. The standard ordering over ℝ+ is not strongly complete because the set ℝ+ itself has no upper bound. + +Let A be an arbitrary set. The inclusion relation ⊆ over the set ℘(A) of all the subsets of A is another example of strongly complete ordering. The least upper bound of a set B is the set . □ + +Exercise 1.1 + +Show that any strongly complete ordering is also weakly complete. + +Is the ordering + +weakly complete? Is it strongly complete? + +Note that if the ordering ≤ over the set E is strongly complete, then any subset A of E has a greatest lower bound inf A. Indeed, let A be a subset of E, let B be the set {y ∈ E | ∀ x ∈ A y ≤ x} of lower bounds of A and l the least upper bound of B. By definition, l is an upper bound of the set B + + * ∀y ∈ B y ≤ l + +and it is the least one + + * (∀y ∈ B y ≤ l') ⇒ l ≤ l' + +It is easy to show that l is the greatest lower bound of A. Indeed, if x is an element of A, it is an upper bound of B and since l is the least upper bound, l ≤ x. Thus, l is a lower bound of A. To show that it is the greatest one, it is sufficient to note that if m is another lower bound of A, it is an element of B and therefore m ≤ l. + +The greatest lower bound of a set B of subsets of A is, of course, the set . + +Second Fixed Point Theorem + +Let ≤ be a strongly complete ordering over a setE. Letfbe a function fromEtoE. Iffis increasing thenp = inf {c | f c ≤ c}is the least fixed point off. + +Proof + +Let C be the set {c | f c ≤ c} and c be an element of C. Then p ≤ c because p is a lower bound of C. Since the function f is increasing, we deduce that f p ≤ f c. Also, f c ≤ c because c is an element of C, so by transitivity f p ≤ c. + +The element f p is smaller than all the elements in C, it is therefore also smaller than or equal to its greatest lower bound: f p ≤ p. + +Since the function f is increasing, f (f p) ≤ f p, thus f p is an element of C, and since p is a lower bound of C, we deduce p ≤ f p. By antisymmetry, p = f p. + +Finally, by definition, all the fixed points of f belong to C, and they are therefore greater than p. □ + +### 1.1.2 Inductive Definitions + +We will now see how these fixed point theorems can be used to define sets and relations. + +Let A be a set, f a function from An to A and E a subset of A. The set E is closed under the function f if for all a1, ...,an in E, fa1 ...an is also in E. For example, the set of all the even numbers is closed under the function n ↦ n + 2. + +Let A be a set. An inductive definition of a subset E of A is a family of partial functions f1 from to A, f2 from to A, .... The set E is defined as the smallest subset of A that is closed under the functions f1,f2, .... + +For example, the subset of ℕ that contains all the even numbers is inductively defined by the number 0—that is, the function from ℕ0 to ℕ that returns the value 0—and the function from ℕ to ℕ n ↦ n + 2. The subset of {a, b, c}∗ containing all the words of the form anbcn is inductively defined by the word b and the function m ↦ a m c. In general, a context free grammar can always be specified as an inductive set. In logic, the set of theorems is defined as the subset of all the propositions that is inductively defined by the axioms and deduction rules. + +The functions f1,f2, ... are called rules. Instead of writing a rule as x1 ...xn ↦ t, we will use the notation + +![ +$$\\frac{{{\\rm x}_1 \\ldots {\\rm x}_{\\rm n} }}{{\\rm t}}$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equc.gif) + +For example, the set of even numbers is defined by the rules + +![ +$$ +\\bar 0 +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equd.gif) + +![ +$$ +\\frac{{\\rm n}}{{{\\rm n} + 2}} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Eque.gif) + +Let P be the set of even numbers. We will sometimes write the rules as follows: + +![ +$$ +\\frac{{}}{{0 \\in {\\rm P}}} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equf.gif) + +![ +$$ +\\frac{{{\\rm n} \\in {\\rm P}}}{{{\\rm n} + 2 \\in {\\rm P}}} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equg.gif) + +In order to define a language inductively, we will sometimes use a notation borrowed from language theory, where, for example, the set of words of the form anbcn is defined as follows + +To show that there is indeed a smallest subset of A that is closed under the functions f1,f2, ..., we define a function F from ℘(A) to ℘(A) + +A subset C of A is closed under the functions f1,f2, ... if and only if F C ⊆ C. + +The function F is trivially increasing, that is, if C ⊆ C' then F C ⊆ F C'. In addition, it is continuous, that is, if C0 ⊆ C1 ⊆ C2 ⊆⋅⋅⋅ then F ( ) = (FCj). Indeed, if an element x of A is in F ( ), then there exists a number i and elements y1, ..., in such that x =fiy1 ... . Each of these elements is in one of the Cj. Since the sequence Cj is increasing, they are all in Ck, which is the largest of these sets. Therefore, the element x belongs to FCk and also to (FCj). Conversely, if x is in (FCj), then it belongs to some FCk, and there is therefore a number i and elements y1, ..., of Ck such that x =fiy1 ... . The elements y1, ..., are in , and therefore x is in F ( ). + +The set E is defined as the least fixed point of the function F. This is the smallest set that satisfies the property F E = E and, according to the second fixed point theorem, it is also the smallest set that satisfies the property F E ⊆ E. Thus, it is the smallest set that is closed under the functions f1,f2, .... + +The set of even numbers is not the only subset of ℕ that contains 0 and is closed under the function n ↦ n + 2—the set ℕ, for example, also satisfies these properties—but it is the smallest one. It can be defined as the intersection of all those sets. The second fixed point theorem allows us to generalise this observation and define E as the intersection of all the sets that are closed under the functions f1,f2, .... + +The first fixed point theorem shows that an element x is in E if and only if there is some number k such that x is in the set Fk ∅. That is, if there is a function fi such that x =fiy1 ... where y1, ..., are in Fk−1 ∅. Iterating, that is, by induction on k, we can show that an element x of A is in E if and only if there exists a tree where the nodes are labelled by elements of A, the root is labelled by x, and if a node is labelled by c, then its children are labelled by d1, ...,dn such that for some rule f, we have c = fd1 ...dn. Such a tree is called a derivation for a. This notion of a derivation generalises the notion of proof in logic. We can then define the set E as the set of elements x of A for which there is a derivation. + +We will use a specific notation for derivations. First, the root of the tree will be written at the bottom, and the leaves at the top. Then, we will write a line over each node in the tree and we will write its children over the line. + +The number 8, for example, is in the set of even numbers, as the following derivation shows + +If we call P the set of even numbers, we can write the derivation as follows + +### 1.1.3 Structural Induction + +Inductive definitions suggest a method to write proofs. If a property is hereditary, that is, if each time it holds for y1, ..., , then it also holds for fiy1 ... , then we can deduce that it holds for all the elements of E. + +One way to show this, is to use the second fixed point theorem and to observe that the subset P of A containing all the elements that satisfy the property is closed under the functions fi and thus it includes E. Another way is to use the first fixed point theorem and to show by induction on k that all the elements in Fk ∅ satisfy the property. + +### 1.1.4 The Reflexive-Transitive Closure of a Relation + +The reflexive-transitive closure of a relation is an example of inductive definition. If R is a binary relation on a set A, we can inductively define another relation R∗, called the reflexive-transitive closure of R + +![ +$$ +\\frac{{}}{{{\\rm x R}^* {\\rm y}}}{\\rm if x R y} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equk.gif) + +![ +$$ +\\frac{{}}{{{\\rm x R}^* {\\rm x}}} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equl.gif) + +![ +$$ +\\frac{{{\\rm x R}^* {\\rm y y R}^* {\\rm z}}}{{{\\rm x R}^* {\\rm z}}} +$$ +](A978-0-85729-076-2_1_Chapter_TeX2GIF_Equm.gif) + +If we see R as a directed graph, then R∗ is the relation that links two nodes when there is a path from one to the other. + +## 1.2 Languages + +### 1.2.1 Languages Without Variables + +Now that we have introduced inductive definitions, we will use this technique to define the notion of a language. The notion of language that we will define does not take into account superficial syntactic conventions, for instance, it does not matter whether we write 3 + 4, +(3,4), or 3 4 +. This term will be represented in an abstract way by a tree. + +Each node in the tree will be labelled by a symbol. The number of children of a node depends on the node's label—2 children if the label is +, 0 if it is 3 or 4, .... + +A language is thus a set of symbols, each with an associated number called arity, or simply number of arguments, of the symbol. The symbols without arguments are called constants. + +The set of terms of the language is the set of trees inductively defined by + + * if f is a symbol with n arguments and t1, ...,tn are terms then f(t1, ...,tn)—that is, the tree that has a root labelled by f and subtrees t1, ...,tn—is a term. + +### 1.2.2 Variables + +Imagine that we want to design a language to define functions. One possibility would be to use constants sin,cos, ... and a symbol with two arguments ○. We could, for instance, build the term sin ○ (cos ○ sin) in this language. + +However, we know that, to specify functions, it is easier to use a notion invented by F. Viète (1540–1603): the notion of a variable. Thus, the function described above can be written sin (cos (sin x)). + +Since the 1930's, we write this function x ↦ sin (cos (sin x)) or λx sin (cos (sin x)), using the symbol ↦ or λ to bind the variable x. By indicating explicitly which variables are bound, we can distinguish the arguments of the function from potential parameters, and we also fix the order of the arguments. + +The symbol ↦ appears to have been introduced by N. Bourbaki around 1930, and the symbol λ by A. Church around the same time. The notation λ is a simplified version of a previous notation sin (cos (sin x)) used by A.N. Whitehead and B. Russell since the 1900's. + +The definition f = x ↦ sin (cos (sin x)) is sometimes written f x = sin (cos (sin x)). The advantage of writing f = x ↦ sin (cos (sin x)) is that in this way we can distinguish two different operations: the construction of the function x ↦ sin (cos (sin x)) and the definition itself, which gives a name to an object previously constructed. It is often important, in computer science, to have notations that allow us to build objects without necessarily giving them a name. + +In this book, we use the notation fun x -> sin (cos (sin x)) to specify this function. + +The term fun x -> sin (cos (sin x)) specifies a function. However, its subterm sin x does not specify anything: it is not a real number and it is not a function, because it contains a free variable whose value we do not know. + +To bind variables in terms, we need to extend the notion of term to include free variables, which will be bound later. This requires also new symbols, such as fun, which act as binders for the variables in some of their arguments. Other examples of binders are the symbol { | }, the symbol ∂/∂, the symbol ∫d, the symbols ∑ and ∏, the quantifiers ∀ and ∃,... In this book we will use several binders: the symbol fun above, the symbols fix,let,fixfun .... + +The arity of a symbol f will no longer be a number n, instead, we will use a finite sequence of numbers (k1, ...,kn) that will indicate that f binds k1 variables in its first argument, k2 variables in the second,..., kn variables in the nth. + +In this way, when a language is defined—that is, a set of symbols with their arities—and an infinite set of variables is given, we can define the set of terms inductively as follows + + * variables are terms, + + * if f is a symbol with arity (k1, ...,kn), t1, ...,tn are terms and , ..., , ..., , ..., are variables, then f( ... t1, ..., ... tn) is a term. + +The notation f( ... t1, ..., ... tn) denotes the tree + +This definition can be better understood with an example. We build a language in which terms specify real numbers and functions over the reals, and which includes two constants sin and cos to represent the functions sine and cosine, a symbol α, called application, such that α(f,x) is the object obtained by applying the function f to the object x and a symbol fun to build functions. This language includes then four symbols: the constants sin and cos, α with arity (0,0) and fun with arity (1); the set of terms is inductively defined by + + * variables are terms, + + * sin is a term, + + * cos is a term, + + * if t and u are terms then α(t,u) is a term, + + * if t is a term and x is a variable then fun(x t) is a term. + +We will adopt a simplified notation, writing t u for the term α(t,u) and fun x -> t for the term fun(x t). + +For example, fun x -> sin (cos (sin x)) is a term in this language. + +### 1.2.3 Many-Sorted Languages + +In this book, we will sometimes use more general languages, called many-sorted languages. For instance, the language that is used to describe vectors with a finite number of constants, addition and scalar multiplication. In this language, there are two sorts of terms: terms describing a vector, and terms describing a scalar. In the definition of the language we indicate that the symbol + has two arguments, that are both vectors and that the symbol . has two arguments, which are a scalar and a vector. + +For this, we introduce a set with two elements {vect, scal}, called sorts, and we associate to the symbol . the arity (scal, vect, vect). This arity indicates that in a term of the form λ.v, the term λ must be of sort scal, the term v of sort vect, and the term λ.v is itself of sort vect. + +When, in addition, there are bound variables, the arity of a symbol f is a finite sequence (( , ..., , s'1), ..., ( , ..., , s'n), s") indicating that the symbol has n arguments, the first one of sort s'1 and binding k1 variables of sorts , ..., , ..., and that the resulting term is itself of sort s". + +Given a language—that is, a set of sorts and a set of symbols each with an associated arity—and a family, indexed by sorts, of infinite, pairwise disjoint, sets of variables, we can inductively define terms as follows: + + * variables of sort s are terms of sort s, + + * if f is a symbol of arity (( , ..., , s'1), ..., ( , ..., , s'n), s"), , ..., , ..., , ..., are variables of sort , ..., , ..., , ..., and t1, ...,tn are terms of sort s'1, ..., s'n then f( ... t1, ..., ... tn) is a term of sort s". + +### 1.2.4 Free and Bound Variables + +The set of variables of a term is defined by structural induction: + + * Var(x) = {x}, + + * Var(f( ... t1, ..., ... tn)) = Var(t1) ∪ { , ..., } ∪ ⋅⋅⋅ ∪ Var(tn) ∪ { , ..., }. + +We can also define the set of free variables of a term: + + * FV(x) = {x}, + + * FV(f( ... t1, ..., ... tn)) = (FV(t1) ∖ { , ..., }) ∪ ⋅⋅⋅ ∪ (FV(tn) ∖ { , ..., }) + +For example, Var (fun x -> sin (cos (sin x))) = {x} and FV (fun x -> sin (cos (sin x))) = ∅. + +A term without free variables is said to be closed. + +The height of a term is also defined by structural induction: + + * Height(x) = 0, + + * Height(f( ... t1,..., ... tn))=1\+ max(Height(t1), ..., Height (tn)). + +### 1.2.5 Substitution + +The first operation that we need to define is substitution: indeed, the rôle of variables is not only to be bound but also to be substituted. For example, when we apply the function fun x -> sin (cos (sin x)) to the term 2 *π, at some point we will need to substitute in the term sin (cos (sin x)) the variable x by the term 2 *π. + +A substitution is simply a mapping from variables to terms, with a finite domain. In other words, a substitution is a finite set of pairs where the first element is a variable and the second a term, and such that each variable occurs at most once as first element in a pair. We can also define a substitution as an association list—θ =t1/x1 ...tn/xn. + +When a substitution is applied to a term, each occurrence of a variable x1, ...,xn in the term is replaced by t1, ...,tn, respectively. + +Of course, this replacement only affects the free variables. For example, if we substitute the variable x by the term 2 in the term x + 3, we should obtain the term 2 + 3. However, if we substitute the variable x by the term 2 in the term fun x -> x which represents the identity function we should obtain the term fun x -> x and not fun x -> 2. + +The first attempt to define the application of a substitution to a term is as follows: + + * 〈θ〉xi =ti, + + * 〈θ〉x = x if x is not in the domain of θ, + + * 〈θ〉f( ... u1,..., ... un)=f( ... , ..., ... ) + +where we use the notation for the restriction of the substitution θ to the set V ∖ {y1, ...,yk}, that is, the substitution where we have omitted all the pairs where the first element is one of the variables y1, ...,yk. + +This definition is problematic, because substitutions could capture variables. For example, the term fun x -> (x + y) represents the function that adds y to its argument. If we substitute y by 4 in this term, we obtain the term fun x -> (x + 4) representing the function that adds 4 to its argument. If we substitute y by z, we get the term fun x -> (x + z) representing the function that adds z to its argument. But if we substitute y by x, we obtain the function fun x -> (x + x) which doubles its argument, instead of the function that adds x to its argument as expected. We can avoid this problem if we change the name of the bound variable: bound variables are dummies, their name does not matter. In other words, in the term fun x -> (x + y), we can replace the bound variable x by any other variable, except of course y. Similarly, when we substitute in the term u the variables x1, ...,xn by the terms t1, ...,tn, we can change the names of the bound variables in u to make sure that their names do not occur in x1, ...,xn, or in the variables of t1, ...,tn, or in the variables of u, to avoid capture. + +We start by defining an equivalence relation on terms, by induction on the height of terms. This relation is called alphabetic equivalence—or α-equivalence—and it corresponds to variable renaming. + + * x ∼ x, + + * f( ... t1, ..., ... tn) ∼ f( ... t'1, ..., ... t'n) if for all i, and for any sequence of fresh variables z1, ..., (that is, variables that do not occur in ti, t'i), we have . + +For example, the terms fun x -> x + z and fun y -> y + z are α-equivalent. + +In the rest of the book we will work with terms moduloα-equivalence, that is, we will consider implicitly α-equivalence classes of terms. + +We can now define the operation of substitution by induction on the height of terms: + + * θxi =ti, + + * θx = x if x is not in the domain of θ, + + * θf( ... u1,..., ... un)=f( ... ) where , ..., , ..., , ..., are variables that do not occur in f( ... u1, ..., ... un) or in θ. + +For example, if we substitute the variable y by the term 2 * x in the term fun x -> x + y, we obtain the term fun z -> z + (2 * x). The choice of variable z is arbitrary, we could have chosen v or w, and we would have obtained the same term modulo α-equivalence. + +The composition of the substitutions θ =t1/x1 ...tn/xn and σ =u1/y1 ...up/yp is the substitution + +We can prove, by induction on the height of t, that for any term t + +## 1.3 Three Ways to Define the Semantics of a Language + +The semantics of a programming language is a binary relation over the set of terms in the language. Since we have already defined the notion of a language and introduced tools to define relations, we are ready to describe the three main techniques used for semantic definitions. The semantics of a language is usually given as a function, as an inductive definition, or as the reflexive-transitive closure of an explicitly defined relation. They are called denotational semantics, big-step operational semantics and small-step operational semantics, respectively. + +### 1.3.1 Denotational Semantics + +Denotational semantics is useful for deterministic languages. In this case, for each program p, the input-output relation defined by a program is a function, written 〚p〛. The relation ↪ is then defined by + +Of course, this simply moves the problem further down: we now need to define the function 〚p〛. For this, we will use two tools: explicit definitions of functions, and the fixed point theorem... but we will leave this for later. + +### 1.3.2 Big-Step Operational Semantics + +The big-step operational semantics is also called structural operational semantics (S.O.S.) or natural semantics. It gives an inductive definition of the relation ↪. + +### 1.3.3 Small-Step Operational Semantics + +The small-step operational semantics is also called reduction semantics. It defines the relation ↪ by means of another relation that describes the basic steps to transform the initial term t into the final term s. + +For example, when we run the program fun x -> (x * x) + x with input 4, we obtain the result 20. But the term (fun x -> (x * x) + x) 4 does not become 20 in one step, it is first transformed into (4 * 4) + 4, then 16 + 4, and finally 20. + +The most important relation is not the one that links (fun x -> (x * x) + x) 4 with 20, but , which relates the term (fun x -> (x * x) + x) 4 with (4 * 4) + 4, then the term (4 * 4) + 4 with 16 + 4 and finally the term 16 + 4 with the term 20. + +Once the relation is given, ↪ can be derived from the reflexive-transitive closure of the relation + +The fact that the term s is irreducible implies that there is nothing else to compute in s. For example, the term 20 is irreducible, but the term 16 + 4 is not. A term s is irreducible if there is no term s' such that s s'. + +### 1.3.4 Non-termination + +The execution of a program may produce a result, produce an error, or never terminate. Errors can be seen as particular kinds of results. For non-terminating programs, there are several ways to define a semantics. A first alternative is to consider that if the term t does not terminate, then there is no pair (t,s) in the relation ↪. Another alternative is to add a specific element ⊥ to the set of output values, and to state that the relation ↪ contains the pair (t,⊥) when the term t does not terminate. + +The difference may seem superficial: it is easy to delete all the pairs of the form (t,⊥), or to add such a pair if there is no pair of the form (t,s) in the relation. However, readers who are familiar with computability problems will notice that, if we add the pairs (t,⊥), the relation ↪ is no longer recursively enumerable. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_2© Springer-Verlag London Limited 2011 + +# 2. The Language PCF + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +This chapter introduces a specific programming language called PCF (and sometimes mini-ML). This language is one of the backbones of the book. It will be evaluated, interpreted, compiled, and extended (with types, references, records and objects) in the rest of the book. This chapter focuses on giving an informal description of the language, defining its small-step and big-step operational semantics and culminates with the implementation of an evaluator for this language. + +We will illustrate the various styles of semantics of programming languages with an example: the language PCF—Programming language for computable functions—, also called Mini-ML. + +## 2.1 A Functional Language: PCF + +### 2.1.1 Programs Are Functions + +We observed in the previous chapter that a deterministic program computes a function, and from this observation we derived the principles of denotational semantics. This remark is also the basis of a class of programming languages: functional languages, such as Caml, Haskell or Lisp, which are traditionally used to begin the study of programming languages. + +In these languages, the goal is to shorten the distance between the notion of a program and the notion of a mathematical function. In other words, the idea is to bring programs closer to their denotational semantics. + +The basic constructions in the language PCF are the explicit construction of a function, written fun x -> t, and the application of a function to an argument, written t u. + +PCF includes also a constant for each natural number, the operations +, -, *, /, and a test to detect zero ifz t then u else v. Addition and multiplication are defined for all natural numbers, and similarly for subtraction using the convention n - m = 0 if n < m. Division is the standard Euclidean division, division by 0 produces an error. + +### 2.1.2 Functions Are First-Class Objects + +In many programming languages, it is possible to define a function that takes another function as argument, or that returns another function, but often this requires the use of a syntax that is different from the syntax used for a standard argument such as an integer or a string. In a functional language, functions are defined in the same way whether they take numbers or functions as arguments. + +For example, the composition of a function with itself is defined by fun f -> fun x -> f (f x). + +To highlight the fact that functions are not considered different, and thus they can be used as arguments or returned as results for other functions, we say that functions are first class objects. + +### 2.1.3 Functions with Several Arguments + +In PCF, there is no symbol to build a function with several arguments. These functions are built as functions with one argument, using the isomorphism (A × B) -> C = A -> (B -> C). For instance, the function that associates to x and y the number x * x + y * y is defined as the function associating to x a function, which in turn associates to y the number x * x + y * y, that is, fun x -> fun y -> x * x + y * y. + +Then, to apply the function f to the numbers 3 and 4 we need to apply it first to 3, obtaining the term f 3, which represents the function that associates 3 * 3 + y * y to y, and then to 4, obtaining the term (f 3) 4. Since, by convention, application associates to the left, we will write this term simply as f 3 4. + +### 2.1.4 No Assignments + +In contrast with languages such as Caml or Java, the main feature of PCF is a total lack of assignments. There is no construction of the form x := t or x = t to assign a value to a "variable". We will describe, in Chap. 7, an extension of PCF with assignments. + +### 2.1.5 Recursive Definitions + +In Mathematics, some functions cannot be defined explicitly. For example, in a high-school textbook, the power function is often defined by + +or through a definition by induction. + +In programming languages, we use similar constructs: iterations and recursive definitions. PCF includes a special construct to define recursive functions. + +It is often said that a function is recursive if the function is used in its own definition. This is absurd: in programming languages, as everywhere else, circular definitions are meaningless. We cannot "define" the function fact by fun n -> ifz n then 1 else n * (fact (n - 1)). In general, we cannot define a function f by a term G which contains an occurrence of f. However, we can define the function f as the fixed point of the function fun f -> G. For example, we can define the function fact as the fixed point of the function fun f -> fun n -> ifz n then 1 else n * (f (n - 1)). + +Does this function have a fixed point? and if it does, is this fixed point unique? Otherwise, which fixed point are we referring to? We will leave these questions for a moment, and simply state that a recursive function is defined as a fixed point. + +In PCF, the symbol fix binds a variable in its argument, and the term fix f G denotes the fixed point of the function fun f -> G. The function fact can then be defined by fix f fun n -> ifz n then 1 else n * (f (n - 1)). + +Note, again, that using the symbol fix we can build the factorial function without necessarily giving it a name. + +### 2.1.6 Definitions + +We could, in theory, omit definitions and replace everywhere the defined symbols by their definitions. However, programs are simpler and clearer if we use definitions. + +We add then a final construct in PCF, written let x = t in u. The occurrences of the variable x in u are bound, but those in t are not. The symbol let is a binary operator that binds a variable in its second argument. + +### 2.1.7 The Language PCF + +The language PCF contains + + * a symbol fun with one argument, that binds a variable in its argument, + + * a symbol α with two arguments, which does not bind any variables in its arguments, + + * an infinite number of constants to represent the natural numbers, + + * four symbols +, -, * and / with two arguments, which do not bind any variables in their arguments, + + * a symbol ifz with three arguments, which does not bind any variables in its arguments, + + * a symbol fix with one argument, which binds a variable in its argument, + + * a symbol let with two arguments, which binds a variable in its second argument. + +In other words, the syntax of PCF is inductively defined by + +Despite its small size, PCF is Turing complete, that is, all computable functions can be programmed in PCF. + +Exercise 2.1 + +Write a PCF program that takes two natural numbers n and p as inputs and returns np. + +Exercise 2.2 + +Write a PCF program that takes a natural number n as input and returns the number 1 if the input is a prime number, and 0 otherwise. + +Exercise 2.3 + +(Polynomials in PCF) Write a PCF program that takes a natural number q as input, and returns the greatest natural number u such that u (u + 1) / 2 ≤ q. + +Cantor's function K is a function from ℕ2 to ℕ defined by fun n -> fun p -> (n + p) (n + p + 1) / 2 + n. Let K' be the function from ℕ to ℕ2 defined by fun q -> (q - (u (u + 1) / 2), u - q + u (u + 1) / 2) where u is the greatest natural number such that u (u + 1) / 2 ≤ q. + +Show that K ○ K' = id. Let n and p be two natural numbers, show that the greatest natural number u such that u (u + 1) / 2 ≤ (n + p) (n + p + 1) / 2 + n is n + p. Then deduce that K' ○ K = id. From this fact, deduce that K is a bijection from ℕ2 to ℕ. + +Let L be the function fun n -> fun p -> (K n p) + 1. A polynomial with integer coefficients a0 +a1 X + ⋅⋅⋅ +aiXi + ⋅⋅⋅ +anXn can be represented by the integer La0 (La1 (La2 ... (Lan 0) ...)). + +Write a PCF program that takes two natural numbers as input and returns the value of the polynomial represented by the first number applied to the second. + +## 2.2 Small-Step Operational Semantics for PCF + +### 2.2.1 Rules + +Let us apply the program fun x -> 2 * x to the constant 3. We obtain the term (fun x -> 2 * x) 3. According to the principles of small-step operational semantics, let us try to evaluate this term step by step, to obtain a result: 6 if all goes well. The first step in this simplification process is parameter passing, that is, the replacement of the formal argument x by the actual argument 3. The initial term becomes, after a first small-step transformation, the term 2 * 3. In the second step, the term 2 * 3 is evaluated, resulting in the number 6. The first small step, parameter passing, can be performed each time we have a term of the form (fun x -> t) u where a function fun x -> t is applied to an argument u. As a consequence, we define the following rule, called β-reduction rule + +The relation t ⟶ u should be read "t reduces—or rewrites—to u ". The second step mentioned above can be generalised as follows + +where ⊗ is any of the four arithmetic operators included in PCF. We add similar rules for conditionals + +a rule for fixed points + +and a rule for let + +A redex is a term t that can be reduced. In other words, a term t is a redex if there exists a term u such that t ⟶ u. + +### 2.2.2 Numbers + +It could be said, quite rightly, that the rule p ⊗ q ⟶ n (if p ⊗ q = n), of which 2 * 3 ⟶ 6 is an instance, does not really explain the semantics of the arithmetic operators, since it just replaces the multiplication in PCF by that of Mathematics. This choice is however motivated by the fact that we are not really interested in the semantics of arithmetic operators, instead, our goal is to highlight the semantics of the other constructs in the language. + +To define the semantics of the arithmetic operators in PCF without referring to the mathematical operators, we should consider a variant of PCF without numeric constants, where we introduce just one constant for the number 0 and a symbol S—"successor"—with one argument. The number 3, for instance, is represented by the term S(S(S(0))). We then add small-step rules + +0 + u ⟶ u + +S(t) + u ⟶ S(t + u) + +0 - u ⟶ 0 + +t - 0 ⟶ t + +S(t) - S(u) ⟶ t - u + +0 * u ⟶ 0 + +S(t) * u ⟶ t * u + u + +t / S(u) ⟶ ifz t - u then 0 else S((t - S(u)) / S(u)) + +Note that, to be precise, we should add a rule for division by 0, which should raise an exception: error. + +Exercise 2.4 + +(Church numerals) Instead of introducing the symbols 0 and S, we can represent the number n by the term fun z -> fun s -> s (s (...(s z)...)) rather than S(S(...(0)...)). Show that addition and multiplication can be programmed on these representations. Show that the function that checks whether a number is 0 can also be programmed. + +Exercise 2.5 + +(Position numerals) It could be said that the representations of numbers using the symbols 0 and S, or using Church numerals, are not efficient, since the size of the term representing a number grows linearly with the number—as the representation in unary notation, where to write the number n we need n symbols—and not logarithmically, as it is the case with the usual position-based notation. An alternative could be to use a symbol z for the number 0 and two functions O and I to represent the functions n ↦ 2 * n and n ↦ 2 * n + 1. The number 26 would then be represented by the term O(I(O(I(I(z))))), and reversing it we obtain IIOIO, the binary representation of this number. + +Write a small-step operational semantics for the arithmetic operators in this language. + +### 2.2.3 A Congruence + +Using the rules of the small-step semantics we obtain + +Thus, denoting by ⟶∗ the reflexive-transitive closure of ⟶, we can write (fun x -> 2 * x) 3 ⟶∗6. + +However, with this definition, the term (2 + 3) + 4 does not reduce to the term 9 according to ⟶∗. Indeed, to reduce a term of the form t + u the terms t and u should be numeric constants, but our first term 2 + 3 is a sum, not a constant. The first step should then be the evaluation of 2 + 3, which produces the number 5. Then, a second step reduces 5 + 4 to 9. The problem is that, with our definition, the term 2 + 3 reduces to 5, but (2 + 3) + 4 does not reduce to 5 + 4. + +We need to define another relation, where rules can be applied to any subterm of a term to be reduced. Let us define inductively the relation as follows + +![ +$$ +\\frac{{}}{{{\\rm t} \\triangleright {\\rm u}}}{\\rm if t} \\to {\\rm u} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equi.gif) + +![ +$$ +\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm t v} \\triangleright {\\rm u v}}} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equj.gif) + +![ +$$ +\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm v t} \\triangleright {\\rm v u}}} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equk.gif) + +![ +$$ +\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm fun x } - > {\\rm t} \\triangleright {\\rm fun x } - > {\\rm u}}} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equl.gif) + +![ +$$ +\\frac{{{\\rm t} \\triangleright {\\rm u}}}{{{\\rm t} + {\\rm v} \\triangleright {\\rm u} + {\\rm v}}} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equm.gif) + +It is possible to show that a term is a redex with respect to the relation if and only if one of its subterms is a redex with respect to ⟶. + +### 2.2.4 An Example + +To illustrate PCF's small-step semantic rules, let us compute the factorial of 3. + +(fix f fun n -> ifz n then 1 else n * (f (n - 1))) 3 + + (fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 3 + + ifz 3 then 1 else 3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (3 - 1)) + + 3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (3 - 1)) + + 3 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 2) + + 3 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 2) + + 3 * (ifz 2 then 1 else 2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (2 - 1))) + + 3 * (2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (2 - 1))) + + 3 * (2 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 1)) + + 3 * (2 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 1)) + + 3 * (2 * (ifz 1 then 1 else 1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (1 - 1)))) + + 3 * (2 * (1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (1 - 1)))) + + 3 * (2 * (1 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) 0))) + + 3 * (2 * (1 * ((fun n -> ifz n then 1 else n * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (n - 1))) 0))) + + 3 * (2 * (1 * ((ifz 0 then 1 else 0 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (0 - 1)))))) + + 3 * (2 * (1 * 1)) 3 * (2 * 1) 3 * 2 6 + +### 2.2.5 Irreducible Closed Terms + +A term t is irreducible if it cannot be reduced by , that is, if there is no term u such that t u. + +We can now define the relation "the term u is the result of the evaluation of term t ", where t is a closed term, by: t ↪ u if and only if t u and u is irreducible. In this case, the term u must be closed. Finally, the relation "the program p with inputs e1, ...,en produces the output s " is simply written pe1 ...en ↪ s. + +Exercise 2.6 + +(Classification of irreducible closed terms) Show that a term is irreducible and closed if and only if it is of one of the following forms + + * fun x -> t where t is irreducible and does not contain any free variables except possibly x, + + * n where n is a number, + + * V1V2, where V1 and V2 are irreducible closed terms and V1 is not of the form fun x -> t, + + * V1 ⊗ V2, where V1 and V2 are irreducible closed terms and are not both numeric constants, + + * ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number. + +Numbers and irreducible closed terms of the form fun x -> t are called values. When the result of a computation is a value, we associate the value to the initial term, and we say that the term evaluates to this value. + +Unfortunately, values are not the only possible results. For example, the term (fun x -> x) 1 2 can be reduced to the term 1 2, which is irreducible and closed, and thus the term 1 2 is the result of the computation of (fun x -> x) 1 2. This result is meaningless, because we cannot apply the object 1, which is not a function, to 2. An irreducible closed term that is not a value is said to be stuck. Stuck terms have the form V1V2, where V1 and V2 are irreducible closed terms and V1 is not a function fun x -> t (for example 1 2), V1 ⊗ V2, where V1 and V2 are irreducible and closed and are not numbers (for example 1 + (fun x -> x)), and ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible and closed and V1 is not a number (for example, ifz (fun x -> x) then 1 else 2). + +Exercise 2.7 + +Which are the values associated to the terms + +and + +according to the small-step operational semantics of PCF? + +Exercise 2.8 + +(Static binding) Does the small-step operational semantics of PCF associate the value 10 or the value 11 to the term + +The first versions of the language Lisp produced the value 11 instead of 10 for this term. In this case, we say that the binding is dynamic. + +### 2.2.6 Non-termination + +It is easy to see that the relation ↪ is not total, that is, there are terms t for which there is no term u such that t ↪ u. For example, the term b = fix x x reduces to itself, and only to itself. It does not reduce to any irreducible term. + +Exercise 2.9 + +Let b1 = (fix f (fun x -> (f x))) 0. Show all the terms obtained by reducing this term. Does the computation produce a result in this case? + +Exercise 2.10 + +(Curry's fixed point operator) Let t be a term and u be the term (fun y -> (t (y y)))(fun y -> (t (y y))). Show that u reduces to t u. + +Let t be a term and v be the term (fun y -> ((fun x -> t) (y y)))(fun y -> ((fun x -> t) (y y))). Show that v reduces to (v/x)t. + +Thus, we can deduce that the symbol fix is superfluous in PCF. However, it is not going to be superfluous later when we add types to PCF. + +Write a term u without using the symbol fix and equivalent to b = fix x x. Describe the terms that can be obtained by reduction. Does the computation produce a result in this case? + +### 2.2.7 Confluence + +Is it possible for a closed term to produce several results? And, in general, can a term reduce to several different irreducible terms? The answer to these questions is negative. In fact, every PCF program is deterministic, but this is not a trivial property. Let us see why. + +The term (3 + 4) + (5 + 6) has two subterms which are both redexes. We could then start by reducing 3 + 4 to 7 or 5 + 6 to 11. Indeed, the term (3 + 4) + (5 + 6) reduces to both 7 + (5 + 6) and (3 + 4) + 11. Fortunately, neither of these terms is irreducible, and if we continue the computation we reach in both cases the term 18. + +To prove that any term can be reduced to at most one irreducible term we need to prove that if two computations originating in the same term produce different terms, then they will eventually reach the same irreducible term. + +This property is a consequence of another property of the relation : confluence. A relation R is confluent if each time we have aR∗b1 and aR∗b2, there exists some c such that b1R∗ c and b2R∗ c. + +It is not difficult to show that confluence implies that each term has at most one irreducible result. If the term t can be reduced to two irreducible terms u1 and u2, then we have t u1 and t u2. Since is confluent, there exists a term v such that u1 v and u2 v. Since u1 is irreducible, the only term v such that u1 v is u1 itself. Therefore, u1 = v and similarly u2 = v. We conclude that u1 =u2. In other words, t reduces to at most one irreducible term. + +We will not give here the proof of confluence for the relation . The idea is that when a term t contains two redexes r1 and r2, and t1 is obtained by reducing r1 and t2 is obtained by reducing r2, then we can find the residuals of r2 in t1 and reduce them. Similarly, we can reduce the residuals of r1 in t2, obtaining the same term. For example, by reducing 5 + 6 in 7 + (5 + 6) and reducing 3 + 4 in (3 + 4) + 11, we obtain the same term: 7 + 11. + +## 2.3 Reduction Strategies + +### 2.3.1 The Notion of a Strategy + +Since in PCF each term has at most one result (due to the unicity property mentioned above), it does not matter in which order we reduce the redexes in a term: if we reach an irreducible term, it will always be the same. However, it may be the case that one sequence of reduction reaches an irreducible term whereas another one does not. For example, let C be the term fun x -> 0 and let b1 be the term (fix f (fun x -> (f x))) 0. The term b1 reduces to b2 = (fun x -> (fix f (fun x -> (f x)) x)) 0 and then again to b1. The term Cb1 contains several redexes, and it can be reduced to 0 and to Cb2 which in turn contains several redexes and can be reduced to 0 and Cb1 (amongst other terms). By reducing always the innermost redex, we can build an infinite reduction sequence Cb1 Cb2 Cb1 , whereas reducing the outermost redex produces the result 0. + +This example may seem an exception, because it contains a function C that does not use its argument; but note that the ifz construct is similar, and in the example of the factorial function, when computing the factorial of 3 for instance, we can observe the same behaviour: The term ifz 0 then 1 else 0 * ((fix f fun n -> ifz n then 1 else n * (f (n - 1))) (0 - 1)) has several redexes. Outermost reduction produces the result 1 (the other redexes disappear), whereas reducing the redex fix f fun n -> ifz n then 1 else n * (f (n - 1)) we get an infinite reduction sequence. In other words, the term fact 3 can be reduced to 6, but it can also generate reductions that go on forever. + +Both Cb1 and fact 3 produce a unique result, but not all reduction sequences reach a result. + +Since the term Cb1 has the value 0 according to the PCF semantics, an evaluator, that is, a program that takes as input a PCF term and returns its value, should produce the result 0 when computing Cb1. Let us try to evaluate this term using some current compilers. In Caml, the program + +does not terminate. In Java, we have the same problem with the program + +Only a small number of compilers, using call by name or lazy evaluation, such as Haskell, Lazy-ML or Gaml, produce a terminating program for this term. + +This is because the small-step semantics of PCF does not correspond to the semantics of Caml or Java. In fact, it is too general and when a term has several redexes it does not specify which one should be reduced first. By default, it imposes termination of all programs that somehow can produce a result. An ingredient is missing in this semantic definition: the notion of a strategy, that specifies the order of reduction of redexes. + +A strategy is a partial function that associates to each term in its domain one of its redex occurrences. Given a strategy s, we can define another semantics, replacing the relation by a new relation such that t u if s t is defined and u is obtained by reducing the redex s t in t. Then, we define the relation as the reflexive-transitive closure of , and the relation ↪s as before. + +Instead of defining a strategy, an alternative would be to weaken the reduction rules, in particular the congruence rules, so that only some specific reductions can be performed. + +### 2.3.2 Weak Reduction + +Before defining outermost or innermost strategies for the term Cb1, let us give another example to show that the operational semantics defined above is too liberal, and to motivate the definition of strategies or weaker reduction rules. Let us apply the program fun x -> x + (4 + 5) to the constant 3. We obtain the term (fun x -> x + (4 + 5)) 3 that contains two redexes. We can then reduce it to 3 + (4 + 5) or to (fun x -> x + 9) 3. The first reduction is part of the execution of the program, but not the second. Usually, if we execute a function before passing arguments to it, we say that we are optimising or specialising the program. + +A weak reduction strategy never reduces a redex that is under a fun. Thus, weak reduction does not specialise programs, it just executes them. It follows that with a weak strategy all terms of the form fun x -> t are irreducible. + +Alternatively, we can define weak reduction by weakening the reduction rules, more precisely, by discarding the congruence rule + +![ +$$ +\\frac{{{\\rm t } \\triangleright {\\rm u}}}{{{\\rm fun x } - > {\\rm t } \\triangleright {\\rm fun x } - > {\\rm u}}} +$$ +](A978-0-85729-076-2_2_Chapter_TeX2GIF_Equo.gif) + +Exercise 2.11 + +(Classification of weak irreducible closed terms) Show that, under weak reduction, a closed irreducible term must have one of the following forms: + + * fun x -> t, where t has at most x free, + + * n where n is a number, + + * V1V2, where V1 and V2 are irreducible closed terms and V1 is not a term of the form fun x -> t, + + * V1 ⊗ V2, where V1 and V2 are irreducible closed terms and are not both numbers, + + * ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number. + +What is the difference with Exercise 2.6? + +Numbers and closed terms of the form fun x -> t are called values. + +### 2.3.3 Call by Name + +Let us analyse again the reductions available for the term Cb1. We need to decide whether we should evaluate the arguments of the function C before they are passed to the function, or we should pass to the function the arguments without evaluating them. + +The call by name strategy always reduces the leftmost redex first, and the weak call by name strategy always reduces the leftmost redex that is not under a fun. Thus, the term Cb1 reduces to 0. This strategy is interesting due to the following property, called standardisation: if a term can be reduced to an irreducible term, then the call by name strategy terminates. In other words, ↪n = ↪. Moreover, when we evaluate the term (fun x -> 0) (fact 10) using a call by name strategy, we do not need to compute the factorial of 10. However, if we evaluate the term (fun x -> x + x) (fact 10), using a call by name strategy, we will compute it twice, because this term reduces to (fact 10) + (fact 10). Most call by name evaluators use sharing to avoid this duplication of computation, and in this case we call it lazy evaluation. + +### 2.3.4 Call by Value + +Call by value, in contrast, always evaluates the arguments of a function before passing them to the function. It is based on the following convention: we can only reduce a term of the form (fun x -> t) u if u is a value. Thus, when we evaluate the term (fun x -> x + x) (fact 10), we start by reducing the argument to obtain (fun x -> x + x) 3628800, and then we reduce the leftmost redex. By doing this, we only compute the factorial of 10 once. + +All the strategies that evaluate arguments before passing them are in this class. For instance, the strategy that reduces always the leftmost redex amongst those that are authorised. Thus, call by value is not a unique strategy, but a family of strategies. + +This convention can also be defined by weakening the β-reduction rule: the term (fun x -> t) u is a redex only if the term u is a value. + +A weak strategy is said to implement call by value if it reduces a term of the form (fun x -> t) u only when u is a value and is not under a fun. + +### 2.3.5 A Bit of Laziness Is Needed + +Even under a call by value strategy, a conditional construct ifz must be evaluated under call by name: in a term of the form ifz t then u else v, we should never evaluate the three arguments. Instead, we should first evaluate t and depending on the result, evaluate either u or v. + +It is easy to see that if we evaluate the three arguments of an ifz then the evaluation of the term fact 3 does not terminate. + +Exercise 2.12 + +Characterise the irreducible closed terms under weak call by name, then characterise the irreducible closed terms under weak call by value. + +## 2.4 Big-Step Operational Semantics for PCF + +Instead of defining a strategy, or weakening the reduction rules of the small-step operational semantics, we can control the order in which redexes are reduced by defining a big-step operational semantics. + +The big-step operational semantics of a programming language provides an inductive definition of the relation ↪, without first defining ⟶ and . + +### 2.4.1 Call by Name + +Let us start by the call by name semantics for PCF. Consider a term of the form t u that is reduced under call by name to obtain an irreducible term V. We will start by reducing the redexes that occur in t until we obtain an irreducible term. If this term is of the form fun x -> t', then the whole term reduces to (fun x -> t') u and the left-most redex is the term itself. It reduces to (u/x)t', which in turn reduces to V. We can say that the term t u reduces under call by name to the irreducible term V if t reduces to fun x -> t' and (u/x)t' reduces to V. + +This can be expressed as a rule + +which will be part of the inductive definition of the relation ↪ (without first defining ⟶ and ). + +Other rules state that the result of the computation for a term of the form fun is the term itself, that is, we are defining a weak reduction relation + +and that the result of the computation of a term of the form n is the term itself + +Also, there is a rule to give the semantics of arithmetic operators + +two rules to define the semantics of the ifz construct + +a rule to define the semantics of the fixed point operator + +and finally a rule to define the semantics of a let + +We can prove by structural induction on the evaluation relation that the result of the computation of a term is always a value, that is, a number or a closed term of the form fun. There are no stuck terms. The computation of a term such as ((fun x -> x) 1) 2, which gave rise to the term 1 2 (stuck) with the small-step semantics, does not produce a result with the big-step semantics, since none of the rules can be applied to this term. Indeed, there is no rule in the big-step semantics that explains how to evaluate an application where the left part evaluates to a number. + +### 2.4.2 Call by Value + +The rules defining the call by value semantics are similar, except for the application rule: we compute the value of the argument before passing it to the function + +and the let rule + +Summarising, we have the following rules + +Notice that, even under call by value, we keep the rules for the ifz + +that is, we do not evaluate the second and third arguments of an ifz until they are needed. + +Note also that, even under call by value, we keep the rule + +We must resist the temptation to evaluate the term fix x t to a value W before substituting it in t, because the rule + +requires, in order to evaluate fix x t, to start by evaluating fix x t which would create a loop and the term fact 3 would never produce a value—its evaluation would give rise to an infinite computation. + +Note finally that other rule combinations are possible. For example, some variants of the call by name semantics use call by value in the let rule. + +Exercise 2.13 + +Which values do we obtain under big-step semantics for the terms + +and + +Compare your answer with that of Exercise 2.7. + +Exercise 2.14 + +Does the big-step semantics associate the value 10 or the value 11 to the term + +Compare your answer with that of Exercise 2.8. + +## 2.5 Evaluation of PCF Programs + +A PCF evaluator is a program that takes a closed PCF term as input, and produces its value as output. When read in a bottom-up fashion, the rules in the big-step semantics can be seen as the kernel of such an evaluator: To evaluate an application t u one starts by evaluating u and t,... this is easy to program in a language like Caml + +In the case of an application, the rules of the big-step semantics leave us the freedom to evaluate u first or t first—call by value is not a strategy, but a family of strategies—, but the term (W/x)t' must be the third to be evaluated, because it is built out of the results of the first two evaluations. + +Exercise 2.15 + +Write a call by name evaluator for PCF, that is, a program that takes as input a closed term and computes its value. Write a call by value evaluator. Evaluate the term fact 6 and the term Cb1 in both cases. + +PCF's denotational semantics is more difficult to define. This may seem a paradox, since PCF is a functional language and it should be easy to interpret its programs as functions. However, in PCF, any object can be applied to any object, and nothing stops us writing for instance the term fun x -> (x x). In contrast with mathematical functions, PCF functions do not have a domain. For this reasons, we will give a denotational semantics for PCF after we add types, in Chap. 5. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_3© Springer-Verlag London Limited 2011 + +# 3. From Evaluation to Interpretation + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +This chapter introduces an essential notion in the implementation of programming languages: that of environment. Then it uses it to transform the evaluator built in the previous chapter into an interpretor. It finally discusses several optimizations: the use of De Bruijn indices and recursive closures. + +## 3.1 Call by Name + +Using the rules of the big-step operational semantics, we can build an evaluator for PCF where a term of the form (fun x -> t) u is evaluated by first substituting the variable x by the term u everywhere in the body t of the function. For example, to evaluate the term (fun x -> (x * x) + x) 4, we substitute x by 4 in the term (x * x) + x and then we evaluate the term (4 * 4) + 4. Substitutions are costly operations; to increase the efficiency of the evaluator we could instead keep the association x = 4 in a separate structure called an environment, and evaluate the term (x * x) + x in that environment. A program that evaluates terms in this way is called an interpreter. + +An environment is a function from variables to terms, with a finite domain. It is in essence the same thing as a substitution, but different notations are used. We write an environment as a list of pairs x1 =t1, ...,xn =tn, where the same variable x may occur several times and in that case the rightmost pair has priority. Thus, in the environment x = 3, y = 4, x = 5, z = 8 we only consider x = 5, not x = 3, which is said to be hidden by the pair x = 5. Finally, if e is an environment and x = t a pair, we denote by e, x = t the list obtained by extending e with the pair x = t. + +During the evaluation of a term we might reach a free variable x. In this case, we will look for the term associated to this variable in the environment. It can be shown that, if we start with a closed term, then each time we reach a variable we will find an associated term in the environment. + +In fact, the situation is slightly more complicated, because in addition to the term u associated to the variable in the environment, we will also need to find the environment associated to u. A pair of a term and an environment is called a thunk. We will write it 〈u, e〉. + +Similarly, when we interpret a term of the form fun x -> t in an environment e, the result cannot simply be the term fun x -> t, because it might contain free variables and when interpreting the term t we will need the thunks associated to these variables in e. We introduce then a new notion of value, called a closure, consisting of a term that must be of the formfun x -> t and an environment e. We will write such values as follows 〈x, t, e〉. Values are no longer a subset of terms, and we will have to define a language of values independently from the language of terms. + +As a consequence, we will need to rewrite the rules for the call by name big-step operational semantics of PCF, in order to consider a relation of the form e ⊢ t ↪ V, read "t is interpreted as V in e ", where e is an environment, t a term and V a value. When the environment e is empty, this relation will be written ⊢ t ↪ V. The rules that extend the environment are the application rule, which adds a pair consisting of a variable x and a thunk 〈u, e〉, the let rule, which adds a pair consisting of the variable x and the thunk 〈t, e〉 and the fix rule, which adds a pair consisting of the variable x and the thunk 〈fix x t, e〉. In the latter rule, the term t is duplicated: one of the copies is interpreted and the other is kept in the environment for any recursive calls arising from the interpretation of the first one. + +Exercise 3.1 + +Write a call by name interpreter for PCF. + +Exercise 3.2 + +Which values will be obtained for the following terms according to the interpretation rules given above for PCF? + +and + +Compare with Exercises 2.7 and 2.13. + +Exercise 3.3 + +Will the interpretation rules for PCF compute the value 10 or the value 11 for the term + +Compare with Exercises 2.8 and 2.14. + +## 3.2 Call by Value + +The situation is simpler with a call by value semantics. Indeed, when interpreting a term of the form (fun x -> t) u, we start by interpreting the term u. The result is a value, that is, a number or a closure, and it suffices to bind the variable x to this value in the environment. Similarly, to interpret a term of the form let x = t in u, we start by interpreting the term t. The result is a value and it suffices to bind the variable x to this value in the environment. Thus, the environments will associate to variables values instead of thunks (which are suspended until they can be interpreted). We no longer need the notion of a thunk. + +However, the evaluation rule for fix, unlike the application rule or the let rule, requires a variable to be substituted by a term of the form fix x t, which is not a value, and to evaluate such a term before substituting it or before storing it in the environment will give rise to infinite computations (as mentioned above). The environment will have to include then extended values, which are either values or thunks containing a term of the form fix x t and an environment e. When we access such an extended value, we will need to interpret it if it is a thunk. This leads us to the following rules + +Exercise 3.4 + +When we compute the value of the term (fact 3) where the function fact is defined by fix f fun n -> ifz n then 1 else n * (f (n - 1)), we start by calling recursively the function fact with argument 2, which will create an association between the variable n and the value 2. When we come back from the recursive call to compute the value of n and perform the multiplication, is the variable n associated to the value 2 or the value 3? Why? + +Exercise 3.5 + +Write a call by value interpreter for PCF. + +## 3.3 An Optimisation: de Bruijn Indices + +In the big-step operational semantic rules, environments are lists of pairs consisting of a variable and an extended value. We could replace this structure by a pair of lists of the same length, one containing the variables and the other the values. Thus, the list x = 12, y = 14, z = 16, w = 18 could be replaced by the list of variables x, y, z, w and the list of extended values 12, 14, 16, 18. To find the extended value associated to a variable, we just need to search through the first list to find the variable's position, and then find in the other list the element at the same position. The position of a variable in the first list is a number, called the de Bruijn index of the variable in the environment. In general, we can associate the number 0 to the last element of the list—the rightmost element—, 1 to the previous,..., n - 1 to the first element of the list—the leftmost one. + +The list of variables which will be needed for the interpretation of each subterm can be computed before starting the process of interpretation. In fact, we can associate a de Bruijn index to each occurrence of a variable before interpreting the term. For example, if we interpret the term fun x -> fun y ->(x + (fun z->fun w->(x + y + z + w))(2 * 8)(14 + 4))(5 + 7) (20 - 6) the variable y will necessarily be interpreted in an environment of the form x = ., y = ., z = ., w = ., that is, to find the value associated to y we need to find the value with index 2. We can then associate this index to the variable from the start. + +To compute the de Bruijn indices of the variables we simply need to traverse the term maintaining a variable environment, that is, a list of variables, where we associate the index p to the variable x in the environment e, if p is the position of the variable x in the environment e, starting from the end. + + * |x|e =xp where p is the position of x in the environment e + + * |t u|e = |t|e |u|e + + * |fun x -> t|e = fun x -> |t|e,x + + * |n|e = n + + * |t + u|e = |t|e \+ |u|e + + * |t - u|e = |t|e \- |u|e + + * |t * u|e = |t|e * |u|e + + * |t / u|e = |t|e / |u|e + + * |ifz t then u else v|e = ifz |t|e then |u|e else |v|e + + * |fix x t|e = fix x |t|e,x + + * |let x = t in u|e = let x = |t|e in |u|e,x + +For example, the term above will be written fun x -> fun y -> (x1 \+ (fun z -> fun w -> (x3 +y2 +z1 +w0))(2 * 8)(14 + 4))(5 + 7)(20 - 6). + +It is easy to show that an occurrence of a subterm translated in the variable environment x1, ...,xn will always be interpreted in an environment of the form x1 = ., ...,xn = . For this reason, to find the value of the variable associated to the index p we will just look for the pth element in the environment. + +This suggests an alternative way to interpret a term: we start by computing the de Bruijn index for each occurrence of a variable; once the indices are known, we no longer need to keep in the environment the list of variables. The environment will simply be a list of extended values. Similarly, we can dispose of variable names in closures and in thunks. Indeed, variable names are useless now and we could for instance rewrite the term above as follows: fun _ -> fun _ -> (_1 \+ (fun _ -> fun _ -> (_3 \+ _2 \+ _1 \+ _0)) (2 * 8) (14 + 4)) (5 + 7) (20 - 6). + +The big-step operational semantic rules can now be defined as follows + +Exercise 3.6 + +Write a program to replace each variable by its De Bruijn index. Write an interpreter for this language. + +Exercise 3.7 + +Write the rules of the call by name big-step operational semantics using de Bruijn indices. + +We will highlight the advantages of this notation, which eliminates the names of variables, when we study compilation in the next chapter. + +In the meantime, notice that two terms have the same de Bruijn translations if and only if they are α-equivalent. This gives us a new definition of alphabetical equivalence. Replacing variables by indices that indicate the position where they are bound can be seen as a radical point of view that highlights the fact that bound variables are "dummies". + +## 3.4 Construction of Functions via Fixed Points + +In most programming languages, only functions can be recursively defined. The fix construct applies to a term of the form fun, or we could also replace the symbol fix by a symbol fixfun f x -> t that binds two variables in its argument. The call by value big-step semantic rule for the latter can be derived from the rules given above for fix and fun + +In this case, we could define simpler variations of the rules for the call by value interpreter. + +### 3.4.1 First Variation: Recursive Closures + +We will distinguish closures of the form 〈x, t, (e, f = 〈fixfun f x -> t, e〉)〉, which we will write 〈f, x, t, e〉 and call recursive closures. + +The rule that we have given to interpret the construction fixfun f x -> t can be reformulated as follows + +When we interpret an application t u under a call by value semantics, if the term t is interpreted as the recursive closure 〈f, x, t', e'〉, that is, 〈x, t', (e', f = 〈fixfun f x -> t', e'〉)〉 and the term u as the value W, then to interpret the term t u, the application rule requires to interpret the term t' in the environment e', f = 〈fixfun f x -> t', e'〉, x = W. + +We can anticipate the interpretation of the thunk 〈fixfun f x -> t', e〉 that appears in this environment, and this gives rise to the rule fixfun, the recursive closure 〈f, x, t', e'〉. In the case of recursive closures, the application rule can then be specialised as follows + +Thunks are no longer used in this rule; thus, under call by value, by introducing recursive closures we eliminate thunks and we no longer need the rule to interpret them. + +A final simplification: standard closures 〈x, t, e〉 can be replaced by recursive closures 〈f, x, t, e〉 where f is an arbitrary variable that does not occur in t. We can then discard the application rule for the case of standard closures. + +Finally, we obtain the rules + +where f is an arbitrary variable, different from x, that does not occur in t or e + +Exercise 3.8 + +Write a call by value interpreter for PCF, using recursive closures. + +Exercise 3.9 + +How will the rules of the big-step operational semantics with recursive closures change if variables are replaced by de Bruijn indices—see Sect. 3.3? + +### 3.4.2 Second Variation: Rational Values + +In the rule + +we can anticipate the interpretation of the thunk 〈fixfun f x -> t, e〉. Of course, the value of this thunk is the term 〈x, t, (e, f = 〈fixfun f x -> t, e〉)〉 where the thunk occurs again. We could decide to interpret it again, and again.... + +As previously said, this kind of interpretation of a term of the form fix f t before substituting it or storing it in the environment leads to an infinite computation. Here, it leads to the construction of the infinite value 〈x, t, (e, f = 〈x, t, (e, f = 〈x, t, (e, f = 〈x, t, (e, f =...)〉)〉)〉)〉, which is an infinite term, but a rational one. There are well-known techniques for the representation of rational trees in the computer's memory. Here, we could represent this value by the structure. + +Using the notation FIX X 〈x, t, (e, f = X)〉 for this rational value, we can replace the rule above by + +and again thunks will no longer be needed. + +Note that it is sometimes better to represent such rational value in an equivalent way + +and in this case we could instead define rational environments. + +Exercise 3.10 + +Write a call by value interpreter for PCF using rational values. + +Exercise 3.11 + +How do these big-step operational semantic rules change if we replace variables by their de Bruijn indices—see Sect. 3.3? + +Exercise 3.12 + +Could the technique of rational values be used to design an interpreter for the full PCF, that is, where we could define via fixed points not only functions but also arbitrary objects? Hint: what is the rational representation of the value of the term fix x x? + +To summarise, in this section we have seen that if a variable x has an occurrence in the term t, the reduction rule fix x t ⟶ (fix x t/x)t can be applied an infinite number of times starting from the term fix x t, because the term (fix x t/x)t contains again the term fix x t as a subterm. This corresponds to the replacement, in a recursive definition f = G(f), of f by G(f) an infinite number of times, which leads to the infinite program f = G(G(G(...))). In a sense, this explains the intuition that recursive programs are infinite programs. For example, the term fact could be written fun x -> ifz x then 1 else x * (ifz x - 1 then 1 else (x - 1) * (ifz x - 2 then 1 else (x - 2) * ⋅⋅⋅)). This replacement must only be done on demand: in a lazy way. + +We have seen that there are several ways to express this behaviour in the semantics of PCF—and finally in the code of a PCF interpreter: substitute x by fix x t and freeze this redex if it is under a fun or an ifz, store this redex as a thunk or a recursive closure and "unfreeze" the thunk on demand, represent the term f = G(G(G(...))) as a rational tree and traverse it on demand. A final method could be to use the encoding of fix given in Exercise 2.10, and only reduce this term (which requires the duplication of a subterm) when needed. + +Exercise 3.13 + +(An extension of PCF with pairs) We extend PCF with the following constructions: t,u represents the pair where the first component is t and the second is u; fst t and snd t are, respectively, the first and second component of the pair t. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF. + +Exercise 3.14 + +(An extension of PCF with lists) We extend PCF with the following constructions: nil denotes the empty list, cons n l denotes a list where the first element is the natural number n and l is the rest of the list, ifnil t then u else v checks whether a list is empty or not, hd l returns the first element of the list l and tl l the list l without its first element. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF. Write a program to implement a sorting algorithm over these lists. + +Exercise 3.15 + +(An extension of PCF with trees) We extend PCF with the following constructions: L n denotes a tree that consists of one leaf labelled by the natural number n, N t u denotes a tree with two subtrees t and u, ifleaf t then u else v checks whether its first argument is a tree of the form L n or N t u, content t denotes the content of the tree t if it is a leaf, left t and right t denote, respectively, the left and right subtrees of t if it is not a leaf. Write small-step and big-step operational semantic rules for this extension of PCF. Write an interpreter for this extension of PCF. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_4© Springer-Verlag London Limited 2011 + +# 4. Compilation + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +In this chapter the interpretor is transformed into a compiler. The emphasis is put on the construction of an abstract machine, whose language is the target language of the compilation. This chapter ends with the bootstrapping of this compiler. + +When a computer comes out of the factory, it is not capable of interpreting a PCF term, not even a Caml or Java program. For a computer to be able to run a PCF, Caml or Java program, we need to have an interpreter for the language, which must be written in the machine language of the computer. In the previous chapter we described the principles underlying PCF interpretation, and we wrote an interpreter in a high-level language, such as Caml. We could continue this line of thought, and try to write now an interpreter in machine language.... + +One possibility is to leave the realm of interpretation and move towards a compiler. An interpreter takes a PCF term as input and returns its value. A compiler, instead, is a program that takes a PCF term as argument and returns a program, in machine language, whose execution returns the value of the term. In other words, a PCF compiler is a program that translates PCF terms into machine language, that is, into a language which can be directly executed by the machine. + +One of the advantages of using a compiler is that the program is translated once and for all, when it is compiled, rather than each time it is executed. Once compiled, the execution is usually faster. Another advantage comes from the fact that a compiler can compile itself, we call this bootstrapping (see Exercise 4.4), whereas an interpreter cannot interpret itself. + +The implementation of a compiler should be guided by the rules of the operational semantics of the language (as was the case for the interpreter). To simplify, we will focus on a fragment of PCF where only functions can be defined recursively, and we will use the big-step semantics with recursive closures—see Sect. 3.4. + +The machine language that we will use is not a commercial one: it is the machine language of an imaginary computer. This kind of machine is called an abstract machine. We will write a program that will simulate the behaviour of this machine. The use of an abstract machine is not only motivated by pedagogical reasons, there are practical reasons too: the main compilers for Caml and Java, for instance, use abstract machines. Compiled programs are executed by a program that simulates the workings of the abstract machine, or are further translated (in a second compilation phase) to the machine language of a concrete machine. + +## 4.1 An Interpreter Written in a Language Without Functions + +In Chap. 2, we gave a big-step operational semantics for PCF and we used it to derive an interpreter for this language. For example, the rule + +results in the following piece of Caml code for the PCF interpreter + +Since Caml allows us to write local definitions, we can compute the value of the term interp env t and recover the value w after the computation, even if the variable w is bound to other values during the computation. + +If we tried to write the interpreter in machine language, or in any language that does not permit local definitions, then we would need to devise a mechanism to memorise the value w, for example using a stack: we could interpret the term u, put the result in the stack, then interpret the term t and finally pop the top of the stack and add it to the result of the interpretation. + +In this way, to interpret the term ((((1 + 2) + 3) + 4) + 5) + 6 we need to put the number 6, then the number 5,..., then the number 2 in the stack, then pop the number on the top of the stack (that is, 2) and add it to the number 1, then pop the number 3 and add it to the previous result, then... pop the number 6 and add it to the previous result, to obtain the final result: 21. + +## 4.2 From Interpretation to Compilation + +This interpreter can be decomposed into two programs. The first one can be seen as an object with two fields: a field that contains a natural number and that we call an accumulator, and a field that contains a list of natural numbers, called the stack. We have the following operations + + * Ldin: puts the number n in the accumulator, + + * Push: puts the contents of the accumulator on the top of the stack, + + * Add: adds the top of the stack and the accumulator, leaves the result in the accumulator, and pops the top of the stack. + +This object is our abstract machine, and the three instructions above constitute its machine language. The fields are called registers. + +The second program takes a PCF term as input and, depending on the term, produces machine instructions, which will be executed by the machine, one by one. If t is a PCF term, we denote by |t| the sequence of abstract machine instructions generated by this program during the interpretation of the term. For instance, for the term ((((1 + 2) + 3) + 4) + 5) + 6, the machine instructions generated are: Ldi6, Push, Ldi5, Push, Ldi4, Push, Ldi3, Push, Ldi2, Push, Ldi 1, Add, Add, Add, Add, Add. + +Exercise 4.1 + +Which instructions will be executed by the abstract machine when interpreting the term 1 + (2 + (3 + (4 + (5 + 6))))? + +This way of sharing the work resembles the behaviour of a car driver and a passenger in an unfamiliar city: the passenger reads the map and gives instructions to the driver, who follows the instructions without really knowing where the car is. + +If the passenger could generate the instructions just by looking at the map, it would be possible to record the list of instructions in a compact disk, which the driver could then listen to in the car. In this scenario, the passenger does not need to be in the car to guide the driver. Similarly, the interpreter could leave the sequence |t| of instructions in a file, and the file could then be executed later by the abstract machine. We have just transformed the interpreter into a compiler. + +In general, we consider that the abstract machine contains, in addition to the accumulator and the stack, a third register: the code, the list of instructions that have to be executed. At the beginning, the abstract machine looks for an instruction in the code register, executes it, then looks for another instruction... until the code register becomes empty. As we will see, the fact that the execution of an instruction may add new instructions to the code register will allow us to write loops and recursive definitions. + +## 4.3 An Abstract Machine for PCF + +### 4.3.1 The Environment + +So far we have only compiled a fragment of PCF: numbers and addition. Can this principle be generalised to the full language? + +First, recall that in PCF a term has to be interpreted in an environment. In addition to the accumulator, stack, and code, our abstract machine needs a fourth register: the environment. The machine must also include an instruction Extendx to extend the environment, adding the definition x = V where V is the content of the accumulator, and an instruction Searchx to look for the value associated to x in the environment and put it in the accumulator. + +When the machine executes the code generated by the compilation of several nested applications, the environment will change several times, and at the end of the execution the initial environment should be restored. The abstract machine needs then instructions Pushenv and Popenv to put the contents of the environment in the stack and recover it. These operations are often further decomposed into several operations to push and pop individual elements of the environment, but here we will not decompose them in this way. + +### 4.3.2 Closures + +In PCF it is also necessary to define closures as values. In addition to the instruction Ldin, we will need an instruction Mkclos(f,x,t), with two variables f and x and a term t as arguments. This instruction will build the closure 〈f, x, t, e〉, where e is the content of the environment register, and put the closure in the accumulator. + +### 4.3.3 PCF Constructs + +It is not difficult to compile a term of the form fun x -> t or fixfun f x -> t since we can simply generate the instruction Mkclos(f,x,t) to build a closure, which is the value of this kind of term. + +In the same way, it is easy to compile a term of the form x, we just need to generate the instruction Searchx to look for the value associated to x in the environment. + +Let us consider now the compilation of a term of the form t u. The corresponding big-step semantics rule is + +To interpret the term t u in the environment e, we start by interpreting u in the environment e, which returns the value W. We then interpret the term t in the environment e, obtaining the closure 〈f, x, t', e'〉, and finally we interpret t' in the environment (e', f = 〈f, x, t', e'〉), x = W, to obtain the final result. + +Now, let us see how an interpreter running in an abstract machine will deal with that term: to interpret the term t u, the abstract machine starts by interpreting u, and puts the result in the stack. Then, it interprets the term t, resulting in the closure 〈f, x, t', e'〉, and puts in the environment register the environment e', f = 〈f, x, t', e'〉, x = W, where W is the value at the top of the stack, which will then be removed from the stack. Finally, the machine interprets the term t'. To ensure that the contents of the environment register are restored at the end of the operations, it should be put in the stack at the beginning of the interpretation, and recovered from the stack at the end. + +Let us consider now the compilation process for such a term. The interpretation of the term u is replaced by the execution of the sequence |u| of instructions, and similarly the interpretation of the term t is replaced by the execution of the sequence |t| of instructions. The interpretation of t' has to be replaced by the execution of the sequence |t'| of instructions. However, there is a difficulty here: t' is not a subterm of t u, it is provided by the closure resulting from the interpretation of t. We then need to modify the notion of closure, and replace the term t in 〈f, x, t, e〉 by a sequence i of instructions. Thus, terms of the form fun x -> t and fixfun f x -> t should not be compiled into Mkclos(f, x, t), instead, they should be compiled into Mkclos(f, x, |t|) to build the closure 〈f, x, |t|, e〉 where e is the content of the environment register. + +Finally, we need to include in the machine an instruction Apply that takes a closure 〈f, x, i, e〉 from the accumulator, puts the environment e, f = 〈f, x, i, e〉, x = W, where W is the top of the stack, in the environment register, discards the top of the stack and adds to the code register the sequence i of instructions. + +The term t u can then be compiled as the sequence of instructions Pushenv, |u|, Push, |t|, Apply, Popenv. + +Summarising, the abstract machine has the set of instructions Ldin, Push, Add, Extendx, Searchx, Pushenv, Popenv, Mkclos(f,x,i) and Apply. To complete it, we just need to add the arithmetic operations Sub, Mult, Div and the test Test(i,j) to compile the operators -, *, / and ifz. + +### 4.3.4 Using de Bruijn Indices + +To simplify the machine we can use De Bruijn indices—see Sect. 3.3. Recall that the instruction Searchx is generated by the compilation of variables, and we have already seen that it is possible to determine the index of each variable occurrence statically. We could then compile a variable x using the instruction Searchn, where n is a number, instead of earchx. + +De Bruijn indices can be computed at the same time as the compilation is performed, it suffices to compile a term in a variable environment, and compile the variable x in the environment e by the instruction Searchn, where n is the position of the variable x in the environment e, starting by the end. + +This mechanism allows us to dispose of variables in environments, closures, and instructions Mkclos and Extend. Our abstract machine includes the instructions Ldin, Push, Extend, Searchn, Pushenv, Popenv, Mkclosi, Apply, Test(i,j), Add, Sub, Mult and Div. + +### 4.3.5 Small-Step Operational Semantics + +The machine state, the contents of its registers, is a tuple consisting of a value (the accumulator), a list where each element is either a value or a list of values (the stack), a list of values (the environment), and a sequence of instructions (the code). + +A small execution step consists of getting an instruction from the code register and executing it. The small-step semantics of the machine can be easily defined: + + * (a,s,e,((Mkclos i),c)) ⟶ (〈i,e〉,s,e,c) + + * (a,s,e,(Push,c)) ⟶ (a,(a,s),e,c) + + * (a,s,e,(Extend,c)) ⟶ (a,s,(e,a),c) + + * (a,s,e,((Search n),c)) ⟶ (V,s,e,c) if V is the nth value in e (starting from the end) + + * (a,s,e,(Pushenv,c)) ⟶ (a,(e,s),e,c) + + * (a,(e',s),e,(Popenv,c)) ⟶ (a,s,e',c) + + * (〈i,e'〉,(W,s),e,(Apply,c)) ⟶ (〈i,e'〉,s,(e', 〈i,e'〉, W), i c) + + * (a,s,e,((Ldi n),c)) ⟶ (n,s,e,c) + + * (n,(m,s),e,(Add,c)) ⟶ (n + m,s,e,c) + + * (n,(m,s),e,(Sub,c)) ⟶ (n - m,s,e,c) + + * (n,(m,s),e,(Mult,c)) ⟶ (n * m,s,e,c) + + * (n,(m,s),e,(Div,c)) ⟶ (n / m,s,e,c) + + * (0,s,e,((Test(i,j)),c)) ⟶ (0,s,e,i c) + + * (n,s,e,((Test(i,j)),c)) ⟶ (n,s,e,j c) if n is a number different from 0 + +An irreducible term is a tuple where the fourth component—the contents of the code register—is empty. If i is a sequence of instructions and if the term (0,[ ],[ ],i) reduces to an irreducible term of the form (V,_,_,[ ]), then we say that V is the result of the execution of i, and we write i ⇒ V. + +## 4.4 Compilation of PCF + +We can now give the compilation rules for PCF + + * |x|e =Search n where n is the position of x in the environment e + + * |t u|e =Pushenv, |u|e,Push, |t|e,Apply,Popenv + + * |fun x -> t|e =Mkclos |t|e,_,x + + * |fixfun f x -> t|e =Mkclos |t|e, f, x + + * |n|e =Ldi n + + * |t + u|e = |u|e,Push, |t|e,Add + + * |t - u|e = |u|e,Push, |t|e,Sub + + * |t * u|e = |u|e,Push, |t|e,Mult + + * |t / u|e = |u|e,Push, |t|e,Div + + * |ifz t then u else v|e = |t|e,Test(|u|e,|v|e) + + * |let x = t in u|e =Pushenv, |t|e,Extend, |u|e, x,Popenv + +For example, the compilation of + +generates the sequence of instructions Pushenv, Mkclos [Search0, Test([Ldi1], [Pushenv, Ldi1, Push, Search0, Sub, Push, Search1, Apply, Popenv, Push, Search0, Mult])], Extend, Pushenv, Ldi6, Push, Search0, Apply, Popenv, Popenv and the result of its execution is the number 720. + +The correctness of the compilation, and of the semantics of the abstract machine, can be stated as follows: if V is a numeric value, then ⊢ t ↪ V if and only if |t| ⇒ V. + +Exercise 4.2 + +Write an abstract machine and a compiler for PCF. + +The state of the abstract machine at the beginning of the 14th execution step for the program Pushenv, Ldi1, Extend, Ldi6, Push, Ldi5, Push, Ldi4, Push, Ldi 3, Push, Ldi2, Push, Search0, Add, Add, Add, Add, Add, Popenv. + +Exercise 4.3 + +We extend PCF with the tree operators described in Exercise 3.15. Write a compiler and an abstract machine for this extension of PCF. + +Exercise 4.4 + +(A bootstrapping compiler) Many kinds of data structures can be represented using the trees described in Exercise 3.15. To start with, we can represent a natural number n as a tree L n. The character c can be represented by the tree L n where n is a code, for instance the ASCII code of the character c. If t1, t2,...,tn are trees, the list t1,t2, ...,tn can be represented by the tree N(t1, N(t2, ..., N(tn, L 0)...)). Finally, values of a type defined by constructors that are themselves representable could be defined by enumerating the constructors and representing the value C(V1,V2, ...,Vn) by the list L p,t1,t2, ...,tn where p is the number associated to the constructor C and t1, t2, ...,tn represent the values V1, V2, ...,Vn. + +We could, in particular, represent in this way programs written in the extended PCF language, or in the language of the abstract machine in Exercise 4.3. Modify the compiler and the abstract machine in Exercise 4.3 to accept programs represented by binary trees. The abstract machine will take two inputs: a compiled program, represented by a tree, and a value, and will apply the program to the value. + +Translate the compiler in Exercise 4.3 to PCF. After writing the compiler, compile it with the compiler defined in Exercise 4.3. The result is the first compiler executed by the PCF abstract machine. Compile this compiler (it will compile itself). Verify that the code produced is the same that was obtained with the compiler in Exercise 4.3. If this is true, we can destroy the first compiler and use instead the second: this is the bootstrap process. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_5© Springer-Verlag London Limited 2011 + +# 5. PCF with Types + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +This chapter opens a new part of the book, dedicated to types. The language PCF is extended by adding types. A type verification algorithm is described and the application to the static detection of errors is discussed in length. This chapter also describes the denotational semantics of PCF with types. + +In Chap. 2, we remarked that, in contrast with mathematical functions, the domain of PCF functions is not specified. For this reason, it is possible to apply the function fun x -> x + 1 to the function fun x -> x + 2, even if this application is meaningless. + +It is sometimes convenient to be able to apply any object to another object. For example, we can apply the identity function fun x -> x to itself, using the term (fun x -> x) (fun x -> x) that reduces to fun x -> x. More generally, the identity function in PCF is defined for any object, whereas in Mathematics it always has to be restricted to a specific domain. The ability to apply an object to itself was essential to show that the fix construct can be simulated in PCF using application and fun—see Exercise 2.10. + +However, the unrestricted application of an object to another may rise a number of problems. For example, we saw that the terms 1 2, 1 + (fun x -> x), ifz (fun x -> x) then 1 else 2 were irreducible closed terms according to the small-step semantics of PCF, but they are not values. + +The big-step operational semantics instead does not associate any result to a term such as (fun x -> x) 1 2. In practice, if we interpret a term of the form t u where t results in a number instead of a term of the form fun, an error is raised. This error will be detected at run time, instead of being detected statically (before execution) as one would expect. + +The fact that the domain of a PCF function is not specified also makes it more difficult to give a denotational semantics for PCF. + +The goal of this chapter is to define a version of PCF where functions come with associated domains, and to show that if a program is well-formed in this language, its interpretation cannot produce the errors mentioned above. We will also give a simple denotational semantics for this language. + +## 5.1 Types + +In Mathematics, the domain of a function is a set (any set). For example, we can define a function m from 2ℕ to ℕ that associates to each even number its half. Then, to check whether the expression m (3 + (4 + 1)) is well-formed or not, that is, to check whether the argument is in the domain of the function, we need to check whether 3 + (4 + 1) is even or not. For arbitrary sets, the problem of deciding whether a given element belongs to the set is undecidable in general. Therefore, the problem of checking the validity of terms is also undecidable in general. Besides, to know whether a term such as ifz t then u else v will produce an error or not, we need to know whether the value of t is a natural number or a term of the form fun (the parity of the number is not relevant in this case). + +These two remarks lead us to restrict the class of sets used to define the domains of functions. The sets in this restricted class will be called types. + +### 5.1.1 PCF with Types + +In PCF, types are inductively defined by + + * nat—that is, ℕ—is a type, + + * if A and B are types then A -> B—that is, the set of all the functions from A to B—is a type. + +Types can then be defined using a language that includes the constant nat and the symbol -> with two arguments that do not bind any variables. Such a term is also called a type. + +Functions in PCF were written fun x -> t, but will now include the type of the variable x. Thus, we will write fun x:nat -> x for the identity function on the natural number, and fun x:(nat -> nat) -> x for the identity function on functions from natural numbers to natural numbers. In general, the symbol fun will now have two arguments, a type and a term; it will bind a variable in the second argument. The typed version of the language PCF is a language with two sorts of objects: terms and types, and the arity of the symbol fun is ((type), (term, term), term). Also the symbols fix and let must now indicate the type of the bound variable. + +Summarising, typed PCF includes + + * a term symbol fun with a type argument and a term argument, which binds a variable in the second argument, + + * a term symbol α with two term arguments, that does not bind any variable, + + * an infinite number of term constants to represent the natural numbers, + + * four term symbols +, -, * and /, each with two arguments which do not bind any variables in their arguments, + + * a term symbol ifz with three term arguments which do not bind any variables, + + * a term symbol fix with a type argument and a term argument, which binds a variable in the second argument, + + * a term symbol let with three arguments, where the first is a type and the others terms, binding a variable in the third argument, + + * a type constant nat, + + * a type symbol -> with two type arguments and which does not bind any variable in its arguments. + +Alternatively, we can define the syntax of the typed version of PCF inductively + +### 5.1.2 The Typing Relation + +We can now define by induction the relation t : A, read "the term t has type A ". More precisely, we will define by induction a ternary relation e ⊢ t : A, as we did for the interpretation relation, where t is a term that might have free variables and e is a typing environment that associates a type to each variable. This is an inductive definition, similar to the inductive definition of PCF's big-step operational semantics. We could imagine that it is the operational semantics of a language with the same syntax as PCF but where the interpretation of a term returns a type instead of a value—for this reason, it is called an abstract interpretation of the term. + +In the first rule only the rightmost declaration for x is taken into account, the others are hidden. + +The language includes variables of various sorts, in particular type variables for which we will use capital letters. Since no symbol can bind a type variable, a closed term will not contain type variables. Moreover, if a closed term t has the type A in the empty environment, then the type A must be closed too. So, type variables are not really used here; they will be used in the next chapter. + +Let e be an environment and t a term. Reasoning by induction on t, we can show that the term t has at most one type in the environment e. + +We can build a type checking algorithm based on the typing rules given above. The algorithm will check whether a term t has a type in an environment e, and if it does, it will give the type as a result. It will do this by typing recursively the direct subterms of the given term, and will then compute the type of term using the types of the subterms. + +Exercise 5.1 + +Write a type checker for PCF. + +Reduction is still confluent on the typed language, and types bring us an additional property: all the terms that do not contain the operator fix terminate—Tait's Theorem. It will be impossible to build a term such as (fun x -> (x x)) (fun x -> (x x)), which does not terminate and does not contain fix. + +Exercise 5.2 + +Write typing rules for the version of PCF that uses de Bruijn indices instead of variable names—see Sect. 3.3. + +Exercise 5.3 + +We extend PCF with the constructs described in Exercise 3.13 to define pairs, and we introduce a symbol × to denote the Cartesian product of two types. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF. + +Exercise 5.4 + +We extend PCF with the constructs described in Exercise 3.14 to define lists, and we introduce a type natlist for these lists. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF. + +Exercise 5.5 + +We extend PCF with the constructs described in Exercise 3.15 to define trees, and we introduce a type nattree for these trees. Write typing rules for this extension of PCF. Write a type-checker for this extension of PCF. + +## 5.2 No Errors at Run Time + +We will now show that the interpretation of a correctly typed term cannot produce a type error at run time. For this we can use the small-step or the big-step semantics; the proof is slightly different depending on the semantics we use. + +### 5.2.1 Using Small-Step Operational Semantics + +Using the small-step operational semantics of the language, the property can be formulated as follows: the result of the computation of a typed closed term, if it exists, is a value. In other words, a typed closed term evaluates to a natural number or a closed term of the form fun x -> t; it can never be a stuck term: V1V2, where V1 and V2 are irreducible closed terms and V1 is not a term of the form fun x -> t, V1 ⊗ V2, where V1 and V2 are irreducible closed terms which are not both numbers, or a term of the form ifzV1 thenV2 elseV3 where V1, V2 and V3 are irreducible closed terms and V1 is not a number. + +The first lemma, which we will not prove here, is usually called subject reduction. It says that if a closed term t of type A reduces in one step to the term u (t u), then u also has type A. We can deduce that if a closed term t of type A reduces to u in any number of steps (t u), then u also has type A. + +The next step in the proof consists of showing that a term of the form fun cannot have the type nat and similarly a numeric constant cannot have a type of the form A -> B. This is done by a simple structural induction over the typing relation. + +The proof proceeds by showing that an irreducible closed term t of type nat is a constant representing a natural number and an irreducible closed term t of type A -> B has the form fun. This is done by structural induction on t. + +Since t is a closed term, it cannot be a variable. Since it is irreducible, it cannot be a fix or a let. + +We show that t cannot be an application, an arithmetic operator or a conditional. If t is an application t = u v then u has a type of the form C -> D. By induction hypothesis, this term must be of the form fun, and therefore t is a redex, contradicting our assumption (t is irreducible). If t is an arithmetic operator t = u ⊗ v then u and v have type nat. By induction hypothesis, they are numeric constants and therefore t is a redex, contradicting our assumption (t is irreducible). If t is a term of the form t = ifz u then v else w then u has type nat. By induction hypothesis, u is a numeric constant and therefore t is a redex, contradicting our assumption (t is irreducible). + +An irreducible closed term t is then either a numeric constant or a term of the form fun. If it has type nat, it is a constant; if it has type A -> B, it is a fun. + +If a well-typed closed term can be reduced to an irreducible closed term, this irreducible term will also be well typed, and will therefore be either a numeric constant or a term of the form fun. + +### 5.2.2 Using Big-Step Operational Semantics + +The property is formulated differently using the big-step operational semantics of the language. This is because in this style of semantics only values can be associated to terms (even if the terms are ill typed). One could say that the rules of the big-step operational semantics are incomplete, since they do not specify how to associate a value to an application whose left-hand side has a value that is a numeric constant, or how to associate a value to an arithmetic operation where the value of one of the arguments is a term of the form fun, or a value to a conditional where the first argument has a value that is of the form fun. However, for well-typed terms the rules are complete. In other words, the three examples that we have just mentioned cannot arise. + +We start by showing a type-preservation-by-interpretation lemma, which states that if a closed term t has type A then its value, if it exists, also has type A. This lemma corresponds to the subject reduction lemma of the small-step operational semantics. + +Then we show, as for the small-step semantics, that a term of the form fun cannot have type nat and, similarly, that a numeric constant cannot have a type of the form A -> B. + +Since we know that the value of a term is either a number or a term of the form fun, we deduce that the value of a term of type nat is a numeric constant, and the value of a term of type A -> B is a term of the form fun. Therefore, when interpreting a well-typed term, the left-hand side of an application will always be interpreted as a term of the form fun, the arguments of arithmetic operators will always be interpreted as numeric constants, and the first argument of an ifz will always be interpreted as a numeric constant. + +Exercise 5.6 + +(Equivalent semantics) Show that the computation of a well-typed term produces a result under call by name small-step operational semantics if and only if it produces a result under call by name big-step operational semantics. Moreover, the result is the same in both cases. Show that the same property is true of the call by value semantics. + +Does this result hold also for the untyped version of PCF? Hint: what is the result of ((fun x -> x) 1) 2? + +## 5.3 Denotational Semantics for Typed PCF + +### 5.3.1 A Trivial Semantics + +We mentioned above that one of the goals of functional languages is to shorten the distance between the notion of a program and the notion of a function. In other words, the goal is to bring the program closer to its denotational semantics. + +We also said that it was difficult to give a denotational semantics for PCF without types, because functions did not have a domain of definition. Now that we have a type system for PCF, it is easier to give a denotational semantics. + +We associate to each type a set + + * 〚nat〛 = ℕ, + + * 〚A -> B〛 = 〚A〛 -> 〚B〛 + +and to each term t of type A an element 〚t〛 of 〚A〛. If the term t has free variables, we will associate meanings to these variables via a semantic environmente. + + * 〚x〛e = a, if e includes the pair x = a, + + * 〚fun x:A -> t〛e = fun a:〚A〛 -> 〚t〛e,x=a, + + * 〚t u〛e = 〚t〛e 〚u〛e, + + * 〚n〛e = n, + + * 〚t + u〛e = 〚t〛e + 〚u〛e, 〚t - u〛e = 〚t〛e - 〚u〛e, 〚t * u〛e = 〚t〛e * 〚u〛e, 〚t / u〛e = 〚t〛e / 〚u〛e, + + * 〚ifz t then u else v〛e = 〚u〛e if 〚t〛e = 0 and 〚v〛e otherwise, + + * 〚let x:A = t in u〛e = 〚u〛e,x=〚t〛e. + +This is really trivial: a program is a function and its semantics is the same function. Achieving this "triviality" is one of the goals in the design of functional languages. + +Two remarks are in order. First, division by 0 produces an error in PCF, whereas it is not defined in Mathematics. To be precise, we should add a value error to each set 〚A〛 and adapt the definition given above. Second, in this definition we have forgotten the construction fix. + +### 5.3.2 Termination + +The only construct with a non-trivial denotational semantics is fix, because this construct is not usually found in everyday definitions of functions in Mathematics. Unlike PCF, mathematical definitions can only use fixed points of functions that do have a fixed point, and even then if there are several fixed points it is essential to specify which one we are taking. We left these issues aside when we defined PCF, it is now time to deal with them. + +Consider a function that does not have a fixed point: the function fun x:nat -> x + 1. In PCF, we can build the term fix x:nat (x + 1). Similarly, the function fun f:(nat -> nat) -> fun x:nat -> (f x) + 1 does not have a fixed point but we can build the term fix f:(nat -> nat) fun x:nat -> (f x) + 1. On the other hand, the function fun x:nat -> x, has many fixed points, and still we can build the term fix x:nat x. + +When we defined the operational semantics of PCF, we gave a reduction rule + +that explains the idea of a fixed point. Using this rule, we can see that the term a = fix x:nat (x + 1) reduces to a + 1, then to (a + 1) + 1, ... without ever reaching an irreducible term. Similarly, if g = fix f:(nat -> nat) fun x:nat -> (f x) + 1, the term g 0 can be reduced in two steps to (g 0) + 1 and then ((g 0) + 1) + 1, ... and again will never reach an irreducible term. The same thing happens with the term b = fix x:nat x, which reduces to b, and again to b,... and will never reach an irreducible term. In other words, it appears that in PCF, when we take the fixed point of a function that does not have any, or that has more than one, the program does not terminate. + +The situation is similar in Caml, where the program + +loops, or in Java with the program + +There are even functions, such as fun x:nat -> x + x, which have a unique fixed point but for which the fix construct in PCF produces a non-terminating computation: fix x:nat (x + x). + +In other words, to understand the denotational semantics of the fixed point operator, we need to understand first the semantics of terms that do not terminate. + +The small-step operational semantics does not associate any result to these terms: there is no term V such that fix x:nat (x + 1) ↪ V. And the big-step operational semantics does not give us more information. As we have already said, we could complete the relation ↪ by adding a value ⊥ such that fix x:nat (x + 1) ↪ ⊥. + +We have the same options in denotational semantics. We could define a partial function 〚 〛, and leave 〚fix x:nat (x + 1)〛 undefined, or we could add a value ⊥ to 〚nat〛 and define 〚fix x:nat (x + 1)〛 = ⊥. + +If we include the value ⊥, the interpretation of a term of the form t + u will be obtained by interpreting first u and t, and if one of these terms loops, then the whole term t + u does. Thus, the denotational semantics of a term of the form t + u is defined as follows + + * 〚t + u〛 = 〚t〛 + 〚u〛 if 〚t〛 and 〚u〛 are natural numbers, + + * 〚t + u〛 = ⊥ if 〚t〛 = ⊥ or 〚u〛 = ⊥. + +We can now remark that the function 〚fun x:nat -> x + 1〛, which did not have a fixed point when ⊥ was not included, now has one: ⊥. This value is precisely the one we will define as semantics for the term fix x:nat (x + 1), which does not terminate. The function 〚fun x:nat -> x〛, which had several fixed points, now has an additional one ⊥, and we will choose this one as semantics for the term fix x:nat x. The function 〚fun x:nat -> x + x〛, which had a unique fixed point 0 now has two: 0 and ⊥, and again we will choose ⊥ as semantics for the term fix x:nat (x + x) that does not terminate. + +All the functions that we had mentioned have fixed points now, and if they have more than one, including ⊥, we will choose the latter as our privileged value. + +### 5.3.3 Scott's Ordering Relation + +To make the ideas discussed above more precise, we define an ordering relation, called Scott's ordering relation, on the set 〚nat〛 as follows + +and we define 〚fix x:nat t〛 as the least fixed point of the function 〚fun x:nat -> t〛, forcing the use of the fixed point ⊥ when more than one fixed point exist. It remains to prove that the least fixed point exists; we will use the fixed point theorem for this. To apply this theorem, we must show that the ordering relation that we defined on 〚nat〛 is weakly complete, and that the semantics of a program of type nat -> nat is a continuous function. + +More generally, we will build for each type A a set 〚A〛 endowed with a weakly complete ordering relation, and we will show that the semantics of a program of type A -> B is a continuous function from 〚A〛 to 〚B〛. + +We start by defining the sets 〚A〛. The set 〚nat〛 will be defined as ℕ∪{⊥}, with the ordering relation given above. The set 〚A -> B〛 is defined to be the set of all continuous functions from 〚A〛 to 〚B〛, with the ordering relation f ≤ g if for all x in 〚A〛, f x ≤ g x. + +We can show that these ordering relations are weakly complete. The ordering on 〚nat〛 is weakly complete because any increasing sequence is either constant or has the form ⊥, ⊥, ..., ⊥, n, n, ... and in both cases there is a limit. + +We will now show that if the ordering relations on 〚A〛 and 〚B〛 are weakly complete, then so is the ordering on 〚A -> B〛. Let us consider an increasing sequence fn over 〚A -> B〛. Using the definition of the ordering on 〚A -> B〛, for all x in 〚A〛, the sequence fn x, whose values are in 〚B〛, is also increasing, and therefore has a limit. Let us call F the function that associates to x the element limn (fn x). We can show—but we will not do it here—that the function F is in 〚A -> B〛, that is, it is a continuous function (this requires a lemma to permute limits). By construction, the function F is greater than all the functions fn, and it is the least such function. Therefore it is the limit of the sequence fn. Any increasing sequence has a limit and the ordering relation on 〚A -> B〛 is therefore weakly complete. + +Each set 〚A〛 has a least element, written ⊥A. The least element of 〚nat〛 is ⊥, and the least element of 〚A -> B〛 is the constant function that returns the value ⊥B for all arguments. + +### 5.3.4 Semantics of Fixed Points + +We can now go back to the denotational semantics of PCF, and add to the definition the missing case for fix + + * 〚x〛e = a, if e contains the definition x = a, + + * 〚fun x:A -> t〛e = fun a:〚A〛 -> 〚t〛e,x=a, + + * 〚t u〛e = 〚t〛e 〚u〛e, + + * 〚n〛e = n, + + * 〚t ⊗ u〛e = 〚t〛e ⊗ 〚u〛e, if 〚t〛e and 〚u〛e are natural numbers, ⊥ otherwise, + + * 〚ifz t then u else v〛e = 〚u〛e si 〚t〛e = 0, 〚v〛e if 〚t〛e is a natural number different from 0 and ⊥A, where A is the type of this term, if 〚t〛e = ⊥nat. + + * 〚fix x:A t〛e = FIX (fun a:〚A〛 -> 〚t〛e,x=a) where FIX(f) is the least fixed point of the continuous function f, + + * 〚let x:A = t in u〛e = 〚u〛e,x=〚t〛e. + +To show that this definition is correct, we need to prove that if t is a term of type A then 〚t〛 is in 〚A〛, that is, we need to prove that the function is continuous. This is true, but we will not prove it here. + +Exercise 5.7 + +What is the semantics of the term fun x:nat -> 0? And the semantics of fix x:nat x and (fun x:nat -> 0) (fix x:nat x)? + +Exercise 5.8 + +What is the value of 〚ifz t then u else v〛e, if 〚t〛e = 0, 〚u〛e = 0 and 〚v〛e = ⊥nat? + +We can now state the equivalence theorem for the two semantics. Let t be a closed term of type nat and n a natural number: t ↪ n under call by name if and only if 〚t〛 = n. The direct implication is not difficult to prove, but the converse is not trivial. + +Exercise 5.9 + +Show, using the equivalence theorem, that if t is a closed term of type nat such that 〚t〛 = ⊥, there is no natural number n such that t ↪ n. + +Exercise 5.10 + +Let G be the denotational semantics of the term fun f:(nat -> nat) -> fun n:nat -> ifz n then 1 else n * (f (n - 1)). + +The denotational semantics of the term fix f:(nat -> nat) fun n:nat -> ifz n then 1 else n * (f (n - 1)) is the least fixed point of G. By the first fixed point theorem, this is the limit of the sequence Gn(⊥nat -> nat). Which function is denoted by ⊥nat -> nat? And by Gn(⊥nat -> nat)? Identify the limit of this sequence. + +Show that for any natural number p, there exists a natural number m such that Gm(⊥nat -> nat)(p) =limnGn(⊥nat -> nat)(p). + +Exercise 5.11 + +We consider the following elements in the set 〚nat -> nat〛: the function u that maps ⊥ to ⊥ and all other elements to 0, the function vi that maps ⊥ to ⊥, i to 1 and all other elements to 0, and the function wi that maps ⊥ to ⊥, 0, 1, ..., i-1 to 0 and all other elements to ⊥. + +Let F be an increasing function from 〚nat -> nat〛 to 〚nat〛, such that F u = 0 and for all i, Fvi = 1. Show that for all i, Fwi = ⊥. Show that the function F is not continuous. + +Show that it is not possible to write a PCF function that takes as argument a function g of type nat -> nat and returns 0 if for all n, g n = 0 and 1 otherwise. + +Exercise 5.12 + +(An information-based approach to continuity) It might seem surprising that the notion of continuity is used to define the semantics of PCF, even though PCF works only with natural numbers, not with real numbers. In fact, the set of functions from ℕ to ℕ, or the set of sequences of natural numbers, is very similar to the set of real numbers. + +The intuition is that a real function f is continuous if to compute the initial n decimal places of f x it is sufficient to know a finite number of decimals in x. Unfortunately, this is technically false if x or f x are decimal numbers. We will say that a decimal number approximates a real number to the nth decimal place if the distance between the two is smaller than 10−n. Thus, the number π has two approximations to the second decimal place: 3.14 and 3.15, and it makes sense to say that the function f is continuous if to compute a decimal approximation of f x to the nth place it is sufficient to have some decimal approximation of x. + +The goal of this exercise is to show that, similarly, a function f from sequences of natural numbers to sequences of natural numbers is continuous if to compute the first n terms in f x it is sufficient to have an initial segment of x. If we agree to call a finite initial segment of the sequence a finite approximation, then we can rephrase it as follows: to compute an approximation of f x with n terms, it is sufficient to have a certain approximation of x. + +Let u be a sequence of natural numbers, and let U be the element of 〚nat -> nat〛 that associates ⊥ to ⊥ and ui to i. + +Let V be a sequence with elements in 〚nat -> nat〛 + +Show that the sequence V converges to U. Let F be a continuous function on 〚nat -> nat〛. Show that the sequence FVi converges to F U. Show that the sequence FVi p converges to F U p. Show that there exists a natural number k such that FVk p = F U p. Show that to compute F U p, it suffices to have the first k terms in U. Show that to compute the first n terms in F U it is sufficient to know a finite number of terms in U. + +Consider the function that associates to a sequence u the number 0 if u is always 0, and 1 otherwise. Is this function continuous? Can it be written in PCF? + +Finally, notice that in these two examples, the approximations—decimal numbers or finite sequences—contain a finite amount of information, whereas the objects that they approximate—real numbers or infinite sequences—contain an infinite amount of information. + +Exercise 5.13 + +(Gödel's System T) To avoid non-terminating computations, we can replace fix by a rec construct to define functions by induction. All the programs in this language terminate, but the language is no longer Turing complete. Still, it is not easy to find a program that cannot be represented in this language, you need to be an expert logician to build such a program. + +The function f defined by f 0 = a and f (n + 1) = g n (f n) is written rec a g. The small-step operational semantic rules for this construct are + +if n is a natural number different from 0. + +Program the factorial function in this language. Give typing rules for rec. Give a denotational semantics for this language. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_6© Springer-Verlag London Limited 2011 + +# 6. Type Inference + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +This chapter continues with types, but with a much more operational orientation. Powerful type inferences type inference algorithms are described, in particular one with polymorphic typing. + +In many programming languages, for instance Java and C, programmers must declare a type for each of the variables used in the program, writing for example fun x:nat -> x + 1. However, if we know that + can only work with numbers, it is not difficult to show that in the term fun x -> x + 1 the variable x has to be of type nat. We can then let the computer infer the types, rather than asking the programmer to write them. This is the goal of a type inference algorithm. + +## 6.1 Inferring Monomorphic Types + +### 6.1.1 Assigning Types to Untyped Terms + +We will now use the original syntax of PCF, where variables are not explicitly typed. Instead of writing fun x:nat -> x + 1, we will write fun x -> x + 1 as in Chap. 2. + +We can now define the language of terms and the language of types independently. The language of terms in PCF is defined as in Chap. 2 and the language of types consists of + + * a constant nat, and + + * a symbol -> with two arguments which does not bind any variable in its arguments. + +As before, the relation e ⊢ t : A (read "the term t has type A in the environment e ") can be defined by induction. + +![ +$$ +\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,x}\\,:{\\rm\\,A}}}{\\rm\\,if\\,e\\,contains\\,x}\\,:{\\rm\\,A} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equa.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,A\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A} - > {\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t\\,u}\\,:{\\rm\\,B}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equb.gif) + +![ +$$ +\\frac{{\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fun\\,x} - > {\\rm\\,t}\\,:{\\rm\\,A} \\,- > {\\rm\\,B}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equc.gif) + +![ +$$ +\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,n}\\,:{\\rm\\,nat}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equd.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,nat e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,nat}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t} \\otimes {\\rm\\,u}\\,:{\\rm\\,nat}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Eque.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,nat e}\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,A e}\\,\\vdash {\\rm\\,v}\\,:{\\rm\\,A}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,ifz\\,t\\,then\\,u\\,else\\,v}\\,:{\\rm\\,A}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equf.gif) + +![ +$$ +\\frac{{\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fix\\,x\\,t}\\,:{\\rm\\,A}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equg.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,A }\\left\( {{\\rm\\,e, x}\\,:{\\rm\\,A}} \\right\)\\,\\vdash {\\rm\\,u}\\,:{\\rm\\,B}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,let\\,x}\\,=\\,{\\rm\\,t\\,in\\,u}\\,:{\\rm\\,B}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equh.gif) + +Some terms, for example the term fun x -> x, may have more than one type in this system. For instance, we can derive the judgement ⊢ fun x -> x : nat -> nat and also the judgement ⊢ fun x -> x : (nat -> nat) -> (nat -> nat). A closed term may have a type with free variables, for example the term fun x -> x has type X -> X in the empty environment. + +We can prove that if a closed term t has a type A which contains variables in the empty environment, then t also has type θA for any substitution θ. For example, if we substitute the variable X by the type nat -> nat in X -> X, we obtain the type (nat -> nat) -> (nat -> nat) and this is one of the possible types for the term fun x -> x. + +### 6.1.2 Hindley's Algorithm + +We can now describe the type inference algorithm. We will first describe a version of the algorithm that has two phases. The first phase is similar to the type checking algorithm: it traverses the term, recursively, checking that the type constraints are satisfied, and computes the type of the term. There are however two important differences: first, when we are trying to type a term of the form fun x -> t in an environment e, since we do not know the type of the variable x we need to create a type variable X, extend the environment e with the declaration x : X, and type the term t in this extended environment. The second difference is that when typing an application t u, after computing types A and B for u and t, respectively, we cannot simply check that the type B has the form A -> C. Indeed, these two types might have variables. For this reason, at this point an equation between the types is generated B = A -> X. The second phase of the type inference algorithm solves these equations. + +Let us illustrate the idea with an example: to type the term fun f -> 2 + (f 1) we must type the term 2 + (f 1) in the environment f : X. For this, we need to type the term 2, which has type nat, and the term f 1. The term 1 has type nat and the term f has type X. We generate the equation X = nat -> Y and the type of f 1 is Y. Once the terms 2 and f 1 are typed, we generate the equations nat = nat and Y = nat, and the type of the term 2 + (f 1) is nat. Finally, the type of the term fun f -> 2 + (f 1) is X -> nat and the equations that we need to solve are + +This system of equations has a unique solution X = nat -> nat, Y = nat, and therefore the only type that we can assign to the term fun f -> 2 + (f 1) is (nat -> nat) -> nat. + +We can describe the first part of the algorithm using a set of rules in the style of the big-step operational semantics (as we did for the type checking algorithm), but in this case the result of the interpretation of a term will not be a value or a type, it will be a pair of a type and a set of equations on types. We write e ⊢ t A, E to denote the relation between the environment e, the term t, the type A and the set of equations E. + +In the application rule, the variable X is an arbitrary variable that does not occur in e, A, B, E and F. In the rules for fun and fix, it is an arbitrary variable that does not occur in e. + +Let t be a closed term and let A and E be the type and the set of equations computed by this algorithm, that is, we have ⊢ t A, E. A substitution σ =B1/X1, ...,Bn/Xn is a solution of E if, for each equation C = D in E, the types σC and σD are identical. We can show that if a substitution σ is a solution of the set E, then the type σA is a type for t in the empty environment. In general, if e ⊢ t A, E, then for any solution σ of E, σA is a type for t in the environment σe. Conversely, if A' is a type for t in the empty environment, then there exists a substitution σ such that A' =σA and σ is a solution of the set E of equations. + +The second part of the algorithm deals with the type equations. The language of types does not have binders, it is a language generated by a constant nat and a symbol -> with two arguments. To solve the type equations, we use Robinson's unification algorithm, which solves equations in any arbitrary language without binders. This algorithm is in some respects similar to Gauss's algorithm to solve systems of equations. It proceeds by a series of transformations, defined as follows + + * if an equation in the system is of the form A -> B = C -> D, it is replaced by the equations A = C and B = D, + + * if an equation in the system is of the form nat = nat, it is removed from the system, + + * if an equation in the system is of the form nat = A -> B or A -> B = nat, the algorithm fails, + + * if an equation in the system is of the form X = X, it is removed from the system, + + * if an equation in the system is of the form X = A or A = X, where X occurs in A and A is different from X, the algorithm fails, + + * if an equation in the system is of the form X = A or A = X, where X does not occur in A and X occurs in other equations in the system, then X is substituted by A in all the other equations in the system. + +This algorithm terminates, but the proof is not trivial. If the algorithm fails, then the system does not have a solution. If it terminates without failure, then the final system is of the form X1 =A1, ...,Xn =An, where the Xi are different variables and do not occur in the Ai. In this case, the substitution σ =A1/X1, ...,An/Xn is a solution of the initial system. We can prove that this substitution is a principal solution of this system, in other words, for any solution θ of the initial system, there is some substitution η such that θ=η○σ. We write σ=mgu(E)—most general unifier: principal solution. + +Let t be a closed term, and let A and E be such that ⊢ t A, E. Let σ be a principal solution of E. Then the term t has type σA in the empty environment. Moreover, σA is a principal type of t, that is, for any other type B of t, there exists a substitution η such that B =ησA. + +### 6.1.3 Hindley's Algorithm with Immediate Resolution + +There is a variant of Hindley's algorithm where instead of waiting until the end of the first phase to start solving the equations, the equations are solved as they are generated. In this case, instead of returning a type and a set of equations, the algorithm returns a type A and a substitution ρ that is a principal solution of the equations. We can also apply the substitution ρ to the type A as it is built. + +The algorithm has the following property: if e ⊢ t A,ρ, then A is a principal type of t in the environment ρe. The algorithm is defined below. + +if σ = mgu(B =ρ′A -> X) + +if σ = mgu(A = nat) and σ′ = mgu(B = nat) + +if σ = mgu(A = nat) and σ′ = mgu(ρ″ B = C) + +Again, in the application rule X is an arbitrary variable that does not occur in e, A, B, ρ and ρ', and in the rules for fun and fix, it is a variable that does not occur in e. + +Exercise 6.1 + +Give a principal type for the term fun x -> fun y -> (x (y + 1)) + 2. Describe all of its types. + +Give a principal type for the term fun x -> x. Describe all of its types. + +Exercise 6.2 + +(Unicity of principal types) A substitution σ is called a renaming if it is an injective map associating a variable to each variable. For example, the substitution y/x, z/y is a renaming. Let A be a type and σ, σ′ two substitutions. Show that if σ′σA = A then σ|FV(A) is a renaming. + +Deduce that if A and A' are two principal types of a term t then there exists a renaming θ, with domain FV(A), such that A' =θA. + +Exercise 6.3 + +In the general case of a language without binders, we can replace the first three rules in Robinson's unification algorithm by the two rules + + * if an equation is of the form f(u1, ...,un) = f(v1, ...,vn), replace it by u1 =v1, ...,un =vn, + + * if an equation is of the form f(u1, ...,un) = g(v1, ...,vp) where f and g are different symbols, fail. + +In a language that consists of a symbol + with two arguments and integer constants, does the equation (2 + (3 + X)) = (X + (Y + 2)) have a solution? And the equation X + 2 = 4? + +What is the difference between the equations in this language and the equations over integers studied at high school? + +Define the high school notion of solution using the small-step operational semantics of PCF. Does the equation X + 2 = 4 have a solution in this case? + +## 6.2 Polymorphism + +We have seen that the principal type of the term id = fun x -> x is X -> X. This means that the term id has type A -> A for any type A. We could give it a new type ∀X (X -> X) and add a rule so that if a term t has type ∀X A then it has the type (B/X)A for any type B. A type language that includes a universal quantifier is polymorphic. + +In the system presented in the previous section, the term let id = fun x -> x in id id was not typeable. Indeed, the typing rule for let requires that we type both fun x -> x and id id, but the latter is not typeable because we cannot assign the same type to both occurrences of the variable id. For this reason the term let id = fun x -> x in id id cannot be typed. This could be seen as a flaw in the type system, because the term (fun x -> x) (fun x -> x), obtained by replacing id by its definition, is typeable. Indeed, to type this term it is sufficient to assign type nat -> nat to the first occurrence of the bound variable x and type nat to the second. + +If we give the type ∀X (X -> X) to the symbol id in the term let id = fun x -> x in id id we can then use a different type for each occurrence of id in the term id id, and the term becomes typeable. + +Typing the term let id = fun x -> x in id id might seem a minor issue, and adding quantifiers to the type language might seem a high price to pay to obtain a marginal increase in power. However, this is a wrong impression. In fact, in the extension of PCF with lists—see Exercise 3.14—, this feature allows us to develop a unique sorting algorithm and apply it to all the lists, irrespective of the type of their arguments: let sort = t in u. Polymorphism entails more code reuse, and therefore more concise programs. + +We will therefore give a quantified type to the variables bound in a let, but use a standard type for variables that are bound in a fun or fix. + +### 6.2.1 PCF with Polymorphic Types + +We need to distinguish between types without quantifiers—we will continue to use the word types for these—and quantified types, which we will call type schemes. A scheme has the form ∀X1 ... ∀Xn A where A is a type. We will then define a language with two sorts: a sort for types and a sort for schemes. Since the sets of terms of each sort are disjoint in a many-sorted language, the set of types cannot be a subset of the set of schemes, and we will need to use a symbol [ ] to inject a type in the sort of the schemes. Thus, if A is a type, [A] will be the scheme consisting of the type A without any quantified variable. + +The language of types and schemes is defined by + + * a type constant nat, + + * a type symbol -> with two type arguments, which does not bind any variable in its arguments, + + * a scheme symbol [ ] with one type argument, which does not bind any variable in its argument, + + * a scheme symbol ∀ with one scheme argument, which binds a variable in its argument. + +This language includes variables for every sort, in particular scheme variables. However, these variables will not be used. + +An environment is now a list associating a scheme to each variable. We define inductively the relation "the term t has the scheme S in the environment e " + +![ +$$ +\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,x}\\,:{\\rm\\,S}}}{\\rm\\,if\\,e\\,contains\\,x}\\,:{\\rm\\,S} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equz.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}:\\left\[ {\\rm\\,A} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,t}:\\left\[ {{\\rm\\,A} \\,- > {\\rm\\,B}} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t u}:\\left\[ {\\rm\\,B} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equaa.gif) + +![ +$$ +\\frac{{\\left\( {{\\rm\\,e, x}:\\left\[ {\\rm\\,A} \\right\]} \\right\){\\rm\\, }\\,\\vdash {\\rm\\,t}:\\left\[ {\\rm\\,B} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fun\\,x } \\,- > {\\rm\\,t}:\\left\[ {{\\rm\\,A } \\,- > {\\rm\\,B}} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equab.gif) + +![ +$$ +\\frac{{}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,n}:\\left\[ {{\\rm\\,nat}} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equac.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,u}:\\left\[ {{\\rm\\,nat}} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,t}:\\left\[ {{\\rm\\,nat}} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t} \\otimes {\\rm\\,u}:\\left\[ {{\\rm\\,nat}} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equad.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:\\left\[ {{\\rm\\,nat}} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,u}:\\left\[ {\\rm\\,A} \\right\]{\\rm\\, e}\\,\\vdash {\\rm\\,v}\\,:\\left\[ {\\rm\\,A} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,ifz\\,t\\,then\\,u\\,else\\,v}:\\left\[ {\\rm\\,A} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equae.gif) + +![ +$$ +\\frac{{\\left\( {{\\rm\\,e,\\,x :}\\left\[ {\\rm\\,A} \\right\]} \\right\)\\,\\vdash {\\rm\\,t }\\,:\\left\[ {\\rm\\,A} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,fix\\,x\\,t}:\\left\[ {\\rm\\,A} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equaf.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,S }\\left\( {{\\rm\\,e,\\,x}\\,:{\\rm\\,S}} \\right\)\\,\\vdash {\\rm\\,u}\\,:\\left\[ {\\rm\\,B} \\right\]}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,let\\,x} = {\\rm\\,t\\,in\\,u}:\\left\[ {\\rm\\,B} \\right\]}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equag.gif) + +![ +$$ +\\frac{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}\\,:{\\rm\\,S}}}{{{\\rm\\,e}\\,\\vdash {\\rm\\,t}:\\forall {\\rm\\,X S}}}{\\rm\\,if\\,X\\,does\\,not\\,occur\\,free\\,in\\,e} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equah.gif) + +![ +$$ +\\frac{{{\\rm\\,e }\\,\\vdash {\\rm\\, t }\\,:{\\rm\\, }\\forall {\\rm\\,X S}}}{{{\\rm\\,e }\\,\\vdash {\\rm\\, t }\\,:{\\rm\\, }\\left\( {{\\rm\\,A/X}} \\right\){\\rm\\, S}}} +$$ +](A978-0-85729-076-2_6_Chapter_TeX2GIF_Equai.gif) + +This inductive definition assigns a scheme to each term, in particular to variables. This is why variables are associated to schemes in the environment. However, when we type a term of the form fun x -> t or fix x t, we type t in an extended environment where the variable x is associated to a scheme [A] without quantifiers. A scheme can be associated to a term t only during the typing of a term of the form let x = t in u, and then this scheme is associated to the variable x. + +To introduce quantifiers in the scheme associated to t we use the penultimate rule, which allows us to quantify a variable in the scheme S if the variable does not occur free in e. Thus, in the empty environment, after assigning the scheme [X -> X] to the term fun x -> x we can assign the scheme ∀X [X -> X] to it. Note that in the environment x : [X], after assigning the scheme [X] to the variable x we cannot assign the scheme ∀X [X]. + +Finally, note that if we have assigned a quantified scheme to a variable, or to an arbitrary term, we can remove the quantifier and substitute the free variable using the last rule. For example, in the environment x : ∀X [X -> X] we can assign the scheme [nat -> nat] to the variable x. + +### 6.2.2 The Algorithm of Damas and Milner + +We are now ready to define the inference algorithm. We will solve the equations on the fly, as we did in the second variant of Hindley's algorithm. The algorithm will be applied to a term t and an environment e, and it will return a type A and a substitution ρ such that the term t has the scheme [A] in the environment ρe. The only difference with respect to the second variant of Hindley's algorithm is in the first two rules + +if e contains x : ∀X1 ... ∀Xn [A] and Y1, ...,Yn are new variables + +where Gen(A,e) is the scheme obtained by quantifying in [A] all the type variables that are free in [A] but not in e. + +We can prove that if t is a closed term, the type A computed by this algorithm is a principal type of t, that is, if ⊢ t:[B] then B is an instance of A. + +Exercise 6.4 + +Consider the extension of PCF with a type symbol list with one argument, which is a type. We write nat list for the type of lists of natural numbers, (nat -> nat) list will be the type of lists of functions from natural numbers to natural numbers, and (nat list) list will be the type of lists where the elements are lists of natural numbers. + +We add the following constructs to the language: a constant nil of type (A list) for any type A, representing the empty list, cons a l of type (A list) for any type A such that a has type A and l has type A list, which will represent a list where the first element is a and l is the rest of the list, ifnil t then u else v of type A if t has type B list and u, v are terms of type A, to check whether the list t is empty or not, hd l of type A if l is of type A list, that returns the first element of the list l, and tl l of type A list if l is of type A list, that returns the list l without the first element. Write typing rules for this extension of PCF. Write a type checker for this extension of PCF. + +Program the function map that associates to a function f and a list t1, ...,tn the list ft1, ..., ftn. What is the type of this function? + +Program a sorting algorithm. What is the type of this algorithm? + +In the type system described in this chapter, we can use quantified types for variables that are bound in a let. We could try to give a quantified type to variables that are bound in a fun. For example, we could give the type ∀X (X -> X) to the variable x in the term fun x -> x x, which will allow us to type this term. The language obtained in this way is called System F, and was defined by Girard and Reynolds. However, the typing relation is undecidable in System F, as shown by Wells, and we cannot hope to have a type inference algorithm for System F. Similarly, if we allow the variable bound by a fix to be polymorphic, the system becomes undecidable, as shown by Kfoury. Restricting the polymorphic aspects of the system to the let construct can be seen as a good compromise, it offers a good level of code reuse and type inference. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_7© Springer-Verlag London Limited 2011 + +# 7. References and Assignment + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +This chapter opens the last part of the book where PCF is extended with new features. This chapter focuses on references and assignments, and thus shifts from functional to imperative programming. A semantics is given for PCF with references and the interpretor of Chap. 3 is extended. + +Consider two numbers: π and the temperature in Paris. Today, the number π has a value between 3.14 and 3.15 and the temperature in Paris is between 16 and 17 degrees. Tomorrow, π will have the same value, but the temperature in Paris will probably change. In Mathematics, numbers are entities that do not change over time: the temperature in Paris is not a number that changes, it is a function that varies over time. + +However, formalising the temperature of a system as a function of time is perhaps too general. It does not take into account the fact that the variation in temperature at a given point in time depends, in general, of the temperature at this point and not the temperature ten seconds earlier or ten seconds later. In general, a system does not have access to the full temperature function, just the current value of the function. This is why equations in Physics are generally differential equations and not arbitrary equations on functions. + +In Computer Science, programs also use objects that vary over time. For example, in the program that manages the sale of tickets for a concert, the number of seats available varies over time: it decreases by one each time a ticket is sold. From the mathematical point of view, it is a function of time. However, to know whether it is possible or not to sell a ticket, or whether booking is no longer possible, the program only needs to know the current value of this function, not the full function: at a certain point t in time, it needs the value of the function at t. + +For this reason, when we write such a program, we do not represent the number of places available for the concert as a function, that is, as a term of type nat -> nat—assuming a discrete clock—, which would mean that at each instant t we know the number of seats still available for the concert at each instant t'. This is clearly impossible, since it requires to know the number of seats available at each instant t' in the future. We cannot express this number by a term of type nat either, because as a number the value of a term of type nat in PCF cannot change over time. We have to introduce another sort of terms for the values that change over time: references, also called variables but we prefer not to use the word variable in this context, since the notion of a reference is very different from the notion of a variable in Mathematics and in functional languages. + +If x is a reference, we can do two things with it, get its current value !x and modify its value x := t, that is, contribute to the construction of the function that we mentioned above, asserting that the value of the function is now, and until further notice, the current value of the term t. + +The issue of equality of "numbers that vary over time" is subtle. We could compare such a number, the temperature in Paris for instance, with a leaf in a tree: small, green and flexible in Spring, it becomes bigger, yellow and brittle in Autumn. There is clearly a change, but we know that it is the same leaf: nobody would believe that the little green leaf disintegrated and suddenly the big yellow leaf appeared ex nihilo. Although there is a transformation, the same leaf remains in the tree from March till October. This is an instance of the old paradox, that something can change while remaining the same. Similarly, the notion of temperature in Paris is always the same, even if the temperature changes over time. On the other hand, we can easily distinguish the temperature in Paris from the temperature in Rome: these are two different things, even if from time to time the temperature is the same in both cities. + +One way to deal with this paradox is to consider the temperature in Paris and the temperature in Rome as functions: a function may take different values at two different points and remain the same function, and two different functions might take the same value at a given point. + +In a program, if x and y are two references and we need to compare them, we should distinguish carefully between their equality as references, that is, whether x and y are the same thing or not—in mathematical terms: whether they are the same function of time—and equality of their contents, that is, whether the numbers !x and !y are the same at a particular point in time. In particular, equality of references implies that if we modify the value of x then the value of y also changes, but this is not the case if they are different references with the same value. + +## 7.1 An Extension of PCF + +We will now extend the language PCF with two new term constructors, written ! and :=. + +The term x := 4 denotes an action: it updates the value associated to the reference x. Compare with the term fact 3, that we have already seen, and which also denotes an action: the computation of the factorial of 3. There is a difference between these two actions: the effect of the computation of the factorial of 3 is a value, whereas the effect of the action x := 4 is a change in the "global state" of the universe. Before this action, the reference x had, for instance, the value 0, and after this action it has the value 4. When we add references to PCF, the interpretation of a term is not just a value, but a value and a new state of the universe. This modification of the state is a side effect of the interpretation of a term. + +The formal semantics of references in PCF defines the global state as a function from a finite set R to the set of values of PCF terms. The elements of the set R are called references. In the native programming language of a computer, its machine language, the set of references is fixed: it is the set of memory addresses of the computer. In other languages, the set R is arbitrary. In particular, when we define the semantics of a language, we do not distinguish between sets R and R' of the same cardinality (i.e., with the same number of elements). This means that programmers cannot know the exact set of memory addresses used to store the data. + +In PCF, as well as in most programming languages, the values associated to references may change over time. Moreover, the set R itself may vary over time: it is possible to create a reference during the execution of the program. To do this, the language includes a construct ref. The side effect associated to the interpretation of the term ref t is the creation of a new reference whose initial value is the current value of the term t. The value computed by this interpretation is the reference itself. + +Since the interpretation of the term ref t produces a value which is a reference, it is clear that references must be values in this extension of PCF. + +## 7.2 Semantics of PCF with References + +In the big-step operational semantics of this extension of PCF, the relation is of the form e, m ⊢ t ↪ V, m' where t is the term to be interpreted, e the environment where it will be interpreted, m the global state in which the interpretation will take place, V the value produced by the interpretation, and m' the new global state produced by the interpretation. + +We can now give rules for the three new constructs, ref, ! and := + +if r is any reference not occurring in m' + +The construction t; u whose semantics is obtained by interpreting t, throwing away the value obtained, then interpreting u, is not very interesting in a language without side effects, because in that case the value of the term t; u is always the same as the value of u, assuming t terminates. We can now add it to PCF + +We can also add now constructions whilez, for,... which were of no interest in a language without side effects. + +Exercise 7.1 + +Write an interpreter for the language PCF with references. + +The uncertainty that we mentioned at the beginning of the book regarding the evaluation of nested functions is finally elucidated. + +Exercise 7.2 + +Consider the term + +What is the value of this term? In which order will the arguments be interpreted in PCF? Why? + +Modify the rules given above to obtain the value 2 instead of the value 9 for this term. + +In Sect. 2.5 we remarked: "In the case of an application...". What do you think of this remark? + +What is the value of this term in Caml? + +Consider the following Java program + +What is the value of this term? + +In which order does Caml interpret its arguments? and Java? + +Exercise 7.3 + +Is the value of the term + +10 or 11? Compare with the answer for Exercise 2.8. + +Exercise 7.4 + +Give the big-step operational semantics of the construction whilez. What is the value of the term given below? + +Exercise 7.5 + +(The quirks of references under call by name) Consider the rules given above to define the big-step semantics of references. Do they follow a call by name or a call by value strategy? Give a similar rule for application under call by name, but keep the let in call by value. What is the value of the term let n = ref 0 in ((fun x -> x + x) (n := !n + 1; 4)); !n in call by value? And in call by name? What is the value of the term let n = ref 0 in ((fun x -> 2 * x) (n := !n + 1; 4)); !n in call by value? And in call by name? + +Exercise 7.6 + +(Typing references) To type terms in the extension of PCF with references, we extend the language of types with a symbol ref, so that nat ref, for instance, is the type of references to a natural number. Thus, if t is a term of type A ref then !t is a term of type A. + +Extend the typing rules given in Sect. 5.1 in order to type the language PCF with references. + +Write a type-checking program for PCF with references. + +The combination of references and polymorphism is subtle; we will not attempt to mix them in this exercise. + +Exercise 7.7 + +(From imperative to functional programs) Consider a term t defining a function from natural numbers to natural numbers, with p arguments and a free variable n of type nat ref. We associate to this term a function with p + 1 arguments that returns a pair of natural numbers—see Exercise 3.13—such that the image of a1, ...,ap, m is the pair of natural numbers consisting of the value of the term let n = ref m in (ta1 ...ap) and the value of the term !n at the end of the interpretation. Which function will be associated to the term + + * fun z -> (n := !n + z; !n)? + +And to the term + + * (fun z -> (n := !n + z; !n)) 7? + +And to the term + + * (fun x -> fun y -> x) ((fun z -> (n := !n + z; !n)) 2) ((fun z -> (n := !n + z; !n)) 7)? + +Is it possible to program these functions in PCF without references? + +More generally, + + * which function is associated to the term funy1 -> ... -> funyp -> 2? + + * And to the term funy1 -> ... -> funyp ->y1? + + * And to the term funy1 -> ... -> funyp -> !n? + + * If t is a term of type nat and f is the function associated to the term funy1 -> ... -> funyp -> t, which function is associated to funy1 -> ... -> funyp -> n := t? + + * If t and u are terms of type nat, and f and g are the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp -> u, which function is associated to funy1 -> ... -> funyp -> (t + u)? + + * If t and u are terms of type nat and f and g are the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp -> u, which function is associated to funy1 -> ... -> funyp -> (t; u)? + + * If t is a term of type nat -> ... -> nat -> nat—with q arguments of type nat—u1, ...,uq are terms of type nat, and f,g1, ...,gq the functions associated to the terms funy1 -> ... -> funyp -> t and funy1 -> ... -> funyp ->u1, ...,funy1 -> ... -> funyp ->uq, which function is associated to funy1 -> ... -> funyp -> (tu1 ...uq)? + +Is it possible to program these functions in PCF without references? + +Write a program to transform a PCF term containing these symbols and a free variable of type nat ref into a program without it and with the same semantics. + +Exercise 7.8 + +(For those who prefer to write x := x + 1 instead of x := !x + 1) Consider now a finite set of references, and let us extend PCF by introducing a constant for each of these references. These references will be called mutable variables. The symbol := applies now to a mutable variable and a term, written X := t. + +If X is a mutable variable, the value that the operational semantics associates to the term X is the value associated to the reference X in the state available at the time of interpretation. + +Give a big-step operational semantics for this extension of PCF. + +Write an interpreter for this extension of PCF. + +Exercise 7.9 + +(A minimal imperative language) Consider a language including integer constants, arithmetic operations, mutable variables—see Exercise 7.8—, assignment :=, sequence ;, a conditional ifz and a whilez loop (but without the usual notion of variable, fun, fix, let or application). + +Give rules to define the operational semantics of this language. Write an interpreter for this language. Write a program to compute factorial in this language. What can we program in this language? + +To conclude this chapter, we remark that in most programming languages there are two different ways to program the factorial function. For example, in Java, we can program it recursively + +or iteratively + +Should we prefer the first version or the second? + +Of course, the theory of programming languages does not give us an answer to "moral" questions of the form "Should we...?" We could nevertheless say a few words about the way this question has evolved. + +In the first programming languages—machine languages, assembly languages, Fortran, Basic,...—only the second version could be programmed. Indeed, a program with loops and references is easier to execute in a machine that is itself, in fine, a physical system with a mutable state, than a program that requires evaluating a function defined via a fixed point. + +Lisp was one of the first languages to promote the use of recursive definitions. With Lisp, for the first time, programs did away with references and side effects, and this simplified the semantics of the language, brought it close to mathematical language, allowed programmers to reason over programs in an easier way, and facilitated the task of writing complex programs. For example, it is much easier to write a program to compute the derivative of an algebraic expression using recursion than keeping track of a stack of expressions that are waiting to be treated. It was then natural to contrast the pure functional style of programming with the "impure" imperative one. + +But the first implementations of functional languages were very slow in comparison with those of imperative languages, precisely because, as we have said, it is more difficult to execute a functional program on a machine, which is a physical system, than it is to execute an imperative program. During the 1990's, the compilation techniques for functional languages made such a huge progress that efficiency is no longer a valid argument against functional programming today, except in the domain of intensive computation. + +Moreover, all modern languages include both functional and imperative features, which means that today the only valid argument to justify the choice of a particular style should be its simplicity and ease of use. + +From this point of view, it is clear that not all problems are identical. A program that computes derivatives for functional expressions is easier to express in functional style. In contrast, when we program the Logo turtle it is more natural to talk about the position of the turtle, its orientation,...—that is, its state at a given instant. It is also natural to talk about the actions that the turtle does: to move, to write a line,..., and it is not easy to express all this in a functional way: in fact, it is not natural to think of the turtle's actions as functions over the space of drawings. + +There is still one point that remains mysterious: programs, whether functional or imperative, are always functions from inputs to outputs. If imperative programming brought us new ways of defining functions, which in certain cases are more practical from a Computer Science point of view than the mathematical definitions that are typical of functional languages, we could wonder whether they would also be more practical for mathematicians. However, so far the mathematical language has not adopted the notion of reference. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_8© Springer-Verlag London Limited 2011 + +# 8. Records and Objects + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +The final chapter of the book is dedicated to object oriented programming languages. An extension of PCF with objects is defined and implemented. But before that, an extension of PCF with records is. Then objects are introduced as records with functional fields. + +## 8.1 Records + +In the equations describing the movement of two bodies that exert a force on each other, for example, a star and a planet, their positions are represented by three coordinates (functions of time). This leads to a system of differential equations with six variables. However, instead of "flattening" them, we can group them in two packages of three variables each, obtaining a system of differential equations with vector variables. There are mathematical tools to pack several values into one: the notion of a pair, which can be iterated to build tuples, and the notion of a finite sequence. + +In programming languages we also need tools to pack several values into one. The tools that we have for this are the notion of a pair, the notion of an array, the notion of a record, the notion of an object and the notion of a module. The components of those structures are called fields. + +### 8.1.1 Labelled Fields + +To represent the position of an object on Earth by latitude, longitude and altitude, we can use a tuple with three components: the first one is the latitude of the object, the second its longitude and the third its altitude. If we decide that the tuple (a,b,c) is the pair (a,(b,c)), then the element in the left-hand side is the latitude, the one in the left-hand side of the right-hand side component is its longitude and the one on the right of the right-hand side component is its altitude. There are several other combinations, and our choice here is clearly arbitrary. + +If instead we decide that the tuple (a,b,c) is represented by a function from {0,1,2} to ℝ that associates a to 0, b to 1 and c to 2, then the latitude of the object is the real number associated by this function to 0, its longitude is the number associated to 1 and its altitude is the number associated to 2. Again, there are other alternatives, and our choice is arbitrary. + +There is no reason to place these values in a specific position in the tuple, or to associate them with one number rather than another. Moreover, if in a program we need to change the data structure to add another field, we will have to update the program in several places. These modifications are likely to introduce errors, and we might end up confusing longitude and temperature... + +Since it is more convenient for programmers to identify the fields by using a name—"latitude", "longitude",...—instead of a position or a number, programming languages offer this possibility. This leads us to a notion of tuple with labelled fields, called record. From a mathematical point of view, a record is a function whose domain is an arbitrary finite set (rather than an initial segment of ℕ), and the elements of the set are the labels of the record. + +The idea of referring to the fields by a name instead of using their position in the tuple can also be used in the context of a function call. In some experimental languages, instead of writing f(4,2) we write f(abscissa = 4, ordinate = 2) or equivalently f(ordinate = 2, abscissa = 4). + +### 8.1.2 An Extension of PCF with Records + +To extend PCF with records, we add three symbols to the language: a symbol {} to build records, a symbol . to access a field in a record, and a symbol <- to build a new record identical to one previously constructed except for the value of one field. + +Before introducing these symbols we need to introduce a new sort for labels and an infinite set of constants, one for each label. Notice that there is no symbol to bind a variable of sort label, therefore there will be no such variables in a closed term. Moreover, the language does not include any other symbol to build terms of sort label, just the constants. Therefore, in a closed term the only subterms of sort label are constants. We can then add to PCF + + * a symbol {} with 2n arguments that does not bind any variables; the arguments at odd positions are labels and the ones at even positions are terms, + + * a symbol . with two arguments, where the first is a term and the second a label, which does not bind any variable, + + * a symbol <- with three arguments where the first is a term, the second a label and the third a term, which does not bind any variable. + +Exercise 8.1 + +In the definition of language that we gave in Chap. 1, each symbol has a fixed number of arguments. We cannot have then a symbol like {} which could have for instance 6 or 8 arguments. How could we fix the definition given above to make it compatible with the notion of language defined in Chap. 1? Hint: What is a list? + +The term {}(l1,t1, ...,ln,tn) will be written {l1 =t1, ...,ln =tn}, the term .(t,l) will be written t.l and the term <-(t,l,u) will be written t(l <\- u). + +The small-step operational semantics of PCF will now include the following rules + +Similarly, the big-step operational semantics is extended with the following rules + +Notice that in these rules the terms of sort label are not interpreted. This is because, as mentioned above, these terms are constants. + +Exercise 8.2 + +Write an interpreter for PCF with records. + +Exercise 8.3 + +The goal of this exercise is to represent a Logo turtle with a record containing an abscissa, an ordinate, and an angle. The turtle should have an internal state so that it can move without changing its identity—see the introduction to Chap. 7. There are two alternatives: the turtle can be defined as a record of references to real numbers, or as a reference to a record of real numbers. Write the function move-forward in both cases. + +In this exercise we assume that there is a type of real numbers and all the necessary operations. + +In the big-step operational semantics that we gave for PCF with records, the interpretation of the term {a = 3 + 4, b = 2} requires to perform the addition of 3 and 4. In contrast, once the value {a = 7, b = 2} is built, an access to the field a does not require to perform an arithmetic operation. + +An alternative would be to delay the addition and assume that the term {a = 3 + 4, b = 2} is a value that can be interpreted as itself. In this case, we will need to interpret the term 3 + 4 each time there is an access to the field a. We could say that this semantics is a call by name one, as opposed to the semantics we gave above, which follows the call by value strategy. + +In call by name, the rules of the operational semantics are + +Exercise 8.4 + +Write an interpreter for PCF with records following the call by name semantics. + +If we compare these two semantics of records, we are lead to make the same comments as for the semantics of functions in call by value vs. call by name: the interpretation of let x = {a = fact 10, b = 4} in x.b requires the computation of the factorial of 10 in call by value, but not in call by name. On the other hand, the interpretation of let x = {a = fact 10, b = 4} in x.a + x.a under call by name triggers twice the computation of the factorial 10. The interpretation of let x = {a = fix y y, b = 4} in x.b produces an infinite loop under call by value, whereas it successfully returns 4 under call by name. Finally, when we also have references, the side effects of the interpretation of a field could be repeated several times if we access the filed several times—see Exercise 7.5. + +For example, if we build a record x with a field a that is a reference to a natural number, initially 0, and a function inc that increases this number by one, and then we write a term that increases this value and returns it, we obtain + +Under call by value, this term produces the result 1, as one expects. However, a call by name interpretation will access three times the field a of the record x, that is, it will interpret three times the term ref 0, creating three references that point to the value 0. The third reference, created by the interpretation of the term !(x.a), is never updated and therefore the interpretation of the programme above under call by name produces the result 0. + +To make sure that the call by value and the call by name interpretations produce the same result, we should avoid side effects—such as the creation of a reference in the example above—during the interpretation of fields. We can rewrite the term as follows + +which guarantees that the value will be 1, whether in call by value or call by name. + +Exercise 8.5 + +(Types for records) Consider a type person for records with three fields: surname, name and telephone. Show that we can program the three functions x(surname <\- y), x(name <\- y) and x(telephone <\- y) without using the symbol <-, which means that this symbol is superfluous. + +Will this symbol be still superfluous if we have a type contactable including all the records which contain at least the field telephone? + +If we have a type person and a type contactable, do we still have unicity of types? + +## 8.2 Objects + +Programs usually deal with various kinds of data, often structured as records. For example, a company's computer system might deal with order forms from customers, invoices, pay slips.... A customer order might be represented as a record including the identification of the object ordered, the quantity requested... To print the data there are several alternatives. We could write a unique function print that starts by checking which kind of data we want to print—order form, pay slip...—and then prints it in a different format depending on the kind of data. Or we could write several functions: print_order_form, print_pay_slip... Alternatively, we could define a record print where each field is a printing function. Yet another option would be to make each printing function a part of the type. Such a data type is called a class, and its elements are called objects. + +In the most radical object-oriented programming style, each object, for instance, each order form, includes a different function print. An order form is then a record that contains, in addition to the standard fields—identification of the item requested, number of items ordered,...—a field print defining the printing function that should be used to print the object. + +Some languages, for instance Java, associate a print function to each class rather than each object. Thus, all the objects in the class share the printing function—whether static or dynamic. If we do not want to share the printing function for two objects t and u in the same class C, we need to define two sub-classes T and U of C, which inherit all the fields of C but redefine print differently. + +### 8.2.1 Methods and Functional Fields + +An object is simply a record where some fields are functions. In Java, where functions are not first-class objects, we must distinguish the fields that are functions from those that are not; the functional ones are called methods. + +In a language where functions are first-class objects, like PCF, this distinction is not necessary. Objects are then simply records, and we can program in an object-oriented style in the extension of PCF with records defined previously in this chapter. + +Exercise 8.6 + +The program that manages the sale of tickets for a concert is an object with the following fields + + * a reference to a natural number: the number of orchestra seats available, + + * a reference to a natural number: the number of balcony seats available, + + * a function that takes an object and a natural number as arguments—0 for orchestra and 1 for balcony—and returns the number 0 or the number 1 to indicate whether the booking is closed or there are still seats in that area, + + * a function that takes an object and a natural number as arguments—0 for orchestra and 1 for balcony—, and reserves a seat by decreasing the number of seats available in that area; by convention it returns the value 0. + +Program this object in PCF with records. + +Typing systems for records and objects are out of the scope of this book. We will only say that if we give type A to the object defined in Exercise 8.6, then A must be the Cartesian product of nat ref, nat ref, A -> nat -> nat and A -> nat -> nat. We cannot define the type A as (nat ref) × (nat ref) × (A -> nat -> nat) × (A -> nat -> nat), because this is a circular definition. To define this type, we need to introduce a fixed point operator on types. + +If X -> Y denotes the space of functions from X to Y and B is a set with at least two elements, then the recursive equation A = (A -> B) does not have a solution. Indeed, it follows from Cantor's theorem that the cardinal of the set A -> B is strictly greater than that of A. The equation A = (nat ref) × (nat ref) × (A -> nat -> nat) × (A -> nat -> nat) does not have a solution either. As with the construction fix in PCF, it is not trivial to give a denotational semantics for the fixed point operator on types. + +### 8.2.2 What Is "Self"? + +If t is the object built in Exercise 8.6, to know whether the booking is closed or there are still orchestra tickets, we need to interpret the term t.free t 0. Indeed, the function t.free takes an object u and a natural number n and indicates whether the field associated to n in u—orchestra if n = 0, balcony if n = 1—is zero or not. In other words, the method free is static, as defined for example in Java. + +We now want the method free of the object t to apply to the object t itself, that is, we want to invoke it by interpreting the term t#free 0 instead of t.free t 0. In other words, we want this method to be dynamic. + +One way to achieve this is to consider the term t#l as an abbreviation for t.l t. The difficulty here is that if t is an object and l a label in this object, we can only use the term t#l if the field l is a function of type A -> ... where A is the type of t itself. In other words, we can only use the term t#l if l is the label of a method. If l is the label of a field that is not a method, we still need to write t.l. + +To avoid this distinction, we can state that all fields are functions. If a field a of an object t has the value 3, we transform it into a field with functional value fun s -> 3. Thus, the term t#a, that is, t.a t or (fun s -> 3) t, is interpreted as the value 3. + +The first argument of each method in the object is then a bound variable, which is usually called self or this. In fact, most programming languages use a special variable self or this which is implicitly bound in the object, and which denotes the object itself. + +When all methods in a record are terms of the form fun x -> ..., they can be interpreted as themselves, and we can simplify the rule + +by using + +Similarly, the rule + +specialises to + +and finally the rule + +can be replaced by + +To force all fields to be functions, we can modify the language of records, passing from a record language to an object-oriented language. The symbol {} now binds a variable in each even argument—terms—, the symbol . is replaced by the symbol #, the symbol <- now binds a variable in the third argument. + +The term {}(l1,s1t1, ...,ln,sntn) is written {l1 =ςs1t1, ...,ln =ςsntn}, the term #(t,l) is written t#l and the term <-(t,l,s u) is written t(l <-ςs u). The rules of the big-step operational semantics are now + +Exercise 8.7 + +Write an interpreter for the language PCF with objects. + +Exercise 8.8 + +(Late binding) Consider the term + +Is the value of this term 10 or 11? Compare this result with that of Exercise 2.8. + +### 8.2.3 Objects and References + +The standard definition of object includes a notion of internal state, which evolves in time. Thus, it combines the notion of object and reference, which are clearly separate in the definition of functional object given above. + +In a language with objects and references, when a non-functional field a = u is transformed into a = fun x -> u, the interpretation of fun x -> u does not produce the side effects produced by the interpretation of u. It is only when we access the field that the side effects will be visible. Thus, the behaviour is similar to that of records under call by name. The term + +is interpreted as the value 0 and not 1 as the term + +in call by value. We need to rewrite this term as follows + +if we want the interpretation to be the value 1. + +Exercise 8.9 + +When we interpret a term of the form t#l, how many times is the term t interpreted? If the interpretation of t includes side effects, how many times will they take place? How can we force the term t to be interpreted only once? +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2_9© Springer-Verlag London Limited 2011 + +# 9. Epilogue + +Gilles Dowek1 and Jean-Jacques Lévy2 + +(1) + +Labo. d'Informatique, École polytechnique, route de Saclay, 91128 Palaiseau, France + +(2) + +Centre de Recherche Commun, INRIA-Microsoft Research, Parc Orsay Université, 28 rue Jean Rostand, 91893 Orsay Cedex, France + +Gilles Dowek (Corresponding author) + +Email: gilles.dowek@polytechnique.edu + +Jean-Jacques Lévy + +Email: jean-jacques.levy@inria.fr + +Abstract + +The Epilogue discusses the goals of the theory of programming languages: building tools to describe existing languages or to define new ones? + +The first goal of this book was to present the main tools to define the semantics of a programming language: small-step operational semantics, big-step operational semantics, and denotational semantics. + +We have stressed the fact that these three tools have the same purpose. In the three cases, the goal is to define a relation ↪ between a program, an input value and an output value. Since the goal is to define a relation, the question that arises naturally is: how do we define relations in mathematical language? + +The answer is the same in the three cases: the means to achieve the goal is the fixed point theorem. However, the similarity is superficial, because the fixed point theorems are used in different ways in the three semantics. By giving rise to inductive definitions, and hence reflexive-transitive closures, the fixed point theorem plays a major rôle in operational semantics. In contrast, it plays a minor rôle in denotational semantics, because it is only used to give the meaning of the construction fix. The denotational semantics of a language without fixed point, such as Gödel's System T—see Exercise 5.13—can be defined without using the fixed point theorem. + +To highlight the differences, we can look at the rôle of derivations. To establish that a term t has the value V in operational semantics, it is sufficient to show a derivation, or a sequence of reductions, that is, finite objects. In contrast, in denotational semantics the meaning of a term of the form fix is given as the least fixed point of a function, that is, a limit. For this reason, to establish that the value of a term t is V we sometimes need to compute the limit of a sequence, that is, we sometimes need to deal with an infinite object. + +Operational semantics have an advantage over denotational ones, because the relation ↪ can be defined in a more concrete way operationally. But on the other hand, operationally we can only define relations that are recursively enumerable, whereas denotationally we can define arbitrary relations. For this reason, in operational semantics we cannot complete the definition of the relation ↪ by adding a value ⊥ for the terms that do not terminate, because the resulting relation is not recursive, it cannot be effectively defined by induction. In contrast, denotationally it is not a problem to add such a value. + +We see here the dilemma that arises from the undecidability of the halting problem: we cannot complete the relation ↪ by adding ⊥ for the non-terminating terms, and at the same time define it inductively. We have to choose between completing the relation or defining it inductively, which leads to two different semantics. The readers who have followed logic courses before will recognise here the same issues that distinguish the truth judgements that are inductively defined, by the existence of a proof, from those that are defined by their validity in a model. + +The second goal of this book was to give the semantics of some programming language features: explicit definitions of functions, functions defined by fixed points, assignment, records, objects.... Here again, since the goal is to define functions, it is useful to start by looking at the ways in which functions are defined in Mathematics. In general, the comparison between the mathematical language and programming languages is fruitful, since the mathematical language is the closest we have to programming languages. This comparison shows some common points, but also some differences. + +The purpose of the study of programming language features is not to be exhaustive, but to show some informative examples. The point to remember is that, in the same way that Zoology is not the study of all the animal species one after the other, the study of programming languages should not consist of studying all languages one after the other. They should be organised according to their main features. + +We could continue this study by defining data types and exceptions. The study of data types would give us the opportunity to use again the fixed point theorem, and Robinson's unification algorithm, of which matching is a particular case. Going forward in this direction we could study the notion of backtracking which leads to Prolog. Other important points that we have left aside are the polymorphic typing of references, the notion of array, imperative objects, modules, type systems for records and objects (and in particular the notion of sub-type), concurrency.... + +The final goal of this book was to present a number of applications of these tools, in particular for the design and implementation of interpreters and compilers, and also the implementation of type inference systems. The main point here is that the structure of a compiler is derived directly from the operational semantics of the language to be compiled. The next step would be the study of implementation techniques for abstract machines, and this would lead us to the study of memory management and garbage collection. We could also study program analysis, and design systems to deduce in an automatic or interactive way properties of programs, for instance, the property that states that the value returned by a sorting algorithm is a sorted list. + +The last point that remains to discuss is the rôle of the theory of programming languages, and in particular whether its purpose is to describe the existing programming languages, or to propose new languages. + +Astronomers study the galaxies that exist, and do not build new ones, whereas chemists study the existing molecules and build new ones. We know that in the latter case, the order in which theories and production techniques appear may vary: the transformation of mass into energy was achieved long time after the theory of relativity, whereas the steam engine appeared before the principles of thermodynamics were established. + +The theory of programming languages has enabled the development of new features, such as static binding, type inference, polymorphic types, garbage collection,... which are now available in commercial languages. In contrast, other functionalities, such as assignments and objects, were introduced in programming languages wildly, and the theory has been slow to follow. The development of a formal semantics for these constructs led in turn to new proposals, such as the recent extensions of Java with polymorphic types. + +The theory of programming languages has neither an exclusively descriptive rôle nor an exclusively leading rôle. It is this going backwards and forwards between the description of existing features and the design of new ones that gives the theory of programming languages its dynamics. +Gilles Dowek and Jean-Jacques LévyUndergraduate Topics in Computer ScienceIntroduction to the Theory of Programming Languages10.1007/978-0-85729-076-2© Springer-Verlag London Limited 2011 + +References + +1. + +Abadi, M., Cardelli, L.: A Theory of Objects. Springer, Berlin (1998) + +2. + +Dybvig, R.K.: The Scheme Programming Language, 2nd edn. Prentice Hall, New York (1996). www.scheme.com/tspl2d/ + +3. + +Gunter, C.A.: Semantics of Programming Languages: Structures and Techniques. MIT Press, Cambridge (1992)MATH + +4. + +Kahn, G.: Natural semantics. In: Proceedings of the Symp. on Theoretical Aspects of Computer Science, TACS, Passau (1987) + +5. + +Mitchell, J.C.: Foundations for Programming Languages. MIT Press, Cambridge (1996) + +6. + +Mitchell, J.C.: Concepts in Programming Languages. Cambridge University Press, Cambridge (2002)MATHCrossRef + +7. + +Peyton Jones, S., Lester, D.: Types and Programming Languages. Prentice Hall, New York (1992) + +8. + +Pierce, B.C.: Types and Programming Languages. MIT Press, Cambridge (2002) + +9. + +Plotkin, G.D.: LCF considered as a programming language. Theor. Comput. Sci. 5 , 223–255 (1977) MathSciNetCrossRef90044-5) + +10. + +Plotkin, G.D.: A structural approach to operational semantics. Technical Report DAIMI FN–19, Computer Science Department, Aarhus University, Aarhus, Denmark, September 1981 + +11. + +Reynolds, J.C.: Theories of Programming Languages. Cambridge University Press, Cambridge (1998)MATHCrossRef + +12. + +Scott, D.: Continuous Lattices. Lecture Notes in Math., vol. 274, pp. 97–136. Springer, Berlin (1972) + +13. + +Weis, P., Leroy, X.: Le langage Caml, 2nd edn. Dunod, Paris (1999) + +14. + +Winskel, G.: The Formal Semantics of Programming Languages. MIT Press, Cambridge (1993)MATH + +Index + +A + +Abstract machine + +Algorithm + +Damas and Milner + +Hindley's + +Robinson's + +α -equivalence + +Alphabetic equivalence + +Arity + +Array + +B + +β -reduction + +Binding + +dynamic + +late + +static + +C + +Call by name + +Call by value + +Church numeral + +Closed set + +Closure + +recursive + +Compiler + +bootstrapping + +Composition + +Confluence + +Constant + +Continuous function + +D + +De Bruijn index + +Definition + +explicit + +inductive + +Derivation + +Deterministic + +E + +Environment + +semantic + +typing + +Evaluate + +Evaluator + +F + +Fields + +Fixed point + +Curry + +first theorem + +function construction via + +in PCF + +second theorem + +Functionalisation + +H + +Height + +I + +Interpreter + +L + +Label + +Language + +Limit + +List + +M + +Method + +dynamic + +static + +Module + +N + +Number of arguments + +O + +Object + +Ordering + +Scott's + +strongly complete + +weakly complete + +P + +Pair + +PCF ( Programming language for computable functions ) + +Polymorphism + +Position numerals + +R + +Record + +in call by name + +in call by value + +Redex + +Reduction + +call by name + +call by value + +lazy + +weak + +Reference + +Register + +accumulator + +code + +environment + +stack + +Renaming + +Result + +Rule + +S + +Semantics + +big-step operational + +denotational + +small-step operational + +Side effect + +Solution + +principal + +Sort + +Strategy + +Subject reduction + +Substitution + +System + +F + +T + +T + +Term + +closed + +irreducible + +stuck + +Thunk + +Tree + +Type + +checking + +inference + +principal + +Type preservation + +by interpretation + +Type scheme + +U + +Unification + +V + +Value + +extended + +rational + +Variable + +capture + +environment + +mutable + diff --git a/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt b/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt new file mode 100644 index 00000000..344c34c9 --- /dev/null +++ b/kag/examples/csqa/builder/data/joe_celko_s_sql_programming_style.txt @@ -0,0 +1,4811 @@ +Joe Celko's SQL Programming Style + +Joe Celko's SQL Programming Style + +Joe Celko + +Morgan Kaufmann +Publishing Director Michael Forster Publisher Diane Cerra Publishing Services Manager Andre Cuello Senior Production Editor George Morrison Editorial Assistant Asma Stephan Cover Design Side by Side Studios Cover Image Side by Side Studios Composition Multiscience Press, Inc. Copyeditor Multiscience Press, Inc. Proofreader Multiscience Press, Inc. Indexer Multiscience Press, Inc. Interior printer The Maple-Vail Book Manufacturing Group Cover printer Phoenix Color Corp. + +Morgan Kaufmann Publishers is an imprint of Elsevier. 500 Sansome Street, Suite 400, San Francisco, CA 94111 + +This book is printed on acid-free paper. + +Designations used by companies to distinguish their products are often claimed as trademarks or registered trademarks. In all instances in which Morgan Kaufmann Publishers is aware of a claim, the product names appear in initial capital or all capital letters. Readers, however, should contact the appropriate companies for more complete information regarding trademarks and registration. + +No part of this publication may be reproduced, stored in a retrieval system, or transmitted in any form or by any means-electronic, mechanical, photocopying, scanning, or otherwise-without prior written permission of the publisher. + +Permissions may be sought directly from Elsevier's Science & Technology Rights Department in Oxford, UK: phone: (+44) 1865 843830, fax: (+44) 1865 853333, e-mail: permissions@elsevier.com.uk. You may also complete your request on-line via the Elsevier homepage () by selecting "Customer Support" and then "Obtaining Permissions". + +Library of Congress Cataloging-in-Publication Data + +Application submitted. + +ISBN: 0-12-088797-5 + +For information on all Morgan Kaufmann publications, visit our Web site at www.mkp.com or www.books.elsevier.com + +Printed in the United States of America 05 06 07 08 5 4 3 2 1 +To Eve Astrid Adersson, Miss American Π and April Wilson, who rubs me the right way +Table of Contents + +Instructions for online access + +Cover + +Title Page + +Introduction + +1.1 Purpose of the Book + +1.2 Acknowledgments + +1.3 Corrections, Comments, and Future Editions + +Chapter 1: Names and Data Elements + +1.1 Names + +1.2 Follow the ISO-11179 Standards Naming Conventions + +1.3 Problems in Naming Data Elements + +Chapter 2: Fonts, Punctuation, and Spacing + +2.1 Typography and Code + +2.2 Word Spacing + +2.3 Follow Normal Punctuation Rules + +2.4 Use Full Reserved Words + +2.5 Avoid Proprietary Reserved Words if a Standard Keyword Is Available in Your SQL Product + +2.6 Avoid Proprietary Statements if a Standard Statement Is Available + +2.7 Rivers and Vertical Spacing + +2.8 Indentation + +2.9 Use Line Spacing to Group Statements + +Chapter 3: Data Declaration Language + +3.1 Put the Default in the Right Place + +3.2 The Default Value Should Be the Same Data Type as the Column + +3.3 Do Not Use Proprietary Data Types + +3.4 Place the PRIMARY KEY Declaration at the Start of the CREATE TABLE Statement + +3.5 Order the Columns in a Logical Sequence and Cluster Them in Logical Groups + +3.6 Indent Referential Constraints and Actions under the Data Type + +3.7 Give Constraints Names in the Production Code + +3.8 Put CHECK() Constraint Near what they Check + +3.9 Put Multiple Column Constraints as Near to Both Columns as Possible + +3.10 Put Table-Level CHECK() Constraints at the End of the Table Declaration + +3.11 Use CREATE ASSERTION for Multi-table Constraints + +3.12 Keep CHECK() Constraints Single Purposed + +3.13 Every Table Must Have a Key to Be a Table + +3.14 Do Not Split Attributes + +3.15 Do Not Use Object-Oriented Design for an RDBMS + +Chapter 4: Scales and Measurements + +4.1 Measurement Theory + +4.2 Types of Scales + +4.3 Using Scales + +4.4 Scale Conversion + +4.5 Derived Units + +4.6 Punctuation and Standard Units + +4.7 General Guidelines for Using Scales in a Database + +Chapter 5: Data Encoding Schemes + +5.1 Bad Encoding Schemes + +5.2 Encoding Scheme Types + +5.3 General Guidelines for Designing Encoding Schemes + +5.4 Multiple Character Sets + +Chapter 6: Coding Choices + +6.1 Pick Standard Constructions over Proprietary Constructions + +6.2 Pick Compact Constructions over Longer Equivalents + +6.3 Use Comments + +6.4 Avoid Optimizer Hints + +6.5 Avoid Triggers in Favor of DRI Actions + +6.6 Use SQL Stored Procedures + +6.7 Avoid User-Defined Functions and Extensions inside the Database + +6.8 Avoid Excessive Secondary Indexes + +6.9 Avoid Correlated Subqueries + +6.10 Avoid UNIONS + +6.11 Testing SQL + +Chapter 7: How to Use Views + +7.1 VIEW Naming Conventions Are the Same as Tables + +7.2 VIEWs Provide Row- and Column-Level Security + +7.3 VIEWs Ensure Efficient Access Paths + +7.4 VIEWs Mask Complexity from the User + +7.5 VIEWs Ensure Proper Data Derivation + +7.6 VIEWs Rename Tables and/or Columns + +7.7 VIEWs Enforce Complicated Integrity Constraints + +7.8 Updatable VIEWs + +7.9 Have a Reason for Each VIEW + +7.10 Avoid VIEW Proliferation + +7.11 Synchronize VIΕWs with Base Tables + +7.12 Improper Use of VIEWs + +7.13 Learn about Materialized VIEWs + +Chapter 8: How to Write Stored Procedures + +8.1 Most SQL 4GLs Are Not for Applications + +8.2 Basic Software Engineering + +8.3 Use Classic Structured Programming + +8.4 Avoid Portability Problems + +8.5 Scalar versus Structured Parameters + +8.6 Avoid Dynamic SQL + +Chapter 9: Heuristics + +9.1 Put the Specification into a Clear Statement + +9.2 Add the Words "Set of All . . ./" in Front of the Nouns + +9.3 Remove Active Verbs from the Problem Statement + +9.4 You Can Still Use Stubs + +9.5 Do Not Worry about Displaying the Data + +9.6 Your First Attempts Need Special Handling + +9.7 Do Not Think with Boxes and Arrows + +9.8 Draw Circles and Set Diagrams + +9.9 Learn Your Dialect + +9.10 Imagine That Your WHERE Clause Is "Super Ameba" + +9.11 Use the Newsgroups and Internet + +Chapter 10: Thinking in SQL + +10.1 Bad Programming in SQL and Procedural Languages + +10.2 Thinking of Columns as Fields + +10.3 Thinking in Processes, Not Declarations + +10.4 Thinking the Schema Should Look Like the Input Forms + +Resources + +Military Standards + +Metadata Standards + +ANSI and ISO Standards + +U.S. Government Codes + +Retail Industry + +Code Formatting and Naming Conventions + +Bibliography + +Reading Psychology + +Programming Considerations + +Index + +About the author +Introduction + +I AM NOT trying to teach you to program in SQL in this book. You might want to read that again. If that is what you wanted, there are better books. This ought to be the second book you buy, not the first. + +I assume that you already write SQL at some level and want to get better at it. If you want to learn SQL programming tricks, get a copy of my other book, _SQL for Smarties_ (3rd edition, 2005). I am trying to teach the reader how to work in logical and declarative terms, instead of in a procedural or OO manner—"Query Eye for the Database Guy," if you will forgive a horrible contemporary pun. + +Few, if any, SQL programmers came to SQL before learning and writing for years in a procedural or object-oriented language. They then got one particular SQL product and were told to learn it on their own or with a book that has a title like "SQL for Brain-Dead Morons," "Learn SQL in Ten Easy Lessons or Five Hard Ones," or worse. + +This is absurd! It takes at least five years to learn to be a master carpenter or chef. Why would you believe people could become SQL gurus in a weekend? What they become is bad SQL programmers, who speak SQL in dialect from the local SQL product with a strong accent from their previous languages. You might want to read "Teach Yourself Programming in Ten Years" by Peter Norvig (www.norvig.com/21-days.html) or "No Silver Bullets" by Fred Brooks, _Computer,_ 20(4): 10–19, April 1987) to get a reality check. + +The horrible part is that these people often don't know they are bad programmers. At one extreme, the entire shop where they work is just as bad, and they never see anything else. At the other extreme, if anyone tries to tell them about their problems, they become defensive or angry. If you look at postings on SQL newsgroups, many programmers just want to get a kludge for an immediate problem and not actually obtain a true long-term solution. + +If these were woodworking newsgroups, their questions would be the equivalent of "What are the best kind of rocks to use to pound screws into fine furniture?" When someone tells them to use large chunks of granite, they are happy, but if you try to tell them about screwdrivers, they explode into a rage. + +You might want to read an essay on this phenomenon: "Unskilled and Unaware of It: How Difficulties in Recognizing One's Own Incompetence Lead to Inflated Self-Assessments" by Justin Kruger and David Dunning (Department of Psychology, Cornell University, www.apa.org/journals/psp/psp7761121.html). + +Or look at the actual and self-assessments of American high school students in mathematics and sciences that were part of the Bush administration's No Child Left Behind Act. + +# 1.1 Purpose of the Book + +So how did we old farts learn to be better programmers when dinosaurs walked the earth? One of the best helpers we had in the late 1970s when the structured programming revolution came along was a series of books entitled "[Pascal | FORTRAN | COBOL | BASIC] with Style: Programming Proverbs" by Henry Ledgard and some of his colleagues at MIT. The covers were done like a Victorian novel with angels, scrolls, and old-style typographical elements. And like a Victorian novel, the books were subtitled "Principles of Good Programming with Numerous Examples to Improve Programming Style and Proficiency." These books and others made a big difference for most of us because they taught us how to think like good programmers. + +My goals in this book are to improve SQL programming style and proficiency. To be more exact: + +1. To _help an individual programmer write Standard SQL without an accent or a dialect._ It is difficult to unlearn old habits but not impossible, and it is best to learn the right way from the start. Amateurs write code for themselves. A professional writes codeto be maintained and used by other people. My rule of thumb has been that you need to have a full year of SQL programming before you have your epiphany and suddenly see the world in three: valued logic, data models, and sets. + +2. _To give an SQL shop a coding standard for internal use._ I have tried carefully to give a rationale for each of my rules, and I have given exceptions to those rules when I could think of them. You may disagree with some of my choices, but you will have to provide research and examples to defend your position. It is not good enough to simply declare: "Well, that's the way we wrote code in FooTran, so it must be the will of God!" as an argument. + +If you are the team leader, you now have a book (and author) that you can hold up and blame for anything that your people do not like. Even if I am later shown to be wrong about something, you will have been consistent. It is much easier to repair errors if they were made consistently. + +3. _To give programmers the mental tools to approach a new problem with SQL as their tool._ I tell people it takes about a year to "get it" and drop your procedural programming habits. + +# 1.2 Acknowledgments + +Craig Mullins provided the structure of the chapter on VIEWs in an article in www.DBAzine.com. The formatting style is taken from a house style I have used in CMP magazines and other publications for more than a decade. Peter Gulutzan provided the data for the naming conventions in actual products from an article inwww.DBAzine.com. The affix conventions in Chapter 1 are based on internal standards from Teradata Corporation. The scales and measurements and the encoding schemes material appeared in several of my old magazine columns in _DBMS_ and _Database Programming & Design_ before they were collected into a chapter in my book _Data & Database_ (Morgan-Kaufmann Publishers). I have tried to give credit in the text, but so many people have participated in the newsgroups over the years that I know I am forgetting someone. + +And, obviously, thanks to Henry Ledgard and his "Programming Proverbs" series for the inspiration. + +I would also like to thank all of the newbie programmers who wrote bad code. It sounds a bit sarcastic, but it is not meant to be. Many of thenewbies are programmers who were thrown into a DBA or SQL programmer job by management without training or an experienced mentor. I do not want to blame the victims unless they are really not working on getting better. Your errors in syntax, semantics, and style showed me how you were thinking. Diagnosis is the first step to treatment. + +# 1.3 Corrections, Comments, and Future Editions + +Corrections and additions for future editions can be sent to Morgan-Kaufmann publishers directly or to me at my e-mail address, jcelko212@earthlink.net. +CHAPTER 1 Names and Data Elements + +This is the old joke: + +"When I was a kid, we had three cats." + +"What were their names?" + +"Cat, cat, and cat." + +"That sounds screwed up. How did you tell them apart?" + +"Who cares? Cats don't come when you call them anyway!" + +YOUR DATA WILL not come when it is called either if you do not give it a name that is always distinct and recognizable. This is an important part of any database project. Bad names for the data elements make the code difficult, or even impossible, to read. + +I am not kidding about impossible to read. In the old days, software companies used to deliberately scramble source code names and remove formatting to hide the algorithm from the buyers. The tradition seems to linger on, even if not by intent. In August 2004, a SQL newsgroup had a posting in which all of the names were one letter and a long string of digits. + +There are now ISO-11179 metadata standards that describe rules for naming data elements and for registering standards. Because they are an ISO standard, they are what you should be using in SQL as well as everywhere else. + +That standard, a bit of typography, and some common sense will give you the rules you need to get started. + +# 1.1 Names + +In the early days, every programmer had his or her own personal naming conventions. Unfortunately, they were often highly creative. My favorite was a guy who picked a theme for his COBOL paragraph names: one program might use countries, another might use flowers, and so forth. This is obviously weird behavior even for a programmer, but many programmers had personal systems that made sense to themselves but not to other people. + +For example, the first FORTRAN I used allowed only six-letter names, so I became adept at using and inventing six-letter names. Programmers who started with weakly typed or typeless languages like to use Hungarian notation (see Leszynski and Reddick). Old habits are hard to give up. + +When software engineering became the norm, every shop developed its own naming conventions and enforced them with some kind of data dictionary. Perhaps the most widespread set of rules was MIL STD 8320.1, set up by the U.S. Department of Defense, but it never became popular outside of the federal government. This was a definite improvement over the prior nonsystem, but each shop varied quite a bit; some had formal rules for name construction, whereas others simply registered whatever the first name given to a data element was. + +Today, we have ISO-11179 standards, which are becoming increasingly widespread, required for certain government work, and being put into data repository products. Tools and repositories of standardized encoding schemes are being built to this standard. Given this and XML as a standard exchange format, ISO-11179 will be the way that metadata is referenced in the future. + +## 1.1.1 Watch the Length of Names + +#### Rationale: + +The SQL-92 standards have a maximum identifier length of 18 characters. This length came from the older COBOL standards. These days, SQL implementations allow longer names, but if you cannot say it in 18 characters, then you have a problem. Table 1.1 shows the maximum length for names of the most important SQL schema objects according to ISO and several popular SQL products. + +Table 1.1 I _dentifier lengths_ + +The numbers in the table are either bytes or characters. A maximum character length can be smaller than a maximum byte length if you use a multibyte character set. + +Do not use super-long names. People have to read them, type them, and print them out. They also have to be able to understand those names when they look at the code, search for them in the data dictionary, and so forth. Finally, the names need to be shared in host programs that might not allow the same maximum length. + +But do not go to the other extreme of highly condensed names that are impossible to read without weeks of study. The old Bachman design tool was used to build DB2 databases back when column length was limited to 18 bytes. Sometimes the tool would change the logical attribute name to a physical column name by removing all of the vowels. Craig Mullins referred to this as "Bachman having a vowel movement on my DDL." This is a bad approach to getting the name to fit within a smaller number of characters. + +#### Exceptions: + +These exceptions would be on a case-by-case basis and probably the result of legacy systems that had different naming restrictions. + +## 1.1.2 Avoid All Special Characters in Names + +#### Rationale: + +Special characters in a name make it difficult or impossible to use the same name in the database and the host language programs or even to move a schema to another SQL product. + +Table 1.2 shows the characters allowed in names by the standards and popular SQL products. + +Table 1.2 _Identifier character sets_ + +Generally, the first character of a name must be a letter, whereas subsequent characters may be letters, digits, or _ (underscore). Any database management system (DBMS) might also allow $, #, or @, but no DBMS allows all three, and in any case the special characters are not usable everywhere (Microsoft attaches special meaning to names that begin with @ or # and Oracle discourages special characters in the names of certain objects). + +But what is a letter? In the original SQL, all letters had to be uppercase Latin, so there were only 26 choices. Nowadays the repertoire is more extensive, but be wary of characters outside the Latin-1 character set for the following reasons: + +1. _IBM cannot always recognize a letter._ It just accepts that any multibyte character except space is a letter and will not attempt to determine whether it's uppercase or lowercase. + +2. _IBM and Oracle use the database's character set and so could have a migration problem with exotic letters._ Microsoft uses Unicode and so does not have this problem. + +Intermediate SQL-92 does not allow an identifier to end in an underscore. It is also not a good idea to put multiple underscores together; modern printers make it difficult to count the number of underscores in a chain. + +#### Exceptions: + +None + +## 1.1.3 Avoid Quoted Identifiers + +#### Rationale: + +#### + +This feature was added to SQL-92. Its main use has been to alias column names to make printouts look like reports. This kludge defeats the purpose of a tiered architecture. Instead, it destroys portability of the code and invites poorly constructed names. Table 1.3 shows the characteristics of delimited identifiers. + +Table 1.3 _Quoted identifier character sets_ + +If you find the character-set restrictions of names onerous, you can avoid them by putting identifiers inside double quotes. The result is a delimited identifier (or quoted identifier in Oracle terminology). Delimited identifiers may start with, and contain, any character. It is a bit uncertain how one can include the double quote (") character. The standard way is to double it, as in "Empl" "oyees" but that's not always documented. + +Support for delimited names is nearly universal, with only two major exceptions: (1) IBM will not allow nonalphanumeric characters for labels and variable names inside stored procedures, and (2) Microsoft will not allow quoted identifiers if the QUOTED_IDENTIFIER switch is off. The reason for the first exception is, perhaps, that IBM converts SQL procedures into another computer language before compilation. Suppose you make a table with a delimited identifier, for example: + +Now try to get that table with a regular identifier, thus: + +Will this work? According to the SQL standard, it should not, but with Microsoft, it might. The reason is case sensitivity, which we discuss in section 1.1.4. + +The quoted identifiers do not work well with hot languages, especially when they have spaces or special characters. For example, this is a valid insertion statement: + +ADO generates the following code: + +which is a syntax error. + +#### Exceptions: + +If you need to communicate a result to someone who cannot read or understand the properly constructed column names in Latin-1, then use quoted aliases to format the output. I have done this for Polish and Chinese speakers. + +I also use quoted names inside documentation so that they will immediately read as the name of a schema object and not a regular word in the sentence. + +The usual reason for this error is that the programmer confuses a data element name with a display header. In traditional procedural languages, the data file and the application are in the same tier; in SQL, the database is totally separate from the front end where the data is displayed. + +## 1.1.4 Enforce Capitalization Rules to Avoid Case-Sensitivity Problems + +#### Rationale: + +Case-sensitivity rules vary from product to product. + +Standard SQL, IBM, and Oracle will convert regular identifiers to uppercase but will not convert delimited identifiers to uppercase. For Microsoft, the case-sensitivity rule has nothing to do with whether the name is regular or delimited. Instead, identifiers depend on the default collation. If the default collation is case insensitive, then t equals T. If it's case sensitive, then t does not equal T. + +To sum up, there are two case-sensitivity problems. The first is that the delimited identifier "t" and the regular identifier t differ if one followsthe SQL standard. The second is that Microsoft does not follow the SQL standard. These problems make it difficult for one naming convention to fit everyone. + +#### Exceptions: + +I will give a simple set of rules based on principles of readability and typography, but there are other possible conventions: + +1. Avoid delimited identifiers so you have no problems. + +2. IBM uses only uppercase. Unfortunately, this is difficult to read and looks like you are still programming on a punchcard system. + +3. Microsoft and Oracle use lowercase except where it would look odd. Unfortunately, the definition of looking odd is not at all precise. Sometimes reserved words are uppercased, sometimes lowercased, and so forth. + +# 1.2 Follow the ISO-11179 Standards Naming Conventions + +This is a fairly new ISO standard for metadata, and it is not well understood. Fortunately, the parts that a SQL programmer needs to know are pretty obvious and simple. The real problem is in the many ways that people violate them. A short summary of the NCITS L8 Metadata Standards Committee rules for data elements can be found at the following sites: + +Also the pdf file: + +and the draft: + +The ISO-11179 standard is broken down into six sections: + +## 1.2.1 ISO-11179 for SQL + +#### Rationale: + +Although the formal standards are good, they are very general. It is handy to have a set of rules aimed at the SQL developer in his or her own language. Some of the interpretations given here are the consensus of experts, as taken from newsgroups and private e-mails. + +Taking the rules from Section ISO-11179–4, a scalar data element should do the following: + +1. Be unique (within any data dictionary in which it appears). + +2. Be stated in the singular. + +3. State what the concept is, not only what it is not. + +4. Be stated as a descriptive phrase or sentence(s). + +5. Contain only commonly understood abbreviations. + +6. Be expressed without embedding definitions of other data elements or underlying concepts. + +7. Tables, sets, and other collections shall be named with a collective, class, or plural name. + +8. Procedures shall have a verb in their name. + +9. A copy (alias) of a table shall include the base table name as well as the role it is playing at that time. + +This formalism is nice in theory, but names are subject to constraints imposed by software limitations in the real world, such as maximum name length and character sets. Another problem is that one data element may have many names depending on the context in which it is used. It might be called something in a report and something else in an electronic data interchange (EDI) file, and it might be different from the name in the database. But you want to avoid using multiple names in thesame database, and you should be able to detect them with metadata tools. Furthermore, you want to avoid using multiple names in different databases in the same enterprise. Unfortunately, this is much more difficult to detect without very good data dictionary tools. The data dictionary should include the external names and their context. + +#### Exceptions: + +The curse of legacy databases, legacy file systems, and other traditions can make this very difficult. If there is a common, well-understood name for a data element, then you can use this name instead of a constructed name. For example, "us_postal_code" is formally correct, but "zip_code" is well understood, and you can argue for simply "zip" or "zip4" as a name because it is a familiar term. + +## 1.2.2 Levels of Abstraction + +Name development begins at the conceptual level. An object class represents an idea, abstraction, or thing in the real world, such as tree or country. A property is something that describes all objects in the class, such as height or identifier. This lets us form terms such as "tree height" or "country identifier" from the combination of the class and the property. + +The level in the process is the logical level. A complete logical data element must include a form of representation for the values in its data value domain (the set of possible valid values of a data element). The representation term describes the data element's representation class. The representation class is equivalent to the class word of the prime/class naming convention with which many data administrators are familiar. This gets us to "tree height measure," "country identifier name," and "country identifier code" as possible data elements. + +There is a subtle difference between "identifier name" and "identifier code," and it might be so subtle that we do not want to model it, but we would need a rule to drop the property term in this case. The property would still exist as part of the inheritance structure of the data element, but it would not be part of the data element name. + +Some logical data elements can be considered generic elements if they are well defined and are shared across organizations. Country names and country codes are well defined in the ISO 3166 standard, "Codes for the Representation of Names of Countries," and you might simply reference this document. + +Note that this is the highest level at which true data elements, by the definition of ISO-11179, appear: They have an object class, a property, and a representation. + +The next is the application level. This is usually done with a quantifier that applies to the particular application. The quantifier will either subset the data value domain or add more restrictions to the definition so that we work with only those values needed in the application. + +For example, assume that we are using ISO-3166 country codes, but we are only interested in Europe. This would be a simple subset of the standard, but it will change slowly over time. However, the subset of countries with more than 20 centimeters of rain this year will vary greatly in a matter of weeks. + +Changes in the name to reflect this fact will be accomplished by addition of qualifier terms to the logical name. For example, if a view were to list all of the countries with which a certain organization had trading agreements, the query data element might be called "trading_partner_country_name" to show its role in the context of the VIEW or query that limits it. The data value domain would consist of a subset of countries listed in ISO-3166. + +The physical name is the lowest level. These are the names that actually appear in the database table column headers, file descriptions, EDI transaction file layouts, and so forth. They may be abbreviations or use a limited character set because of software restrictions. However, they might also add information about their origin or format. + +In a registry, each of the data element names and name components will always be paired with its context so that we know the source or usage of the name or name component. The goal is to be able to trace each data element from its source to wherever it is used, regardless of the name under which it appears. + +## 1.2.3 Avoid Descriptive Prefixes + +#### Rationale: + +Another silly convention among newbies is to use prefixes that describe something about the appearance of the data element in the current table. In the old days, when we worked with sequential file systems, the physical location of the file was very important. + +The "tbl-" prefix is particularly silly. Before you counter that this prefix answers the question of what something is, remember that SQL has only one data structure. What else could it be? Do you put "n-" in front of every noun you write? Do you think this would make Englisheasier to read? It is like infants announcing that everything is "thingie!" as they grab them. + +"To _be something is to be something in particular; to be nothing in particular or anything in general is to be nothing."_ —Aristotle + +The next worst affix is the . Why does a data element become something totally different from table to table? For example, "orders_upc" and "inventory_upc" are both UPC codes no matter where they appear, but by giving them two names, you are saying that they are totally, logically different things in your data model. + +A total nightmare is the combination of "id" in a base table (vague name) with a reference in a second table using the base table name as a prefix in the foreign key or non-foreign-key references. The queries fill up with code like "Orders.ID = OrderID," which quickly becomes a game of looking for the period and trying to figure out what a thousand different "ID" columns mean in the data dictionary. + +Affixes like "vw" for views tell you how the virtual table is implemented in the schema, but this has nothing to do with the data model. If I later decide to replace the view with a base table, do I change the name? The bad news is that a table often already exists with the same root name, which makes for more confusion. + +Equally silly and dangerous are column names that are prefixed with the data type. This is how it is physically represented and not what it means in the data model. The data dictionary will be trashed, because you have no idea if there are "intorder_nbr," "strorder_nbr," and perhaps even "forder_nbr," all trying to be the simple "order_nbr" at the same time. The user can also look at the data declaration language (DDL) and see the data type, defaults, and constraints if he or she does not remember them. + +The final affix problem is telling us that something is the primary key with a "PK_" or a foreign key with an "FK_" affix. That is how it is used in that particular table; it is not a part of its fundamental nature. The user can also look at the DDL and see the words "PRIMARY KEY" or "FOREIGN KEY .. REFERENCES.." in the column declarations. + +The strangest version of this is a rule on a Web site for a company that specializes in Oracle programming. It advocated "
_CK_" for CHECK() constraints. This not only gives you no help in determining the errors that caused the violation, but it also limits you to one and only one constraint per column per table, and it leaves you to ask about constraints that use two or more columns. + +The same rules and warnings about affixes apply to all schema objects. You will see "usp_" for user-defined stored procedures, "trig_" for triggers, and so forth. In MS SQL Server, this is a serious problem, because the prefix "sp_" is used for system procedures and has special meaning in the architecture. + +If the schema object does something (triggers, procedures), then use a format for the name; the subject of the sentence is understood to be the procedure. We will go into more details on this topic in Chapter 8. + +#### Exceptions: + +You can find other opinions at: + + + +There was also a series of articles at: + +http://www.sqlservercentral.com/​columnists/​sjones/​codingstandardspart2formatting.asp + +http://www.sqlservercentral.com/​columnists/​sjones/​codingstandardspart1formatting.asp + +## 1.2.4 Develop Standardized Postfixes + +This list of postfixes is built on Teradata's internal standards and common usage. The Teradata standards are given in the Appendix. + +"_id" = identifier. It is unique in the schema and refers to one entity anywhere it appears in the schema. Never use "
_id"; that is a name based on location and tells you this is probably not a real key at all. Just plain "id" is too vague to be useful to anyone and will screw up your data dictionary when you have to find a zillion of them, all different, but with the same data element name and perhaps the same oversized data type. + +"_date" or "dt" = date, temporal dimension. It is the date of something—employment, birth, termination, and so forth; there is no such column name as just a date by itself. + +"_nbr" or "num" = tag number. This is a string of digits that names something. Do not use "_no" because it looks like the Boolean yes/no value. I prefer "nbr" to "num" because it is used as a common abbreviation in several European languages. + +"_name" or "nm" = alphabetic name. This explains itself. It is also called a nominal scale. + +"_code" or "_cd" = a code is a standard maintained by a trusted source, usually outside of the enterprise. For example, the ZIP code is maintained by the U.S. Postal Service. A code is well understood in its context, so you might not have to translate it for humans. + +"_size" = an industry standard or company scale for a commodity, such as clothing, shoes, envelopes, or machine screws. There is usually a prototype that defines the sizes kept with a trusted source. + +"_tot" = a sum, an aggregated dimension that is logically different from its parts. + +"_seq" = sequence, ordinal numbering. This is not the same thing as a tag number, because it cannot have gaps. + +"_tally" = a count of values. Also called an absolute scale. + +"_cat" = category, an encoding that has an external source that has distinct groups of entities. There should be strong, formal criteria for establishing the category. The classification of Kingdom in Biology is an example. + +"_class" = an internal encoding that does not have an external source that reflects a subclassification of the entity. There should be strong formal criteria for the classification. The classification of plants in Biology is an example. + +"_type" = an encoding that has a common meaning both internally and externally. Types are usually less formal than a class and might overlap. For example, a driver's license might be typed for motorcycles, automobiles, taxis, trucks, and so forth. + +The differences among type, class, and category are an increasing strength of the algorithm for assigning the type, class, or category. A category is distinct; you will not often have to guess if something is animal, vegetable, or mineral to put it in one of those categories. + +A class is a set of things that have some commonality; you have rules for classifying an animal as a mammal or a reptile. You may have some cases for which it is more difficult to apply the rules, such as the platypus, an egg-laying mammal that lives in Australia, but the exceptions tend to become their own classification—monotremes in this example. + +A type is the weakest of the three, and it might call for a judgment. For example, in some states a three-wheeled motorcycle is licensed as amotorcycle, but in other states, it is licensed as an automobile, and in some states, it is licensed as an automobile only if it has a reverse gear. + +The three terms are often mixed in actual usage. Stick with the industry standard, even if it violates the aforementioned definitions. + +"_status" = an internal encoding that reflects a state of being, which can be the result of many factors. For example, "credit_status" might be computed from several sources. + +"_addr" or "_loc" = an address or location for an entity. There can be a subtle difference between an address and a location. + +"_img" = an image data type, such as.jpg,.gif, and so forth. + +Then an application might have some special situations with units of measurement that need to be shown on an attribute or dimension. And _always_ check to see if there is an ISO standard for a data element. + +## 1.2.5 Table and View Names Should Be Industry Standards, Collective, Class, or Plural Nouns + +#### Rationale: + +Industry standards should always be used. People in that industry will understand the name, and the definition will be maintained by the organization that sets those standards. + +For example, the North American Industry Classification System (NAICS) has replaced the old Standard Industrial Classification (SIC) system in the United States. This new code was developed jointly by the United States, Canada, and Mexico to provide new comparability in statistics about business activity across North America. The names "NAICS" and "naics_code" are clear to people who do business statistics, even though they look weird to the rest of us. + +If an industry standard is not right for your situation, then try to base your names on that standard. For example, if I am dealing only with automobiles made in Mexico, I could have a table named "VIN_Mexico" to show the restriction. Moving down the priority list, if I cannot find an industry standard, I would look for a collective or class name. I would never use a singular name. + +Collective or class table names are better than singular names because a table is a set and not a scalar value. If I say "Employee," the mental picture is of Dilbert standing by himself—one generic employee. If I say "Employees," the mental picture is of the crew from Dilbert—acollection of separate employees. If I say "Personnel," the mental picture is suddenly more abstract—a class without particular faces on it. + +It is legal in SQL to give a table and a column the same name, but it is a really bad idea. First of all, the column's name would be in violation of the rules we just discussed because it would lack a qualifier, but it would also mean that either the table name is not a set or the column name is not a scalar. + +#### Exceptions: + +Use a singular name if the table actually has one and only one row in it. The one example I can think of is a table for constants that looks like this: + +The insertion creates one row, so the table ought to have a singular name. The "lock" column assures you that there is always only one row. Another version of this is to create a VIEW that cannot be changed using SQL-99 syntax. + +The advantage is that this view cannot be changed; the disadvantage is that this view cannot be changed. + +## 1.2.6 Correlation Names Follow the Same Rules as Other Names . . . Almost + +#### Rationale: + +Correlation names are names. They should be derived from the base table or view name, the column name, or from the expression thatcreates them. The nice part is that the readers have the context in front of them, so you can often use a more abbreviated name. + +A correlation name is more often called an _alias,_ but I will be formal. In SQL-92, they can have an optional AS operator, and it should be used to make it clear that something is being given a new name. + +This explicitly means that you do not use an alphabetical sequence unrelated to the base table name. This horrible practice is all too common and makes maintaining the code much more difficult. Consider looking at several statements where the table "Personnel" is aliased as "A" in one, "D" in another, and "Q" in a third because of its position in a FROM clause. + +Column correlation names for a computed data element should name the computed data element in the same way that you would name a declared column. That is, try to find a common term for the computation. For example, "salary + COALESCE(commission, 0.00)) AS total_pay" makes sense to the reader. + +A simple table or view correlation name should have a short, simple name derived from the base table name or descriptive of the role that copy of the table is playing in the statement (e.g., "SELECT .. FROM Personnel AS Management, Personnel AS Workers" as the two uses of the table in the query). + +Now to explain the "almost" part of this section's title. In the case of multiple correlation names on the same table, you may find it handy to postfix abbreviated names with a number (e.g., "SELECT .. FROM Personnel AS PI, Personnel AS P2"). The digit is to tell the reader how many correlation names are used in the statement for that table. + +In effect, these are "correlation pronouns"—a shorthand that makes sense in a local context. They are used for the same reason as pronouns in a natural language: to make the statement shorter and easier to read. + +A table expression alias should have a short, simple name derived from the logical meaning of the table expression. + +Although not required, the correlation name on a table expression can be followed by a list of new column names in parentheses. If this list is missing, the correlation name inherits the names from the base tables or views in the table expression. In the case of a simple table correlation name, such a list would probably be redundant because we usually want to use the original column names. + +In the case of a table expression correlation name, such a list would probably be a good idea to avoid ambiguous column names. It also forces the programmer to trim the expression of extraneous columns that were not actually needed in the query. + +#### Exceptions: + +If there is no obvious, clear, simple name for the table correlation name, then use an invented name, such as a single letter like X. Likewise, if a computation has no immediate name, then you might use an invented name. + +## 1.2.7 Relationship Table Names Should Be Common Descriptive Terms + +#### Rationale: + +Tables and views can model relationships, usually one-to-many or many-to-many, as well as entities. If the relationship has a common name that is understood in the context, then use it. There is a tendency for newbies to concatenate the names of the tables involved to build a nounce word. For example, they name a table "Marriages" because that is the common term for that relationship rather than "ManWoman," "HusbandsWives," or something really weird. Likewise, "Enrollment" makes more sense than "Students_Courses"; once you start looking for the names, they come easily. + +This concatenation falls apart when the relationship is not a simple binary one, such as an escrow on a house that has a buyer, a seller, and a lender. + +#### Exceptions: + +If there is no common term for the relationship, you will need to invent something, and it might well be a concatenation of table names. + +## 1.2.8 Metadata Schema Access Objects Can Have Names That Include Structure Information + +This rule does not apply to the schema information tables, which come with standardized names. It is meant for naming indexes and other things that deal directly with storage and access. The postfix "_idx" is acceptable. + +#### Rationale: + +This is simply following the principle that a name should tell you what something is. In the case of indexes and other things that deal directly with storage and access, that is what they are. They have nothing to do with the data model. + +#### Exceptions: + +This does not apply to schema objects that are seen by the user. Look for the rules for the other schema objects as we go along. + +# 1.3 Problems in Naming Data Elements + +Now that we have talked about how to do it right, let's spend some time on common errors in names that violate the rules we set up. + +## 1.3.1 Avoid Vague Names + +#### Rationale: + +_"That sounds vaguely obscene to me! I can't stand vagueness!"_ + +—Groucho Marx. + +At one extreme the name is so general that it tells us nothing. The column is a reserved word such as "date" or it is a general word like "id," "amount," "date," and so forth. Given a column called "date," you have to ask, "date of what?" An appointment? Birth? Hire? Termination? Death? The name begs the question on the face of it. + +At another extreme, the name is made useless by telling us a string of qualifiers that contradict each other. Consider the typical newbie column name like "type_code_id" as an example. If it is an identifier, then it is unique for every entity that has it, like the vehicle identification number (VIN) on a automobile. If it is a code, then what is the trusted source that maintains it like a ZIP code? It is drawn from a domain of values that is not unique. If it is a type, then what is the taxonomy towhich it belongs? Why not go all the way and call it "type_code_id_value" instead? + +Why did we not find a mere "customer_type" that would have been understood on sight? + +#### Exceptions: + +None + +Improperly formed data element names seem to be the result of ignorance and object-oriented (OO) programming. In particular, OO programmers put "_id" on every primary key in every table and have problems understanding that SQL is a strongly typed language in which things do not change their data types in programs. The names get absurd at times. Consider a lookup table for colors: + +But what does "_value_id" mean? Names like this are generated without thought or research. Assume that we are using the Pantone color system in the database, so we have a trusted source and a precise description—we did the research! This might have been written as follows: + +## 1.3.2 Avoid Names That Change from Place to Place + +#### Rationale: + +The worst possible design flaw is changing the name of an attribute on the fly, from table to table. As an example, consider this slightly cleaned-up piece of actual code from a SQL newsgroup: + +Those full table names are difficult to read, but the newbie who wrote this code thinks that the table name must _always_ be part of the column name. That is the way that a file worked in early COBOL programs. + +This means that if you have hundreds of tables, each appearance of the same attribute gets a new name, so you can never build a proper data dictionary. Did you also notice that it is not easy to see underscores, commas, and periods? + +Try this cleaned-up version, which clearly shows a simple star schema centered on the IPC table. + +I have no idea what a URN is, but it looks like a standard identifier of some kind. Look at all of the kinds of "URNs" (i.e., URN, IPCURN, and OffenseURN) in the original version of the query. It gives you the feeling of being in a crematorium gift shop. + +As you walk from room to room in your house, do you also change your name, based on your physical location? Of course not! The name we seek identifies the entity, not the location. + +#### Exceptions: + +Aliases inside a query can temporarily give a new name to an occurrence of a data element. These are temporary and disappear at the end of the statement. We discuss rules for this in another section 1.2.6. + +## 1.3.3 Do Not Use Proprietary Exposed Physical Locators + +#### Rationale: + +The most basic idea of modern data modeling is to separate the logical model and the physical implementation from each other. This allows us to reuse the model on different platforms and not be tied to just one platform. + +In the old days, the logical and physical implementations were fused together. I will explain this in more detail in the next chapter, but for now the rule is to never use proprietary physical locators. We want to have portable code. But the real problem is that the proprietary physical locator violates the basic idea of a key in the relational model. + +When new SQL programmers use IDENTITY, GUID, ROWID, or other auto-numbering vendor extensions to get a key that can be used for locating a given row, they are imitating a magnetic tape's sequential access. It lets them know the order in which a row was added to the table—just like individual records went onto the end of the magnetic tape! + +We will spend more time discussing this flaw in Chapter 3. + +#### Exceptions: + +You might want to fake a sequential file when you are using a SQL table structure for some purpose other than a relational database management system (RDBMS). For example, staging and scrubbing data outside the "Real Schema" that do not have any data integrity issues. +CHAPTER 2 Fonts, Punctuation, and Spacing + +CODE IS USUALLY set in a monospace font. After more than a century of manual typewriters and decades of punchcards, we find that it is actually easier to read code in a monospace font than a proportional font. Punctuation marks get the same spacing as a letter in a monospace font, but would be lost in a proportional font. + +# 2.1 Typography and Code + +Your brain and eyes do not follow code the same way that they follow text, process mathematics, read maps, or look at pictures. In fact, there are a lot of individual differences in human brains. + +Some people like text editors that use colors for various syntax elements in a programming language. Other people get headaches from colored program editors and want to see black-and-white text. Likewise, a newspaper that put nouns in red, verbs in green, and other such things would simply not work. Yet black-and-white maps are much more difficult to read than those with colors. Why? This has to do with color perception and how fast you can switch between the left and right halves of your brain. + +There is a test for brain damage in which the examiner flashes cards with words printed in various colored inks (e.g., the word "RED" written in green ink). The examiner asks the subject for the word orthe color and times the responses. The rate is fairly constant over the subject's lifetime, so a change is a symptom of some physical or chemical change. Now, try reading this phrase: + +Almost nobody reading this for the first time catches the fact that the word "the" appears twice. The point is that there is a vertical component to how we read text in chunks of words. + +Code on a page is read from left to right and from top to bottom, with a lot of vertical eye movement that you would not have if you were reading pure text. + +A few years ago, the following posting made the rounds in newsgroups. I am not sure if it is genuinely from Cambridge University, but it makes its point very nicely: + +Aoccrdnig to rscheearch at Cmabrigde Uinervtisy, it deosn't mttaer in waht oredr the ltteers in a wrod are, the only iprmoetnt tihng is taht the frist and lsat ltteer be at the rghit pclae. The rset can be a total mses and you can sitll raed it wouthit porbelm. Tihs is bcuseae the huamn mnid does not raed ervey lteter by istlef, but the wrod as a wlohe. + +Because the parser guarantees that running code will not have syntax and spelling errors like those in the above text, the reader knows what token to expect next with far more certainty than in plain text. Not only are words seen as wholes, but they are also anticipated within each statement in the programming language. That is, if I see an "IF" token in Pascal or another member of the Algol family, I anticipate the matching "THEN" that completes the statement. + +Let's discuss some basic typographic conventions for programming code, which are based on how people read it. + +## 2.1.1 Use Only Upper- and Lowercase Letters, Digits, and Underscores for Names + +#### Rationale: + +This subset of characters will port to any other programming language. It is very handy to be able to use the same names in both the database and the host languages of the applications. + +For example, the octothrope or number sign (#) is allowed in several SQL products, but it has a special meaning in other programming languages and could not be used in them. + +#### Exceptions: + +If you are still programming on a machine that uses punchcards, then you have no choice but to use the limited, uppercase-only character. It is hard to imagine such a situation in the 21st century. + +If the SQL implementation requires special symbols for certain names, then you have no choice. For example, temporary table names begin with an octothrope and parameter names begin with a "petite snail" or "at sign" (@) in Sybase/SQL Server T-SQL dialects. However, it is a good idea to be sure that the names are unique without the special characters, so you can port the code to a more modern implementation. + +Do not use an underscore as the first or last letter in a name. It looks like the name is missing another component. Leading or trailing underscores also get lost visually without letters or digits around them, thanks to laser-quality printers. Likewise, do not use more than one underscore in a row. The old mechanical line printers could not align underscores, so you could eyeball them, whereas laser printers are microscopically precise. + +## 2.1.2 Lowercase Scalars Such as Column Names, Parameters, and Variables + +#### Rationale: + +Words in books and newspapers are written in lowercase letters because they are easier to read than uppercase words. This is basic typography. Using all uppercase letters is the worst choice. Lowercase text is also read faster than uppercase text. The first measurements are in Woodworth (1938), and Smith and Fisher (1975) have confirmed it. Participants were asked to read comparable passages of text, half completely in uppercase text and half presented in standard lowercase text. In each study, participants read reliably faster with the lowercase text by a 5 percent to 10 percent speed difference. + +#### Exceptions: + +Unless there is a compelling physical reason, use lowercase. The only compelling physical reason I can think of is that you are still using punchcards in the 21st century. + +## 2.1.3 Capitalize Schema Object Names + +#### Rationale: + +Schema objects include tables, views, stored procedures, and so forth. Capitalized words begin a sentence in languages that use the Latin alphabet. Additionally, capitalization represents proper nouns—like the names of sets being modeled by tables in SQL—in English, German, and other natural languages. This is the way that readers expect to see these names; don't surprise them. + +#### Exceptions: + +Unless the name naturally begins with a lowercase letter, there is no reason not to capitalize it. + +## 2.1.4 Uppercase the Reserved Words + +#### Rationale: + +Uppercase words are seen as a unit, rather than being read as a series of syllables or letters. The eye is drawn to them, and they act to announce a statement or clause. That is why headlines and warning signs work. + +Typographers use the term _bouma_ for the shape of a word. The term appears in Paul Saenger's book (1975). Imagine each letter on a rectangular card that just fits it, so you see the ascenders, descenders, and baseline letters as various-sized "Lego blocks" that are snapped together to make a word. + +The bouma of an uppercase word is always a simple, dense rectangle, and it is easy to pick out of a field of lowercase words. Consider this statement: + +versus: + +See how quickly you can find each clause, reading from left to right? Next, if you put each clause on a line of its own, you can read the code still faster: + +We will deal with rules for the vertical components later. + +#### Exceptions: + +None + +Keywords come in two types, reserved and nonreserved words. The reserved words are part of the SQL language; the nonreserved words are metadata names that appear in the environment and will not cause syntax errors in an actual SQL program. They are also not very likely to be used in a real application. + +Vendors will also have proprietary reserved words, which should also be capitalized. + +## 2.1.5 Avoid the Use of CamelCase + +#### Rationale: + +The eye tends to look for a word in its usual lowercase or capitalized form, so CamelCase words tend to lead the eye to the pieces rather than to the whole word. In particular, a CamelCase word that begins with a lowercase letter will be scanned starting at the first uppercase letter and then scanned backward to get the first syllable. + +Another problem is that you need to agree on how to mix the cases. For example, should it be "upcCode," "UpcCode," "UPCcode," or"UPCCode"? In practice, you can wind up with several versions of the same name. + +It is even more difficult to read text in alternating case; that is, where the letters of a word change from uppercase to lowercase multiple times within a word (e.g., "AlTeRnAtlnG cAsE"). The bouma shape is different from the same word in its lowercase form. Alternating case has been shown to be more difficult than either lowercase or uppercase text in a variety of studies. + +Smith (1969) showed that it slowed the reading speed of a passage of text. Mason (1978) showed that the time to name a word was slowed. + +Pollatsek, Well, and Schindler (1975) showed that word matching was hindered. Meyer and Gutschera (1975) showed that category decision times decreased. + +#### Exceptions: + +If the word naturally appears in CamelCase, such as "MacDonald," then use it. If you begin the object name with an uppercase letter, then you can optionally use it. However, never use CamelCase for a scalar. + +# 2.2 Word Spacing + +Put one space between language tokens and do not jam things into a stream. For example, do write "foobar = 21" instead of "foobar=21," as you will often see. Many programmers who grew up with punchcards were taught to use minimal white space to save the limited number of columns. For example, FORTRAN II does not need any spaces at all in its code, nor does the original IBM job control language (JCL) for the IBM/360 family. Modern programming languages are not this restricted, and we now have the ability to write code as if people were more important than computers. + +#### Rationale: + +We are now living in the 21st century, and you can add white space for readability without running over the edge. That is a screen and not a punchcard in front of you. + +#### Exceptions: + +You might have to wrap exceptionally long lines. This is not as big a problem in a concise language like SQL as it was in a verbose language like COBOL. + +# 2.3 Follow Normal Punctuation Rules + +#### Rationale: + +Try to follow the rules that you would for English punctuation, because people are used to reading English and their eyes expect certain conventions. + +1. In SQL in particular, you need to follow the rule about having a space after a comma because the comma and the period are easy to confuse or to miss visually. + +Compare: + +versus + +2. Put commas at the end of a line, not the start. A comma, semicolon, question mark, or periods are visual signals that something has just ended, not that it is starting. Having a comma at the start of a line will make the eye tick leftward as it looks for that missing word that was expected before the comma. + +Instead, put comma-separated lists on one line so they can be read left to right instead of vertically. If you split the list into two or more lines, see that each line contains related data elements. + +3. Put a new line or at least a space after a semicolon to separate statements. + +4. Put a space between words even when you could crowd them together. + +#### Exceptions: + +If SQL does not work the same way as English, then you have to follow the SQL syntax rules. + +Many of the code-formatting habits people have go back to habits they were taught by programmers who grew up with punchcard data processing. Because we have video terminals and text editors today, a lot of habits no longer have any basis. + +The practice of putting a comma in front of a single variable on a single line goes back to punchcards. It was often difficult for programmers to get to a keypunch machine to create their decks of cards. In this format, you could pull or insert a card to change your code. There is no excuse for this practice since we now have video terminals. + +English and European languages are read left to right and then top to bottom. This scanning pattern is so deeply learned that we arrange schematics, comic books, maps, and other graphics the same way. To see how much changing that order can throw you off, try to read a Japanese or Chinese comic book. The panels are in right-to-left order, and the Chinese word balloons are read top to bottom. This is why typographers have a rule that you do not set long words + +Did you spot the misspelling? About one-third of readers do not. Likewise, it is difficult to locate duplicates and errors in those longvertical lists of names. SQL formatting can use vertical alignment to advantage in other places but in things that should be chunked together. + +# 2.4 Use Full Reserved Words + +#### Rational: + +SQL allows you to skip some reserved words and to abbreviate others. Try to use the full forms to document the program. This is a good thing in COBOL, and it works in SQL as well. + +For example, an alias can be written with or without an AS operator. That is, "Personnel AS PI" is equivalent to "Personnel P1" in a FROM clause, and "(salary + commission) AS total_pay" is equivalent to "(salary + commission) total_pay" in a SELECT list. But the AS reserved word makes it easier to see there is an alias and not a comma in these situations. + +Technically, you can abbreviate INTEGER to INT and DECIMAL to DEC, but the full names are preferred. The abbreviations look like the reserved word "into" or the month "Dec" in English. + +#### Exceptions: + +The exception is to use the shorter forms of the character data types. That is, CHAR(n) instead of CHARACTERS), VARCHAR(n) instead of VARYING CHARACTER(n), NCHAR(n) instead of NATIONAL CHARACTER(n), and NVARCHAR(n) instead of NATIONAL VARYING CHARACTER(n). The full names are too long to be comfortable to a reader. Even COBOL, the most verbose programming language on earth, allows some abbreviations. + +# 2.5 Avoid Proprietary Reserved Words if a Standard Keyword Is Available in Your SQL Product + +#### Rationale: + +Sticking to standards will make your code readable to other SQL programmers who might not know your dialect. It also means that your code can run on other products without being rewritten. + +Standard code will protect you from failure when the proprietary syntax is dropped or modified. That unwelcome surprise occurred in several products when the vendors added the Standard SQL versions of OUTER JOINs and deprecated their old proprietary versions. In particular, SQL Server programmers had to unlearn their *= syntax and semantics for outer joins. + +The other disadvantage of proprietary features is that they change over time and have no standard behavior. For example, the BIT data type in SQL Server changed its NULL-ability between product releases. Oracle could not tell an empty string from a NULL. There are lots of other examples. Because there is no external standard to appeal, a vendor is free to do anything it wishes. + +#### Exceptions: + +If your SQL product does not yet support standard syntax for something, then you have no choice. This is true for temporal functions. They were late getting to Standard SQL, so the early vendors made up their own syntax and internal temporal models. + +# 2.6 Avoid Proprietary Statements if a Standard Statement Is Available + +#### Rationale: + +This rule ought to be obvious. Sticking to standards will make your code readable to other SQL programmers who might not know your dialect. It also means that your code can run on other products without being rewritten. Standard code will protect your code from failure when the proprietary syntax is dropped or modified. + +In fact, a vendor can actually give you proprietary features that are unpredictable! In the "Books On Line" interactive manual that comes with Microsoft SQL Server, we get a warning in the REMARKS section about the proprietary "UPDATE .. FROM.." syntax that tells us: + +The results of an UPDATE statement are undefined if the statement includes a FROM clause that is not specified in such a way that only one value is available for each column occurrence that is updated (in other words, if the UPDATE statement is not deterministic). For example, given the UPDATE statement in the following script, both rows in table S meet the qualifications of the FROM clause in the UPDATE statement, but it is undefined which row from S is used to update the row in table T. + +This replaces a prior behavior found in the Sybase and Ingres family where the UPDATE .. FROM would do multiple updates, one for each joined row in the second table. + +In older versions of Sybase/SQL Server, if a base table row is represented more than once in the embedded query, then that row is operated on multiple times instead of just once. This is a total violation of relational principles, but it's easy to do with the underlying physical implementation. Here is a quick example: + +Now try to update T1 by doubling all the rows that have a match in T2. + +The FROM clause gives you a CROSS JOIN, so you get a series of four actions on the same row (1 => 2 => 4 => 8 => 16). These are pretty simple examples, but you get the idea. There are subtle things with self-joins and the diseased mutant T-SQL syntax that can hang you in loopsby changing things, or you can have tables that depend on the order of the rows for their results, and so forth. + +SQL Server and Sybase used different fixes for this problem in later versions of their products. Sybase did a hidden "SELECT DISTINCT" in the implied query, and SQL Server gets an unpredictable row. Standard SQL is consistent and clear about aliases, views, and derived tables, as well as a highly orthogonal language. + +If the UPDATE clause could take an alias, according to the Standard SQL model, then you would create a copy of the contents of that base table under the alias name, then update that copy, and delete it when the statement was over—in effect doing nothing to the base table. + +If the UPDATE clause could take a FROM clause, according to the Standard SQL model, then you would create a result set from the table expression, then update that copy, and delete it when the statement was over—in effect doing nothing to the base tables. + +Because this syntax is so proprietary, inconsistent with the standard model, and ambiguous, why does it exist? In the original Sybase product, the physical model made this "extension" relatively easy to implement, and there were no standards or a good understanding of the relational model back then. Programmers got used to it and then it was almost impossible to fix. + +When I lived in Indianapolis in the mid-1970s, my neighbor had graduated from General Motors private college and gone to work for the company. His first job was investigating industrial accident reports. We were having a beer one night, and he got to telling war stories from the various General Motors plants he had been to for his job. His conclusion after a year on that job was that all industrial accidents are bizarre suicide attempts. People would go to the machine shop and build clever devices to short around the safety features on their equipment so they could work a little faster. + +For example, if you make a clamp that holds in one of the two safety switches that operates a small stamping machine, you can push the other button with one hand and work material with your free hand. Well, you can do this until that free hand is crushed just above the wrist and squirts across the back wall of the shop anyway. Trading speed for safety and correctness will eventually catch up with you. + +#### Exceptions: + +If your SQL product does not yet support standard syntax for something, then you have no choice. For example, Oracle did not support the CASEexpression, but its DECODE() function is quite close to it and can be substituted in older versions of Oracle. + +# 2.7 Rivers and Vertical Spacing + +When you look at a magazine or newspaper, you will notice that the text is set in a column that is even on both sides. This is called justified text, as opposed to ragged right or ragged left text. Extra spacing is added to each line to justify the text, but if this extra spacing appears in the same location on several rows, you get rivers. + +A river is a vertical open space in text, and it is considered to be bad typography. You want to read text from left to right, top to bottom, with a visual break at the indentation or new line that marks the start of a paragraph. A river pulls your eye downward and makes the text more difficult to read. + +It is easy to set up what typographers call rivers in the program code in a monospace font because you can add spacing as needed, but that same downward river effect aligns code on a vertical axis and makes the program easier to read. + +versus no river: + +# 2.8 Indentation + +When you have to indent in block-structured 3GL programming languages, use three spaces. A single space is too short to be read as anything but a word separator. Two spaces will work because that is what you were probably taught to use in typing classes at the end of a sentence, but three spaces or a new line is clearly a paragraph to the reader. + +Indenting five or more spaces actually hurts readability. The eye has to skip over too far to grab the code. In particular, the use of an eight-space tab character is historical. The early Teletype machines had 80 characters per line and set tabs at eight spaces for mechanical reasons. That became the definition when we moved to electronic terminals. + +The rule for SQL is that rivers override what we were doing in the old 3GL languages. + +#### Rationale: + +What we need in data manipulation language (DML) is a balance of indentation and the use of rivers to the logical nesting. Note how each subquery has a river to hold it together and that the subquery is placed against the river. + +#### Exceptions: + +A subquery is always inside parentheses, so one can make a case that the closing parentheses should align vertically with its mate. + +The advantage is that you can quickly find the limits of the subquery but at the cost of extra lines that hold only one or two tokens. + +When you have a group of related columns in the SELECT clause list or other places, then use the three-space rule to indent the members of the group when you have to go to a second line: + +The customer columns are on one line, while the 10 payments are split over three lines with an indentation to group them. + +# 2.9 Use Line Spacing to Group Statements + +#### Rationale: + +Use one new line between related statements and two new lines between separate steps in the same process. + +Clusters of related code on a page show the reader which statements perform each step of a process. It is also a good idea to introduce each step with a high-level comment, but we will get into that later. + +As an experiment to demonstrate how important visual clustering is, make some flash cards with some red circles on them. On one set of flash cards, arrange the spots in the patterns in which they appear on a double nine set of dominoes. On a second set of flash cards, put the spots on at random. + +Show the cards to your subjects for one second each and call out the number of the card. Ask them to write down the number of spots oneach card. When there is no arrangement, most people start having problems at five spots and almost nobody can handle eight or more randomly arranged cards. However, nine spots in a three-by-three arrangement present no problems. Even the 10 spots on a playing card are easy to count because they are broken into two clusters of five spots. + +#### Exceptions: + +The double spacing between steps can be optional if it breaks up the flow of the code. +CHAPTER 3 Data Declaration Language + +_"[I need] Data! Data! Data! I can't make bricks without clay."_ +—Sherlock Holmes +(fictional detective of author Sir Arthur Conan Doyle) + +_"Smart data structures and dumb code works a lot better +than the other way round."_ +—Eric S. Raymond + +I BELIEVE THAT MOST of the bad SQL queries in the world are the result of bad schema design. A bad schema can be ambiguous, require extra work to fetch data, and not return valid results even when good data was input into it. + +Let's start with the syntax rules that should be followed when writing data declaration language (DDL), and then in the following chapters, talk about the content and semantics of the DDL. + +# 3.1 Put the Default in the Right Place + +#### Rationale: + +The DEFAULT constraint appears after the data type and NOT NULL constraint appears after the DEFAULT value. + +The SQL-92 standard requires that ordering, but most products allow you to place the DEFAULT either after the data type or after theNOT NULL constraint. A NULL-able column can also have a DEFAULT value, so the standard makes sense. Because we need a consistent pattern, let's go with the standard. Because NOT NULL is so common, it can be left on the same line as the DEFAULT and data type. + +#### Exceptions: + +None + +# 3.2 The Default Value Should Be the Same Data Type as the Column + +#### Rationale: + +That rule sounds obvious, but programmers do not follow it. You will see columns with decimal places defaulted to integer zero, columns of CHAR (n) defaulted to strings of less than (n) characters, and columns of TIMESTAMP defaulted to DATE. The result in many SQL products was implicit type conversions whenever a default value was used. Why incur that overhead, when you could get it right in the first place? + +#### Exceptions: + +None + +# 3.3 Do Not Use Proprietary Data Types + +#### Rationale: + +Proprietary data types do not port to other products or from one release to another of the same product. Standard SQL has more than enough data types to model most of the things you will find in the real world. + +As an example, only the SQL Server/Sybase family has a MONEY data type. It adds currency symbols and commas to a numeric string for display, but it has different rules for doing computations than NUMERIC or DECIMAL data types. The front end has to handle the currency symbols and commas and be sure that the basic math is correct. Why do something in the DDL only to undo it in the front end? + +Even worse, machine-level things like a BIT or BYTE data type have no place in a high-level language like SQL. SQL is a high-level language; it is abstract and defined without regard to physical implementation. This basic principle of data modeling is called _data abstraction._ + +Bits and bytes are the lowest units of hardware-specific, physical implementation you can get. Are you on a high-end or low-end machine? Does the machine have 8-, 16-, 32-, 64-, or 128-bit words? Twos complement or ones complement math? Hey, the standards allowdecimal-based machines, so bits do not exist at all! What about NULLs? To be a data type, you have to have NULLs, so what is a NULL bit? By definition, a bit is on or off and has no NULL. + +What does the implementation of the host languages do with bits? Did you know that +1, +0,-0, and -1 are all used for Booleans but not consistently? That means all of the host languages—present, future, and not yet defined. Surely no good programmer would ever write nonportable code by getting to such a low level as bit fiddling! + +You might also ask if zero is used for "successful completion" in the functions of the host language or the vendor's own 4GL. There are two situations in practice. Either the bits are individual attributes or they are used as a vector to represent a single attribute. In the case of a single attribute, the encoding is limited to two values, which do not port to host languages or other SQLs, cannot be easily understood by an end user, and cannot be expanded. + +In the second case, what some newbies, who are still thinking in terms of second- and third-generation programming languages or even punchcards, do is build a vector for a series of yes/no status codes, failing to see the status vector as a single attribute. Did you ever play the children's game "20 Questions" when you were young? + +Imagine you have six components for a loan approval, so you allocate bits in your second-generation model of the world. You have 64 possible vectors, but only 5 of them are valid (i.e., you cannot be rejected for bankruptcy and still have good credit). For your data integrity, you can: + +1. Ignore the problem. This is actually what most newbies do. When the database becomes a mess without any data integrity, they move on to the second solution. + +2. Write elaborate ad hoc CHECK() constraints with user-defined functions or proprietary bit-level library functions that cannot port and that run like cold glue. + +Now we add a seventh condition to the vector: Which end does it go on? Why? How did you get it in the right place on all the possible hardware that it will ever use? Did the code that references a bit in a word by its position do it right after the change? + +You need to sit down and think about how to design an encoding of the data that is high level, general enough to expand, abstract, and portable. For example, is that loan approval a hierarchical code? + +Concatenation code? Vector code? Did you provide codes for unknown, missing, and N/A values? It is not easy to design such things! + +#### Exceptions: + +Very, very special circumstances where there is no alternative at the present time might excuse the use of proprietary data types. In 20 years of consulting on SQL programming, I have never found a situation that could not be handled by a basic data type or a CREATE DOMAIN statement. + +Next, consider porting a proprietary data type by building a user-defined distinct type that matches the proprietary data type. This is not always possible, so check your product. If the data type is exotic, such as Geo/Spatial data, sound, images, or documents, you should probably do the job in a specialized system and not SQL. + +# 3.4 Place the PRIMARY KEY Declaration at the Start of the CREATE TABLE Statement + +#### Rationale: + +Having the key as the first thing you read in a table declaration gives you important information about the nature of the table and how you will find the entities in it. For example, if I have a table named "Personnel" and the first column is "ssn," I immediately know that we track employees via their Social Security numbers. + +#### Exceptions: + +In the case of a compound primary key, the columns that make up the key might not fit nicely into the next rule (3.5). If this is the case, then put a comment by each component of the primary key to make it easier to find. + +# 3.5 Order the Columns in a Logical Sequence and Cluster Them in Logical Groups + +#### Rationale: + +The physical order of the columns within a table is not supposed to matter in the relational model. Their names and not their ordinal positions identify columns, but SQL has ordinal positions for columns in tables in default situations. The SELECT * and INSERT INTO statements use the order of declaration in their default actions. + +This rule is obvious; people prefer a logical ordering of things to a random mix. For example, the columns for an address are best put in their expected order: name, street, city, state, and postal code. + +#### Exceptions: + +Thanks to columns being added after the schema is in place, you might not be able to arrange the table as you would like in your SQL product. Check to see if your product allows column reordering. + +If you have a physical implementation that uses the column ordering in some special way, you need to take advantage of it. For example, DB2 for z/OS logs changes from the first byte changed to the last byte changed, unless the row is variable; then it logs from the first byte changed to the end of the row. If the change does not cause the length of the variable row to change size, it goes back to logging from the first byte changed to the last byte changed. The DBA can take advantage of this knowledge to optimize performance by placing: + + * Infrequently updated nonvariable columns first + * Infrequently updated variable-length columns next + * Frequently updated columns last + * Columns that are frequently modified together next to each other + +Following this approach will cause DB2 to log the least amount of data most of the time. Because the log can be a significant bottleneck for performance, this approach is handy. You can always create the table and then create a view for use by developers that resequences the columns into the logical order if it is that important. + +# 3.6 Indent Referential Constraints and Actions under the Data Type + +#### Rationale: + +The idea is to make the full column declaration appear as one visual unit when you read down the CREATE TABLE statement. In particular, put the ON DELETE and ON UPDATE clauses on separate lines. + +The standard does not require that they appear together in any particular order. As an arbitrary decision, I am going to tell you to use alphabetical order, so ON DELETE comes before ON UPDATE if both are present. + +#### Exceptions: + +None + +# 3.7 Give Constraints Names in the Production Code + +#### Rationale: + +The constraint name will show up in error messages when it is violated. This gives you the ability to create meaningful messages and easily locate the errors. + +The syntax is simply "CONSTRAINT ," and it should be a clear statement of what has been violated done as a name. For example: + +If you do not provide a name, the SQL engine will probably provide a machine-generated name that is very long, impossible to read, and will give you no clue about the nature of your problem. + +#### Exceptions: + +You can leave off constraint names on PRIMARY KEYS, UNIQUE, and FOREIGN KEY constraints, because most SQL products will give an explicit error message about them when they are violated. The exception is that Oracle will use the system-generated name when it displays the execution plans. + +You can leave off constraint names during development work. However, remember that constraint names are global, not local, because the CREATE ASSERTION statement would have problems otherwise. + +# 3.8 Put CHECK() Constraint Near what they Check + +#### Rationale: + +Put single column CHECK() constraints on its column, multicolumn constraints near their columns. + +We want as much information about a column on that column as possible. Having to look in several places for the definition of a column can only cost us time and accuracy. Likewise, put multicolumn constraints as near to the columns involved as is reasonable. + +#### Exceptions: + +If your SQL product has a CREATE DOMAIN statement, you will include DEFAULT and CHECK() constraints in the domain declaration, so the use of the DOMAIN is enough. Multicolumn constraints on columns that are far apart should be moved to the end of the table declaration. This will give you one place to look for the more complex constraints, rather than trying to look all over the DDL statement. + +It can also be argued that none of this really matters, because most of the time we should be going to the schema information tables to retrieve the constraint definitions, not the DDL. Constraints may have been removed or added with subsequent ALTER statements, and the system catalog will have the correct, current state, whereas the DDL may not. + +## 3.8.1 Consider Range Constraints for Numeric Values + +#### Rationale: + +The whole idea of a database is that it is a single trusted repository for all of the data in the enterprise. This is the place where the business rules must be enforced. + +The most common constraint on numbers in a data model is that they are not less than zero. Now look at actual DDL and see how often you find that constraint. Programmers are lazy and do not bother with this level of details. + +#### Exceptions: + +When the column really can take any value whatsoever. + +## 3.8.2 Consider LIKE and SIMILAR TO Constraints for Character Values + +#### Rationale: + +Again, the whole idea of a database is that it is a single trusted repository for all of the data in the enterprise. This is the place where the business rules must be enforced. + +An encoding will have a format that can be validated with a LIKE or SIMILAR TO predicate. Now look at actual DDL and see how often you find that constraint. This is not as portable an option as numeric range checking, and many programmers who did not use UNIX in their youth have problems with regular expressions, but it is still important. + +#### Exceptions: + +When the column really can take any value whatsoever. + +## 3.8.3 Remember That Temporal Values Have Duration + +There is no such thing as a point in time. You can ask Einstein or go back to the Greek philosopher Zeno and his famous paradoxes. Temporal values have duration, and you need to remember that they have a start and finish time, either explicitly or implicitly, that includes all of the continuum bound by them. The implicit model is a single column and the explicit model uses a pair of temporal values. + +For example, when you set a due date for a payment, you usually mean any point from the start of that day up to but not including midnight of the following day. When you say an employee worked on a given date, you usually mean the event occurred during an eight-hour duration within that day. + +Remember that you can use a DEFAULT CURRENT_TIMESTAMP on a temporal column and that a NULL can be used as a marker for "eternity" in the finish time column. A CHECK() constraint can round off time values to the start of the nearest year, month, day, hour, minute, or second as needed. + +## 3.8.4 REAL and FLOAT Data Types Should Be Avoided + +Most commercial applications do not need floating-point math. SQL has NUMERIC and DECIMAL data types that can be set to a great deal of scale and precision and do not have floating-point numeric rounding errors. There will be exceptions for scientific and statistical data. + +# 3.9 Put Multiple Column Constraints as Near to Both Columns as Possible + +#### Rationale: + +Do not make the reader have to look in multiple physical locations to find all of the columns involved in the constraint. You do not have to indent this constraint, but it is a good idea to split it on two lines: one with the CONSTRAINT clause and one with the CHECK() clause. + +#### Exceptions: + +This is not always physically possible, especially when many columns are involved. + +# 3.10 Put Table-Level CHECK() Constraints at the End of the Table Declaration + +#### Rationale: + +These constraints are not yet well supported in SQL products, but they are legal SQL-92 syntax. Their predicates involve the entire table as a whole rather than just single rows. This implies that they will involve aggregate functions. + +#### Exceptions: + +None + +# 3.11 Use CREATE ASSERTION for Multi-table Constraints + +#### Rationale: + +Put multiple table CHECK() Constraints in CREATE ASSERTION statements rather than on a table declaration. + +These constraints are not yet well supported in SQL products, but they are legal SQL-92 syntax. Their predicates involve several different tables, not just one table. This implies that they are at a higher level and should be modeled there. The practical consideration is that all constraints are TRUE on an empty table, so the CREATE ASSERTIONstatement lets you control that possibility. The assertion name acts as the constraint name. + +#### Exceptions: + +If the SQL product does not support CREATE ASSERTION statements, then this cannot be done, but if it were possible, then violation would require a strong reason having to do with the schema design. + +# 3.12 Keep CHECK() Constraints Single Purposed + +#### Rationale: + +Put simple CHECK() constraints in their own clauses rather than writing one long constraint with multiple tests. + +When you give a constraint a name, that name will appear in error messages and can help the user to correct data. If all of the validation is in one single CHECK() clause, what name would you give it? For example, imagine a single validation for a name that looks for correct capitalization, extra spaces, and a length over five characters. About all you can call it is "bad address line" and hope the user can figure out how to fix it. However, if there were separate checks for capitalization, extra spaces, and a length over five characters, then those constraint names would be obvious and give the user a clue as to the actual problem. + +#### Exceptions: + +If your SQL product supports the SIMILAR TO predicate (a version of grep() based on the POSIX standard in Standard SQL), then you might consider having a longer regular expression with OR-ed patterns that fall under a general constraint name. + +If you do not want to give details about errors to users for security reasons, then you can use a single constraint with a vague name. This would be a strange situation. + +# 3.13 Every Table Must Have a Key to Be a Table + +#### Rationale: + +This is the very definition of a table. The problem is that many newbies do not understand what a key really is. A key must be a subset of the attributes (columns) in the table. There is no such thing as a universal, one-size-fits-all key. Just as no two sets of entities are the same, the attributes that make them unique have to be found in the reality of the data. God did not put a 17-letter Hebrew number on the bottom of everything in creation. + +Here is my classification of types of keys (Table 3.1). + +Table 3.1 _Types of keys_ + +1. A _natural key is a subset of attributes that occurs in a table and acts as a unique identifier._ The user sees them. You can go to the external reality and verify them. You would also like to have some validation rule. Example: UPC codes on consumer goods are easily seen (read the package bar code), and you validate them with a scanner, a manual-check digit calculation, or a manufacturers Web site. + +2. _An artificial key is an extra attribute added to the table that is seen by the user._ It does not exist in the external reality but can beverified for syntax or check digits inside itself. Example: The open codes in the UPC scheme that a user can assign to his or her own products. The check digit still works the same way, but you have to verify the codes inside your own enterprise. + +If you have to construct a key yourself, it takes time to design it, to invent a validation rule, and so forth. There is a chapter on that topic in this book. Chapter 5 discusses the design of encoding schemes. + +3. _An exposed physical locator is not based on attributes in the data model and is exposed to the user._ There is no way to predict it or verify it. The system obtains a value through some physical process in the storage hardware that is totally unrelated to the logical data model. Example: IDENTITY columns in the T-SQL family; other proprietary, nonrelational auto-numbering devices; and cylinder and track locations on the hard drive used in Oracle. + +Technically, these are not really keys at all, because they are attributes of the physical storage and are not even part of the logical data model, but they are handy for lazy, non-RDBMS programmers who don't want to research or think! This is the worst way to program in SQL. + +4. _A surrogate key is system generated to replace the actual key behind the covers where the user never sees it._ It is based on attributes in the table. Example: Teradata hashing algorithms, pointer chains. + +The fact that you can never see or use them for DELETE and UPDATE or create them for INSERT is vital. When users can get to them, they will screw up the data integrity by getting the real keys and these physical locators out of sync. The system must maintain them. + +Notice that people get exposed physical locator and surrogate mixed up; they are totally different concepts. + +## 3.13.1 Auto-Numbers Are Not Relational Keys + +In an RDBMS, the data elements exist at the schema level. You put tables together from attributes, with the help of a data dictionary to model entities in SQL. + +But in a traditional 3GL-language application, the names are local to each file because each application program gives them names and meaning. Fields and subfields had to be completely specified to locate the data. There are important differences between a file system and a database, a table and a file, a row and a record, and a column and a field. If you do not have a good conceptual model, you hit a ceiling and cannot get past a certain level of competency. + +In 25 words or less, it is "logical versus physical," but it goes beyond that. A file system is a loose collection of files, which have a lot of redundant data in them. A database system is a single unit that models the entire enterprise as tables, constraints, and so forth. + +## 3.13.2 Files Are Not Tables + +Files are independent of each other, whereas tables in a database are interrelated. You open an entire database, not single tables within it, but you do open individual files. An action on one file cannot affect another file unless they are in the same application program; tables can interact without your knowledge via DRI actions, triggers, and so on. + +The original idea of a database was to collect data in a way that avoided redundant data in too many files and not have it depend on a particular programming language. + +A file is made up of records, and records are made up of fields. A file is ordered and can be accessed by a physical location, whereas a table is not. Saying "first record," "last record," and "next n records" makes sense in a file, but there is no concept of a "first row," "last row," and "next row" in a table. + +A file is usually associated with a particular language—ever try to read a FORTRAN file with a COBOL program? A database is language independent; the internal SQL data types are converted into host language data types. + +A field exists only because of the program reading it; a column exists because it is in a table in a database. A column is independent of any host language application program that might use it. + +In a procedural language, "READ a, b, c FROM FileX;" does not give the same results as "READ b, c, a FROM FileX;" and you can even write "READ a, a, a FROM FileX;" so you overwrite your local variable. In SQL, "SELECT a, b, c FROM TableX" returns the same data as "SELECT b, c, a FROM TableX" because things are located by name, not position. + +A field is fixed or variable length, can repeat with an OCCURS in COBOL, struct in c, and so on. A field can change data types (union in'C, VARIANT in Pascal, REDEFINES in COBOL, EQUIVALENCE in FORTRAN). + +A column is a scalar value, drawn from a single domain (domain = data type + constraints + relationships) and represented in one and only one data type. You have no idea whatsoever how a column is physically represented internally because you never see it directly. + +Consider temporal data types: in SQL Server, DATETIME (their name for TIMESTAMP data type) is a binary number internally (UNIX-style system clock representation), but TIMESTAMP is a string of digits in DB2 (COBOL-style time representation). When you have a field, you have to worry about that physical representation. SQL says not to worry about the bits; you think of data in the abstract. + +Fields have no constraints, no relationships, and no data type; each application program assigns such things, and they don't have to assign the same ones! That lack of data integrity was one of the reasons for RDBMS. + +Rows and columns have constraints. Records and fields can have anything in them and often do! Talk to anyone who has tried to build a data warehouse about that problem. My favorite is finding the part number "I hate my job" in a file during a data warehouse project. + +Dr. Codd (1979) defined a row as a representation of a single simple fact. A record is usually a combination of a lot of facts. That is, we don't normalize a file; you stuff data into it and hope that you have everything you need for an application. When the system needs new data, you add fields to the end of the records. That is how we got records that were measured in Kbytes. + +## 3.13.3 Look for the Properties of a Good Key + +#### Rationale: + +A checklist of desirable properties for a key is a good way to do a design inspection. There is no need to be negative all the time. + +1. _Uniqueness._ The first property is that the key be unique. This is the most basic property it can have because without uniqueness it cannot be a key by definition. Uniqueness is necessary, but not sufficient. + +Uniqueness has a context. An identifier can be unique in the local database, in the enterprise across databases, or unique universally. We would prefer the last of those three options. + +We can often get universal uniqueness with industry: standard codes such as VINs. We can get enterprise uniquenesswith things like telephone extensions and e-mail addresses. An identifier that is unique only in a single database is workable but pretty much useless because it will lack the other desired properties. + +2. _Stability._ The second property we want is stability or invariance. The first kind of stability is within the schema, and this applies to both key and nonkey columns. The same data element should have the same representation wherever it appears in the schema. It should not be CHAR(n) in one place and INTEGER in another. The same basic set of constraints should apply to it. That is, if we use the VIN as an identifier, then we can constrain it to be only for vehicles from Ford Motors; we cannot change the format of the VIN in one table and not in all others. + +The next kind of stability is over time. You do not want keys changing frequently or in unpredictable ways. Contrary to a popular myth, this does not mean that keys cannot ever change. As the scope of their context grows, they should be able to change. + +On January 1, 2005, the United States added one more digit to the UPC bar codes used in the retail industry. The reason was globalization and erosion of American industrial domination. The global bar-code standard will be the European Article Number (ΕΑΝ) Code. The American Universal Product Code (UPC) turned 30 years old in 2004 and was never so universal after all. + +The ΕΑΝ was set up in 1977 and uses 13 digits, whereas the UPC has 12 digits, of which you see 10 broken into two groups of 5 digits on a label. The Uniform Code Council, which sets the standards in North America, has the details for the conversion worked out. + +More than 5 billion bar-coded products are scanned every day on earth. It has made data mining in retail possible and saved millions of hours of labor. Why would you make up your own code and stick labels on everything? Thirty years ago, consumer groups protested that shoppers would be cheated if price tags were not on each item, labor protested possible job losses, and environmentalists said that laser scanners in the bar-code readers might damage people's eyes. The neo-Luddites have been with us a long time. + +For the neo-Luddite programmers who think that changing a key is going to kill you, let me quote John Metzger, chief information officer of A&P. The grocery chain had 630 stores in 2004, and the grocery industry works 1 percent to 3 percent profit margins—the smallest margins of any industry that is not taking a loss. A&P has handled the new bar-code problem as part of a modernization of its technology systems. "It is important," Mr. Metzger said, "but it is not a shut-the-company-down kind of issue." + +Along the same lines, ISBN in the book trade is being changed to 13 digits, and VINs are being redesigned. See the following sources for more information: + +3. _Familiarity._ It helps if the users know something about the data. This is not quite the same as validation, but it is related. Validation can tell you if the code is properly formed via some process; familiarity can tell you if it feels right because you know something about the context. Thus, ICD codes for disease would confuse a patient but not a medical records clerk. + +4. _Validation._ Can you look at the data value and tell that it is wrong, without using an external source? For example, I know that "2004-02-30" is not a valid date because no such day exists on the Common Era calendar. Check digits and fixed format codes are one way of obtaining this validation. + +5. _Verifiability._ How do I verify a key? This also comes in context and in levels of trust. When I cash a check at the supermarket, the clerk is willing to believe that the photo on the driver's license I present is really me, no matter how ugly it is. Or rather, the clerk used to believe it was me; the Kroger grocery store chain is now putting an inkless fingerprinting system in place, just like many banks have done. + +When I get a passport, I need a birth certificate and fingerprinting. There is a little less trust here. When I get a security clearance, I also need to be investigated. There is a lot less trust. + +A key without a verification method has no data integrity and will lead to the accumulation of bad data. + +6. _Simplicity._ A key should be as simple as possible, but no simpler. People, reports, and other systems will use the keys. Long, complex keys are more subject to error; storing and transmitting them is not an issue anymore, the way it was 40 or 50 years ago. + +One person's simple is another person's complex. For an example of a horribly complex code that is in common international usage, look up the International Standard Bank Number (IBAN). A country code at the start of the string determines how to parse the rest of the string, and it can be up to 34 alphanumeric characters in length. Why? Each country has its own account numbering systems, currencies, and laws, and they seldom match. In effect, the IBAN is a local banking code hidden inside an international standard (see and the European Committee for Banking Standards Web site for publications). + +More and more programmers who have absolutely no database training are being told to design a database. They are using GUIDs, IDENTITY, ROWID, and other proprietary auto-numbering features in SQL products to imitate either a record number (sequential file system mindset) or OID (OO mindset) because they don't know anything else. This magical, universal, one-size-fits-all numbering is totally nonrelational, depends on the physical state of the hardware at a particular time, and is a poor attempt at mimicking a magnetic tape file system. + +Experienced database designers tend toward intelligent keys they find in industry-standard codes, such as UPC, VIN, GTIN, ISBN, and so on. They know that they need to verify the data against the reality they are modeling. A trusted external source is a good thing to have. + +The reasons given for this poor programming practice are many, so let me go down the list: + +**Q:** Couldn't a natural compound key become very long? + +**Al:** So what? This is the 21st century, and we have much better computers than we did in the 1950s when key size was a real physical issue. What is funny to me is the number of idiots who replace a natural two- or three-integer compound key with a huge GUID, which no human being or other system can possibly understand, because they think it will be faster and easy to program. + +**A2:** This is an implementation problem that the SQL engine can handle. For example, Teradata is a SQL designed for very large database (VLDB) applications that use hashing instead of B-tree or other indexes. They guarantee that no search requires more than two probes, no matter how large the database. A tree index requires more and more probes as the size of the database increases. + +**A3:** A long key is not always a bad thing for performance. For example, if I use (city, state) as my key, I get a free index on just (city). I can also add extra columns to the key to make it a super-key when such a super-key gives me a covering index (i.e., an index that contains all of the columns required for a query, so that the base table does not have to be accessed at all). + +**Q:** Can't I make things really fast on the current release of my SQL software? + +**Al:** Sure, if 1 want to lose all of the advantages of an abstract data model, SQL set-oriented programming, carry extra data, and destroy the portability of code. Look at any of the newsgroups and see how difficult it is to move the various exposed physical locators in the same product. + +The auto-numbering features are a holdover from the early SQLs, which were based on contiguous storage file systems. The data was kept in physically contiguous disk pages, in physically contiguous rows, made up of physically contiguous columns. In short, just like a deck of punchcards or a magnetic tape. Most programmers still carry that mental model, too. + +But physically contiguous storage is only one way of building a relational database, and it is not the best one. The basic idea of a relational database is that the user is not supposed to know how or where things are stored at all, much less write code that depends on the particular physical representation in a particular release of a particular product on particular hardware at a particular time. + +The first practical consideration is that auto-numbering is proprietary and nonportable, so you know that you will have maintenance problemswhen you change releases or port your system to other products. Newbies actually think they will never port code! Perhaps they only work for companies that are failing and will be gone. Perhaps their code is such a disaster that nobody else wants their application. + +But let's look at the logical problems. First, try to create a table with two columns and try to make them both auto-numbered. If you cannot declare more than one column to be of a certain data type, then that thing is not a data type at all, by definition. It is a property that belongs to the physical table, not the logical data in the table. + +Next, create a table with one column and make it an auto-number. Now try to insert, update, and delete different numbers from it. If you cannot insert, update, and delete rows, then it is not really a table by definition. + +Finally, create a simple table with one hidden auto-number column and a few other columns. Use a few statements like: + +Put a few rows into the table and notice that the auto-numbering feature sequentially numbered them in the order they were presented. If you delete a row, the gap in the sequence is not filled in, and the sequence continues from the highest number that has ever been used in that column in that particular table. This is how we did record numbers in preallocated sequential files in the 1950s, by the way. A utility program would then pack or compress the records that were flagged as deleted or unused to move the empty space to the physical end of the physical file. + +But we now use a statement with a query expression in it, like this: + +Because a query result is a table, and a table is a set that has no ordering, what should the auto-numbers be? The entire, whole, completed set is presented to Foobar all at once, not a row at a time. There are (n!) ways to number (n) rows, so which one do you pick? The answer has been to use whatever the physical order of the result set happened to be. That nonrelational phrase "physical order" again! + +But it is actually worse than that. If the same query is executed again, but with new statistics or after an index has been dropped or added, the new execution plan could bring the result set back in a different physical order. Can you explain from a logical model why the same rows in the second query get different auto-numbers? In the relational model, they should be treated the same if all the values of all the attributes are identical. + +Using auto-numbering as a primary key is a sign that there is no data model, only an imitation of a sequential file system. Because this magic, all-purpose, one-size-fits-all pseudo identifier exists only as a result of the physical state of a particular piece of hardware, at a particular time, as read by the current release of a particular database product, how do you verify that an entity has such a number in the reality you are modeling? People run into this problem when they have to rebuild their database from scratch after a disaster. + +You will see newbies who design tables like this: + +Now input data and submit the same row a thousand times or a million times. Your data integrity is trashed. The natural key was this: + +Another problem is that if a natural key exists (which it must, if the data model is correct), then the rows can be updated either through the key or through the auto-number. But because there is no way to reconcile the auto-number and the natural key, you have no data integrity. + +To demonstrate, here is a typical newbie schema. I call them "id-iots" because they always name the auto-number column "id" in every table. + +Now change a row in Personnel, using the "id" column: + +or using the natural key: + +But when I rebuild the row from scratch: + +What happened to the tables that referenced Personnel? Imagine a company bowling team table that also had the "id" column and the "ssn" of the players. I need cascaded DRI actions if the "ssn" changes, but I only have the "id," so I have no idea how many "ssn" values the same employee can have. The "id" column is at best redundant, but now we can see that it is also dangerous. + +Finally, an appeal to authority, with a quote from Dr. Codd (1979): "Database users may cause the system to generate or delete a surrogate, but they have no control over its value, nor is its value ever displayed to them. + +This means that a surrogate ought to act like an index: created by the user, managed by the system, and never seen by a user. That means never used in queries, DRI, or anything else that a user does. + +Codd also wrote the following: + +There are three difficulties in employing user-controlled keys as permanent surrogates for entities. + +1. The actual values of user-controlled keys are determined by users and must therefore be subject to change by them (e.g., if two companies merge, the two employee databases might be combined, with the result that some or all of the serial numbers might be changed). + +2. Two relations may have user-controlled keys defined on distinct domains (e.g., one of them uses Social Security, while the other uses employee serial numbers) and yet the entities denoted are the same. + +3. It may be necessary to carry information about an entity either before it has been assigned a user-controlled key value or after it has ceased to have one (e.g., an applicant for a job and a retiree). + +These difficulties have the important consequence that an equi-join on common key values may not yield the same result as a join on common entities. A solution—proposed in part [4] and more fully in [14]—is to introduce entity domains, which contain system-assigned surrogates. Database users may cause the system to generate or delete a surrogate, but they have no control over its value, nor is its value ever displayed to them . . . (Codd, 1979). + +#### Exceptions: + +If you are using the table as a staging area for data scrubbing or some other purpose than as a database, then feel free to use any kind of proprietary feature you wish to get the data right. We did a lot of this in the early days of RDBMS. Today, however, you should consider using ETL and other software tools that did not exist even a few years ago. + +# 3.14 Do Not Split Attributes + +#### Rationale: + +Attribute splitting consists of taking an attribute and modeling it in more than one place in the schema. This violates Domain-key Normal Form(DKNF) and makes programming insanely difficult. There are several ways to do this, discussed in the following sections. + +## 3.14.1 Split into Tables + +The values of an attribute are each given their own table. If you were to do this with gender and have a "MalePersonnel" and a "FemalePersonnel" table, you would quickly see the fallacy. But if I were to split data by years (temporal values) or by location (spatial values) or by department (organizational values), you might not see the same problem. + +In order to get any meaningful report, these tables would have to be UNION-ed back into a single "Personnel" table. The bad news is that constraints to prevent overlaps among the tables in the collection can be forgotten or wrong. + +Do not confuse attribute splitting with a partitioned table, which is maintained by the system and appears to be a whole to the users. + +## 3.14.2 Split into Columns + +The attribute is modeled as a series of columns that make no sense until all of the columns are reassembled (e.g., having a measurement in one column and the unit of measure in a second column). The solution is to have scale and keep all measurements in it. + +Look at section 3.3 on BIT data types as one of the worst offenders. You will also see attempts at formatting of long text columns by splitting (e.g., having two 50-character columns instead of one 100-character column so that the physical display code in the front end does not have to calculate a word-wrap function). When you get a 25-character-wide printout, though, you are in trouble. + +Another common version of this is to program dynamic domain changes in a table. That is, one column contains the domain, which is metadata, for another column, which is data. + +Glenn Carr posted a horrible example of having a column in a table change domain on the fly on September 29, 2004, on the SQL Server programming newsgroup. His goal was to keep football statistics; this is a simplification of his original schema design. I have removed about a dozen other errors in design, so we can concentrate on just the shifting domain problem. + +The "stat_field_id" held the names of the statistics whose values are given in the "number_value" column of the same row. A better name for this column should have been "yardage_or_completions_or_interceptions_or_.." because that is what it has in it. Here is a rewrite: + +We found by inspection that a player is identified by a (league_id, player_nbr) pair. Player_id was originally another IDENTITY column in the Players table. I see sports games where the jersey of each player has a number; let's use that for identification. If reusing jersey numbers is a problem, then I am sure that leagues have some standard in their industry for this, and I am sure that it is not an auto-incremented number that was set by the hardware in Mr. Carr's machine. + +What he was trying to find were composite statistics, such as "Yards per Completion," which is trivial in the rewritten schema. The hardest part of the code is avoiding a division by zero in a calculation. Using theoriginal design, you had to write elaborate self-joins that had awful performance. I leave this as an exercise to the reader. + +#### Exceptions: + +This is not really an exception. You can use a column to change the scale, but not the domain, used in another column. For example, I record temperatures in degrees Absolute, Celsius, or Fahrenheit and put the standard abbreviation code in another column. But I have to have a VIEW for each scale used so that I can show Americans everything in Fahrenheit and the rest of the world everything in Celsius. I also want people to be able to update through those views in the units their equipment gives them. + +A more complex example would be the use of the ISO currency codes with a decimal amount in a database that keeps international transactions. The domain is constant; the second column is always currency, never shoe size or body temperature. When I do this, I need to have a VIEW that will convert all of the values to the same common currency: Euros, Yen, Dollars, or whatever. But now there is a time element because the exchange rates change constantly. This is not an easy problem. + +## 3.14.3 Split into Rows + +The attribute is modeled as a flag and value on each row of the same table. The classic example is temporal, such as this list of events: + +Time is measured by duration, not by instants; the correct DDL is: + +#### Exceptions: + +None + +These are simply bad schema designs that are often the results of confusing the physical representation of the data with the logical model. This tends to be done by older programmers carrying old habits over from file systems. + +For example, in the old days of magnetic tape files, the tapes were dated and processing was based on the one-to-one correspondence between time and a physical file. Creating tables with temporal names like "Payroll_Jan," "Payroll_Feb," and so forth just mimic magnetic tapes. + +Another source of these errors is mimicking paper forms or input screens directly in the DDL. The most common is an order detail table that includes a line number because the paper form or screen for the order has a line number. Customers buy products that are identified in the inventory database by SKU, UPC, or other codes, not a physical line number on a form on the front of the application. But the programmer splits the quantity attribute into multiple rows. + +# 3.15 Do Not Use Object-Oriented Design for an RDBMS + +#### Rationale: + +Many years ago, the INCITS H2 Database Standards Committee (née ANSI X3H2 Database Standards Committee) had a meeting in Rapid City, South Dakota. We had Mount Rushmore and Bjarne Stroustrup as special attractions. Mr. Stroustrup did his slide show about Bell Labs inventing C++ and OO programming for us, and we got to ask questions. + +One of the questions was how we should put OO stuff into SQL. His answer was that Bell Labs, with all its talent, had tried four different approaches to this problem and came to the conclusion that you should not do it. OO was great for programming but deadly for data. + +## 3.15.1 A Table Is Not an Object Instance + +Tables in a properly designed schema do not appear and disappear like instances of an object. A table represents a set of entities or arelationship. For them to appear (CREATE TABLE) and disappear (DROP TABLE) is like living in a world of magic, where a whole new species of creatures are created by any user, on the fly. Likewise, there are no OIDs in SQL. GUIDs, auto-numbering, and all of those proprietary exposed physical locators will not work in the long run. + +I have watched people try to force OO models into SQL, and it falls apart in about a year. Every typo becomes a new attribute, or class queries that would have been so easy in a relational model are now multitable monster outer joins, redundancy grows at an exponential rate, constraints are virtually impossible to write so you can kiss data integrity goodbye, and so on. + +In a thread discussing OO versus relational modeling entitled "impedance mismatch" in the comp.databases.theory newsgroup in October 2004, one experienced programmer reported: + +I'm here to tell you what you already know—you are 100 percent correct. I am stuck with working with an OO schema superimposed on an RDBMS. The amount of gymnastics that I need to go through to do what should be the simplest query is unimaginable. It took six man-hours (me and one of the OO developers for three hours) to come up with a query that was the equivalent of: + +The data needed consisted of basic information, name of the office location, address, manager, and phone. The final query was almost a full page long, required the joining of all the various tables for each data element (as each data element is now an object and each object has its own attributes, so requires its own table), and of course the monster object-linking tables so as to obtain the correct instance of each object. + +By the way, which instance is the correct one? Why, the latest one, of course, unless it is marked as not being the one to use, in which case look for the one that is so marked. And the marking indicator is not always the same value, as there are several potential values. These object-linking tables are the biggest in the entire database. Millions of rows in each of these in just one year's time to keep track of less than 80,000 entity instances. + +Self-joins are needed in some cases; here are two of these monster tables, and a few smaller ones. + +Fortunately, there are extracts that run nightly to transform the data into a relational schema set up for reporting, but not all the data is there, or is wrong, so sometimes I need to go through the above. + +## 3.15.2 Do Not Use EAV Design for an RDBMS + +The Entity-Attribute-Value (EAV) design flaw is particularly popular among newbies who come from the agile or extreme school of software development. This school used to be called "Code first, design and think later" when it was first popular. + +The idea is that you have one huge table with three columns of metadata: entity name, attribute name, and attribute value. This lets your users invent new entities as they use the database. If the American wants to create something called a "tire" and the British speaker wants to create something called a "tyre," then they are both free to do so. + +The values have be recorded in the most general data type in the SQL engine, so you use a lot of VARCHAR(n) columns in the EAV model. Now try to put a constraint on the column. + +#### Exceptions: + +None. There are better tools for collecting free-form data. +CHAPTER 4 Scales and Measurements + +BEFORE YOU CAN put data into a database, you actually need to think about how it will be represented and manipulated. Most programmers have never heard of measurement theory or thought about the best way to represent their data. Although this topic is not specifically about SQL style, it gives a foundation for decisions that have to be made in the design of any schema. + +# 4.1 Measurement Theory + +_Measure all that is measurable and attempt to make measurable that which_ _is not y et so._ + +—Galileo (1564–1642) + +Measurement theory is a branch of applied mathematics that is useful in data analysis. Measurements are not the same as the attribute being measured. Measurement is not just assigning numbers to things or their attributes so much as it is assigning to things a structural property that can be expressed in numbers or other computable symbols. This structure is the scale used to take the measurement; the numbers or symbols represent units of measure. + +Strange as it might seem, measurement theory came from psychology, not mathematics or computer science. In particular, S. S.Stevens originated the idea of levels of measurement and classification of scales. Scales are classified into types by the properties they do or do not have. The properties with which we are concerned are the following: + +1. A _natural origin point on the scale._ This is sometimes called a zero, but it does not have to be literally a numeric zero. For example, if the measurement is the distance between objects, the natural zero is zero meters—you cannot get any closer than that. If the measurement is the temperature of objects, the natural zero is zero degrees Kelvin—nothing can get any colder than absolute zero. However, consider time: It goes from an eternal past into an eternal future, so you cannot find a natural origin for it. + +2. _Meaningful operations can be performed on the units._ It makes sense to add weights together to get a new weight. However, adding names or shoe sizes together is absurd. + +3. A _natural ordering of the units._ It makes sense to speak about an event occurring before or after another event, or a thing being heavier, longer, or hotter than another thing, but the alphabetical order imposed on a list of names is arbitrary, not natural—a foreign language, with different names for the same objects, would impose another ordering. + +4. _A natural metric function on the units. A_ metric function has nothing to do with the metric system of measurements, which is more properly called SI, for Systemé International d'units in French. Metric functions have the following three properties: + +a. The metric between an object and itself is the natural origin of the scale. We can write this in a semimathematical notation as _M(a, a) =_ 0. + +b. The order of the objects in the metric function does not matter. Again in the notation, M(a, _b) = M(b, a)._ + +c. There is a natural additive function that obeys the rule that M (a, _b) + M(b, c) =_ M (a, _c),_ which is also known as the _triangular inequality._ + +This notation is meant to be more general than just arithmetic. The zero in the first property is the origin of the scale, not just a numeric zero. The third property, defined with a plus and a greater than or equalto sign, is a symbolic way of expressing general ordering relationships. The greater than or equal to sign refers to a natural ordering on the attribute being measured. The plus sign refers to a meaningful operation in regard to that ordering, not just arithmetic addition. + +The special case of the third property, where the greater than or equal to is always greater than, is desirable to people because it means that they can use numbers for units and do simple arithmetic with the scales. This is called a _strong metric property._ For example, human perceptions of sound and light intensity follow a cube root law—that is, if you double the intensity of light, the perception of the intensity increases by only 20 percent (Stevens, 1957). The actual formula is "Physical intensity to the 0.3 power equals perceived intensity" in English. Knowing this, designers of stereo equipment use controls that work on a logarithmic scale internally but that show evenly spaced marks on the control panel of the amplifier. + +It is possible to have a scale that has any combination of the metric properties. For example, instead of measuring the distance between two places in meters, measure it in units of effort. This is the old Chinese system, which had uphill and downhill units of distance. + +Does this system of distances have the property that _M(a, a)_ = 0? Yes. It takes no effort to get to where you already are located. Does it have the property that _M(a, b) = Μ(b, a)?_ No. It takes less effort to go downhill than to go uphill. Does it have the property that Μ (a, _b) + Μ(b, c) = M(a, c)?_ Yes. The amount of effort needed to go directly to a place will always be less than the effort of making another stop along the way. + +## 4.1.1 Range and Granularity + +Range and granularity are properties of the way the measurements are made. Because we have to store data in a database within certain limits, these properties are important to a database designer. The types of scales are unrelated to whether you use discrete or continuous variables. Although measurements are always discrete because of finite precision, attributes can be conceptually either discrete or continuous regardless of measurement level. Temperature is usually regarded as a continuous attribute, so temperature measurement to the nearest degree Kelvin is a ratio-level measurement of a continuous attribute. However, quantum mechanics holds that the universe is fundamentally discrete, so temperature may actually be a discrete attribute. In ordinal scales for continuous attributes, ties are impossible (or have probability zero). In ordinal scales for discrete attributes, ties are possible. Nominal scalesusually apply to discrete attributes. Nominal scales for continuous attributes can be modeled but are rarely used. + +## 4.1.2 Range + +A scale also has other properties that are of interest to someone building a database. First, scales have a range: What are the highest and lowest values that can appear on the scale? It is possible to have a finite or an infinite limit on either the lower or the upper bound. Overflow and underflow errors are the result of range violations inside the database hardware. + +Database designers do not have infinite storage, so we have to pick a subrange to use in the database when we have no upper or lower bound. For example, few computer calendar routines will handle geologic time periods, but then few companies have bills that have been outstanding for that long either, so we do not mind. + +## 4.1.3 Granularity, Accuracy, and Precision + +Look at a ruler and a micrometer. They both measure length, using the same scale, but there is a difference. A micrometer is more precise because it has a finer granularity of units. Granularity is a static property of the scale itself—how many notches there are on your ruler. In Europe, all industrial drawings are done in millimeters; the United States has been using l/32nd of an inch. + +Accuracy is how close the measurement comes to the actual value. Precision is a measure of how repeatable a measurement is. Both depend on granularity, but they are not the same things. Human nature says that a number impresses according to the square of the number of decimal places. Hence, some people will use a computer system to express things to as many decimal places as possible, even when it makes no sense. For example, civil engineering in the United States uses decimal feet for road design. Nobody can build a road any more precisely than that, but many civil engineering students turn in work that is expressed in ten-thousandths of a foot. You don't use a micrometer on asphalt! A database often does not give the user a choice of precision for many calculations. In fact, the SQL standards leave the number of decimal places in the results of many arithmetic operations to be defined by the implementation. + +The ideas are easier to explain with handgun targets, which are scales to measure the ability of the shooter to put bullets in the center of atarget. A bigger target has a wider range compared with a smaller target. A target with more rings has a higher granularity. + +Once you start shooting, a group of shots that are closer together is more precise because the shots were more repeatable. A shot group that is closer to the center is more accurate because the shots were closer to the goal. Notice that precision and accuracy are not the same thing! If I have a good gun whose sights are off, I can get a tight cluster that is not near the bull's eye. + +# 4.2 Types of Scales + +The lack or presence of precision and accuracy determines the kind of scale you should choose. Scales are either quantitative or qualitative. Quantitative scales are what most people mean when they think of measurements, because these scales can be manipulated and are usually represented as numbers. Qualitative scales attempt to impose an order on an attribute, but they do not allow for computations—-just comparisons. + +## 4.2.1 Nominal Scales + +The simplest scales are the nominal scales. They simply assign a unique symbol, usually a number or a name, to each member of the set that they attempt to measure. For example, a list of city names is a nominal scale. + +Right away we are into philosophical differences, because many people do not consider listing to be measurement. Because no clear property is being measured, that school of thought would tell us this cannot be a scale. + +There is no natural origin point for a set, and likewise there is no ordering. We tend to use alphabetic ordering for names, but it makes just as much sense to use frequency of occurrence or increasing size or almost any other attribute that does have a natural ordering. + +The only meaningful operation that can be done with such a list is a test for equality—"Is this city New York or not?"—and the answer will be TRUE, FALSE, or UNKNOWN. Nominal scales are common in databases because they are used for unique identifiers, such as names and descriptions. + +## 4.2.2 Categorical Scales + +The next simplest scales are the categorical scales. They place an entity into a category that is assigned a unique symbol, usually a number or aname. For example, the class of animals might be categorized as reptiles, mammals, and so forth. The categories have to be within the same class of things to make sense. + +Again, many people do not consider categorizing to be measurement. The categories are probably defined by a large number of properties, and there are two potential problems with them. The first problem is that an entity might fall into one or more categories. For example, a platypus is a furry, warm-blooded, egg-laying animal. Mammals are warm-blooded but give live birth and optionally have fur. The second problem is that an entity might not fall into any of the categories at all. If we find a creature with chlorophyll and fur on Mars, we do not have a category of animals in which to place it. + +The two common solutions are either to create a new category of animals (monotremes for the platypus and echidna) or to allow an entity to be a member of more than one category. There is no natural origin point for a collection of subsets, and, likewise, there is no ordering of the subsets. We tend to use alphabetic ordering for names, but it makes just as much sense to use frequency of occurrence or increasing size or almost any other attribute that does have a natural ordering. + +The only meaningful operation that can be done with such a scale is a test for membership—"Is this animal a mammal or not?"—which will test either TRUE, FALSE, or UNKNOWN. + +## 4.2.3 Absolute Scales + +An absolute scale is a count of the elements in a set. Its natural origin is zero, or the empty set. The count is the ordering (a set of five elements is bigger than a set of three elements, and so on). Addition and subtraction are metric functions. Each element is taken to be identical and interchangeable. For example, when you buy a dozen Grade A eggs, you assume that for your purposes any Grade A egg will do the same job as any other Grade A egg. Again, absolute scales are in databases because they are used for quantities. + +## 4.2.4 Ordinal Scales + +Ordinal scales put things in order but have no origin and no operations. For example, geologists use a scale to measure the hardness of minerals called Moh's Scale for Hardness (MSH). It is based on a set of standard minerals, which are ordered by relative hardness (talc = 1, gypsum = 2, calcite = 3, fluorite = 4, apatite = 5, feldspar = 6, quartz = 7, topaz = 8, sapphire = 9, diamond =10). + +To measure an unknown mineral, you try to scratch the polished surface of one of the standard minerals with it; if it scratches the surface, the unknown is harder. Notice that I can get two different unknown minerals with the same measurement that are not equal to each other and that I can get minerals that are softer than my lower bound or harder than my upper bound. There is no origin point, and operations on the measurements make no sense (e.g., if I add 10 talc units, I do not get a diamond). + +Perhaps the most common use we see of ordinal scales today is to measure preferences or opinions. You are given a product or a situation and asked to decide how much you like or dislike it, how much you agree or disagree with a statement, and so forth. The scale is usually given a set of labels such as "strongly agree" through "strongly disagree," or the labels are ordered from 1 to 5. + +Consider pairwise choices between ice cream flavors. Saying that vanilla is preferred over wet leather in our taste test might well be expressing a universal truth, but there is no objective unit of likeability to apply. The lack of a unit means that such things as opinion polls that try to average such scales are meaningless; the best you can do is a bar graph of the number of respondents in each category. + +Another problem is that an ordinal scale may not be transitive. _Transitivity_ is the property of a relationship in which if _R(a, b)_ and R(b, c), then _R(a, c)._ We like this property and expect it in the real world, where we have relationships like "heavier than," "older than," and so forth. This is the result of a strong metric property. + +But an ice cream taster, who has just found out that the shop is out of vanilla, might prefer squid over wet leather, wet leather over wood, and wood over squid, so there is no metric function or linear ordering at all. Again, we are into philosophical differences, because many people do not consider a nontransitive relationship to be a scale. + +## 4.2.5 Rank Scales + +Rank scales have an origin and an ordering but no natural operations. The most common example of this would be military ranks. Nobody is lower than a private, and that rank is a starting point in your military career, but it makes no sense to somehow combine three privates to get a sergeant. + +Rank scales have to be transitive: A sergeant gives orders to a private, and because a major gives orders to a sergeant, he or she can also give orders to a private. You will see ordinal and rank scales grouped together in some of the literature if the author does not allow nontransitiveordinal scales. You will also see the same fallacies committed when people try to do statistical summaries of such scales. + +## 4.2.6 Interval Scales + +Interval scales have a metric function, ordering, and meaningful operations among the units but no natural origin. Calendars are the best example; some arbitrary historical event is the starting point for the scale and all measurements are related to it using identical units or intervals. Time, then, extends from a past eternity to a future eternity. + +The metric function is the number of days between two dates. Look at the three properties: (1) _M(a, a) =_ 0: there are zero days between today and today; (2) Μ (a, _b) = Μ(b, a):_ there are just as many days from today to next Monday as there are from next Monday to today; and (3) Μ (a, _b) + Μ(b, c)_ = _M(a, c):_ the number of days from today to next Monday plus the number of days from next Monday to Christmas is the same as the number of days from today until Christmas. Ordering is natural and strong: 1900-July-l occurs before 1993-July-l. Aggregations of the basic unit (days) into other units (weeks, months, and years) are also arbitrary. + +Please do not think that the only metric function is simple math; there are log-interval scales, too. The measurements are assigned numbers such that ratios between the numbers reflect ratios of the attribute. You then use formulas of the form (c X _m^d),_ where c and _d art_ constants, to do transforms and operations. For example, density = (mass/volume), fuel efficiency expressed in miles per gallon (mpg), decibel scale for sound, and the Richter scale for earthquakes are exponential, so their functions involve logarithms and exponents. + +## 4.2.7 Ratio Scales + +Ratio scales are what people think of when they think about a measurement. Ratio scales have an origin (usually zero units), an ordering, and a set of operations that can be expressed in arithmetic. They are called ratio scales because all measurements are expressed as multiples or fractions of a certain unit or interval. + +Length, mass, and volume are examples of this type of scale. The unit is what is arbitrary: The weight of a bag of sand is still weight whether it is measured in kilograms or in pounds. Another nice property is that the units are identical: A kilogram is still a kilogram whether it is measuring feathers or bricks. + +# 4.3 Using Scales + +Absolute and ratio scales are also called extensive scales because they deal with quantities, as opposed to the remaining scales, which are intensive because they measure qualities. Quantities can be added and manipulated together, whereas qualities cannot. Table 4.1 describes the different types of scales and their attributes. + +Table 4.1 _Scale properties_ + +The origin for the absolute scale is numeric zero, and the natural functions are simple arithmetic. However, things are not always this simple. Temperature has an origin point at absolute zero, and its natural functions average heat over mass. This is why you cannot defrost a refrigerator, which is at 0 degrees Celsius, by putting a chicken whose body temperature is 35 degrees Celsius inside of it. The chicken does not have enough mass relative to heat. However, a bar of white-hot steel will do a nice job. + +# 4.4 Scale Conversion + +Scales can be put in a partial order based on the permissible transformations: + +An attribute might not fit exactly into any of these scales. For example, you mix nominal and ordinal information in a single scale, such as in questionnaires that have several nonresponse categories. It is common to have scales that mix ordinal and an interval scale by assuming the attribute is really a smooth monotone function. Subjective rating scales ("strongly agree," "agree,". . . "strongly disagree") have no equally spaced intervals between the ratings, but there are statistical techniques to ensure that the difference between two intervals is within certain limits. A binary variable is at least an interval scale, and it might be a ratio or absolute scale, if it means that the attribute exists or does not exist. + +The important principle of measurement theory is that you can convert from one scale to another only if they are of the same type and measure the same attribute. Absolute scales do not convert, which is why they are called absolute scales. Five apples are five apples, no matter how many times you count them or how you arrange them on the table. Nominal scales are converted to other nominal scales by a mapping between the scales. + +That means you look things up in a table. For example, I can convert my English city names to Polish city names with a dictionary. The problem comes when there is not a one-to-one mapping between the two nominal scales. For example, English uses the word "cousin" to identify the offspring of your parents' siblings, and tradition treats them all pretty much alike. + +Chinese language and culture have separate words for the same relations based on the genders of your parents' siblings and the age relationships among them (e.g., the oldest son of your father's oldest brother is a particular type of cousin and you have different social obligations to him). Something is lost in translation. + +Ordinal scales are converted to ordinal scales by a monotone function. That means you preserve the ordering when you convert. Looking at the MSH for geologists, I can pick another set of minerals, plastics, or metals to scratch, but rock samples that were definitely softer than others are still softer. Again, there are problems when there is not a one-to-one mapping between the two scales. My new scale may be able to tell the difference between rocks, whereas the MSH could not. + +Rank scales are converted to rank scales by a monotone function that preserves the ordering, like ordinal scales. Again, there are problems when there is not a one-to-one mapping between the two scales. For example, different military branches have slightly different ranks that don't quite correspond to each other. + +In both the nominal and the ordinal scales, the problem was that things that looked equal on one scale were different on another. This has to do with range and granularity, which was discussed in section 4.1.1 of this chapter. + +Interval scales are converted to interval scales by a linear function; that is, a function of the form _y_ = α X **X** _\+ b._ This preserves the ordering but shifts the origin point when you convert. For example, I can convert temperature from degrees Celsius to degrees Fahrenheit using the formula F = (9.0 + 5.0 X C) + 32. + +Ratio scales are converted to ratio scales by a constant multiplier, because both scales have the same ordering and origin point. For example, I can convert from pounds to kilograms using the formula ρ = 0.4536 X k. This is why people like to use ratio scales. + +# 4.5 Derived Units + +Many of the scales that we use are not primary units but rather derived units. These measures are constructed from primary units, such as miles per hour (time and distance) or square miles (distance and distance). You can use only ratio and interval scales to construct derived units. + +If you use an absolute scale with a ratio or interval scale, you are dealing with statistics, not measurements. For example, using weight (ratio scale) and the number of people in New York (absolute scale), we can compute the average weight of a New Yorker, which is a statistic, not a unit of measurement. + +The SI measurements use a basic set of seven units (i.e., meter for length, kilogram for mass, second for time, ampere for electrical current, degree Kelvin for temperature, mole for molecules, and candela for light) and construct derived units. ISO standard 2955 ("Informationprocessing—Representation of SI and other units for use in systems with limited character sets") has a notation for expressing SI units in ASCII character strings. (See ISO-2955, "Representation of SI and other units for use in systems with limited character sets") The notation uses parentheses, spaces, multiplication (shown by a period), division (shown by a solidus, or slash), and exponents (shown by numerals immediately after the unit abbreviation). There are also names for most of the standard derived units. For example, "100 kg.m **÷r** s2" converts to 10 Newtons (the unit of force), written as "10 N" instead. + +# 4.6 Punctuation and Standard Units + +A database stores measurements as numeric data represented in a binary format, but when the data is input or output, a human being wants readable characters and punctuation. Punctuation identifies the units being used and can be used for prefix, postfix, or infix symbols. It can also be implicit or explicit. + +If I write $25.15, you know that the unit of measure is the dollar because of the explicit prefix dollar sign. If I write 160 lbs., you know that the unit of measure is pounds because of the explicit postfix abbreviation for the unit. If I write 1989 MAR 12, you know that this is a date because of the implicit infix separation among month, day, and year, achieved by changing from numerals to letters, and the optional spaces. The ISO and SQL defaults represent the same date, using explicit infix punctuation, with 1989-03-12 instead. Likewise, a column header on a report that gives the units used is explicit punctuation. + +Databases do not generally store punctuation. The sole exception might be the proprietary MONEY or CURRENCY data type found in many SQL implementations as a vendor extension. Punctuation wastes storage space, and the units can be represented in some internal format that can be used in calculations. Punctuation is only for display. + +It is possible to put the units in a column next to a numeric column that holds their quantities, but this is awkward and wastes storage space. If everything is expressed in the same unit, the units column is redundant. If things are expressed in different units, you have to convert them to a common unit to do any calculations. Why not store them in a common unit in the first place? The DBA has to be sure that all data in a column of a table is expressed in the same units before it is stored. There are some horror stories about multinational companies sending the same input programs used in the United States to their European offices,where SI and English measurements were mixed into the same database without conversion. + +Ideally, the DBA should be sure that data is kept in the same units in all the tables in the database. If different units are needed, they can be provided in a VIEW that hides the conversions (thus the office in the United States sees English measurements and the European offices see SI units and date formats; neither is aware of the conversions being done for it). + +# 4.7 General Guidelines for Using Scales in a Database + +The following are general guidelines for using measurements and scales in a database and not firm, hard rules. You will find exceptions to all of them. + +1. _In general, the more unrestricted the permissible transformations on a scale are, the more restricted the statistics._ Almost all statistics are applicable to measurements made on ratio scales, but only a limited group of statistics may be applied to measurements made on nominal scales. + +2. _Use CHECK() clauses on table declarations to make sure that only the allowed values appear in the database._ If you have the CREATE DOMAIN feature of SQL-92, use it to build your scales. Nominal scales would have a list of possible values; other scales would have range checking. Likewise, use the DEFAULT clauses to be sure that each scale starts with its origin value, a NULL, or a default value that makes sense. + +3. _Declare at least one more decimal place than you think you will need for your smallest units._ In most SQL implementations, rounding and truncation will improve with more decimal places. + +The downside of SQL is that precision and the rules for truncation and rounding are implementation dependent, so a query with calculations might not give the same results on another product. However, SQL is more merciful than older file systems, because the DBA can ALTER a numeric column so it will have more precision and a greater range without destroying existing data or queries. Host programs may have to be changed to display the extra characters in the results, however. + +You also need to consider laws and accounting rules that deal with currencies. The European Union has rules for computing with euros, and the United States has similar rules for dollars in the Generally Accepted Accounting Practices (GAAP). + +4. Try _to store primary units rather than derived units._ This is not always possible, because you might not be able to measure anything but the derived unit. Look at your new tire gauge; it is set for Pascal (Newtons per square meter) and will not tell you how many square meters you have on the surface of the tire or the force exerted by the air, and you simply cannot figure these things out from the Pascals given. A set of primary units can be arranged in many different ways to construct any possible derived unit desired. Never store both the derived and the primary units in the same table. Not only is this redundant, but it opens the door to possible errors when a primary-unit column is changed and the derived units based on it are not updated. Also, most computers can recalculate the derived units much faster than they can read a value from a disk drive. + +5. _Use the same punctuation whenever a unit is displayed._ For example, do not mix ISO and ANSI date formats, or express weight in pounds and kilograms in the same report. Ideally, everything should be displayed in the same way in the entire application system. +CHAPTER 5 Data Encoding Schemes + +YOU DO NOT put data directly into a database. You convert it into an encoding scheme first, then put the encoding into the rows of the tables. Words have to be written in an alphabet and belong to a language; measurements are expressed as numbers. We are so used to seeing words and numbers that we no longer think of them as encoding schemes. We also often fail to distinguish among the possible ways to identify (and therefore to encode) an entity or property. Do we encode the person receiving medical services or the policy that is paying for them? That might depend on whether the database is for the doctor or for the insurance company. Do we encode the first title of a song or the alternate title, or both? Or should we include the music itself in a multimedia database? And should it be as an image of the sheet music or as an audio recording? Nobody teaches people how to design these encoding schemes, so they are all too often done on the fly. Where standardized encoding schemes exist, they are too often ignored in favor of some ad hoc scheme. Beginning programmers have the attitude that encoding schemes do not really matter because the computer will take care of it, so they don't have to spend time on the design of their encoding schemes. This attitude has probably gotten worse with SQL than it was before. The new database designer thinks that an ALTER statement can fix any bad things he or she did at the start of the project. + +Yes, the computer can take care of a lot of problems, but the data entry and validation programs become complex and difficult to maintain. Database queries that have to follow the same convoluted encodings will cost both computer time and money, and a human being still has to use the code at some point. Bad schemes result in errors in data entry and misreading of outputs and can lead to incorrect data models. + +# 5.1 Bad Encoding Schemes + +To use an actual example, the automobile tag system for a certain southern state started as a punchcard system written in COBOL. Many readers are likely too young to remember punchcard (keypunch) machines. A punchcard is a piece of stiff paper on which a character is represented as one or more rectangular holes made into one of 80 vertical columns on the card. Contiguous groups of columns make up fixed-length fields of data. The keypunch machine has a typewriter-like keyboard; it automatically feeds cards into the punch as fast as a human being can type. The position, length, and alphabetic or numeric shift for each field on the card can be set by a control card in the keypunch machine to save the operator keystrokes. This is a fixed format and a fast input method, and making changes to a program once it is in place is difficult. + +The auto tag system had a single card column for a single-position numeric code to indicate the type of tag: private car, chauffeured car, taxi, truck, public bus, and so forth. As time went on, more tag types were added for veterans of assorted wars, for university alumni, and for whatever other lobbyist group happened to have the political power to pass a bill allowing it a special auto tag. + +Soon there were more than 10 types, so a single-digit system could not represent them. There was room on the punchcard to change the length of the field to two digits, but COBOL uses fixed-length fields, so changing the card layout would require changes in the programs and in the keypunch procedures. + +The first new tag code was handled by letting the data-entry clerk press a punctuation-mark key instead of changing from numeric lock to manual shift mode. Once that decision was made, it was followed for each new code thereafter, until the scheme looked like everything on the upper row of keys on a typewriter. + +Unfortunately, different makes and models of keypunch machines have different punctuation marks in the same keyboard position, so eachdeck of cards had to have a special program to convert its punches to the original model IBM 026 keypunch codes before the master file was updated. This practice continued even after all of the original machines had been retired to used-equipment heaven. + +The edit programs could not check for a simple numeric range to validate input but had to use a small lookup routine with more than 20 values in it. That does not sound like much until you realize that the system had to handle more than 3 million records in the first quarter of the year. The error rate was high, and each batch needed to know which machine had punched the cards before it could use a lookup table. + +If the encoding scheme had been designed with two digits (00 to 99) at the beginning, all of the problems would have been avoided. If I were to put this system into a database today, using video terminals for data entry, the tag type could be INTEGER and it could hold as many tag types as I would ever need. This is part of the legacy database problem. + +The second example was reported in _Information Systems Week_ in 1987. The first sentence told the whole story: "The chaos and rampant error rates in New York City's new Welfare Management System appear to be due to a tremendous increase in the number of codes it requires in data entry and the subsequent difficulty for users in learning to use it." The rest of the article explained how the new system attempted to merge several old existing systems. In the merger, the error rates increased from 2 percent to more than 20 percent because the encoding schemes used could not be matched up and consolidated. + +How do you know a bad encoding scheme when you see one? One bad feature is the failure to allow for growth. Talk to anyone who had to reconfigure a fixed-length record system to allow for the change from the old ZIP codes to the current ZIP+4 codes in their address data. SQL does not have this as a physical problem, but it can show up as a logical problem. + +Another bad property is ambiguous encodings in the scheme. Perhaps the funniest example of this problem was the Italian telephone system's attempt at a "time of day" service. It used a special three-digit number, like the 411 information number in the United States, but the three digits they picked were also those of a telephone exchange in Milan, so nobody could call into that exchange without getting the time signal before they completed their call. + +This happens more often than you would think, but the form that it usually takes is that of a miscellaneous code that is too general. Very different cases are then encoded as identical, and the user is given incorrect or misleading information when a query is performed. + +A bad encoding scheme lacks codes for missing, unknown, not applicable, or miscellaneous values. The classic story is the man who bought a prestige auto tag reading "NONE" and got thousands of traffic tickets as a result. The police had no special provision for a missing tag on the tickets, so when a car had no tag, they wrote "none" in the field for the tag number. The database simply matched his name and address to every unpaid missing-tag ticket on file at the time. + +Before you say that the NULL in SQL is a quick solution to this problem, think about how NULL is ignored in many SQL functions. The SQL query "SELECT tag_nbr, SUM(fine) FROM tickets GROUP BY tag_nbr;" will give the total fines on each car, but it also puts all of the missing tags into one group (i.e., one car), although we want to see each one as a separate case, because it is unlikely that there is only one untagged car in all of California. + +There are also differences among "missing," "unknown," "not applicable," "miscellaneous," and erroneous values that are subtle but important. For example, the International Classification of Disease uses 999.999 for miscellaneous illness. It means that we have diagnosed the patient, know that he or she has an illness, and cannot classify it—a scary condition for the patient—but this is not quite the same thing as a missing disease code (just admitted, might not even be sick), an inapplicable disease code (pregnancy complications in a male), an unknown disease code (sick and awaiting lab results), or an error in the diagnosis (the patient's temperature is recorded as 100 degrees Celsius, not Fahrenheit). + +# 5.2 Encoding Scheme Types + +The following is my classification system for encoding schemes and suggestions for using each of them. You will find some of these same ideas in library science and other fields, but I have never seen anyone else attempt a classification system for data processing. + +## 5.2.1 Enumeration Encoding + +An enumeration encoding arranges the attribute values in some order and assigns a number or a letter to each value. Numbers are usually a better choice than letters, because they can be increased without limit as more values are added. Enumeration schemes are a good choice for a short list of values but a bad choice for a long list. It is too difficult to remember a long list of codes, and soon any natural ordering principle is violated as new values are tacked onto the end. + +A good heuristic is to order the values in some natural manner, if one exists in the data, so that table lookup will be easier. Chronological order (1 occurs before 2) or procedural order (1 must be done before 2) is often a good choice. Another good heuristic is to order the values from most common to least common. That way you will have shorter codes for the most common cases. Other orderings could be based on physical characteristics such as largest to smallest, rainbow-color order, and so on. + +After arguing for a natural order in the list, I must admit that the most common scheme is alphabetical order, because it is simple to implement on a computer and makes it easy for a person to look up values in a table. ANSI standard X3.31, "Structure for the Identification of Counties of the United States for Information Interchange," encodes county names within a state by first alphabetizing the names, and then numbering them from one to whatever is needed. + +## 5.2.2 Measurement Encoding + +A measurement encoding is given in some unit of measure, such as pounds, meters, volts, or liters. This can be done in one of two ways. The column contains an implied unit of measure and the numbers represent the quantity in that unit, but sometimes the column explicitly contains the unit. The most common example of the second case would be money fields, where a dollar sign is used in the column; you know that the unit is dollars, not pounds or yen, by the sign. + +Scales and measurement theory are a whole separate topic and are discussed in detail in Chapter 4. + +## 5.2.3 Abbreviation Encoding + +Abbreviation codes shorten the attribute values to fit into less storage space, but the reader easily understands them. The codes can be either of fixed length or of variable length, but computer people tend to prefer fixed length. The most common example is the two-letter postal state abbreviations (e.g., CA for California, AL for Alabama), which replaced the old variable-length abbreviations (Calif, for California, Ala. for Alabama). + +A good abbreviation scheme is handy, but as the set of values becomes larger, the possibility for misunderstanding increases. The three-letter codes for airport baggage are pretty obvious for major cities: LAX for Los Angeles, SFO for San Francisco, BOS for Boston, ATL forAtlanta, but nobody can figure out the abbreviations for the smaller airports. + +As another example, consider the ISO 3166 Country Codes, which come in two-letter, three-letter, and nonabbreviation numeric forms. The RIPE Network Coordination Centre maintains these codes. + +## 5.2.4 Algorithmic Encoding + +Algorithmic encoding takes the value to be encoded and puts it through an algorithm to obtain the encodings. The algorithm should be reversible, so that the original value can be recovered. Although it is not required, the encoding is usually shorter (or at least of known maximum size) and more uniform in some useful way compared with the original value. Encryption is the most common example of an algorithmic encoding scheme, but it is so important that it needs to be considered as a topic by itself. + +Computer people are used to using Julianized dates, which convert a date into an integer. As an aside, please note that astronomers used the _Julian Date,_ which is a large number that represents the number of days since a particular heavenly event. The Julianized date is a number between 1 and 365 or 366, which represents the ordinal position of the day within the year. Algorithms take up computer time in both data input and output, but the encoding is useful because it allows searching or calculations to be done that would be difficult using the original data. Julianized dates can be used for computations; Soundex names give a phonetic matching that would not be possible with the original text. + +Another example is hashing functions, which convert numeric values into other numeric values for placing them in storage and retrieving them. Rounding numeric values before they go into the database is also a case of algorithmic encoding. + +The difference between an abbreviation and an algorithm is not that clear. An abbreviation can be considered a special case of an algorithm, which tells you how to remove or replace letters. The tests to tell them apart are as follows: + +1. When a human being can read it without effort, it is an abbreviation. + +2. An algorithmic encoding is not easily human readable. + +3. An algorithmic encoding might return the same code for more than one value, but an abbreviation is always one-to-one. + +## 5.2.5 Hierarchical Encoding Schemes + +A hierarchy partitions the set of values into disjoint categories, then partitions those categories into subcategories, and so forth until some final level is reached. Such schemes are shown either as nested sets or as tree charts. Each category has some meaning in itself, and the subcategories refine meaning further. + +The most common example is the ZIP code, which partitions the United States geographically. Each digit, as you read from left to right, further isolates the location of the address: first by postal region, then by state, then by city, and finally by the post office that has to make the delivery. For example, given the ZIP code 30310, we know that the 30000 to 39999 range means the southeastern United States. Within the southeastern codes, we know that the 30000 to 30399 range is Georgia and that 30300 to 30399 is metropolitan Atlanta. Finally, the whole code, 30310, identifies substation A in the West End section of the city. The ZIP code can be parsed by reading it from left to right, reading first one digit, then two, and then the last two digits. + +Another example is the Dewey Decimal Classification (DDC) system, which is used in public libraries in the United States. The 500-number series covers "Natural Sciences"; within that, the 510s cover "Mathematics"; and, finally, 512 deals with "Algebra" in particular. The scheme could be carried further, with decimal fractions for kinds of algebra. + +Hierarchical encoding schemes are great for large data domains that have a natural hierarchy. They organize the data for searching and reporting along that natural hierarchy and make it easy, but there can be problems in designing these schemes. First, the tree structure does not have to be neatly balanced, so some categories may need more codes than others and hence create more breakdowns. Eastern and ancient religions are shortchanged in the Dewey Decimal Classification system, reflecting a prejudice toward Christian and Jewish writings. Asian religions were pushed into a very small set of codes. Today, the Library of Congress has more books on Buddhist thought than on any other religion on earth. + +Second, you might not have made the right choices as to where to place certain values in the tree. For example, in the Dewey Decimal system, books on logic are encoded as 164, in the philosophy section, and not under the 510s, mathematics. In the 19th century, there was no mathematical logic. Today, nobody would think of looking for logic under philosophy. Dewey was simply following the conventions of hisday, and, like today's programmers, he found that the system specifications changed while he was working. + +## 5.2.6 Vector Encoding + +A vector is made up of a fixed number of components. These components can be ordered or unordered, but are always present. They can be of fixed or variable length. The components can be dependent or independent of each other, but the code applies to a single entity and makes sense only as a whole unit. Punctuation, symbol-set changes, or position within the code can determine the components of the vector. + +The most common example is a date, whose components are month, day, and year. The parts have some meaning by themselves, but the real meaning is in the vector—the date—as a whole because it is a complete entity. The different date formats used in computer systems give examples of all the options. The three components can be written in year-month-day order, month-day-year order, or just about any other way you wish. + +The limits on the values for the day depend on the year (is it a leap year or not?) and the month (28, 29, 30, or 31 days?). The components can be separated by punctuation (12/1/2005, using slashes and American date format), symbol-set changes (2005 DEC 01, using digits-letters-digits), or position (20051201, using positions 1 to 4, 5 to 6, and 7 to 8 for year, month, and day, respectively). + +Another example is the ISO code for tire sizes, which is made up of a wheel diameter (scaled in inches), a tire type (abbreviation code), and a width (scaled in centimeters). Thus, 15R155 means a 15-inch radial tire that is 155 millimeters wide, whereas 15SR155 is a steel-belted radial tire with the same dimensions. Despite the mixed American and ISO units, this is a general physical description of a tire in a single code. + +Vector schemes are informative and allow you to pick the best scheme for each component, but they have to be disassembled to get to the components (many database products provide special functions to do this for dates, street addresses, and people's names). Sorting by components is difficult unless you want them in the order given in the encoding; try to sort the tire sizes by construction, width, and diameter instead of by diameter, construction, and width. + +Another disadvantage is that a bad choice in one component can destroy the usefulness of the whole scheme. Another problem is extending the code. For example, if the standard tire number had to be expanded to include thickness in millimeters, where would that measurement go? Another number would have to be separated by apunctuation mark. It could not be inserted into a position inside the code without giving ambiguous codes. The code cannot be easily converted to a fixed-position vector encoding without changing many of the database routines. + +## 5.2.7 Concatenation Encoding + +A concatenation code is made up of a variable number of components that are concatenated together. As in a vector encoding, the components can be ordered or unordered, dependent on or independent of each other, and determined by punctuation, symbol-set changes, or position. + +A concatenation code is often a hierarchy that is refined by additions to the right. These are also known _as facet codes_ in Europe. Or the code can be a list of features, any of which can be present or missing. The order of the components may or may not be important. + +Concatenation codes were popular in machine shops at the turn of the 20th century: A paper tag was attached to a piece of work, and workers at different stations would sign off on their parts of the manufacturing process. Concatenation codes are still used in parts of the airplane industry, where longer codes represent subassemblies of the assembly in the head (also called the root or parent) of the code. + +Another type of concatenation code is a quorum code, which is not ordered. These codes say that n out of $$ marks must be present for the code to have meaning. For example, three out of five inspectors must approve a part before it passes. + +The most common use of concatenation codes is in keyword lists in the header records of documents in textbases. The author or librarian assigns each article in the system a list of keywords that describes the material covered by the article. The keywords are picked from a limited, specialized vocabulary that belongs to a particular discipline. + +Concatenation codes fell out of general use because their variable length made them more difficult to store in older computer systems, which used fixed-length records (think of a punchcard). The codes had to be ordered and stored as left-justified strings to sort correctly. + +These codes could also be ambiguous if they were poorly designed. For example, is the head of 1234 the 1 or the 12 substring? When concatenation codes are used in databases, they usually become a set of yes/no checkboxes, represented as adjacent columns in the file. This makes them Boolean vector codes, instead of true concatenation codes. + +# 5.3 General Guidelines for Designing Encoding Schemes + +These are general guidelines for designing encoding schemes in a database, not firm, hard rules. You will find exceptions to all of them. + +## 5.3.1 Existing Encoding Standards + +The use of existing standard encoding schemes is always recommended. If everyone uses the same codes, data will be easy to transfer and collect uniformly. Also, someone who sat down and did nothing else but work on this scheme probably did a better job than you could while trying to get a database up and running. + +As a rule of thumb, if you don't know the industry in which you are working, ask a subject-area expert. Although that sounds obvious, I have worked on a media library database project where the programmers actively avoided talking to the professional librarians who were on the other side of the project. As a result, recordings were keyed on GUIDs and there were no Schwann catalog numbers in the system. If you cannot find an expert, then Google for standards. First, check to see if ISO has a standard, then check the U.S. government, and then check industry groups and organizations. + +## 5.3.2 Allow for Expansion + +Allow for expansion of the codes. The ALTER statement can create more storage when a single-character code becomes a two-character code, but it will not change the spacing on the printed reports and screens. Start with at least one more decimal place or character position than you think you will need. Visual psychology makes "01" look like an encoding, whereas "1" looks like a quantity. + +## 5.3.3 Use Explicit Missing Values to Avoid NULLs + +#### Rationale: + +Avoid using NULLs as much as possible by putting special values in the encoding scheme instead. SQL handles NULLs differently than values, and NULLs don't tell you what kind of missing value you are dealing with. + +All-zeros are often used for missing values and all-nines for miscellaneous values. For example, the ISO gender codes are 0 = Unknown, 1 = Male, 2 = Female, and 9 = Not Applicable. "Not applicable" means a lawful person, such as a corporation, which has no gender. + +Versions of FORTRAN before the 1977 standard read blank (unpunched) columns in punchcards as zeros, so if you did not know a value, you skipped those columns and punched them later, when you did know. Likewise, using encoding schemes with leading zeros was a security trick to prevent blanks in a punchcard from being altered. The FORTRAN 77 standard fixed its "blank versus zero" problem, but it lives on in SQL in poorly designed systems that cannot tell a NULL from a blank string, an empty string, or a zero. + +The use of all-nines or all-Z's for miscellaneous values will make those values sort to the end of the screen or report. NULLs sort either always to the front or always to the rear, but which way they sort is implementation defined. + +#### Exceptions: + +NULLs cannot be avoided. For example, consider the column "termination_date" in the case of a newly hired employee. The use of a NULL makes computations easier and correct. The code simply leaves the NULL date or uses COALESCE (some_date, CURRENT_TIMESTAMP) as is appropriate. + +## 5.3.4 Translate Codes for the End User + +As much as possible, avoid displaying pure codes to users, but try to provide a translation for them. Translation in the front is not required for all codes, if they are common and well known to users. For example, most people do not need to see the two-letter state abbreviation written out in words. At the other extreme, however, nobody could read the billing codes used by several long-distance telephone companies. + +A part of translation is formatting the display so that it can be read by a human being. Punctuation marks, such as dashes, commas, currency signs, and so forth, are important. However, in a tiered architecture, display is done in the front end, not the database. Trying to put leading zeros or adding commas to numeric values is a common newbie error. Suddenly, everything is a string and you lose all temporal and numeric computation ability. + +These translation tables are one kind of auxiliary table; we will discuss other types later. They do not model an entity or relationship in the schema but are used like a function call in a procedural language. The general form for these tables is: + +Sometimes you might see the definition as part of the primary key or a CHECK() constraint on the "encode" column, but because these are read-only tables, which are maintained outside of the application, we generally do not worry about having to check their data integrity in the application. + +### 5.3.4.1 One True Lookup Table + +Sometimes a practice is both so common and so stupid that it gets a name, and, much like a disease, if it is really bad, it gets an abbreviation. I first ran into the One True Lookup Table (OTLT) design flaw in a thread on a CompuServe forum in 1998, but I have seen it rediscovered in newsgroups every year since. + +Instead of keeping the encodings and their definition in one table each, we put all of the encodings in one huge table. The schema for this table was like this: + +In practice, _m_ and _n_ are usually something like 255 or 50—default values particular to their SQL product. + +The rationale for having all encodings in one table is that it would let the programmer write a single front-end program to maintain all of the encodings. This method really stinks, and I strongly discourage it. Without looking at the following paragraphs, sit down and make a list of all the disadvantages of this method and see if you found anything that I missed. Then read the following list: + +1. _Normalization._ The real reason that this approach does not work is that it is an attempt to violate first normal form. I can see that these tables have a primary key and that all of the columns in a SQL database have to be scalar and of one data type, but I will still argue that it is not a first normal form table. The fact that two domains use the same data type does not make them the same attribute. The extra "code_type" column changes the domain of the other columns and thus violates first normal form because the column in not atomic. A table shouldmodel one set of entities or one relationship, not hundreds of them. As Aristotle said, "To be is to be something in particular; to be nothing in particular is to be nothing." + +2. _Total storage size._ The total storage required for the OTLT is greater than the storage required for the one encoding, one table approach because of the redundant encoding type column. Imagine having the entire International Classification of Diseases (ICD) and the Dewey Decimal system in one table. Only the needed small single encoding tables have to be put into main storage with single auxiliary tables, while the entire OTLT has to be pulled in and paged in and out of main storage to jump from one encoding to another. + +3. _Data types._ All encodings are forced into one data type, which has to be a string of the largest length that any encoding—present and future—used in the system, but VARCHAR(n) is not always the best way to represent data. The first thing that happens is that someone inserts a huge string that looks right on the screen but has trailing blanks or an odd character to the far right side of the column. The table quickly collects garbage. + +CHAR(n) data often has advantages for access and storage in many SQL products. Numeric encodings can take advantage of arithmetic operators for ranges, check digits, and so forth with CHECK() clauses. Dates can be used as codes that are translated into holidays and other events. Data types are not a one-size-fits-all affair. If one encoding allows NULLs, then all of them must in the OTLT. + +4. _Validation._ The only way to write a CHECK() clause on the OTLT is with a huge CASE expression of the form: + +This means that validation is going to take a long time, because every change will have to be considered by all the WHEN clauses in this oversized CASE expression until the SQL engine finds one that tests TRUE. You also need to add a CHECK() clause to the "code_type" column to be sure that the user does not create an invalid encoding name. + +5. _Flexibility._ The OTLT is created with one column for the encoding, so it cannot be used for (n) valued encodings where (n > 1). For example, if I want to translate (longitude, latitude) pairs into a location name, 1 would have to carry an extra column. + +6. _Maintenance._ Different encodings can use the same value, so you constantly have to watch which encoding you are working with. For example, both the ICD and Dewey Decimal system have three digits, a decimal point, and three digits. + +7. _Security._ To avoid exposing rows in one encoding scheme to unauthorized users, the OTLT has to have VIEWs defined on it that restrict users to the "code_type"s they are allowed to update. At this point, some of the rationale for the single table is gone, because the front end must now handle VIEWs in almost the same way it would handle multiple tables. These VIEWs also have to have the WITH CHECK OPTION clause, so that users do not make a valid change that is outside the scope of their permissions. + +8. _Display._ You have to CAST() every encoding for the front end. This can be a lot of overhead and a source of errors when the same monster string is CAST() to different data types in different programs. + +## 5.3.5 Keep the Codes in the Database + +A part of the database should have all of the codes stored in tables. These tables can be used to validate input, to translate codes in displays, and as part of the system documentation. + +I was amazed to go to a major hospital in Los Angeles in mid-1993 and see the clerk still looking up codes in a dog-eared looseleaf notebook instead of bringing them up on her terminal screen. The hospital was still using an old IBM mainframe system, which had dumb 3270 terminals, rather than a client/server system with workstations. There was not even a help screen available to the clerk. + +The translation tables can be downloaded to the workstations in a client/server system to reduce network traffic. They can also be used to build picklists on interactive screens and thereby reduce typographical errors. Changes to the codes are thereby propagated in the system without anyone having to rewrite application code. If the codes change over time, the table for a code should have to include a pair of "date effective" fields. This will allow a data warehouse to correctly read and translate old data. + +# 5.4 Multiple Character Sets + +Some DBMS products can support ASCII, EBCDIC, and Unicode. You need to be aware of this, so you can set proper collations and normalize your text. + +The predicate " IS [NOT] NORMALIZED" in SQL-99 determines if a Unicode string is one of four normal forms (i.e., D, C, KD, and KC). The use of the words _normal form_ here is not the same as in a relational context. In the Unicode model, a single character can be built from several other characters. Accent marks can be put on basic Latin letters. Certain combinations of letters can be displayed as ligatures (ae becomes $$). Some languages, such as Hangul (Korean) and Vietnamese, build glyphs from concatenating symbols in two dimensions. Some languages have special forms of one letter that are determined by context, such as the terminal sigma in Greek or accented u in Czech. In short, writing is more complex than putting one letter after another. + +The Unicode standard defines the order of such constructions in their normal forms. You can still produce the same results with different orderings and sometimes with different combinations of symbols, but it is handy when you are searching such text to know that it is normalized rather than trying to parse each glyph on the fly. You can find details about normalization and links to free software at www.unicode.org. +CHAPTER 6 Coding Choices + +_"Caesar: Pardon him, Theodotus. He is a barbarian and thinks the customs_ _of his tribe and island are the laws of nature."_ + +_—Caesar and Cleopatra,_ by George Bernard Shaw, 1898 + +THIS CHAPTER DEALS WITH writing good DML statements in Standard SQL. That means they are portable and can be optimized well by most SQL dialects. I define _portable_ to mean one of several things. The code is standard and can be run as-is on other SQL dialects; standard implies portable. Or the code can be converted to another SQL dialect in a simple mechanical fashion, or that the feature used is so universal that all or most products have it in some form; portable does not imply standard. You can get some help with this concept from the X/Open SQL Portability Guides. + +A major problem in becoming a SQL programmer is that people do not unlearn procedural or OO programming they had to learn for their first languages. They do not learn how to think in terms of sets and predicates, and so they mimic the solutions they know in their first programming languages. Jerry Weinberg (1978) observed this fact more than 25 years ago in his classic book, _Psychology of Computer Programming._ He was teaching PL/I. For those of you younger readers, PL/I was a language from IBM that was a hybrid of FORTRAN, COBOL, and A1GOL that had a popular craze. + +Weinberg found that he could tell the first programming languages of the students by how they wrote PL/I. My personal experience (1989) was that I could guess the nationality of the students in my C and Pascal programming classes because of their native spoken language. + +Another problem in becoming a SQL programmer is that people tend to become SQL dialect programmers and think that their particular products SQL is some kind of standard. In 2004,1 had a job interview for a position where I was being asked to evaluate different platforms for a major size increase in the company's databases. The interviewer kept asking me "general SQL" questions based on the storage architecture of the only product he knew. + +His product is not intended for Very Large Database (VLDB) applications, and he had no knowledge of Nucleus, Teradata, Model 204, or other products that compete in the VLDB arena. He had spent his career tuning one version of one product and could not make the jump to anything different, even conceptually. His career is about to become endangered. + +There is a place for the specialist dialect programmer, but dialect programming should be a last resort in special circumstances and never the first attempt. Think of it as cancer surgery: You do massive surgery when there is a bad tumor that is not treatable by other means; you do not start with it when the patient came in with acne. + +# 6.1 Pick Standard Constructions over Proprietary Constructions + +There is a fact of life in the IT industry called the Code Museum Effect, which works like this: First, each vendor adds a feature to its product. The feature is deemed useful, so it gets into the next version of the standard with slightly different syntax or semantics, but the vendor is stuck with its proprietary syntax. Its users have written code based on it, and they do not want to redo it. The solutions are the following: + +1. Never _implement the standard and just retain the old syntax._ The problem is that you cannot pass a conformance test, which can be required for government and industry contracts. SQL programmers who know the standard from other products cannot read, write, or maintain your code easily. In short, you have the database equivalent of last year's cell phone. + +2. _Implement the standard, but retain the old syntax, too._ This is the usual solution for a few releases. It gives the users a chance tomove to the standard syntax but does not break the existing applications. Everyone is happy for awhile. + +3. _Implement the standard and depreciate the old syntax._ The vendor is ready for a major release, which lets it redo major parts of the database engine. Changing to the standard syntax and not supporting the old syntax at this point is a good way to force users to upgrade their software and help pay for that major release. + +A professional programmer would be converting his or her old code at step two to avoid being trapped in the Code Museum when step three rolls around. Let's be honest, massive code conversions do not happen until after step three occurs in most shops, and they are a mess, but you can start to avoid the problems by always writing standard code in a step two situation. + +## 6.1.1 Use Standard OUTER JOIN Syntax + +#### Rationale: + +Here is how the standard OUTER JOINs work in SQL-92. Assume you are given: + +and the OUTER JOIN expression: + +We call Table 1 the "preserved table" andTable2 the "unpreserved table" in the query. What I am going to give you is a little different but equivalent to the ANSI/ISO standards. + +1. We build the CROSS JOIN of the two tables. Scan each row in the result set. + +2. If the predicate tests TRUE for that row, then you keep it. You also remove all rows derived from it from the CROSS JOIN. + +3. If the predicate tests FALSE or UNKNOWN for that row, then keep the columns from the preserved table, convert all the columns from the unpreserved table to NULLs, and remove the duplicates. + +So let us execute this by hand: + +Tablel CROSS JOIN Table2 + +Table1 LEFT OUTER JOIN Table2 + +The final results: + +The basic rule is that every row in the preserved table is represented in the results in at least one result row. + +### 6.1.1.1 Extended Equality and Proprietary Syntax + +Before the standard was set, vendors all had a slightly different syntax with slightly different semantics. Most of them involved an extended equality operator based on the original Sybase implementation. There are limitations and serious problems with the extended equality, however. Consider the two Chris Date tables: + +And let's do a Sybase-style extended equality OUTER JOIN like this: + +If I do the OUTER join first, I get: + +Then I apply the (qty < 200) predicate and get: + +Doing it in the opposite order results in the following: + +Sybase does it one way, Oracle does it another, and Centura (née Gupta) lets you pick which one to use—the worst of both nonstandard worlds! In SQL-92, you have a choice and can force the order of execution. Either do the predicates after the join: + +or do it in the joining: + +Another problem is that you cannot show the same table as preserved and unpreserved in the extended equality version, but it is easy in SQL-92. For example, to find the students who have taken Math 101 and might have taken Math 102: + +#### Exceptions: + +None. Almost every vendor, major and minor, has the ANSI infixed OUTER JOIN operator today. You will see various proprietary notations in legacy code, and you can convert it by following the discussion given previously. + +## 6.1.2 Infixed INNER JOIN and CROSS JOIN Syntax Is Optional, but Nice + +SQL-92 introduced the INNER JOIN and CROSS JOIN operators to match the OUTER JOIN operators and complete the notation; other infixed JOIN operators are not widely implemented but exist for completeness. The functionality of the INNER JOIN and CROSS JOINexisted in the FROM clause before and did not give the programmer anything new like the OUTER JOINs. + +#### Rationale: + +The CROSS JOIN is a handy piece of documentation that is much harder to miss seeing than a simple comma. Likewise, writing out INNER JOIN instead of the shorthand INNER helps document the code. + +However, many INNER JOIN operators can be visually confusing, and you might consider using the older syntax. The older syntax lets you put all of the predicates in one place and group them in some manner for readability. A rule of thumb is the "rule of five" in human psychology. This says that we have problems handling more than five things at once, get serious problems with seven, and break down at nine (Miller 1956). + +So when you have fewer than five tables, the infixed operators are fine but questionable for more than five INNER JOIN-ed tables. Trying to associate ON clauses to INNER JOIN operators is visually difficult. In particular, a Star Schema has an easily recognized pattern of joins from the fact table to each dimension table, like this in pseudocode: + +The reader can look down the right-hand side of the WHERE clause and see the dimensions in a vertical list. + +One style that is popular is to put the join conditions in the FROM clause with INNER JOIN syntax, then do the search arguments in the WHERE clause. Some newbies believe that this is required, but it is not. However, if the search arguments change, having them in one place is handy. + +A quick heuristic when using old-style joins is that the number of tables in the FROM clause should be one more than the number of join conditions in the WHERE clause. This shows that you do not have cycles in the joins. If the difference between the number of tables and the number of join conditions is more than one, then you might have an unwanted CROSS JOIN caused by a missing join condition. Old style: + +New style: + +Mixed style: + +#### Exceptions: + +The infixed join operators must be used if there is an OUTER JOIN in the FROM clause. The reason is that the order of execution matters with OUTER JOINs, and you can control it better with parentheses and predicates if they are all together. + +As a rule of thumb, when you have a FROM clause with five or more tables in it, the traditional syntax is probably easier to read than trying to visually match the ON clauses to the proper tables and correlation names. This rule of five is mentioned in other places as a limit on human data processing ability. + +## 6.1.3 Use ISO Temporal Syntax + +#### Rationale: + +The only display format allowed for temporal data in Standard SQL is based on ISO-8601, and it is the "yyyy-mm-dd hh:mm:ss.sssss" style. The Federal Information Processing Standards (FIPS) require at least five decimal places of precision in the seconds. Anything else is ambiguousand not acceptable if you want to work with other software that follows ISO standards. + +Standard SQL defines a minimal set of simple temporal math operators. All of them are available in all SQL products, but the syntax varies. For example, in the T-SQL dialect, the function call "DATEADD (DD, 13, birthdate)" adds "13" days to the date in birthdate. The Standard SQL syntax for the same calculation is "birthdate + INTERVAL '13 DAY" instead. + +You can set the display to ISO-8601 in every SQL product, and you can do 99.99 percent of your temporal work without any proprietary temporal functions. The problem is that porting code can be a bother. You need to make a set of notes about any differences in your dialect and the standard. + +#### Exceptions: + +None. Display formatting is always done in the client layer of a tiered architecture. This is a basic programming principle and has nothing to do with SQL per se. Failure to follow this principle is usually the result of a newbie who came to SQL from a traditional monolithic language with a strong coupling between the application, the display, and the file system. + +## 6.1.4 Use Standard and Portable Functions + +#### Rationale: + +Standard SQL is not a computational language, so it does not have the function library of FORTRAN or a statistical package. SQL is not a text manipulation language, so it does not have the function library of ICON or Snobol. All you have is simple four-function math and basic string operators in SQL-92. Vendors have always provided more than just the basic operators, so you can write portable code that assumes other math and string functions. The most common extra math functions are modulus, rounding and truncation, powers, and logarithms. The most extra common string functions are replacement, reversal, and repetition. + +#### Exceptions: + +If your dialect has a function built into it, which would require a huge amount of code to implement or a really long running time, then use the proprietary function and comment it for porting. + +# 6.2 Pick Compact Constructions over Longer Equivalents + +_"Entia non sunt multiplicanda praeter necessitatem."_ (No _more things should be presumed to exist than are absolutely necessary.)_ + +—William Occam (c. 1280–1349). + +_"Everything should be made as simple as possible, but not simpler."_ + +—Attributed to Albert Einstein + +Writing code in as short, clear, and compact a form as possible is just good software engineering for any programming language. Modules that clearly do one function are easier to modify and to understand. Systems with fewer modules are easier to maintain. + +SQL can replace hundreds of lines of procedural code with a few statements. You ought to be predisposed to think of short, clean solutions instead of kludges. However, old habits are hard to kill. Many newbies still think in terms of logical tests based on Boolean logic and simple AND-OR-NOT expressions that they know from their first programming languages. + +## 6.2.1 Avoid Extra Parentheses + +#### Rationale: + +Newbies see generated SQL code that has to have extra levels of parentheses to execute safely and think that this is the way to write code. Consider this simple query: + +This is not so bad to read, but by the time you have more than five predicates and useless nesting of parentheses, the code is difficult to read, and a missing parentheses is a real pain to locate. Let LISP programmers use them; they really need parentheses. + +#### Exceptions: + +Parentheses in moderation can make nested predicates easier to read: + +versus: + +In the following section, we will also see how to use a CASE expression for situations like this one. + +## 6.2.2 Use CASE Family Expressions + +The CASE expression is an expression and not a control statement; that is, it returns a value of one data type. Because SQL is declarative, there is no flow of control for it to modify, like the CASE statements in other languages. The number of newbies who do not understand the difference between an expression and a statement is frightening. + +The idea and the syntax came from the ADA programming language. Here is the formal BNF syntax for a ) expression. I recommend always giving the ELSE clause, so that you can change it later when you find something explicit to return. + +### 6.2.2.2 Simple CASE Expression + +The is defined as a searched CASE expression in which all of the WHEN clauses are made into equality comparisons against the ) is equivalent to () + +2. COALESCE (, ) is equivalent to: + +Then we can recursively define it for (n) expressions, where (n >= 3), in the list by: + +COALESCE (, , n) as equivalent to: + +Likewise, NULLIF (, ) is equivalent to: + +Use the most compact form of these CASE expressions, and do not expand them out to their definitions. + +## 6.2.3 Avoid Redundant Expressions + +#### Rationale: + +Most modern SQL engines are pretty smart. This was not always the case, so older SQL programmers will sometimes add redundant predicates to a where clause. For example, if none of the columns in the table Foobar is NULL-able, then given: + +One of the three search conditions is redundant, because it can be deduced from the other two. Redundant predicates only confuse the human readers and do not give information to a good optimizer. + +#### Exceptions: + +If your SQL has a bad optimizer and needs the extra help, then add redundant predicates. + +## 6.2.4 Seek a Compact Form + +#### Rationale: + +Many of the earlier SQL engines could not use an index on a column if it were in an expression, and they did not do any algebraic optimizations. Today, we do this bit of cleanup work because a simpler form of an expression is easier to maintain and to read: + +And a little algebra becomes: + +#### Exceptions: + +If your SQL has a really good optimizer, and the complicated form is easier for a human being to read for some reason, then use it. Sometimes there is no simple form. + +### 6.2.4.1 Use BETWEEN, Not AND-ed Predicates + +#### Rationale: + +Consider this simple query: + +which can be written as: + +The BETWEEN is more compact and gives the reader information about the relationship among three columns that might not be so obvious amid a longer list of search conditions. + +#### Exceptions: + +This rule makes sense from a readability standpoint, but it does not always stand up in terms of performance. Consider DB2 for z/OS in which " BETWEEN AND is both indexable and a stage one predicate." Without explaining what a stage one predicate is, it is preferred for performance. + +However, " BETWEEN AND " is both stage two and nonindexable, but formulating the same using two <= predicates could be both stage one and indexable and therefore preferable for performance. Likewise, the same execution plan applies to " BETWEEN AND " predicates. This will differ from DBMS to DBMS and platform to platform. As optimizers get better, this will be less and less true. + +### 6.2.4.2 Use IN(), Not OR-ed predicates + +#### Rationale: + +The IN() predicate was first introduced in the Pascal programming language. In SQL it has two forms; the list and the subquery. The list form has a comma-separated list of values or expressions on the right-hand side. The predicate returns a TRUE result if there is a match in that list with the left-hand side of the predicate. It is shorthand for a list or OR-ed predicates. For example consider: + +which can be written as: + +The IN() is more compact and gives the reader information about the relationship among three columns that might not be so obvious amid alonger list of search conditions. The list can also consist of scalar expressions, but that is not common. + +#### Exceptions: + +Watch out for NULLs! The IN () predicate is defined as a chain of OR-ed predicates, thus: + +Therefore: + +We are now in SQLs three-valued logic. Remember that a NULL is not the same thing as an UNKNOWN; SQL-92 has no Boolean data type; and you cannot use AND, OR, and NOT on a NULL. + +The NOT IN () predicate is defined as the negation of the IN(): + +means: + +Now put in a NULL for one of the list elements: + +If you wish to have a match on a NULL in a list, then you can + +COALESCE() the NULLs to the left-hand expression, thus: + +which is a little cleaner than: + +### 6.2.4.3 Use CASE Expressions, Not Complex Nested Predicates + +An advanced trick in the WHERE clause is to use a CASE expression for a complex predicate with material implications. If you forgot your freshman logic, a material implication logical operator is written as an arrow with two tails, and it means "p implies q" or "if ρ is true then q is true" in English. + +The use of a function that returns one or zero when given a predicate as its parameter is called a _characteristic function_ in logic and set theory. + +Review the rules for the CASE expression in section 6.2.2 first, so you understand it. The order of execution of the WHEN clauses can be used to optimize performance and avoid redundant tests. You can also nest CASE expressions inside the WHEN and THEN clauses of a containing CASE expression and display the logic as an indented tree structure. + +The goal of this technique is to replace pages of long lists of simple theta expressions inside horrible levels of parentheses and to providesome short-circuit evaluation as a bonus. When the nesting is too messy to understand, stop and reconsider your logic. Decision table tools, such as Logic Gem, are an excellent way to do this. + +# 6.3 Use Comments + +#### Rationale: + +The best documentation for maintaining a program has been comments in the code. Perhaps it is easier for procedural language programmers to add comments because they are explaining in a narrative fashion what their program is doing. Unfortunately, procedural language comments are often redundant if you can read the code. How much help did you get from: + +which gives you no information about what the variable score means and why it is incremented. + +In Standard SQL, a comment begins with two dashes (--) and ends with a new line, because the first SQL engines were on IBM mainframes and used punchcards. This format is a poor choice with modern computers that can store free-form text. Word wrap in program text can split a comment and give you errors. Because SQL supports the unary minus operator, this is ambiguous in some rare situations and makes the compiler work extra hard. Later standards added the C style /* and */ pairs, and many vendors have similar comment brackets. They are a better choice. + +SQL programmers do not like to put comments in their code, not even redundant or useless ones. My guess is that because SQL does a lot of work in one statement and programmers have been taught to comment the code at the statement execution level rather than explain the purpose of the code, the higher level of abstraction confuses them. They are not inclined to put comments at the clause level because the appearance of the code can be crowded. + +Get over it. You need a high-level descriptive comment on a block of SQL, and then more detailed comments on a few important clauses. Try to keep the comments aimed at non-SQL programmers and in plain English. For example, don't say "relational division of motor pool vehicles by available drivers" on the assumption that the reader willknow what a relational division is. Try "list all drivers who can drive all the vehicles in the motor pool" instead. The other trick is to reference the documentation for the schema and the applications. This assumes that they are current and useful, however. + +If you have the time, another guru-level trick is to save the best of the various statements you tried that worked but did not perform as well as the final choice as comments. In SQL, what was the best answer in one situation is often no longer the best answer. Instead of making the next programmer start from scratch, share your notes. + +#### Exceptions: + +In a well-designed schema with good data element names, much of the code is easy for an experienced SQL programmer to read. You can skip comments on single statements if their intent is really obvious, but remember that one programmer's obvious is another's "what the heck?" when you code. + +## 6.3.1 Stored Procedures + +Always start a stored procedure with a comment that gives at least the author, the date, and the update history. This is simply basic software management. After that, add a high-level description of the function of this module. The procedure name will be in a "" format. Each parameter should have a comment as needed. + +## 6.3.2 Control Statement Comments + +Comments on control statements, such as IF-THEN-ELSE, BEGIN-END, and WHILE-DO loops, will look much like comments in any procedural program. Complicated SQL statements need a comment at the top and often comments at the clause level. + +## 6.3.3 Comments on Clause + +This point is difficult to generalize, but things that act as a unit might need a comment. For example, a derived table for which there is no good alias might need a comment to explain what it contains. A series of predicates that define a complicated join might be prefaced with a comment to explain what they are doing at a higher level. + +# 6.4 Avoid Optimizer Hints + +#### Rationale: + +Many products have proprietary syntax for sending parameters to the optimizer to change the execution plan for a statement. Because each physical implementation is different, this syntax will not be portable, but there are other problems too. + +First, the optimizer is usually smarter than the programmer and finds a good plan. People cannot handle computations that involve tens of parameters very well. Second, once a hint is put on a statement, it stays there permanently, long after the reason for the hint is gone. A typical example of this would set up a query hint for a skewed statistical distribution and then, as the database grows, the distribution becomes more normal or skewed in the opposite direction. The hint that used to be so helpful is now a handicap. + +#### Exceptions: + +If you do have a skewed statistical distribution or other weirdness in your data that is destroying performance, then use a hint. Set up a review of all statements with hints to see if they actually need to be maintained. Reviews should occur when a new release of database is installed (optimizer might be better) or the statistics of one or more of the tables change (data might be better), but if the performance is acceptable, then do not use hints. + +# 6.5 Avoid Triggers in Favor of DRI Actions + +#### Rationale: + +Although there is an ANSI/ISO standard for triggers, their syntax and semantics are still highly proprietary. Triggers are blocks of procedural code that are executed (fired) when a database event occurs to a table. This code is usually in a proprietary 3GL language. A database event is something that changes the data—an insert, update, or delete. + +The full ANSI version of triggers does not fire on an insertion, but some vendor products do. The full ANSI version of triggers have more than one trigger on a table and can fire them in a sequence either before or after the database event. Most vendor products do not have that much control over the triggers. On the other hand, the syntax and semantics for DRI actions are well defined and standardized. + +A newbie posted a topic under the title "Need Help with a Calculation Trigger" on the forums in the SQL Server Central Web site in November 2004. This person was having trouble setting up a trigger to check theunits of a "number field [sic]"; the real problem was that the poster did not know that a column is not a field. + +For some reason, the column was declared as FLOAT and was called length. The trouble is that some people were entering a length in meters, centimeters, and millimeters. The poster was trying to code a trigger that will fire on UPDATE or INSERT to check the value of length. If it is greater than 20, chances are the number is in millimeters and should be divided by 10. If the number is less than 0, then the number is probably in meters and should be multiplied by 100. + +However, this is the wrong answer. It is in procedural code. The right answer is in the DDL, with something like this: + +Triggers tend to fix errors on the fly; the goal is not to permit them in the first place. + +#### Exceptions: + +Some things should be done with triggers because you cannot do them with DRI. In particular, the INSTEAD OF trigger has to be used for updatable views. This trigger is attached to a VIEW, and instead of taking actions on the VIEW, it changes the base tables from which the VIEW is built, so that the user sees those changes reflected in the VIEW. + +Heuristics tend to favor stored procedures over triggers. A trigger fires every time its database event occurs, which puts it out of your control and adds that overhead to each database event. A storedprocedure has to be deliberately executed, which puts it completely in your control. Furthermore, the syntax for triggers is proprietary despite the standards, so they do not port well. + +# 6.6 Use SQL Stored Procedures + +Every SQL product has some kind of 4GL language that allows you to write stored procedures that reside in the database and that can be invoked from a host program. Although there is a SQL/PSM standard, in the real world, only Mimer and IBM have implemented it at the time of this writing. Instead, each vendor has a proprietary 4GL, such asT-SQL for the Sybase/SQL Server family, PL/SQL from Oracle, Informix-4GL from Informix, and so forth. For more details on these languages, I recommend that you get a copy of Jim Melton's excellent book, _Understanding SQL's Stored Procedures_ ISBN: 1-55860461-8 [out of print] on the subject. The advantages they have are considerable, including the following: + + * _Security._ The users can only do what the stored procedure allows them to do, whereas dynamic SQL or other ad hoc access to the database allows them to do anything to the database. The safety and security issues ought to be obvious. + * _Maintenance._ The stored procedure can be easily replaced and recompiled with an improved version. All of the host language programs that call it will benefit from the improvements that were made and not be aware of the change. + * _Network traffic._ Because only parameters are passed, network traffic is lower than passing SQL code to the database across the network. + * _Consistency._ If a task is always done with a stored procedure, then it will be done the same way each time. Otherwise, you have to depend on all programmers (present and future) getting it right. Programmers are not evil, but they are human. When you tell someone that a customer has to be at least 18 years of age, one programmer will code "age > 18" and another will code "age >= 18" without any evil intent. You cannot expect everyone to remember all of the business rules and write flawless code forever. + * _Modularity._ Once you have a library of stored procedures, you can reuse them to build other procedures. Why reinvent the wheel every week? + +Chapter 8 is a general look at how to write stored procedures in SQL. If you look at any of the SQL newsgroups, you will see awful code. Apparently, programmers are not taking a basic software engineering course anymore, or they think that the old rules do not apply to a vendor's 4GL language. + +# 6.7 Avoid User-Defined Functions and Extensions inside the Database + +#### Rationale: + +SQL is a set-oriented language and wants to work with tables rather than scalars, but programmers will try to get around this model of programming to return to what they know by writing user-defined functions in other languages and putting them into the database. + +There are two kinds of user-defined functions and extensions. Some SQL products allow functions written in another standard language to become part of the database and to be used as if they were just another part of SQL. Others have a proprietary language in the database that allows the user to write extensions. + +Even the SQL/PSM allows you to write user-defined functions in any of the ANSI X3J standard programming languages that have data-type conversions and interfaces defined for SQL. There is a LANGUAGE clause in the CREATE PROCEDURE statement for this purpose. + +Microsoft has its common language runtime (CLR), which takes this one step further and embeds code from any compiler that can produce a CLR module in its SQL Server. Illustras "data blade" technology is now part of Informix, IBM has "extenders" to add functionality to the basic RDBMS, and Oracle has various "Cartridges" for its product. + +The rationale behind all of these various user-defined functions and extensions is to make the vendor's product more powerful and to avoid having to get another package for nontraditional data, such as temporal and spatial information. However, user-defined functions are difficult to maintain, destroy portability, and can affect data integrity. + +#### Exceptions: + +You might have a problem that can be solved with such tools, but this is a rare event in most cases; most data processing applications can be done just fine with standard SQL. You need to justify such a decision and be ready to do the extra work required. + +## 6.7.1 Multiple Language Problems + +Programming languages do not work the same way, so by allowing multiple languages to operate inside the database, you can lose data integrity. Just as quick examples: How does your language compare strings? The Xbase family ignores case and truncates the longer string, whereas SQL pads the shorter string and is case sensitive. How does your language handle a MOD() function when one or both arguments are negative? How does your language handle rounding and truncation? By hiding the fact that there is an interface between the SQL and the 3GL, you hide the problems without solving them. + +## 6.7.2 Portability Problems + +The proprietary user-defined functions and extensions will not port to another product, so you are locking yourself into one vendor. It is also difficult to find programmers who are proficient in several languages to even maintain the code, much less port it. + +## 6.7.3 Optimization Problems + +The code from a user-defined function is not integrated into the compiler. It has to be executed by itself when it appears in an expression. As a simple example of this principle, most compilers can do algebraic simplifications, because they know about the standard functions. They cannot do this with user-defined functions for fear of side effects. Also, 3GL languages are not designed to work on tables. You have to call them on each row level, which can be costly. + +# 6.8 Avoid Excessive Secondary Indexes + +First, not all SQL products use indexes: Nucleus is based on a compressed bit vector, Teradata uses hashing, and so forth. However, tree-structured indexes of various kinds are common enough to be worth mentioning. The X/Open SQL Portability Guides give a basic syntax that is close to that used in various dialects with minor embellishments. The user may or may not have control over the kind of index the system builds. + +A primary index is an index created to enforce PRIMARY KEY and UNIQUE constraints in the database. Without them, your schema is simply not a correct data model, because no table would have a key. + +A secondary index is an optional index created by the DBA to improve performance. The schema will return the same answers as itdoes with them, but perhaps not in a timely fashion—or even within the memory of living humans. + +Indexes are one thing that the optimizer considers in building an execution plan. When and how the index is used depends on the kind of index, the query, and the statistical distribution of the data. A slight change to any of these could result in a new execution plan later. With that caveat, we can speak in general terms about tree-structured indexes. + +If more than a certain percentage of a table is going to be used in a statement, then the indexes are ignored and the table is scanned from front to back. Using the index would involve more overhead than filtering the rows of the target table as they are read. + +The fundamental problem is that redundant or unused indexes take up storage space and have to be maintained whenever their base tables are changed. They slow up every update, insert, or delete operation to the table. Although this event is rare, indexes can also fool the optimizer into making a bad decision. There are tools for particular SQL products that can suggest indexes based on the actual statements submitted to the SQL engine. Consider using one. + +# 6.9 Avoid Correlated Subqueries + +#### Rationale: + +In the early days of SQL, the optimizers were not good at reducing complex SQL expressions that involved correlated subqueries. They would blindly execute loops inside loops, scanning the innermost tables repeatedly. The example used to illustrate this point was something like these two queries where "x" is not NULL-able and Table "Foo" is much larger than table "Bar," which produce the same results: + +versus + +In older SQL engines, the EXISTSO predicate would materialize a JOIN on the two tables and take longer. The IN() predicate would put the smaller table into main storage and scan it, perhaps sorting it to speed the search. This is not quite as true any more. Depending on the particular optimizer and the access method, correlated subqueries are not the monsters they once were. In fact, some products let you create indexes that prejoin tables, so they are the fastest way to execute such queries. + +However, correlated subqueries are confusing to people to read, and not all optimizers are that smart yet. For example, consider a table that models loans and payments with a status code for each payment. This is a classic one-to-many relationship. The problem is to select the loans where all of the payments have a status code of 'F': + +One answer to this problem uses this correlated scalar subquery in the SELECT list: + +This approach is backward. It works from the many side of the relationship to the one side, but with a little thought and starting from the one side, you can get this answer: + +The self-reference and correlation are complicated for both humans and machines. Most optimizers are not smart enough to flatten the first query like this. + +#### Exceptions: + +If you have a problem that is easier to understand with correlated subqueries and your optimizer is good, then don't be so afraid of them. + +# 6.10 Avoid UNIONS + +#### Rationale: + +UNIONs are usually not well optimized. Because they require that redundant duplicates be discarded, they force most SQL engines to do a sort before presenting the result set to the user. If possible, use UNION ALL instead. You should never have to build a chain of UNIONs from the same base table. That code can be written with OR-ed predicates or CASE expressions. + +As an example of a horrible misuse of SQL, Chris White posted a procedure that built dynamic SQL that would then build a report. Aside from the obvious violations of basic software engineering, the output was so huge that it exceeded the text size limits of SQL Server. He was attempting to construct an entire report in the database by using UNIONs to get the 12 lines of the report in the right order, by assigning them a letter of the alphabet. The whole thing would take several pages to show, but it is an extraction of the printout lines that were constructed from just the General Ledger. I have not attempted to clean up much of the code, so there are many violations of good coding rules in this snippet. + +The last part of the code could be reduced to a single, cohesive procedure. The output of the procedure would then be formatted in the front. Notice that section, description, and branch are all placeholders to give a slot for columns in the other UNIONs not shown here. + +#### Exceptions: + +Sometimes the UNION [ALL] is what you actually want. The other set operations in SQL-92, EXCEPT [ALL], and INTERSECT [ALL] are not widely available yet. + +# 6.11 Testing SQL + +When you are first writing a schema, you will probably generate some test data. If you look in the literature, there is a thing called an Armstrong set, which is the minimal number of rows that will test all of the constraints in a schema. Although it is difficult to automatically create an Armstrong set, you can do a good job with a little effort. + +## 6.11.1 Test All Possible Combinations of NULLs + +#### Rationale: + +NULLs behave strangely, and if there are problems, there is a good chance that a NULL will be involved. Newbies using graphic tools often leave more NULL-able columns in a single table than a professional would in an entire schema for a Fortune 500 company payroll. + +#### Exceptions: + +If the number of combinations is excessive, then look at a redesign rather than a stress test. It means you probably have too many NULL-able columns in the schema. + +## 6.11.2 Inspect and Test All CHECK() Constraints + +#### Rationale: + +You can extract the CHECK() constraint predicates from the DDL and look at them. The first thing is to see if the same data element has the same rules in all of the tables. Some attributes will always have the same CHECK() constraints if the model is correct. For example, the data type, regular expression, and check digit for a UPC code will be the same everywhere in the schema. + +Some attributes may have different constraints in different tables. For example, it would be reasonable to have "quantity INTEGER DEFAULT 0 NOT NULL CHECK (quantity >= 0)" almost everywhere that the quantity attribute appears. However, you might find that there is also a "CHECK (quantity > 0)" on a table. Is this an error or a situation where a zero quantity is disallowed? You need to look and see. + +#### Exceptions: + +None + +## 6.11.3 Beware of Character Columns + +#### Rationale: + +Character columns seldom have enough constraints on them. The result is that they have extra blanks in them, allow mixed-case letters, and will pretty much hold any kind of garbage that a user wishes to put in them. + +My favorite piece of test data for oversized, unconstrained NVARCHAR(n) columns is a collection of Buddhist sutras in Chinese Unicode. At least the users will learn a bit of classic Buddhist thought. + +#### Exceptions: + +None + +## 6.11.4 Test for Size + +#### Rationale: + +One of the problems with.small test data sets is that they will run just fine in the development shop, but when the size of the tables grows larger, you can get gradually degraded performance or catastrophe points. A catastrophe point is when there is a sudden change in the performance—the straw that breaks the camel's back. There is usually a physical component to a catastrophe point, such as excessive paging to a hard drive. Frankly, there is not a lot you can do about it except wait and see if it was a fluke or if it happens again. + +Gradually degraded performance is the nicer of the two situations. You can monitor the system, see the loss, and take action before anything bad happens. The bad news is that the term _gradual_ can be very short. The query that ran so well on a few thousand rows of test data is a pig when it goes live on several million rows of production data. Try to stress test on a data set that is larger than the current production database. That will let you know you have some margin of error. + +#### Exceptions: + +None +CHAPTER 7 How to Use VIEWS + +_The Blind Men and the Elephant_ + +By John Godfrey Saxe (1816–1887) + +It was six men of Indostan + +To learning much inclined, + +Who went to see the Elephant + +(Though all of them were blind), + +That each by observation + +Might satisfy his mind. + +The First approached the Elephant, + +And happening to fall + +Against his broad and sturdy side, + +At once began to bawl: + +"God bless me! but the Elephant + +Is very like a wall!" + +The Second, feeling of the tusk, + +Cried, "Ho! what have we here + +So very round and smooth and sharp? + +To me 'tis mighty clear + +This wonder of an Elephant + +Is very like a spear!" + +The Third approached the animal, + +And happening to take + +The squirming trunk within his hands, + +Thus boldly up and spake: + +"I see," quoth he, "the Elephant + +Is very like a snake!" + +The Fourth reached out an eager hand, + +And felt about the knee. + +"What most this wondrous beast is like + +Is mighty plain," quoth he; + +"Tis clear enough the Elephant + +Is very like a tree!" + +The Fifth, who chanced to touch the ear, + +Said: "E'en the blindest man + +Can tell what this resembles most; + +Deny the fact who can + +This marvel of an Elephant + +Is very like a fan!" + +The Sixth no sooner had begun + +About the beast to grope, + +Than, seizing on the swinging tail + +That fell within his scope, + +"I see," quoth he, "the Elephant + +Is very like a rope!" + +And so these men of Indostan + +Disputed loud and long, + +Each in his own opinion + +Exceeding stiff and strong, + +Though each was partly in the right, + +And all were in the wrong! + +Moral: + +So oft in theologie wars, + +The disputants, I ween, + +Rail on in utter ignorance + +Of what each other mean, + +And prate about an Elephant + +Not one of them has seen! + +VIEWs are virtual tables, defined by SELECT statements stored in the database. The SQL statement that defines the VIEW is executed only when the VIEW is invoked in another statement. The standard says that VIEWs are to act as if they are materialized, but in practice the optimizer will decide to materialize them as physical tables or to insert the SELECT statement in the definition into the query, invoking it and then compiling it like a derived table. There are six basic uses for VIEWs that we will discuss. + +# 7.1 VIEW Naming Conventions Are the Same as Tables + +#### Rationale: + +A VIEW is a logical table. It consists of rows and columns, exactly the same as a base table. A VIEW can be used in SELECT, UPDATE, DELETE, and INSERT statements in the same way that a base table can. Therefore, it stands to reason that VIEWs should utilize the same naming conventions as are used for tables. As an aside, the same can be said for aliases, synonyms, derived tables, table-valued functions, or anything that returns a table. + +In particular, there is an absurd naming convention of putting a "v" or "vw" in the first or last position of a VIEW name. My guess is that it comes from programmers either who are used to weakly typed languages that use Hungarian notation or who worked with file systems that had to have prefixes to locate the physical drive for the file. In the ISO-11179, the "vw" implies that the VIEW is a table dealing with Volkswagens. + +Individuals who have a need to differentiate between tables and VIEWs can utilize the schema information tables to determine which objects are VIEWs and which objects are tables. They should be at the system administration level or higher. + +INSERT, UPDATE, and DELETE are operations that cannot be performed on certain types of VIEWs. Users who need to do these privileges can be given INSTEAD OF triggers and never know if they are dealing with a VIEW or a base table. + +#### Exceptions: + +None + +## 7.1.1 Always Specify Column Names + +#### Rationale: + +When creating VIEWs, SQL provides the option of specifying new column names for the VIEW clause or defaulting to the same column names as the defining SELECT statement. It is always advisable to explicitly specify VIEW column names instead of allowing them to default, even if using the same names as the underlying base tables. This will provide for more accurate documentation. + +#### Exceptions: + +Make sure that the VIEW clause names are correct. If you misspell them, that is what the user sees. + +# 7.2 VIEWs Provide Row- and Column-Level Security + +One of the most beneficial purposes served by VIEWs is to extend the data security features of SQL. VIEWs can be created that provide a subset of rows, a subset of columns, or a subset of both rows and columns from the base table. + +How do VIEWs help provide row- and column-level security? Consider a "Personnel" table that contains all of the pertinent information regarding an enterprise's employees. Typically, name, address, position, birthdate, and salary information would be contained in such a table. However, not every user will require access to all of this information. Specifically, it may become necessary to shield the salary information from most users. You can accomplish this by creating a VIEW that does not contain the salary column and then granting most users the ability to access the VIEW, instead of the base table. The salary column will not be visible to users of the VIEW. + +Or perhaps you need to implement security at the row level. Consider a table that contains project information. Typically, this would include project name, purpose, start date, and who is responsible for the project. Assume that the security requirements for projects within your organization deem that only the employee who is responsible for the project can access the project data. By storing the authorization ID of the responsible employee in the "projects" table, a VIEW can be created using the CURRENT_USER value. + +Or, if you need to limit access to a team, you can create a table of teams to which only team managers have access. + +Another trick is to use the CURRENT_TIMESTAMP or CURRENT_DATE in VIEWs to get an automatic update to schedules and other time-related events. + +Each time the VIEW is invoked, it will check the clock and see if anything has changed for you. + +# 7.3 VIEWs Ensure Efficient Access Paths + +By coding the appropriate join criteria into the VIEW definition SQL, you can ensure that the correct join predicate will always be used. Of course, this technique becomes more useful as the SQL becomes more complex. + +# 7.4 VIEWs Mask Complexity from the User + +Somewhat akin to coding appropriate access into VIEWs, complex SQL can be coded into VIEWs to mask the complexity from the user. This can be extremely useful when your shop employs novice SQL users (whether those users are programmers, analysts, managers, or typical end users). + +As an example, consider the code for a relational division. Relational division is one of the eight basic operations in Codd's (1979) relational algebra. The idea is that a divisor table is used to partition a dividend table and produce a quotient or results table. The quotient table consists of those values of one column for which a second column had all of the values in the divisor. + +This is easier to explain with an example. We have a table of pilots and the planes they can fly (dividend); we have a table of planes in the hangar (divisor); we want the names of the pilots who can fly every plane (quotient) in the hangar. To get this result, we divide the PilotSkills table by the planes in the hangar. + +Here is one way to write the query: + +This not the sort of thing that newbie SQL programmers can pull out of their hats, but they can write "SELECT pilot FROM QualifiedPilots;" without much trouble. Furthermore, the VIEW definition can be changed, and the user will never know it. Here is another version of relational division: + +# 7.5 VIEWs Ensure Proper Data Derivation + +Another valid usage of VIEWs is to ensure consistent derived data by creating new columns for VIEWs that are based on arithmetic formulae (e.g., creating a VIEW that contains a column named "tot_comp," which is defined by [salary + commission + bonus]). Because this column name is at the table level, it can be used in the SELECT of the invoking SELECT statement. That is, this is illegal: + +and this is legal: + +followed by: + +Although this is an easy formula, it is a good idea to have a complicated one in only one place in the schema. It might not be right, but at least it will be consistent. + +# 7.6 VIEWs Rename Tables and/or Columns + +You can rename columns in VIEWs. This is particularly useful if a table contains arcane or complicated column names. There are some prime examples of such tables in the schema information tables of most SQL products. Additionally, if other tables exist with clumsy table and/or column names, VIEWs can provide a quick solution until you can rename them. In many SQL products, doing this can require dropping and recreating the tables. + +# 7.7 VIEWs Enforce Complicated Integrity Constraints + +Consider a schema for a chain of stores that has three tables, thus: + +The first two tables explain themselves. The third table shows the relationship between stores and personnel—namely, who is assigned to which job at which store and when this happened. Thus: + +Let job_type 0 = "unassigned", 1 = "stockboy", and so on, until we get to 99 = "Store Manager"; we have a rule that each store has one and only one manager. In full SQL-92 you could write a constraint like this: + +But many SQL products do not allow CHECK () constraints that apply to the table as a whole, and they do not support the scheme-level CREATE ASSERTION statement. So, how to do this? You might use a trigger, which will involve—ugh!—procedural code. Despite the SQL/PSM and other standards, most vendors implement different trigger models and use their proprietary 4GL language, but, being a fanatic, I want a pure SQL solution. + +Let's create two tables like this: + +Then build a UNION-ed VIEW: + +The key and job_type constraints in each table working together will guarantee only one manager per store. The next step is to add INSTEAD OF triggers to the VIEW, so that the users can insert, update, and delete from it easily. + +As an exercise for the reader: How would you ensure that no store has more than two assistant managers? + +# 7.8 Updatable VIEWs + +The SQL-92 standard is actually conservative about which VIEWs are updatable. They have to be based on the following: + +1. A SELECT statement on one and only one table, but the VIEW can be defined on several layers of VIEWs on top of VIEWs. + +2. The VIEW must include all of the columns of a UNIQUE or PRIMARY KEY constraint in the base table. This guarantees that all of the rows in the VIEW map back to one and only one row in the base table from which it is derived. + +3. All base table columns not shown in the VIEW must have default values or be NULL-able. The reason for that is obvious: You have to delete or insert a complete row into the base table, so the system must be able to construct such a row. + +However, other VIEWs are updatable, and some vendors support more than the basic version given in the SQL-92 standard. The VIEW must have an INSERT, UPDATE, and DELETE rule under the covers, which maps its rows back to a single row in the base table(s). + +## 7.8.1 WITH CHECK OPTION clause + +Another feature, which is not used enough, is the WITH CHECK OPTION clause on a VIEW. It is a bit tricky, when you nest VIEWs inside each other, but the idea is that an UPDATE or INSERT INTO statement cannot leave the scope of the set selected by the updatable VIEW. For example, we have a VIEW like this: + +The result would be that "NewYorkSalesmen" would be empty when you come back to it. This is probably not desirable. However, if we had defined the updatable VIEW as: + +the system would test the update for a violation and would reject it. + +## 7.8.2 INSTEAD OF Triggers + +Because some VIEWs cannot be updated, you can add INSTEAD OF triggers to fool the users. This trigger is executed instead of the INSERT, UPDATE, or DELETE action, thus overriding the actions of the triggering statements. The syntax will vary from product to product, but expect something like this: + +For obvious reasons, only one INSTEAD OF trigger per INSERT, UPDATE, or DELETE statement can be defined on a table or VIEW. However, it is possible to define VIEWs on VIEWs where each VIEW has its own INSTEAD OF trigger. INSTEAD OF triggers are not allowed on updatable VIEWs that have a WITH CHECK OPTION. + +You can also define INSTEAD OF triggers on base tables, but this is a bit weird because you have BEFORE and AFTER triggers. + +# 7.9 Have a Reason for Each VIEW + +#### Rationale: + +VIEWs should be created only when they achieve a specific, reasonable goal. Each VIEW should have a specific application or business requirement that it fulfills before it is created. That requirement shouldbe documented somewhere, preferably in a data dictionary or possibly as a remark in the VIEW declaration. + +#### Exceptions: + +None + +# 7.10 Avoid VIEW Proliferation + +#### Rationale: + +The proliferation avoidance rule is based on common sense. Why create something that is not needed? It just takes up space that could be used for something that is needed. + +Whenever a SQL object is created, additional entries are placed in the schema information tables. Creating needless schema objects causes what Craig Mullins calls _catalog clutter._ For example, in DB2, every unnecessary VIEW that is created in SQL will potentially insert rows into four VIEW-specific schema information tables (i.e., SYSVTREE, SYSVLTREE, SYSVIEWS, and SYSVIEWDEP) and three table-specific schema information tables (i.e., SYSTABLES, SYSTABAUTH, and SYSCOLUMNS). + +It is a good idea to use a utility program to see if you have VIEWs that are not referenced anywhere. Another good idea is to see if you have VIEWs that do the same thing, or almost the same thing, so you can remove one of them. + +#### Exceptions: + +None + +# 7.11 Synchronize VIΕWs with Base Tables + +#### Rationale: + +Whenever a base table changes, all VIEWs that depend on that base table should be analyzed to determine if the change affects them. All VIEWs should remain logically pure. The VIEW should remain useful for the specific reason you created it. + +For example, say a VIEW was created to control employee access to a project and we add the new badge numbers to the Personnel table. This badge number probably should also be added to the access VIEW. The badge number column can be added to the Personnel table immediately and then to the VIEW at the earliest convenience of the development team. + +The synchronization rule requires that strict change impact analysis procedures be in place. Every change to a base table should trigger the usage of these utility programs and maintenance procedures. + +#### Exceptions: + +None + +# 7.12 Improper Use of VIEWs + +Over the years, VIEWs have been used for other purposes that made sense at the time but have been rendered obsolete with the advent of new DBMS functionality. + +## 7.12.1 VIEWs for Domain Support + +#### Rationale: + +It is a sad fact of life that most RDBMS do not support domains. Domains were in the original relational model and should have been part of SQL from the start. A domain basically identifies the valid range of values that a column can contain. Of course, domains are more complex than this simple explanation. For example, only columns pooled from the same domain should be able to be compared within a predicate (unless explicitly overridden). + +Some of the functionality of domains can be implemented using VIEWs and the WITH CHECK OPTION clause, which ensures the update integrity of VIEWs. This will guarantee that all data inserted or updated using the VIEW will adhere to the VIEW specification. + +Now, this method of using VIEWs to simulate domains is still viable, but a better technique to provide the same functionality is available—namely, CHECK() constraints. + +And a CHECK() constraint is simpler than creating VIEWs with the WITH CHECK OPTION. + +#### Exceptions: + +None + +## 7.12.2 Single-Solution VIEWs + +#### Rationale: + +Another past usage for VIEWs was to enable solutions where VIEWs really were the only way to solve a data access problem. Without VIEWs, some complex data access requests could be encountered that were not capable of being coded using SQL alone. However, sometimes a VIEW can be created to implement a portion of the access. Then, the VIEW can be queried to satisfy the remainder. + +Consider the scenario where you want to report on detail information and summary information from a single table. For instance, what if you would like to report on stock prices? For each stock, provide all stock details, and also report the maximum, minimum, and average prices for that stock. Additionally, report the difference between the average price and each individual price. + +After the VIEW is created, the following SELECT statement can be issued joining the VIEW to the base table, thereby providing both detail and aggregate information on each report row: + +Situations such as these were ideal for using VIEWs to make data access a much simpler proposition. However, the advent of table expressions (sometimes referred to as in-line VIEWs) makes this usage of VIEWs obsolete. Why? Instead of coding the VIEW, we can take the SQL from the VIEW and specify it directly into the SQL statement that would have called the VIEW. Using the previous example, the final SQL statement becomes: + +So we can use a table expression to avoid creating and maintaining a VIEW. + +#### Exceptions: + +If an expression is used in many places and it has a clear meaning in the data model, then create a VIEW. + +## 7.12.3 Do Not Create One VIEW Per Base Table + +#### Rationale: + +A dubious recommendation is often made to create one VIEW for each base table in a SQL application system. This is what Craig Mullins calls "The Big VIEW Myth." This is supposed to insulate application programs from database changes. This insulation is to be achieved by mandating that all programs be written to access VIEWs instead of base tables. When a change is made to the base table, the programs do not need to be modified because they access a VIEW, not the base table. + +There is no adequate rationale for enforcing a strict rule of one VIEW per base table for SQL application systems. In fact, the evidence supports not using VIEWs in this manner. Although this sounds like a good idea in principle, indiscriminate VIEW creation should be avoided. The implementation of database changes requires scrupulous analysis regardless of whether VIEWs or base tables are used by your applications. Consider the simplest kind of schema change, adding a column to a table. If you do not add the column to the VIEW, no programs can access that column unless another VIEW is created thatcontains that column. But if you create a new VIEW every time you add a new column, it will not take long for your environment to be swamped with VIEWs. + +Then you have to ask which VIEW should be used by which program? Similar arguments can be made for removing columns, renaming tables and columns, combining tables, and splitting tables. + +In general, if you follow good SQL/SQL programming practices, you will usually not encounter situations where the usage of VIEWs initially would have helped program/data isolation anyway. By dispelling, "The Big VIEW Myth," you will decrease the administrative burden of creating and maintaining an avalanche of base table VIEWs. + +#### Exceptions: + +None + +# 7.13 Learn about Materialized VIEWs + +#### Rationale: + +A materialized VIEW is brought into existence in the physical database, where it can be used like any other table. This is implementation dependent, so you have to know what your product does to get the best use of this feature. + +All VIEWs are supposed to act as if they are materialized, but in practice the text of the view can often be put into the parse tree of the statement using it and expanded like an in-line macro statement. For example, given this VIEW: + +When it is used in a query, the effect is as if it were a derived table expression inside that query. For example: + +in effect becomes: + +which will probably become something like this in the parse tree: + +However, if more than one user references a VIEW, it can be cheaper to materialize it once and share the data among all users. If the materialized result set is small enough to fit into main storage, the performance improvements are even greater. + +This is actually a common event, because we tend to build views that summarize data for reporting periods. Thus, lots of users want to get to the same summary views at the same time. If you plan the VIEWs to take advantage of this usage pattern, you can get major performance improvements. + +#### Exceptions: + +None +CHAPTER 8 How to Write Stored Procedures + +_"Whatever language you write in, your task as a programmer is to do the best you can with the tools at hand. A good programmer can overcome a poor language or a clumsy operating system, but even a great programming_ _environment will not rescue a bad programmer."_ + +—Kerniehan and Pike + +EVERY SQL PRODUCT has some kind of 4GL tools that allow you to write stored procedures that reside in the database and that can be invoked from a host program. Each 4GL is a bit different, but they are all block-structured languages. They have varying degrees of power and different language models. For example, T-SQL is a simple, one-pass compiler modeled after the C and Algol languages. It was not intended as an application development language, but rather as a tool for doing short tasks inside a SQL Server database. + +At the other extreme, Oracles PL/SQL is modeled after ADA and SQL/PSM. It is a complicated language that can be used for application development. Likewise, Informix 4GL is an application development language that generates C code, which can be immediately ported to a large number of platforms. + +What this means is that anything I say about SQL stored procedures will have to be general, but perhaps the most frightening thing is that I have to go back and teach basic software engineeringprinciples to SQL programmers. If you look at the SQL code posted in newsgroups, much of it is written as if all of the work done in the 1970s and 1980s by Yourdon, DeMarco, Dijkstra, Wirth, and others, never happened. Wake up, people! Those rules still apply to any programming language because they apply to programming. + +# 8.1 Most SQL 4GLs Are Not for Applications + +#### Rationale: + +Most of the proprietary procedural languages added to SQL by vendors were never meant to replace application development languages (note the exceptions). They were meant to be micro-languages that could be used for procedural operations inside the database. + +The classic micro-language has no real input/output (I/O); you can print a message on the standard system output and that is about all. There is no file control, no complex computations, and no display formatting functions. These languages were for writing triggers and short cleanup modules in the schema, and the rule of thumb was never to have a procedure over one page or 50 lines long. + +This is fine; in a tiered architecture, display and complex computations are done in the host language of the presentation layer. But if you read the SQL newsgroups, you will constantly find newbie programmers who want to do display formatting in the database. They want to add leading zeros in a SELECT statement, concatenate first and last names, put line numbers on the result set to display ranges of those line numbers, and a host of other things. SQL is strictly a data-retrieval language and has nothing to do with application presentation layers. + +#### Exceptions: + +Informix 4GL, Progress, Oracle's PL/SQL, and a few other languages were actually meant for application development. Sometimes the language came before the SQL database and vice versa. A proprietary language can be fast to execute, fast to write, and have lots of nice features. A lot of mainframe packages are implemented in Informix 4GL under the covers, Oracle sells packages written in PL/SQL, and a lot of midsized systems are implemented in Progress. The trade-off is the ability to maintain these proprietary code bases versus maintaining a standard programming language with embedded SQL. + +# 8.2 Basic Software Engineering + +I am amazed that so many SQL programmers do not know basic software engineering. Working programmers on newsgroups actually have to ask for definitions of cohesion and coupling. Apparently, programmers are not getting the basics of their trade and simply try to pass certification exams instead of actually learning their craft. With some embarrassment, I will now give what should have been covered in a freshman course. + +These principles apply to any procedural programming language, but they have slightly different applications in SQL because it is a nonprocedural, set-oriented language with concurrency issues. + +## 8.2.1 Cohesion + +Cohesion is how well a module does one and only one thing: that it is logically coherent. The modules should have strong cohesion. You ought to name the module in the format "," where the "" is a specific logical unit in the data model. + +There are several types of cohesion. They are ranked here from the worst form of cohesion to the best: + +1. Coincidental + +2. Logical + +3. Temporal + +4. Procedural + +5. Communicational + +6. Informational + +7. Functional + +This scale is an ordinal scale, and a module can have characteristics of more than one type of cohesion in it. Let's define terms as follows: + + * _Coincidental cohesion._ This is the worst kind of cohesion. This is where a module performs several unrelated tasks under one roof. Think of someone pasting random blocks of code together and somehow getting it to compile. This is what you get with dynamic SQL or passing table names as parameters. + +For example, "InsertNewCustomerO" tells you that you are going to be working with the tables related to the customers. However, a procedure called "InsertNewRecord," which can put a row into any table in the schema, is too general to have good cohesion. It works on bagpipes, marriages, and octopi or any new table that gets put into the schema later. + +Programmers should not be using dynamic SQL, because it has no cohesion and is dangerous. Users who have to provide, say, a table name, can also provide extra SQL code that will be executed. For example, instead of passing just the table name, they pass "Foobar; DELETE FROM Foobar; COMMIT" and destroy the database. But dynamic SQL also says that the programmer is so incompetent that he or she could not write the program and had to give the job to any random user, present or future, to complete on the fly. + +This kind of coding is the result of trying to do metadata operations in an application by using the schema information tables. SQL engines have tools for metadata, and the user should not be writing versions of them. + + * _Logical cohesion._ Here modules can perform a series of related tasks, but the calling module selects only one. The worst example of this was a posting in 2004 on a SQL Server newsgroup where a programmer had been ordered to put all procedures into one module. A parameter would then pick which of 50-plus modules would be executed and which parameters would be used and what they would do in context. + +OO programmers like to do this for each table, because they can think of each table as some kind of object, and the procedure looks like methods on that object. It isn't. + + * _Temporal cohesion._ The module performs a series of actions that are related in time. The classic example is to put all startup or shutdown actions in one module. Older COBOL and file system programmers tend to do this because they worked with batch processing systems that did not have concurrency issues. + * _Procedural cohesion._ The modules perform a sequence of steps in a process that has to be executed in specific order. Again, this style is used by file system programmers who are used to batch processing systems. They often write a lot of temporary tables to hold the process steps, like we used to allocate working tapes. + * _Communicational cohesion._ All elements operate on the same input data set or produce the same output data set. The parts communicate via common data in a global table. + * _Informational cohesion._ This is also called _sequential cohesion_ in the literature. Output from one element in the module serves as input for some other element, but unlike logical cohesion, the code for each action is completely independent. + * _Functional cohesion._ The module performs exactly one function or achieves a single goal. Math functions are the best example of this kind of cohesion. This is what we are trying to do, and it is why SQL is also known as a functional language. + +Procedural, communicational, informational, and functional cohesion are a bit more complicated in SQL than in 3GL programming because we have transactions. A transaction is logically one step, although it consists of individual SQL statements. What looks like procedural, communicational, or informational cohesion can be much stronger in SQL. + +## 8.2.2 Coupling + +If modules have to be used in a certain order, then they are strongly coupled. If they can be executed independently of each other and put together like Lego blocks, then they are loosely or weakly coupled. There are several kinds of coupling, which are ranked from worst to best as follows: + +1. Content + +2. Common + +3. Control + +4. Stamp + +5. Data + +The types of coupling are defined as follows: + + * _Content coupling._ This occurs when one module directly references the contents of another module. For example, module X branches to a local label in module _y_ or module X modifies a statement of module _y._ Such modules are inextricably linked to each other. Content coupling is dangerous but is not often supported in SQL 4GL products. The rule here is not to pass a procedure as a parameter in a SQL 4GL. + * _Common coupling._ This occurs when several modules have access to the same global data. In the 3GL languages, this was use of global variables in the C family and other languages. In SQL, this can happen with the use of common global tables to pass information. It gets to be dangerous when concurrency controls are not done right. + * _Control coupling._ This occurs when one module has control over the logic of another. If module X calls module y and y determines which action X must take, then control coupling is present. The passing of a control switch statement as an argument is an example of control coupling. In SQL, you do this with subqueries that reference other parts of the schema in predicates that drive control flow. + * _Stamp coupling._ Entire tables are passed to the called module, but only some columns are used. In SQL, the use of "SELECT *" in production code is the prime example. + * _Data coupling._ Two modules are data coupled if all arguments are scalar data elements. Data coupling is a desirable goal because such modules are easier to maintain. Any changes in one module or table are less likely to cause a regression fault in the others. + +# 8.3 Use Classic Structured Programming + +Although I like to say that SQL is short for "Scarcely Qualifies as a Language," the truth is that it came from "Structured English-like Query Language" from the original project at IBM. A lot of current programmers seem to have missed the structured revolution and have reverted back to ad hoc programming but call it "extreme" or "agile" these days to make sloppy programming sound better. + +In classic structured programming, you have three control structures: + +1. _Concatenation._ The statements inside brackets are executed in sequential order. In SQL/PSM this is shown with the keyword brackets "BEGIN [ATOMIC] .. END" and often by just "BEGIN .. END" in proprietary 4GLs. The keyword ATOMIC makes the block into a transaction, which we will not discuss in detail here. + +2. _Selection._ A Boolean expression determines which one of two blocks of statements is executed. In SQL/PSM this is shown with the keywords "IF .. THEN .. [ELSE ..] END IF;" and in proprietary 4GLs with "IF .. THEN .. [ELSE ..];" or "IF .. [ELSE ..];" but syntax is always enough alike not to be a problem. + +3. _Iteration._ A block of statements is repeatedly executed while a Boolean expression is TRUE. In SQL/PSM this is shown with the keywords "WHILE .. LOOP .. END WHILE;" and you will see "WHILE .. DO.." keywords in many products. Again, various products are always enough alike not to be a problem. + +The important characteristic of all of these control structures is that they have one entry and one exit point. Any code written using them will also have one entry and one exit point. You do not use a GO TO statement in classic structured programming. + +Some languages allowed a RETURN() statement to jump out of functions and set the value of the function call. Some allowed a switch or case expression as a multiway selection control statement. But by sticking as close as possible to classic structured programming, your code is safe, verifiable, and easy to maintain. + +## 8.3.1 Cyclomatic Complexity + +So is there a heuristic for telling if I have a bad stored procedure? There are a lot of metrics actually. In the 1970s, we did a lot of research on software metrics and came up with some good stuff. Here is one that can be computed by hand when you have short procedures to measure. + +Tom McCabe (1976) invented the cyclomatic complexity metric. The score is basically the number of decision points in a module plus one, or the number of execution paths through the code. Decision points are where a flow graph of the procedure would branch. In a well-structured 4GL program, the keywords of the language will tell us what the decision points are. For us that means IF, WHILE, and each branch of a CASE or SWITCH statement, if your 4GL supports that feature. + +If the module has a score of 1 to 5, it is a simple procedure. If the score is between 6 to 10, it might need simplification. If the score is greater than 10, then you really should simplify the module. There are other metrics and methods, but most of them are not as easy to compute on the fly. + +# 8.4 Avoid Portability Problems + +#### Rationale: + +We already talked about writing portable SQL statements, but you also need to write portable 4GL code. Because these languages are proprietary, they will have some features that will not port to other SQL 4GLs. Also, you cannot expect that you will always find programmers who are expert in these languages or who have time to become experts. Plain, simple code in an unfamiliar language can be a great help. + +Stick to the classic three control structures. They will always port with only mechanical syntax changes and can be read by any programmer who knows a typical 3GL language. But there are other tricks and heuristics. + +## 8.4.1 Avoid Creating Temporary Tables + +In some vendor languages, the programmer can create a temporary table on-the-fly, while in Standard SQL the temporary tables are only created by someone holding administrative privileges. Use subquery expressions, derived tables, or VIEWs instead. The use of temporary tables is usually a sign of a bad design. Temporary tables are most often used to hold the steps in a procedural process. They replace the scratch or work tapes we used in the 1950s magnetic tape file systems. + +There are two major types of error handling. The Sybase/SQL Server family uses a sequential code model. After executing each statement, the SQL engine sets a global error variable, and the programmer has to write code to immediately catch this value and take action. + +The SQL/PSM model uses an interrupt model. There is a global SQLSTATE (the old SQLCODE is deprecated), which can return multiple values into a cache. These values can trigger actions that were defined in WHENEVER statements associated with blocks of code. Maintaining the error handling part of a module is difficult, so do a lot of comments in it. + +Put as much of the code into SQL statements, not into the 4GL. Ideally, a stored procedure ought to be one SQL statement, perhaps with a few parameters. The next best design would be a "BEGIN [ATOMIC] .. END" with a straight sequence of SQL statements. You lose points for each "IF..THEN..ELSE" and lose lots of points for each loop. + +## 8.4.2 Avoid Using Cursors + +#### Rationale: + +A cursor is a way of converting a set into a sequential file so that a host language can use it. There are a lot of options on the Standard SQL cursor, and there are a lot of vendor options, too. + +Cursors are difficult to port and generally run much slower than pure nonprocedural SQL statements. By slower, I mean orders of magnitude slower. For safety, the SQL engine has to assume that anything can happen inside a cursor, so it puts the transaction at the highest level it can and locks out other users. + +So why do people use them? The overwhelming reason is ignorance of SQL and old habits. The cursors in SQL are modeled after tape file semantics, and people know that kind of procedural programming. Here is the analogy in detail: + +Add the use of temporary tables as working or scratch tapes and you can mimic a 1950s tape system statement for statement and never learn to think relationally at all. In 2004, there was an example of this in the SQL Server Programming newsgroup. The newbie had written one cursor to loop through the first table and select rows that met a criterion into a temporary table. A second cursor looped through a second table ordered on a key; inside this loop, a third cursor looped through the temporary table to match rows and do an update. This was a classic 1950s master/transaction tape file merge but written in SQL. The 25 or so statements used in it were replaced by one UPDATE with a scalar subquery expression. It ran almost three orders of magnitude faster. + +#### Exceptions: + +The only uses I have found are truly exceptional. Cursors can be used to repair poorly designed tables that have duplicate rows or data that is so trashed you have to look at every row by itself to clean the data beforedoing an ALTER TABLE to fix such poor design permanently. Here are some reasons to use cursors: + +1. Cursors can be used to build metadata tools, but you really should be using what the vendor has provided. Messing directly with schema information tables is dangerous. + +2. Cursors can be used to solve NP-complete problems in SQL where you stop with the first answer you find that is within acceptable limits. The "Traveling Salesman" and "Bin Packing" problems are examples, but they are not exactly common database problems and are better solved with a procedural language and backtracking algorithms. + +3. In T-SQL and other products that still use physically contiguous storage, calculating a median is probably much faster with a cursor than with any of the set-based solutions, but in other products with different storage or indexing, computing the median is trivial. + +4. It is possible to actually write code that is worse than a cursor. Consider this slightly cleaned-up posting by Curtis Justus in the SQL Server Programming newsgroup in November 2004. He had a table of approximately 1 million rows and needed to "do something with each of the rows" in what he called a traditional "For/Each" type algorithm. The specifications were never explained beyond that. He posted a pseudocode program in T-SQL dialect, which would translate into Standard SQL pseudocode something like this: + +Yes, you are looking at a sequential tape file algorithm from the 1950s written in SQL in the early 21st century. The poster wanted to know if this was the most efficient way to go after the data. The answer, obviously, is that even a cursor would be better than this approach. + +You would be surprised by how many newbies rediscover sequential tape processing in SQL. Perhaps even more remarkable was this person's attitude that he was currently getting a fast enough response time that it did not have to be coded correctly. The lack of portability, the orders of magnitude degradation, and the extra lines of code that had to be maintained were simply not regarded as his responsibility as a professional. + +## 8.4.3 Prefer Set-Oriented Constructs to Procedural Code + +#### Rationale: + +The optimizer cannot use control structures from the 4GL to pick an execution plan. Thus, the more logic you can pass to it via pure SQL statements, the better it will perform. The real cost in a stored procedure is in data access. Timing for various operations on a typical 1-GHz PC in summer 2001 in nanoseconds was: + +If I can save a few disk fetches, I get a much better return on my efforts than if I write faster executing computations. The seek times have not gotten and are not going to get much better in the foreseeable future. + +### 8.4.3.1 Use CASE Expressions to Replace IF-THEN-ELSE Control Flow Statements + +As an example of how to do this, consider the problem of updating the prices in a bookstore. This is a version of an exercise in an early Sybase SQL training class to show why we needed cursors. We want to take 10 percent off expensive books ($25 or more) and increase inexpensive books by 10 percent to make up the loss. The following statement is the first impulse of most new SQL programmers, but it does not work. + +A book priced at $25.00 is reduced to $22.50 by the first update. Then it is raised to $24.75 by the second update. Reversing the order of the update statements does not change the problem. The answer given in the course was to use a cursor and to update each book one at a time. This would look something like this: + +But by using a CASE expression to replace the IF..THEN..ELSE logic, you can write: + +This requires less code and will run faster. The heuristic is to look for nearly identical SQL statements in the branches of an IF statement, then replace them inside one statement with a CASE expression. + +### 8.4.3.2 Use Sequence Tables to Replace Loop Control Flow + +A sequence table is a single-column table that contains integers from 1 to (n), for some values of (n) that are large enough to be useful. One way of generating such a table is: + +However, it is faster to write: + +This use of CROSS JOINs is another example of how to avoid loops. A weird but useful heuristic is to put the phrase "the set of.." in front of the nouns in a sentence that describes the problem you are solving. It is bad grammar, but it can help shift your mindset to thinking in terms of sets. + +Converting a string with a comma-separated list of values into a proper table with the position and value is done by using a simple WHILE loop that cuts off one substring up to but not including the comma, and then converts the substring to an integer. The code would look like this: + +However, the same thing can be done with a Sequence table, thus: + +It makes life easier if the lists in the input strings start and end with a comma. You will also need a table called Sequence, which is a set of integers from 1 to (n). + +The S1 and S2 copies of Sequence are used to locate bracketing pairs of commas, and the entire set of substrings located between them is extracted and cast as integers in one nonprocedural step. The trick is to be sure that the left-hand comma of the bracketing pair is the closest one to the second comma. The place column tells you the relative position of the value in the input string. The real advantage of the nonprocedural approach comes from modifying this second procedure to handle an entire table whose rows are CSV strings. + +In fact, the one row at a time procedure can be replaced with a VIEW instead: + +### 8.4.3.3 Use Calendar Tables to Perform Temporal Calculations + +#### Rationale: + +The first thing to do when you start a new application is to build a Sequence and Calendar table. The calendar table is keyed on a date, and the nonkey columns contain information about that date relative to the enterprise. Is this a workday or a holiday? What is its Julian date number? What fiscal calendar does it fall in? In short, anything to do with how the enterprise uses time must be detailed. + +The table for 20 years of data is only about 7,050 rows, which is nothing. You can look up programming tricks with this table in newsgroups or in Celko (1999). + +#### Exceptions: + +None + +### 8.4.3.4 Consider Auxiliary Tables to Perform Computations + +#### Rationale: + +If a function or computation returns only a few thousand values, instead of computing it over and over, put the parameters and the results into an auxiliary table that can be joined to the tables to get the answer. SQL is good at JOINs but not at computations; play to its strength. + +#### Exceptions: + +If the computation can be done with simple four-function math, then auxiliary tables could be overkill. If the computation is unpredictable or known to have a huge range, then it might not be possible to put it into an auxiliary table. + +# 8.5 Scalar versus Structured Parameters + +There are no arrays, lists, or other data structures in Standard SQL-92. There is only one data structure: the table. There are base tables, views, and derived tables, but the operative word in that list is "table." + +Procedural languages depend on other data structures, such as arrays, lists, and records. Newbie programmers who learned to program with such structures want to use them desperately when they get to SQL. The result is that they kludge code with poor performance. Even worse, they use dynamic SQL to construct a statement or an entire program on the fly. + +Stored procedure calls expect scalar parameters, not structured or dynamic parameters. By using a few coding tricks, you can still get theadvantages of stored procedures and have some flexibility. A typical problem is to pass a list of values to an IN() predicate, like this in pseudocode: + +The all-too-common kludge is dynamic SQL, which has a string with a list of comma-separated values for «parameter list». One answer is to use the code in section 8.4 to put the list into a table and write a compiled statement, thus: + +But a better answer is to scrub the list data in the front end and load it into a table with an INSERT INTO statement. The ability to do this will vary with each SQL product, but the standard SQL syntax uses row constructors, like this: + +The VALUES() list has to be of a known number of rows, but by putting NULLs or other dummy values in the list, you can get the effect of a dynamic list. You only need to clean them out on the database side, and you can use SELECT DISTINCT to remove duplicate values if needed. The full table insertion statement would look like this in the host language: + +# 8.6 Avoid Dynamic SQL + +Dynamic SQL is both slow and dangerous. It is also a sign that the programmer did not have a proper design for his or her application and is now turning that job over to any user, present or future. The purpose of Dynamic SQL is to build metadata tools, not applications. A metadata tool treats schema objects as schema objects, not as parts of a data model. + +## 8.6.1 Performance + +A stored procedure will have a cached execution plan in most SQL products, but Dynamic SQL has to be prepared repeatedly with each execution. Obviously, this is going to be slower than running compiled code that might already be in main storage. One counterargument is that if the predicates change in some significant way, then recompiling can give a better execution plan. The gist of this execution model is that if I have a predicate with constants instead of parameters, the optimizer can do a better job with it. For example, given this simple query: + +If the parameter ":input_sex_code" is male (1, using the ISO sex codes), then a table scan is the best way to process the query; if the parameter is female (2, using the ISO sex codes), then an index is the best; if the parameter is anything else, simply return an empty result set. + +Obviously, this is implementation dependent. However, more modern optimizers will create several possible execution plans, based on the statistics, and hold them until the parameter is known. In short, we are back to the "Trust the optimizer" rule. + +## 8.6.2 SQL Injection + +SQL injection is a security attack in which the attacker places SQL code into your procedure and executes it. Whenever you let a user input code directly into Dynamic SQL in stored procedure or SQL statements generated in client code, you are in danger. Here is an example of a function that builds a simple Dynamic SQL string, based on an FAQ at esquel@sommarskog.se: + +Assume that the input for the parameters "custname" comes directly from user input without any filtering or validation and that a malicious user passes this value in: + +The host program can then PREPARE and EXECUTE it, and drop the table for you. + +A plain user is not likely to have permissions to drop a table, but I can run all kinds of statements I wish via SQL injection. The attacker looks for inputs that will produce a syntax error rather than a runtime error, so he or she knows there is Dynamic SQL on the database side. The attacker writes the code, and, if needed, ends it with semicolons or with a start of comment that will remove the rest of the query code from compilation. With a little probing, the attacker can find out if the Dynamic SQL is providing a table name and really trash the schema. + +The first defense is not to give the users more privileges than are necessary for their jobs. A good heuristic is that plain users should be granted only SELECT privileges on the tables with which they work, but the best defense is not to use Dynamic SQL in production code. +CHAPTER 9 Heuristics + +THE FOLLOWING TRICKS and heuristics are not exactly mathematically + +precise scientific methods. In fact, some of them sound pretty weird, but as Larry Constantine once remarked, a method is a list of things that tells you what to do next, when you did not know what to do next, and you hope the method at least gets you to a workable solution, if not a good solution. + +Let me pick simple programming problems and apply these heuristics as we go along. Consider the "Dance Partner Problem" in which you are given a list of people and their gender. Your task is to pair them into couples. + +Then there is the classic Orders problem: Given a data model of orders from customers for products from inventory, answer any of several questions. This is not a complete schema, but it will work for demonstration purposes. + +# 9.1 Put the Specification into a Clear Statement + +This might sound obvious, but the operative word is _clear_ statement. You need to ask questions at the start. Let me give some examples from actual problem statements having to do with a schema that models a typical orders and order details database: + +1. _"I want to see the most expensive item in each order._ " How do I handle ties for the most expensive item? Did you mean the highest unit price or the highest extension (quantity X unit price) on each order? + +2. _"I want to see how many lawn gnomes everyone ordered."_ How do I represent someone who never ordered a lawn gnome in the result set? Is that a NULL or a zero? If they returned all of their lawn gnomes, do I show the original order or the net results? Or do I show no order ever as a NULL and returns as a zero to preserve information? + +3. _"How many orders were over $100?"_ Did you mean strictly greater than $100 or greater than or equal to $100? + +In the "Dance Partner" example, we need to ask: + +1. How do we pair the couples? + +2. What do we do if there are more boys than girls (or vice versa) in the table? + +3. Can someone have more than one partner? If so, how do we assign them? + +Writing specs is actually harder than writing code. Given a complete, clear specification, the code can almost write itself. + +# 9.2 Add the Words "Set of All . . ./" in Front of the Nouns + +The big leap in SQL programming is thinking in sets and not in process steps that handle one unit of data at a time. Phrases like "for each x . . ." poison your mental model of the problem. Look for set characteristics and not for individual characteristics. For example, given the task to find all of the orders that ordered exactly the same number of each item, how would you solve it? + +One approach is, for each order, to see if there are two values of quantity that are not equal to each other and then reject that order. This leads to either cursors or a self-join. Here is a self-join version; I will not do the cursor version. + +Or you can look at each order as a set with these set properties: + +# 9.3 Remove Active Verbs from the Problem Statement + +Words like _traverse, compute,_ or other verbs that imply a process will poison your mental model. Try to phrase it as a "state of being" description instead. This is the same idea as in section 9.2, but with a slight twist. + +Programmers coming from procedural languages think in terms of actions. They add numbers, whereas a declarative programmer looks at a total. They think of process, whereas we think of completed results. + +# 9.4 You Can Still Use Stubs + +A famous Sydney Harris cartoon shows the phrase "Then a miracle occurs" in the middle of a blackboard full of equations, and a scientist says to the writer, "I think you should be more explicit here in step 2." + +We used that same trick in procedural programming languages by putting in a stub module when we did not know what to do at the point in a program. For example, if you were writing a payroll program and the company had a complex bonus policy that you did not understand or have specifications for, you would write a stub procedure that always returned a constant value and perhaps sent out a message that it had just executed. This allowed you to continue with the parts of the procedure that you did understand. + +This is more difficult to do in a declarative language. Procedural language modules can be loosely coupled, whereas the clauses and subqueries of a SELECT statement are a single unit of code. You could set up a "test harness" for procedural language modules; this is more difficult in SQL. + +Looking at the "Dance Partner Problem," I might approach it by saying that I need the boys and the girls in two separate subsets, but I don't know how to write the code for that yet. So I stub it with some pseudocode in my text editor. Because this is for dance, let's pick the pseudocode words from a musical. Nobody is going to see this scratch paper work, so why not? + +The angle-bracketed pseudocode might expand to multiple columns, subqueries, or just about anything later. Right now they are placemarkers. I also have a "??" placemarker for the relationship between my guys and dolls. I can then go to the next level in the nesting and expand the () subquery like this: + +The same pattern would hold for the () subquery. I now need to figure out some way of getting code for . The first place I look is the columns that appear in the People table. The only thing I can find in that table is gender. I have a rule that tells me guys = 1 and dolls = 2, and I am enforcing it in my subqueries already. (Note: The full ISO sex codes are 0 = unknown, 1 = male, 2 = female, and 9 = lawful persons, corporations, etc.) I could try this: + +but it is pretty easy to see that this is a CROSS JOIN in thin disguise. Add something with the names, perhaps? + +There was no help there. It produces a smaller set of pairs, but you still get multiple couples on the dance floor. This is where some experience with SQL helps. One of the customary programming tricks is to use a self-join to get a ranking of elements in a set based on their collation sequence. Because this works with any table, we can use it in both guys and dolls to get the final query. + +# 9.5 Do Not Worry about Displaying the Data + +In a tiered architecture, display is the job of the front end, not the database. Obviously, you do not do rounding, add leading zeros, change case, or pick a date format in the database. The important thing is to pass the front end all of the data it needs to do its job, but it is more than that. You can get your dance partner pairs with the query in section 9.4, but if you do not want to see the pairs on the same row, you can write a more compact query like this: + +This will put one person per row with a ranking in the alphabetical sort for their gender rather than one couple per row, but that is still the same information from a simpler query. Notice that both solutions can leave unpaired people toward the end of the alphabet. + +You can add an ORDER BY clause to the cursor that passes the result set to the front-end program in a simple client/server system, but in architectures with multiple tiers, sorting and other display functions might be performed differently in several places. For example, the same data is displayed in English units sorted by division in the United States but displayed in SI units sorted by country in Europe. + +# 9.6 Your First Attempts Need Special Handling + +Henry Ledgard (1976) put it very nicely: + +Pruning and restoring a blighted tree is almost an impossible task. The same is true of blighted computer programs. Restoring a structure that has been distorted by patches and deletions, or fixing a program with a seriously weak algorithm isn't worth the time. The best that can result is a long, inefficient, unintelligible program that defies maintenance. The worst that could result, we dare not think of. + +This is especially true with SQL, but how to handle restarts in DDL and DML is different because of the declarative nature of the two sublanguages. DDL execution is static once it is put into place, whereas DML is dynamic. That is, if I issue the same CREATE command, it will have the same results each time, but if I issue the same SELECT, INSERT, UPDATE, or DELETE, the execution plan could change each time. + +## 9.6.1 Do Not Be Afraid to Throw Away Your First Attempts at DDL + +Bad DDL will distort all of the code based on it. Just consider our little "Dance Partner" schema: What if a proprietary BIT data type had been used for gender? The code would not port to other SQL dialects. The host languages would have to handle low-level bit manipulation. It would not interface with other data sources that use ISO standards. + +Designing a schema is hard work. It is unlikely that you will get it completely right in one afternoon. Rebuilding a database will take time and require fixing existing data, but the other choices are worse. + +When I lived in Salt Lake City, Utah, a programmer I met at a user group meeting had gotten into this situation: The existing database was falling apart as the workload increased thanks to poor design at the start. The updates and insertions for a day's work were taking almost 24 hours at that time, and the approaching disaster was obvious to the programmers. Management had no real solution, except to yell at the programmers. They used the database to send medical laboratory results to hospitals and doctors. + +A few months later, I got to see how an improperly declared column resulted in the wrong quantities of medical supplies being shipped to an African disaster area. The programmer tried to save a little space by violating first normal form by putting the package sizes into one column and pulling them out with SUBSTRINGO operations. The suppliers later agreed to package smaller quantities to help with the fantastic expense of shipping to a war zone. Now the first "subfield" in the quantity column was one unit and not five, but the tightly coupled front did not know this. Would you like to pick which four children will die because of sloppy programming? See what we mean by the last sentence in Ledgard's quote? + +## 9.6.2 Save Your First Attempts at DML + +Bad DML can run several orders of magnitude slower than good DML. The bad news is that it is difficult to tell what is good and what is bad in SQL. The procedural programmers had a deterministic environment in which the same program ran the same way every time. SQL decides how to execute a query based on statistics about the data and the resources available. They can and do change over time. Thus, what was the best solution today could be the poorer solution tomorrow. + +In 1988, Pascal (1988) published a classic article on PC database systems at the time. Pascal constructed seven logical equivalent queries for a database. Both the database and the query set were simple and were run on the same hardware platform to get timings. + +The Ingres optimizer was smart enough to find the equivalence, used the same execution plan, and gave the best performance for all queries. The other products at the time gave uneven performances. The worst timing was an order of magnitude or more than the best. In the case of Oracle, the worst timing was more than 600 times the best. + +I recommend that you save your working attempts so that you can reuse them when the world and/or your optimizer change. The second example for the "Dance Partner" in section 9.5 does a nice job of illustrating this heuristic. Put the code for one of the queries in as a comment, so the maintenance programmer can find it. + +# 9.7 Do Not Think with Boxes and Arrows + +This is going to sound absolutely insane, but some of us like to doodle when we are trying to solve a problem. Even an informal diagram can be a great conceptual help, especially when you are learning something new. We are visual creatures. + +The procedural programmers had the original ANSI X3.5 Flowchart symbols as an aid to their programming. This standard was a first crude attempt at a visual tool that became Structure Charts and Data Flow Diagrams (DFD) in the 1970s. All of these tools are based on "boxes and arrows"—they show the flow of data and/or control in a procedural system. If you use the old tools, you will tend to build the old systems. You might write the code in SQL, but the design will tend toward the procedural. + +# 9.8 Draw Circles and Set Diagrams + +If you use set-oriented diagrams, you will tend to produce set-oriented solutions. For example, draw a GROUP BY as small, disjointed circles inside a larger containing circle so you see them as subsets of a set. Use a time line to model temporal queries. In a set-oriented model, nothing flows; it exists in a state defined by constraints. + +Probably the clearest example of "boxes and arrows" versus "set diagrams" is the Adjacency List model versus the Nested Sets model for trees. You can Google these models or buy a copy of my book _Trees and Hierarchies in SQL for Smarties_ for details. The diagrams for each approach are shown in Figure 9.1. + +Figure 9.1 Adjacency list versus Nested Set Trees. + +# 9.9 Learn Your Dialect + +Although you should always try to write Standard SQL, it is also important to know which constructs your particular dialect and release favor. For example, constructing indexes and keys is important in older products that are based on sequential file structures. At the other extreme, the Nucleus engine from Sand Technology represents the entire database as a set of compressed bit vectors and has no indexing because in effect everything is automatically indexed. + +# 9.10 Imagine That Your WHERE Clause Is "Super Ameba" + +That is the weirdest title in this chapter, so bear with me. Your "Super Ameba" computer can split off a new processor at will, and assign it a task, in a massively parallel fashion. Imagine that every row in the working table that was built in the FROM clause is allocated one of these "ameba processors" that will test the WHERE clause search condition on just that row. This is a version of Pournelle's rule: "one task, one processor." + +If every row in your table can be independently tested against simple, basic search conditions, then your schema is probably a good relational design. But if your row needs to reference other rows in the same table, consult an outside source, or cannot answer those simple questions, then you probably have some kind of normalization problems. + +You have already seen the Nested Sets model and the Adjacency List model for trees. Given one row in isolation from the rest of the table, can you answer a basic node question about the tree being modeled? This leads to asking: What are basic questions? Here is a short list that applies to trees in graph theory. + +1. Is this a leaf node? + +2. Is this the root node? + +3. How big is the subtree rooted at this node? + +4. Given a second node in the same tree, is this node superior, subordinate, or at the same level as my node? + +Question 4 is particularly important, because it is the basic comparison operation for hierarchies. As you can see, the Nested Sets model can answer all of these questions and more, whereas the Adjacency List model can answer none of them. + +# 9.11 Use the Newsgroups and Internet + +The Internet is the greatest resource in the world, so learn to use it. You can find a whole range of newsgroups devoted to your particular product or to more general topics. When you ask a question on a newsgroup, please post DDL, so that people do not have to guess what the keys, constraints, Declarative Referential Integrity, data types, and so forth in your schema are. Sample data is also a good idea, along with clear specifications that explain the results you wanted. + +Most SQL products have a tool that will spit out DDL in one keystroke. Unfortunately, the output of these tools is generally less than human-readable. You should prune the real tables down to just what is needed to demonstrate your problem: There is no sense in posting a 100-column CREATE TABLE statement when all you want is two columns. Then clean up the constraints and other things in the output using the rules given in this book. You are asking people to do your job for you for free. At least be polite enough to provide them with sufficient information. + +If you are a student asking people to do your homework for you, please be advised that presenting the work of other people as your own is a valid reason for expulsion and/or failure at a university. When you post, announce that this is homework, the name of your school, your class, and your professor. This will let people verify that your actions are allowed. +CHAPTER 10 Thinking in SQL + +_"It ain't so much the things we don't know that get us into trouble. It's the_ _Thing we know that just ain't so."_ + +—Artemus Ward (Charles Farrar Browne), +American humorist (1834–1867) + +THE BIGGEST HURDLE in learning SQL is thinking in sets and logic, instead of in sequences and processes. I just gave you a list of heuristics in the previous chapter, but let's take a little time to analyze why mistakes were made. You now have some theory, but can you do diagnostics? + +I tried to find common errors that new programmers make, but perhaps the most difficult thing to learn is thinking in sets. Consider the classic puzzle shown in Figure 10.1. + +Figure 10.1 Classic block puzzle. + +The usual mistake people make is trying to count the 1 x 1 x 2 bricks one at a time. This requires the ability to make a three-dimensional mental model of the boxes, which is really difficult for most of us. + +The right approach is to look at the whole block as if it were completely filled in. It is 4 X 5 X 5 units, or 50 bricks. The corner that is knocked off is 3 bricks, which we can count individually, so we must have 47 bricks in the block. The arrangement inside the block does not matter at all. + +All of these examples are based on actual postings in a newsgroup that have been translated into SQL/PSM to remove proprietary features. In some cases, I have cleaned up the data element names, and in others I have left them. Obviously, I am guessing at motivation for each example, but I think I can defend my reasoning. + +# 10.1 Bad Programming in SQL and Procedural Languages + +As an example of not learning any relational approaches to a problem, consider a posting in the comp.databases.ms-sqlserver newsgroup in January 2005: The title was "How to Find a Hole in Records," which already tells you that the poster is thinking in terms of a file system and not an RDBMS. + +The original table declaration had the usual newbie "id" column, without a key or any constraints. The table modeled a year's worth of rows identified by a week-within-year number (1 to 53) and a day-of-the-week number (1 to 7). Thus, we started with a table that looked more or less like this, after the names were cleaned up: + +By removing the useless, proprietary id column and adding constraints, we then had the following table: + +Despite giving some constraints in the narrative specification, the poster never bothered to apply them to the table declaration. Newbies think of a table as a file, not as a set. The only criteria that data needs to be put into a file is that it is written to that file. The file cannot validate anything. The proprietary auto-number acts to replace a nonrelational record number in a sequential file system. + +The problem was to find the earliest missing day within each week for inserting a new row. If there were some other value or measurement for that date being recorded, it was not in the specifications. The poster's own T-SQL solution translated in SQL/PSM like this, with some name changes: + +This is a classic imitation of a FOR loop, or counting loop, used in all 3GL programming languages. However, if you look at it for two seconds, you will see that this is bad procedural programming! SQL will not make up for a lack of programming skills. In fact, the bad effects of mimicking 3GL languages in SQL are magnified. The optimizers and compilers in SQL engines are not designed to look for procedural code optimizations. By removing the redundant local variables and getting rid of the hidden GOTO statements in favor of a simple, classic structured design, the poster should have written this: + +This points out another weakness in this posting. We were not told how to handle a week that has all seven days represented. In the original table design, any integer value would have been accepted because of the lack of constraints. In the revised DDL, any weekday value not between 1 and 7 will cause a primary-key violation. This is not the best solution,but it at least follows the specs that were given without making too many guesses as to what should have been done. + +But can we do this without a loop and get a pure, nonprocedural SQL solution? Yes, there are several ways: Because the purpose of finding this weekday number is to insert a row in the table, why not do that in one procedure instead of finding the number in a function, and then doing the insertion in another procedural step. Think at the level of a whole process and not in sequential steps. + +This first answer is ugly looking and difficult to generalize, but it is fast if the optimizer factors out the tabular subquery in the WHEN clauses and computes it once. It also uses no local variables. + +The thought process was to get the entire set of weekday numbers present in the week, and then compare them to each value in an ordered list. The CASE expression is just a way to hide that list. Although it is a step forward, it is not yet really a set-oriented solution. + +Here is another version that uses a table constructor. This is more compact and easy to generalize. Here we are actually using a set-oriented solution! We are subtracting the set of actual days from the set of all possible days, and then looking at the minimum value in the result to get an answer. + +You can also use a pure set operations approach. The set difference operator can remove all of the numbers that are present, so that we can pick the minimum value from the leftovers. + +If all seven days are present, we will get an empty set, which will return a NULL for the day_nbr, and the NULL will violate the primary-key constraint. + +Here is a third, generalized version with the Sequence table providing any range of integers desired. Just remember that the DDL has to also match that change. + +In the case of only seven values, there is not going to be a huge difference in performance among any of these answers. However, with a huge number of values, the use of hashing or bit vector indexes would be a noticeable improvement over a loop. + +# 10.2 Thinking of Columns as Fields + +The original code was actually much worse, because the poster wanted to create and drop tables on the fly. The purpose is to load totals into a summary report table. + +Why did the poster create a dozen local variables and then use scalar subqueries to load them? The poster is still thinking in terms of a 3GLprogramming language. In COBOL or other 3GL languages, the file containing the Construction Survey data would be read in one record at a time, and then each record would be read one field at a time, from left to right. A sequence of IF-THEN statements would look at the fields and increment the appropriate counter. When the entire file is read, the results would be written to the working file for the survey summary. + +The poster looked at each column as if it were a field and asked how to get the value for it, in isolation from the whole. The poster had seen the use of a subquery expression and implemented it that way. The subqueries will not be well optimized, so this is actually going to run longer than if the poster had used SQL/PSM to mimic the classic COBOL program for this kind of summary. + +Without repeating a dozen columns again, a set-oriented solution is this: + +The trick was to ask what you want in each row of a summary table, as a completed unit of work, and not start at the column level. The answer is a tally of answers to some questions. The word _tally_ leads you to SUM() or COUNTO, and you remember the trick with the CASE expression. + +The final question is why not use a VIEW to get the summary instead of a procedure? + +# 10.3 Thinking in Processes, Not Declarations + +This is a simple schema for checking items out of an inventory. The original schema lacked keys and constraints that had to be added to give us this: + +The original narrative specification was: + +Each user can reserve a maximum of (n) items. Whenever a user reserves something, the "max_reserves" field [sic] of the user is retrieved and checked. Then a record [sic] is inserted into the Reservations table, and the "max_reserves" field [sic] of the user is updated accordingly. 1 would like to ask if there is a better way to implement this system, because there is a chance that the user reserves more than the maximum number, if he or she is logged in from two computers. + +The first proposal was for a stored procedure that looked like this in SQL/PSM: + +Passing the maximum number of items as a parameter makes no sense, because you have to look it up; this will let you pass any value you desire. Having a local variable for the count is redundant; SQL is orthogonal, and the scalar subquery can be used wherever the scalar variable is used. + +Rows are not records and columns are not fields. SQL is a declarative language, not a procedural one. So a sequence of procedural steps like "Retrieve—> check—> insert—> update" does not make sense. Instead, you say that you make a reservation such that the user is not over his or her limit. Think of logic, not process. + +Instead of recording the tally of reserved items in local storage, you can get it with a subquery expression. In fact, you might want to have a view to use for reports. + +# 10.4 Thinking the Schema Should Look Like the Input Forms + +There are several versions of this error. The easiest one is a simple timecard form that gets modeled exactly as it is printed on the paper form. + +But to answer even basic questions, you have to match up in and out times. Dr. Codd (1979) described a row in an RDBMS as containing a fact, but more than that, it should contain a whole fact and not half of it. The "half-fact" that John showed up at the job at 09:00 Hrs has nothing to do with paying him. I need to know that John was on the job from 09:00 to 17:00 Hrs. The correct design holds a whole in each row, thus: + +Many new SQL programmers are scared of NULLs, but this is a good use of them. We do not know the future, so we cannot assign a value to the out_time until we have that information. + +Another common example is a simple order form that is copied directly into DDL. In skeleton form, the usual layout is something like this: + +The order total can be computed from the order details, so it is redundant in the Orders table; but the total was a box on the paper form, so the newbie put it in the table. + +Nobody is actually buying or shipping a line number. Customers are ordering items, but the lines on the paper form are numbered, so the line numbers are in the OrderDetails table. This is dangerous, because if I repeat the same item on another line, I have to consolidate them in the database. Otherwise, quantity discounts will be missed, and I am wasting storage with redundant data. + +For example, each of the rows shows a "half-fact" in each row. One says that I ordered two pairs of lime green pants and the other says that I ordered three pairs of lime green pants on my order #123. The whole fact is that I ordered five pairs of lime green pants on my order #123. + +In 2004, I pointed this out to a programmer who had such a schema. She insisted that they needed the line numbers to be able to reproduce the original order exactly as it was keyed in, but then in a following posting in the same thread, she complained that her people were spending hours every day verifying the quantity of items in orders they received, because their suppliers did not use the proper model to present a consolidated, sorted display of the data. +APPENDIX Resources + +# Military Standards + +DoD 8320.1-M-l, Data Element Standardization Procedures. DoD Directive 8320.1, "DoD Data Administration" + + + + + +# Metadata Standards + +Here is a short summary of the NCITS L8 Metadata Standards Committee rules for data elements: + + + + + +Also the pdf file: + + + +The draft: + + + +# ANSI and ISO Standards + +The SI Basics ("Metric System") + +ISO 31 "Quantities and Units (14 parts)" + +ISO 1000 "SI Units and Recommendations for the Use of Their Multiple and of Certain Other Units for the Application of the SI" + +ISO 2955 "Information Processing—Representation of SI and Other Units for Use in Systems with Limited Character Sets" + +A guide to both ISO 31 and ISO 1000 can be purchased at: + + + +ISO 639-1:2002 "Codes for the Representation of Names of Languages—Part 1 : Alpha-2 Code" + +ISO 639-2:1998 "Codes for the Representation of Names of Languages—Part 2: Alpha-3 Code" + +The language codes are available online: + + + +ISO 3166 "Codes for the Representation of Names of Countries" + +This standard provides a unique two-letter code for each country and a three-letter code for special uses. A three-digit numeric code is given and intended as an alternative for all applications that need to be independent of the alphabet or to save bits in computer storage. + + + +ISO 4217:2001 "Codes for the Representation of Currencies and Funds" + +IBAN: International Standard Bank Number + + and the European Committee for Banking Standards Web site for publications + +ISO 8601 "Data Elements and Interchange Formats—Information Interchange—Representation of Dates And Times." + + + +# U.S. Government Codes + +NAICS: North American Industry Classification System. This system replaced the old Standard Industrial Classification (SIC) system. + + + +NAPCS: North American Product Classification System + + + +TIGER: Topologically Integrated Geographic Encoding and Referencing system. This is how the census views geography and reports data. It is available in electronic formats. + +DOT: Dictionary of Occupational Titles. This is the U.S. Department of Labor encoding system. You can see some of the codes at this URL: + + + +# Retail Industry + +**_EAN_ : _European Article Number, now combined with the UPC codes_** + +ISO/IEC 15418:1999 "EAN/UCC Application Identifiers and Fact Data Identifiers and Maintenance" + +ISO/IEC 15420:2000 "Automatic Identification and Data Capture Techniques—Bar Code Symbology Specification—EAN/UPC" + +Bar Code Détente: U.S. Finally Adds One More Digit + +2004 July 12, the _New York Times,_ by Steve Lohr; http://www.nytimes.com/2004/07/12/business/12barcode.html?ex=1090648405&ei=l&en=202cb9baba72e846 + +**_VIN: Vehicle Identification Number_** + +ISO 3779:1983 Vehicle Identification Number (VIN) + +ISO 4030:1983 Vehicle Identification Number (VIN)—Location and Attachment + +ISO/TR 8357:1996 Instructions for the implementation of the assignment of world manufacturer identifier (WMI) codes for vehicle identification number (VIN) systems and for world parts manufacturer identifier (WPMI) codes (available in English only) + +A good news article on the changes that are coming to the VIN: + +http://www.cars.com/news/stories/070104_storya_dn.jhtml?page=newsstory&aff=national + +ISO tire sizes explained: + + + +#### ISBN: International Standard Book Number + + + +This site provides a converter for the new 13-digit ISBN that is based on the change from 10-digit UPC codes to 13-digit ΕΑΝ codes in the retail industry on January 1, 2005. + +# Code Formatting and Naming Conventions + +You can find other opinions at: + + + +. + +Gulutzan, P. "SQL Naming Conventions," + +Bryzek, M. "Constraint Naming Standards," + +Celko, J. "Ten Things I Hate about You," http://www.intelligententerprise.com/001205/celkol_l.jhtml?_requestid=304726 + +ISO/IEC. IS 11179-5 Information Technology Specification and Standardization of Data Elements: PART 5, Naming and Identification Principles for Data Elements. + +http://metadata-standards.org/Document-library/Draft-standards/11179-Part5-Naming&Identification/ + +Jones, S. "Standards Part 1—Abbreviated Programming," + +Karbowski, J. J. "Naming Standards beyond Programming," + +Koch, G., and K. Loney. _Oracle8i: The Complete Reference_ (3rd ed.). Emeryville, CA: Osborne McGraw Hill, 2000. + +Kondreddi, N., Vyas. "Database Object Naming Conventions," + +Mullins, C. "What's in a Name?" + +Mullins, C. + +Sheppard, S. "Oracle Naming Conventions," +Appendix Bibliography + +# Reading Psychology + +Fisher, D. "Reading and Visual Search," _Memory and Cognition,_ 3, 188–196, 1975. + +Mason, M. "From Print to Sound in Mature Readers as a Function of Reader Ability and Two Forms of Orthographic Regularity," _Memory and Cognition,_ 6, 568–581, 1978. + +Meyer, D. E., and K. D. Gutschera. "Orthographic versus Phonemic Processing of Printed Words," Psychonomic Society Presentation, 1975. + +Pollatsek, Α., A. D. Well, and R. M. Schindler. "Effects of Segmentation and Expectancy on Matching Time for Words and Nonwords," _Journal of Experimental Psychology: Human Perception and Performance,_ 1, 328–338, 1975. + +Saenger, P. _Space Between Words: The Origins of Silent Reading._ Palo Alto, CA: Stanford University Press, 1975. + +# Programming Considerations + +Arthur, J. _Measuring Programmer Productivity and Software Quality._ New York: John Wiley & Sons, 1985. + +Baecker, R. "Enhancing Program Readability and Comprehensibility with Tools for Program Visualization," _Proceedings of the 10th International Conference on Software Engineering,_ 356–366, April 11–15, 1988, Singapore. + +Berry, R. E., and A. E. Meekings. "A Style Analysis of C Programs," _Communications of the ACM,_ 281, 80–88, January 1985. + +Brooks, R. "Studying Programmer Behavior Experimentally: The Problems of Proper Methodology," _Communications of the ACM,_ 234, 207–213, April 1980. + +Celko, J. "Observations about Student Programming Practices," _Structured Programming,_ Fall 1989, p. 215. + +Celko, J. _SQL for Smarties_ (3rd ed.). San Francisco: Morgan-Kaufmann, 2005. + +Celko, J. _SQL Puzzles & Answers._ San Francisco: Morgan-Kaufmann, 1997. + +Celko, J. _Data & Databases._ San Francisco: Morgan-Kaufmann, 1999. + +Celko, J. Trees _& Hierarchies in SQL._ San Francisco: Morgan-Kaufmann, 2004. + +Codd, E. F. "Extending the Database Relational Model to Capture More Meaning," ACM _Transactions on Database Systems,_ 44, 397–434, December 1979. + +Cooper, D., and M. J. Clancy. _Oh! Pascal!_ New York: W. W. Norton, 1985. + +Fairley, R. _Software Engineering Concepts._ Boston: McGraw-Hill, 1985. + +Gilmore, D. J., and R. G. Green. "Comprehension and Recall of Miniature Programs," _International Journal of Man-Machine Studies,_ 211, 31–48, July 1984. + +Grogono, P. "On Layout, Identifiers and Semicolons in Pascal Programs," ACM _SIGPLAN Notices,_ 14(4), 35–40, April 1979. + +Kernighan, Β., and P. J. Plauger. _The Elements of Programming Style._ Boston: McGraw-Hill, 1982. + +Ledgard, H. _Programming_ Proverbs. Rochelle Park, NJ, Hayden Books, 1975. + +Ledgard, H., and L. J. Chmura. _Fortran with Style: Programming Proverbs._ Indianapolis, IN, Sams, 1978. + +Ledgard, H., and J. Tauer. _Professional Software. Volume 2: Programming_ + +_Practice._ Boston: Addison-Wesley Longman, 1987. + +McCabe, Tom. "A Complexity Measure," _IEEE Transactions on Software,_ + +1976. + +McKeithen, K., Reitman J., Rueter H., and Hirtle S. "Knowledge Organization and Skill Differences in Computer Programmers," _Cognitive Psychology,_ 13, 307–325, 1981. + +Meekings, B. "Style Analysis of Pascal Programs," ACM _SIGPLAN Notices,_ 18(9), 45–54, September 1983. + +Miller, G., A. "The Magical Number Seven Plus or Minus Two: Some Limits on Our Capacity for Processing Information," _The Psycological Review,_ 1956. + +Oman P., and Cook C. "A Taxonomy for Programming Style," _Proceedings of the 1990 ACM Annual Conference on Cooperation,_ February 20–22, 1990, Washington, DC. + +Oman P., and Cook C. "A Paradigm for Programming Style Research," _ACM SIGPLAN Notices,_ 23(12), 69–78, December 1988. + +Oman P., and Cook C. "Programming Style Authorship Analysis," _Proceedings of the 17th Annual ACM Conference on Computer Science:_ + +_Computing Trends in the 1990s,_ Louisville, Kentucky, 320–326, February 1989 + +Oman P., and Cook C. "Typographic Style Is More Than Cosmetic," _Communications of the ACM,_ 335, 506–520, May 1990. + +Pascal, F. "SQL Redundancy and DBMS Performance," _Database Programming & Design,_ 112, 22–28, December 1988. + +Pressman, R. S. _Software Engineering: A Practitioner's Approach_ (2nd ed.). Boston: McGraw-Hill, 1986. + +Redish K., and Smyth W. "Program Style Analysis: A Natural By-Product of Program Compilation," _Communications of the ACM,_ 29(2), 126–133, February 1986. + +Rees, M. J. "Automatic Assessment Aids for Pascal Programs," _ACM SIGPLAN Notices,_ 1710, 33–42, October 1982. + +Sheil, B. A. "The Psychological Study of Programming," ACM _Computing Surveys_ (CSUR), 131, 101–120, March 1981. + +Weinberg, G. _The Psychology of Computer Programming: Silver Anniversary Edition._ New York: Dorset House, 1998. + +Weissman, L. "Psychological Complexity of Computer Programs: An Experimental Methodology," ACM _SIGPLAN Notices,_ 96, 25–36, June 1974. +Index + +3GL languages, , 190–91 + +4GL languages, , + +# A + +Abbreviation encoding, 87–88 + +algorithm encoding vs., + +defined, + +examples, 87–88 + +_See also_ Encoding schemes + +Absolute scales, + +Abstraction levels, 9–10 + +Accuracy, , + +Actions, indenting, 45–46 + +Active verbs, + +Affixes, + +Algorithmic encoding, + +Aliases. _See_ Correlation names + +ALTER statement, , + +ANSI X3.5 Flowchart symbols, + +Artificial keys, 51–52 + +Attributes, splitting, 62–66 + +Auto-numbers, 52–53, , + +column names, + +natural key and, + +as primary key, + +problems, 58–59 + +Auxiliary tables, + +# B + +**Β** BETWEEN predicate, 114–15 + +"Big VIEW Myth," 148–49 + +Bits, , + +Block puzzle, , + +Bouma, + +Bytes, + +# C + +**C** Calendar tables, + +CamelCase + +problems, + +use avoidance, 29–30 + +use exceptions, + +Capitalization rules, 6–7 + +CASE expressions, 110–13 + +COALESCE() function, 112–13 + +for complex predicate, + +defined, + +replacing IF-THEN-ELSE control flow statements, 162–63 + +searched, + +simple, 111–12 + +using, 117–18 + +Case-sensitivity rules, 6–7 + +Categorical scales, 73–74 + +categories, + +defined, 73–74 + +properties, + +_See also_ Scales + +Character columns, + +Character sets, + +CHECK() constraints, 46–48 + +applying to table as whole, + +inspecting/testing, 130–31 + +single-purposed, + +on table declarations, + +table-level, + +See _also_ Constraints + +COALESCE() function, 112–13 + +Codd, Dr., , , , + +Code + +clusters, + +indentation, 38–39 + +line spacing, 39–40 + +lowercase scalars usage, 25–26 + +name usage, + +punctuation rules, 31–33 + +reading, + +reserved word use, 33–34 + +statements, 34–37 + +typography and, 23–30 + +upper-/lowercase letter usage, + +word spacing, + +Code Museum Effect, , + +Coding + +choices, 99–131 + +comments, 118–19 + +compact constructions, 109–18 + +correlated subqueries, 125–27 + +optimizer hints avoidance, + +secondary index avoidance, 124–25 + +standard constructions, 100–108 + +stored procedures, 122–23 + +triggers avoidance, 120–22 + +UNIONs, 127–30 + +user-defined functions, 123–24 + +Cohesion, 153–55 + +coincidental, 153–54 + +communicational, + +defined, + +functional, + +informational, + +logical, + +procedural, + +temporal, + +types, + +Coincidental cohesion, 153–54 + +Columns + +added after schema, + +character, + +clustering, 44–45 + +constraints, 48–49, + +as fields, 189–91 + +names, + +ordering, 44–45 + +renaming, + +splitting attributes into, 63–65 + +VIEW, + +_See also_ Rows + +Comma-separated lists, , + +Comments + +on clause, + +control statement, + +stored procedures, + +using, 118–19 + +Common coupling, + +Common language runtime (CLR), + +Communicational cohesion, + +Compact constructions, 109–18 + +CASE family expressions, 110–13 + +parentheses and, 109–10 + +redundant expressions and, —14 + +seeking, 114–18 + +Complexity, masking, 138–39 + +Concatenation, + +Concatenation encoding, + +Consistency, + +Constraints + +CHECKO, 46–48 + +column, + +integrity, 140–42 + +LIKE, + +multiple column, 48–49 + +multi-table, 49–50 + +names, + +in narrative specification, + +range, + +referential, + +rows, + +SIMILAR TO, + +Content coupling, 155–56 + +Control coupling, + +Control statement comments, + +Control structures, 156–57 + +concatenation, + +iteration, + +selection, + +Correlation names, 15–17 + +column, + +derivation, 15–16 + +in queries, + +on table expression, + +Coupling, 155–56 + +common, + +content, 155–56 + +control, + +data, + +defined, + +stamp, + +types of, + +CREATE ASSERTION statement, 49–50, + +CREATE PROCEDURE statement, + +CREATE TABLE statement, , , + +CROSS JOIN syntax, 105–7, + +Cursors + +porting, + +use avoidance, 159–61 + +uses, 159–60 + +Cyclomatic complexity, + +# D + +**D** Data abstraction, + +Database management system (DBMS),3 + +Databases + +codes in, 96–97 + +codes storage, 96–97 + +relational, , , 66–68 + +Data coupling, + +Data declaration language (DDL), 41–68 + +attribute splitting, 62–66 + +CHECK() constraint placement, 46–48 + +column ordering, 44–45 + +constraint names, + +CREATE ASSERTION use, 49–50 + +DEFAULT value, 41–42 + +first attempts, 177–78 + +indentation, 45–46 + +multiple column constraints, 48–49 + +object-oriented design use, 66–68 + +PRIMARY KEY declaration, + +proprietary data types and, 42–44 + +single-purposed CHECKO + +constraints, + +table keys, 51–62 + +table-level CHECK() constraints,49 + +Data derivation, 139–40 + +Data elements + +affixes, + +descriptive prefixes, 10–12 + +logical, + +names in registry, + +naming problems, 18–21 + +postfixes, 12–14 + +query, + +scalar, + +Data encoding schemes, 83–97 + +Data manipulation language (DML), + +bad, + +first attempts, 178–79 + +Data types, + +FLOAT, + +proprietary, 42–44 + +REAL, + +Decimal places, declaring, 81–82 + +DEFAULT value, 41–42 + +Delimited identifiers, + +Derived units, 79–80, + +Descriptive prefixes, 10–12 + +Dewey Decimal Classification (DDC) system, 89–90 + +Display, + +Domain-Key Normal Form (DKNF), 62–63 + +DROP TABLE statement, + +Dynamic SQL, 168–70 + +purpose, + +in stored procedures, + +# E + +Electronic data interchange (EDI) files,8, + +Encoding schemes, 83–97 + +abbreviation, 87–88 + +algorithmic, + +ambiguous, + +bad, 84–86 + +codes in database, 96–97 + +code translation, 93–96 + +concatenation, + +design guidelines, 92–97 + +enumeration, 86–87 + +expansion and, + +explicit missing values, 92–93 + +hierarchical, 89–90 + +measurement, + +standards, + +types, 86–91 + +vector, 90–91 + +Entity-Attribute-Value (EAV) design,68 + +Enumeration encoding, 86–87 + +EXISTS() predicate, + +Extended equality, 103–5 + +# F + +Fields, 53–54 + +columns as, 189–91 + +existence, + +length, + +Files + +EDI, , + +records, + +tables vs., 53–54 + +First attempts, 177–79 + +DDL, 177–78 + +DML, 178–79 + +Flexibility, + +FLOAT data type, + +FROM clause, , + +Functional cohesion, + +# G + +Granularity, 71–72, + +# H + +Heuristics, 171–81 + +active verb removal, + +circles and set diagrams, + +clear statements, 172–73 + +data display, 176–77 + +dialect, + +first attempts, handling, 177–79 + +newsgroups and Internet use, + +"Set of All . . .," + +stubs use, 174–76 + +Hierarchical encoding schemes, 89–90 + +Dewey Decimal system example, 89–90 + +uses, + +ZIP code example, + +_See also_ Encoding schemes + +# I + +IBM + +case sensitivity rules, , + +identifier character sets, + +identifier length, + +quoted identifiers, + +Identifiers + +character sets, + +delimited, + +lengths, + +quoted, 4–6 + +INCITS H2 Database Standards + +Committee, + +Indentation, 38–39 + +actions, 45–46 + +referential constraints, 45–46 + +Informational cohesion, + +INNER JOIN syntax, 105–7 + +IN() predicate, 115–17 + +introduction, + +NOT, + +NULLs and, + +INSERT INTO statement, , + +Integrity constraints, 140–42 + +Intelligent keys, + +Internet, + +Interval scales, + +conversion, + +defined, + +properties, + +_See also_ Scales + +ISO-3166, , + +ISO-11179 standards, , + +correlation names, 15–17 + +descriptive prefixes, 10–12 + +levels of abstraction, 9–10 + +metadata schema access objects,18 + +naming conventions, 7–18 + +postfixes, 12–14 + +relationship table names, + +scalar data elements, + +sections, 7–8 + +table and view names, 14–15 + +ISO temporal syntax, 107–8 + +Iteration, + +# J + +Justified text, + +# K + +Keys, 51–62 + +artificial, 51–52 + +exposed locator, + +familiarity, + +intelligent, + +natural, + +properties, 54–57 + +simplicity, + +stability, 55–56 + +surrogate, + +types of, + +uniqueness, 54–55 + +validation, + +verifiability, 56–57 + +Keywords, + +# L + +Levels of abstraction, 9–10 + +LIKE constraint, + +Line spacing, 39–40 + +Logical cohesion, + +Logical data elements, + +Logical model implementation, + +Lookup tables, 94–96 + +# M + +Maintenance, , + +Materialized VIEWs, 149–50 + +Measurement encoding, + +Measurement theory, 69–73 + +accuracy, , + +defined, + +granularity, 71–72, + +precision, , + +properties, + +range, 71–72 + +scale conversion, + +Metadata schema access objects, + +Modularity, + +Moh's Scale for Hardness (MSH), + +MS SQL + +case-sensitivity rules, , + +identifier character sets, + +identifier length, + +quoted identifiers, + +Multiple character sets, + +# N + +Names, 2–7 + +capitalization rules, 6–7 + +changing from place to place, 19–20 + +column, 25–26, + +constraint, + +correlation, 15–17 + +data elements, 18–21 + +ISO-11179 standards conventions, 7–18 + +length, 2–3 + +letters, digits, underscores for, + +quoted identifiers and, 4–6 + +relationship table, + +schema object, , + +special characters in, 3–4 + +table, 14–15 + +vague, 18–19 + +VIEW, 14–15 + +Natural keys + +auto-numbers and, + +compound, + +defined, + +_See also_ Keys + +NCITS L8 Metadata Standards + +Committee rules, + +Network traffic, + +Newsgroups, + +Nominal scales, + +Normalization, 94–95 + +North American Industry Classification + +System (NAICS), + +NULLs, , , + +avoiding, 92–93 + +IN() predicate and, + +sorting, + +testing combinations of, + +# O + +Object instances, 66–68 + +Object-oriented design, 66–68 + +One True Lookup Table (OTLT), ,95, + +Optimizer hints, + +Oracle + +case-sensitivity rules, , + +identifier character sets, + +identifier length, + +quoted identifiers, + +ORDER BY clause, + +Orders problem, 171–72 + +Ordinal scales, 74–75 + +conversion, + +defined, + +properties, + +transitivity, + +_See also_ Scales + +OUTER JOIN syntax, 101–5 + +extended equality, + +illustrated, + +# P + +Parentheses + +extra, avoiding, 109–10 + +in moderation, + +Physical locators, + +Portable functions, + +Postfixes, 12–14 + +category, + +class, + +Teradata standards, 12–13 + +type, 13–14 + +Precision, , + +PRIMARY KEY declaration, + +Primary units, storing, + +Procedural cohesion, + +Processes, thinking in, 191–93 + +Proprietary data types, 42–44 + +Proprietary exposed physical locators,21 + +Proprietary reserved words, + +avoiding, 33–34 + +disadvantages, + +Proprietary statements, 34–37 + +_Psychology of Computer Programming,_ 99 + +Punchcards, , + +Punctuation + +rules, 31–32 + +standard units and, 80–81 + +storage and, + +units display and, + +Queries, + +aliases inside, + +bad, + +Query data elements, + +Quoted identifiers, 4–6 + +# R + +Range, 71–72 + +Range constraints, + +Rank scales, 75–76 + +conversion, + +defined, + +properties, + +transitivity, 75–76 + +_See also_ Scales + +Ratio scales, + +conversion, + +defined, + +properties, + +_See also_ Scales + +REAL data type, + +Records, + +Redundant expressions, 113–14 + +Referential constraints, 45–46 + +Relational database management system (RDBMS), , + +EAV design and, + +object-oriented design for, 66–68 + +Relationship tables, + +Reserved words + +full, + +proprietary, , 33–34 + +uppercase, 26–29 + +Rivers, + +Rows + +constraints, + +defined, + +splitting attributes into, 65–66 + +VIEW, + +_See also_ Columns + +# S + +Scalar data elements, + +Scalar parameters, 167–68 + +Scales, 73–82 + +absolute, + +categorical, 73–74 + +conversion, 77–79 + +derived units, 79–80 + +information mixing, + +interval, + +nominal, + +ordinal, 74–75 + +properties, , + +rank, 75–76 + +ratio, + +types of, 73–76 + +unrestricted permissible + +transformations, + +use guidelines, 81–82 + +using, + +Schema object names, , + +Searched CASE expression, + +Secondary indexes, 124–25 + +Security, , + +column-level, 136–37 + +row-level, 136–37 + +Selection, + +SELECT statement, , , + +Sequence tables, 163–66 + +Set diagrams, + +Set-oriented constructs, 161–67 + +SI measurements, 79–80 + +SIMILAR TO constraint, + +Simple CASE expression, 111–12 + +Single-solution VIEWs, 147–48 + +Software engineering, 153–56 + +cohesion, 153–55 + +coupling, 155–56 + +Spacing + +line, 39–40 + +vertical, + +word, + +Splitting attributes, 62–66 + +into columns, 63–65 + +into rows, 65–66 + +into tables, + +SQL + +4GLs, , + +bad programming in, 184–89 + +as declarative language, + +Dynamic, 168–70 + +injection, 169–70 + +Standard, , , , , + +testing, 130–31 + +thinking in, 183–95 + +SQL-92 + +CASE expressions, + +DEFAULT value, + +identifier ending, + +maximum identifier length, , + +quoted identifiers, + +string operators, + +SQL-99, + +Stamp coupling, + +Standard functions, + +Standard Industrial Classification + +(SIC), + +Standard SQL + +case-sensitivity rules, , + +comments, + +identifier character sets, + +quoted identifiers, + +Standard syntax + +CROSS JOIN, 105–7 + +implementation, 100–101 + +INNER JOIN, 105–7 + +ISO temporal, 107–8 + +OUTER JOIN, 101–5 + +Standard units, 80–81 + +Statements + +clear, 172–73 + +grouping, 39–40 + +proprietary, 34–37 + +_See also specific statements_ + +Stored procedures, + +advantages, + +Dynamic SQL and, 168–70 + +performance, + +portability problems, 158–67 + +scalar vs. structure parameters, 167–68 + +software engineering and, 153–56 + +structured programming, 156–57 + +triggers vs., 121–22 + +using, 122–23 + +writing, 151–70 + +Strong metric properties, + +Structured parameters, 167–68 + +Structured programming, 156–57 + +control structures, 156–57 + +cyclomatic complexity, + +Stub modules, 174–76 + +Surrogate keys, + +Synchronization, VIEWs, 145–46 + +# T + +Tables + +auxiliary, + +calendar, + +declarations, + +expression correlation names, + +files vs., 53–54 + +keys, 51–62 + +logical, + +lookup, 94–96 + +names, 14–15 + +newbie designs, + +object instances vs., 66–68 + +relationship, + +renaming, + +sequence, 163–66 + +splitting attributes into, + +star schema, + +synchronizing VIEWs with, 145–46 + +temporary, + +translation, 93–95 + +Temporal cohesion, + +Temporal syntax, 107–8 + +Temporal values, + +Temporary tables, + +Teradata standards, + +Testing SQL, 130–31 + +character columns, + +CHECK() constraints, 130–31 + +NULL combinations, + +for size, + +Translation, 93–96 + +Translation tables, 93–94 + +Triggers + +ANSI version, + +avoiding, 120–22 + +INSTEAD OF, , + +length, + +stored procedures vs., 121–22 + +# U + +UNIONs, 127–30 + +Units + +derived, 79–80, + +display, + +primary, + +standard, 80–81 + +Updatable VIEWs, 143–44 + +INSTEAD OF triggers, + +WITH CHECK OPTION clause, 143–44 + +See _also_ VIEWs + +UPDATE statement, , + +User-defined functions, 123–24 + +multiple language problems, + +optimization problems, + +portability problems, + +# V + +Validation, 95–96 + +VALUES() list, + +Vector encoding, 90–91 + +advantages, + +defined, + +disadvantages, 90–91 + +See _also_ Encoding schemes + +Verbs, active, + +Vertical spacing, + +Very Large Database (VLDB), + +VIEWs, 133–50 + +application/business requirements, 144–45 + +column-level security, 136–37 + +columns, + +data derivation, 139–40 + +defined, + +for domain support, 146–47 + +efficient access paths, + +improper use, 146–49 + +integrity constraints enforcement, 140–42 + +invocation, + +masking complexity, 138–39 + +materialized, 149–50 + +names, 14–15 + +proliferation, avoiding, + +reasons for, 144–45 + +rename tables/columns, + +row-level security, 136–37 + +rows, + +single-solution, 147–48 + +synchronizing, 145–46 + +updatable, 143–44 + +uses, + +# W + +Weinberg, Jerry, 99–100 + +WHERE clause, + +WITH CHECK OPTION clause, 143–44 + +Word spacing, +ABOUT THE AUTHOR + +**Joe Celko** is a noted consultant and lecturer, and one of the most-read SQL authors in the world. He is well known for his 10 years of service on the ANSI SQL standards committee, his column in _Intelligent Enterprise_ magazine (which won several Reader's Choice Awards), and the war stories he tells to provide real-world insights into SQL programming. His best-selling books include _Joe Celko's SQL for Smarties: Advanced SQL Programming, second edition; Joe Celko's SQL Puzzles and Answers;_ and _Joe Celko's Trees and Hierarchies in SQL for Smarties._ +Instructions for online access + +Thank you for your purchase. Please note that your purchase of this Elsevier eBook also includes access to an online version. Please click here (or go to http://ebooks.elsevier.com) to request an activation code and registration instructions in order to gain access to the web version. + diff --git a/kag/examples/csqa/builder/data/linux_kernel_networking.txt b/kag/examples/csqa/builder/data/linux_kernel_networking.txt new file mode 100644 index 00000000..cea7b922 --- /dev/null +++ b/kag/examples/csqa/builder/data/linux_kernel_networking.txt @@ -0,0 +1,30606 @@ +Linux Kernel Networking + +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1© Apress 2014 + +Rami Rosen + +Linux Kernel NetworkingImplementation and Theory + +Rami Rosen + +ISBN 978-1-4302-6196-4e-ISBN 978-1-4302-6197-1 + +© Apress 2014 + +Linux Kernel Networking: Implementation and Theory + +President and Publisher: Paul Manning + +Lead Editor: Michelle Lowman + +Technical Reviewer: Brendan Horan + +Editorial Board: Steve Anglin, Ewan Buckingham, Gary Cornell, Louise Corrigan, James DeWolf, Jonathan Gennick, Jonathan Hassell, Robert Hutchinson, Michelle Lowman, James Markham, Matthew Moodie, Jeff Olson, Jeffrey Pepper, Douglas Pundick, Ben Renow-Clarke, Dominic Shakeshaft, Gwenan Spearing, Matt Wade, Steve Weiss, Tom Welsh + +Coordinating Editor: Kevin Shea + +Copy Editor: Corbin Collins + +Compositor: SPi Global + +Indexer: SPi Global + +Artist: SPi Global + +Cover Designer: Anna Ishchenko + +Distributed to the book trade worldwide by Springer Science+Business Media New York, 233 Spring Street, 6th Floor, New York, NY 10013. Phone 1-800-SPRINGER, fax (201) 348-4505, e-mail orders-ny@springer-sbm.com, or visit www.springeronline.com . + +For information on translations, please e-mail rights@apress.com, or visit www.apress.com . + +Apress and friends of ED books may be purchased in bulk for academic, corporate, or promotional use. eBook versions and licenses are also available for most titles. For more information, reference our Special Bulk Sales–eBook Licensing web page at www.apress.com/bulk-sales . + +Any source code or other supplementary materials referenced by the author in this text is available to readers at www.apress.com . For detailed information about how to locate your book's source code, go to www.apress.com/source-code . + +This work is subject to copyright. All rights are reserved by the Publisher, whether the whole or part of the material is concerned, specifically the rights of translation, reprinting, reuse of illustrations, recitation, broadcasting, reproduction on microfilms or in any other physical way, and transmission or information storage and retrieval, electronic adaptation, computer software, or by similar or dissimilar methodology now known or hereafter developed. Exempted from this legal reservation are brief excerpts in connection with reviews or scholarly analysis or material supplied specifically for the purpose of being entered and executed on a computer system, for exclusive use by the purchaser of the work. Duplication of this publication or parts thereof is permitted only under the provisions of the Copyright Law of the Publisher's location, in its current version, and permission for use must always be obtained from Springer. Permissions for use may be obtained through RightsLink at the Copyright Clearance Center. Violations are liable to prosecution under the respective Copyright Law.Trademarked names, logos, and images may appear in this book. Rather than use a trademark symbol with every occurrence of a trademarked name, logo, or image we use the names, logos, and images only in an editorial fashion and to the benefit of the trademark owner, with no intention of infringement of the trademark.The use in this publication of trade names, trademarks, service marks, and similar terms, even if they are not identified as such, is not to be taken as an expression of opinion as to whether or not they are subject to proprietary rights.While the advice and information in this book are believed to be true and accurate at the date of publication, neither the authors nor the editors nor the publisher can accept any legal responsibility for any errors or omissions that may be made. The publisher makes no warranty, express or implied, with respect to the material contained herein. + +To Dr. Joseph Shapira, Qualcomm Israel Founder and Ex-President, coauthor of "CDMA Radio with Repeaters"(Springer, 2007). + +Dr Ruth Shapira. + +Iris & Dr. Shye Shapira, made of the stuff dreams are made of. + +—Rami Rosen + +Preface + +This book takes you on a guided, in-depth tour of the current Linux kernel networking implementation and the theory behind it. For almost a decade, no new book about Linux networking has been written. A decade of dynamic and fast-paced Linux kernel development is quite a long time. There are important kernel networking subsystems that are not described in any other book; for example, IPv6, IPsec, Wireless (IEEE 802.11), IEEE 802.15.4, NFC, InfiniBand, and more. There is also very little information on the Web about the implementation details of these subsystems. For all these reasons, I have written this book. + +About ten years ago I made my first steps in kernel programming. I was a developer in a startup taking part in a VoIP project for a Linux-based set-top box (STB). There were crashes in the USB stack with some USB cameras, and we had to delve into the code to try to find a solution, because the vendors of that STB did not want to spend time to solve the problem. In fact, it was not that they did not want to, they simply did not know how to. In these days, there was almost no documentation about the USB stack. The Linux Device Drivers book from O'Reilly in those days was only in its second edition (the USB chapter was added only in the third edition). Success in that project was crucial for us as a startup. I had learned much about kernel programming in the process of solving the USB crash. Later on we had a project where a NAT traversal solution was needed. The userspace solution was so heavy that the device quickly crashed. When I suggested a kernel solution, my managers were very skeptical, but they did let me try. The kernel solution proved to be very stable and took much less CPU than the userspace solution. Since then I have taken part in many kernel networking projects. This book is a result of my many years of development and research. + +## Who This Book Is For + +This book is intended for computer professionals, including developers, software architects, designers, project managers, and CTOs, who are working on networking-related projects. These projects can be in a wide range of professional areas, such as communication, data centers, embedded devices, virtualization, security, and more. In addition, students and academy researchers and theorists who deal with networking projects or networking research or operating systems research will find a lot of help in this book. + +## How This Book Is Structured + +In Chapter 1 you will find a general overview of the Linux kernel and the Linux network stack. Other topics in this chapter include the implementation of the network device, the socket buffer, and the Rx and Tx paths. Chapter 1 concludes with a section about the Linux Kernel Networking Development Model. + +In chapter 2 you will learn about netlink sockets, which provide a mechanism for bidirectional communication between userspace and the kernel, and which are used by the networking subsystem as well as by other subsystems. You will also find a section in this chapter about generic netlink sockets, which can be perceived as advanced netlink sockets, and which you will encounter in Chapter 12 and while browsing the kernel networking source code. + +In Chapter 3 you will learn about the ICMP protocol, which helps to keep the system behaving correctly by sending error and control messages about the network layer (L3). You will learn about the implementation of the ICMP protocol both in IPv4 and in IPv6. + +Chapter 4 delves into the IPv4 protocol—the Internet and modern life cannot be described without it. You will learn about the structure of IPv4 header, about the Rx and Tx path, about IP options, about fragmentation and defragmentation and why they are needed, and about forwarding packets, which is one of the important tasks of IPv4. + +Chapters 5 and 6 are devoted to the IPv4 Routing Subsystem. In chapter 5 you will learn how a lookup in the routing subsystem is performed, how the routing tables are organized, which optimizations are used in the IPv4 routing subsystem and about the removal of the IPv4 routing cache. Chapter 6 discusses advanced routing topics such as Multicast Routing, Policy Routing, and Multipath Routing. + +Chapter 7 endeavors to explain the neighbouring subsystem. You will learn about the ARP protocol, which is used in IPv4, and about the the NDISC protocol used in IPv6, and about some of the differences between the two protocols. You will also learn about the Duplicate Address Detection (DAD) mechanism in IPv6. + +Chapter 8 discusses the IPv6 protocol, which seems to be the inevitable solution to the shortage of IPv4 addresses. This chapter describes the implementation of IPv6 and discusses topics such as IPv6 addresses, the IPv6 header and extension headers, autoconfiguration in IPv6, Rx path, and forwarding. It also describes the MLD protocol. + +Chapter 9 deals with the netfilter subsystem. You will learn about netfilter hooks and how they are registered, about Connection Tracking, about IP tables and Network Address Translation (NAT), and about callback used by Connection Tracking and NAT. + +Chapter 10 deals with IPsec, one of the most complex networking subsystems. Topics like the IKE protocol (which is implemented in userspace) and cryptography aspects of IPsec are discussed briefly (full treatment is beyond the scope of the book). You will learn about the XFRM framework, which is the basis of the Linux IPsec subsystem, and about its two most important structures: XFRM policy and XFRM state. The ESP protocol is briefly described, as well as the IPsec Rx path and Tx path in transport mode. The chapter concludes with a section about XFRM lookup and a short section about NAT traversal. + +Chapter 11 describes four Layer 4 protocols, starting with the most commonly used protocols, UDP and TCP, and concluding with two newer protocols, SCTP and DCCP. + +Chapter 12 deals with wireless in Linux (IEEE 802.11). You will learn about the mac80211 subsystem and its implementation, about various wireless network topologies, about power save mode, and about IEEE 802.11n and packet aggregation. There is also a section devoted to Wireless Mesh networks in this chapter. + +Chapter 13 delves into the InfiniBand subsystem, a technology enjoying a rising popularity in datacenters. You will learn about the RDMA stack organization, about addressing in InfiniBand, about the organization of InfiniBand packets, and about the RDMA API. + +Chapter 14 concludes the book with a discussion of advanced topics such as Linux namespaces and network namespaces in particular, Busy Poll Sockets, the Bluetooth subsystem, the IEEE 802.15.4 subsystem, the Near Field Communication (NFC) subsystem, the PCI subsystem, and more. + +Appendices A, "Linux API," and C, "Glossary," provide complete reference information for many topics dicussed in the book. Appendix B, "Network Administration," provides information about various tools which you will need while working with Linux kernel networking. + +## Conventions + +Throughout the book, I've kept a consistent style. All code snippets, whether inside text paragraphs or on lines of their own, along with library paths, shell commands, URLs, and other code-related elements, are set in monospaced font, like this. New terms are set off in italics, and other emphasis may be given in bold. + +About the Author + + Rami Rosen is a software engineer, a computer science graduate of the Technion, Israel High Institute of Technology. In the last 17 years he has been a software developer for three innovative startups and a semiconductor company. Rami lives in Israel and he has participated in highly advanced Linux kernel projects, in particular those related to networking. He has published several articles and given lectures about Linux kernel networking and virtualization. + +About the Technical Reviewer + + Brendan Horan is a hardware fanatic, with a full high rack of all types of machine architectures in his home. He has more than ten years of experience working with large UNIX systems and tuning the underlying hardware for optimal performance and stability. Brendan's love for all forms of hardware has helped him throughout his IT career, from fixing laptops to tuning servers and their hardware in order to suit the needs of high-availability designs and ultra low-latency applications. Brendan takes pride in the open source movement and is happy to say that every computer in his house is powered by open source technology. He resides in Hong Kong with his wife, Vikki, who continues daily to teach him more Cantonese. + +Acknowledgments + +Thanks to my editors for giving me the honor of writing this book; to Michelle Lowman, the lead editor, for believing in this book while it was still just an idea; to Kevin Shea, the coordinating editor, who guided and supported me from the initial stages until the book was fully realized; to Brendan Horan, the technical reviewer, for his helpful comments that helped me to improve the book by a lot; to Troy Mott, the development editor, for his many suggestions and for his hard work; to Corbin Collins and Roger LeBlanc, the copy editors, for shaping up the text; and to Kumar Dhaneesh from the production team. + +I would like to thank the Linux kernel networking maintainer, David Miller, for the great work he has done over all these years and all the developers who continue to participate and contribute to the networking subsystem. I would like also to say thanks to the Linux kernel networking community and all its members who helped me by reviewing my text: Julian Anastasov, Timo Teras, Steffen Klassert, Gerrit Renker, Javier Cardona, Gao feng, Vlad Yasevich, Cong Wang, Florian Westphal, Reuben Hawkins, Pekka Savola, Andreas Steffen, Daniel Borkmann, Joachim Nilsson, David Hauweele, Maxime Ripard, Alexandre Belloni, Benjamin Zores, and too many others to mention. Thanks to Donald Wood and Eliezer Tamir from Intel for their help with the "Busy Polling Sockets" section, and to Samuel Ortiz from Intel for his advice in preparing the NFC section. Thanks for Dotan Barak, an InfiniBand expert, for contributing Chapter 13, "InfiniBand." + +—Rami Rosen + +Contents + +Chapter 1:​ Introduction 1 + +The Linux Network Stack 2 + +The Network Device 4 + +New API (NAPI) in Network Devices 5 + +Receiving and Transmitting Packets 5 + +The Socket Buffer 7 + +The Linux Kernel Networking Development Model 10 + +Summary 12 + +Chapter 2:​ Netlink Sockets 13 + +The Netlink Family 13 + +Netlink Sockets Libraries 14 + +The sockaddr_​nl Structure 15 + +Userspace Packages for Controlling TCP/​IP Networking 15 + +Kernel Netlink Sockets 16 + +The Netlink Message Header 19 + +NETLINK_​ROUTE Messages 22 + +Adding and Deleting a Routing Entry in a Routing Table 24 + +Generic Netlink Protocol 25 + +Creating and Sending Generic Netlink Messages 29 + +Socket Monitoring Interface 31 + +Summary 32 + +Chapter 3:​ Internet Control Message Protocol (ICMP) 37 + +ICMPv4 37 + +ICMPv4 Initialization 38 + +ICMPv4 Header 39 + +Receiving ICMPv4 Messages 42 + +Sending ICMPv4 Messages:​ "Destination Unreachable" 44 + +ICMPv6 47 + +ICMPv6 Initialization 48 + +ICMPv6 Header 49 + +Receiving ICMPv6 Messages 50 + +Sending ICMPv6 Messages 53 + +ICMP Sockets ("Ping sockets") 56 + +Summary 57 + +Quick Reference 57 + +Methods 57 + +Tables 58 + +procfs entries 60 + +Creating "Destination Unreachable" Messages with iptables 61 + +Chapter 4:​ IPv4 63 + +IPv4 Header 64 + +IPv4 Initialization 66 + +Receiving IPv4 Packets 66 + +Receiving IPv4 Multicast Packets 70 + +IP Options 72 + +Timestamp Option 74 + +Record Route Option 77 + +IP Options and Fragmentation 86 + +Building IP Options 87 + +Sending IPv4 Packets 88 + +Fragmentation 94 + +Fast Path 95 + +Slow Path 97 + +Defragmentation 100 + +Forwarding 104 + +Summary 107 + +Quick Reference 107 + +Methods 107 + +Macros 110 + +Chapter 5:​ The IPv4 Routing Subsystem 113 + +Forwarding and the FIB 113 + +Performing a Lookup in the Routing Subsystem 115 + +FIB Tables 118 + +FIB Info 119 + +Caching 123 + +Nexthop (fib_​nh) 124 + +Policy Routing 126 + +FIB Alias (fib_​alias) 127 + +ICMPv4 Redirect Message 130 + +Generating an ICMPv4 Redirect Message 131 + +Receiving an ICMPv4 Redirect Message 132 + +IPv4 Routing Cache 133 + +Summary 135 + +Quick Reference 135 + +Methods 135 + +Macros 136 + +Tables 137 + +Route Flags 139 + +Chapter 6:​ Advanced Routing 141 + +Multicast Routing 141 + +The IGMP Protocol 142 + +The Multicast Routing Table 143 + +The Multicast Forwarding Cache (MFC) 144 + +Multicast Router 146 + +The Vif Device 147 + +IPv4 Multicast Rx Path 148 + +The ip_​mr_​forward( ) Method 151 + +The ipmr_​queue_​xmit( ) Method 154 + +The ipmr_​forward_​finish( ) Method 156 + +The TTL in Multicast Traffic 157 + +Policy Routing 157 + +Policy Routing Management 158 + +Policy Routing Implementation 158 + +Multipath Routing 159 + +Summary 160 + +Quick Reference 160 + +Methods 160 + +Macros 163 + +Procfs Multicast Entries 163 + +Table 164 + +Chapter 7:​ Linux Neighbouring Subsystem 165 + +The Neighbouring Subsystem Core 165 + +Creating and Freeing a Neighbour 172 + +Interaction Between Userspace and the Neighbouring Subsystem 174 + +Handling Network Events 175 + +The ARP protocol (IPv4) 175 + +ARP:​ Sending Solicitation Requests 177 + +ARP:​ Receiving Solicitation Requests and Replies 181 + +The NDISC Protocol (IPv6) 187 + +Duplicate Address Detection (DAD) 187 + +NIDSC:​ Sending Solicitation Requests 189 + +NDISC:​ Receiving Neighbour Solicitations and Advertisements 193 + +Summary 200 + +Quick Reference 200 + +Methods 200 + +Macros 204 + +The neigh_​statistics Structure 206 + +Table 207 + +Chapter 8:​ IPv6 209 + +IPv6 – Short Introduction 209 + +IPv6 Addresses 210 + +Special Addresses 210 + +Multicast Addresses 212 + +IPv6 Header 213 + +Extension Headers 215 + +IPv6 Initialization 217 + +Autoconfiguratio​n 217 + +Receiving IPv6 Packets 218 + +Local Delivery 222 + +Forwarding 224 + +Receiving IPv6 Multicast Packets 228 + +Multicast Listener Discovery (MLD) 230 + +Joining and Leaving a Multicast Group 230 + +MLDv2 Multicast Listener Report 233 + +Multicast Source Filtering (MSF) 234 + +Sending IPv6 Packets 239 + +IPv6 Routing 240 + +Summary 240 + +Quick Reference 240 + +Methods 240 + +Macros 244 + +Tables 245 + +Special Addresses 246 + +Routing Tables Management in IPv6 246 + +Chapter 9:​ Netfilter 247 + +Netfilter Frameworks 247 + +Netfilter Hooks 248 + +Registration of Netfilter Hooks 249 + +Connection Tracking 250 + +Connection Tracking Initialization 251 + +Connection Tracking Entries 255 + +Connection Tracking Helpers and Expectations 259 + +IPTables 262 + +Delivery to the Local Host 265 + +Forwarding the Packet 265 + +Network Address Translation (NAT) 266 + +NAT Hook Callbacks and Connection Tracking Hook Callbacks 268 + +NAT Hook Callbacks 271 + +Connection Tracking Extensions 273 + +Summary 274 + +Quick Reference 274 + +Methods 274 + +MACRO 276 + +Tables 277 + +Chapter 10:​ IPsec 279 + +General 279 + +IKE (Internet Key Exchange) 279 + +IPsec and Cryptography 280 + +The XFRM Framework 281 + +XFRM Initialization 282 + +XFRM Policies 282 + +XFRM States (Security Associations) 285 + +ESP Implementation (IPv4) 288 + +IPv4 ESP Initialization 290 + +Receiving an IPsec Packet (Transport Mode) 291 + +Sending an IPsec Packet (Transport Mode) 294 + +XFRM Lookup 295 + +NAT Traversal in IPsec 298 + +NAT-T Mode of Operation 299 + +Summary 299 + +Quick Reference 299 + +Methods 299 + +Table 302 + +Chapter 11:​ Layer 4 Protocols 305 + +Sockets 305 + +Creating Sockets 306 + +UDP (User Datagram Protocol) 310 + +UDP Initialization 311 + +Sending Packets with UDP 313 + +Receiving Packets from the Network Layer (L3) with UDP 316 + +TCP (Transmission Control Protocol) 318 + +TCP Header 319 + +TCP Initialization 321 + +TCP Timers 322 + +TCP Socket Initialization 323 + +TCP Connection Setup 323 + +Receiving Packets from the Network Layer (L3) with TCP 324 + +Sending Packets with TCP 325 + +SCTP (Stream Control Transmission Protocol) 326 + +SCTP Packets and Chunks 328 + +SCTP Chunk Header 328 + +SCTP Chunk 329 + +SCTP Associations 330 + +Setting Up an SCTP Association 331 + +Receiving Packets with SCTP 332 + +Sending Packets with SCTP 332 + +SCTP HEARTBEAT 332 + +SCTP Multistreaming 333 + +SCTP Multihoming 333 + +DCCP:​ The Datagram Congestion Control Protocol 333 + +DCCP Header 334 + +DCCP Initialization 336 + +DCCP Socket Initialization 337 + +Receiving Packets from the Network Layer (L3) with DCCP 338 + +Sending Packets with DCCP 338 + +DCCP and NAT 339 + +Summary 340 + +Quick Reference 340 + +Methods 340 + +Macros 342 + +Tables 342 + +Chapter 12:​ Wireless in Linux 345 + +Mac80211 Subsystem 345 + +The 802.​11 MAC Header 346 + +The Frame Control 347 + +The Other 802.​11 MAC Header Members 348 + +Network Topologies 349 + +Infrastructure BSS 349 + +IBSS, or Ad Hoc Mode 350 + +Power Save Mode 350 + +Entering Power Save Mode 350 + +Exiting Power Save Mode 351 + +Handling the Multicast/​Broadcast Buffer 351 + +The Management Layer (MLME) 353 + +Scanning 353 + +Authentication 353 + +Association 353 + +Reassociation 353 + +Mac80211 Implementation 354 + +Rx Path 356 + +Tx Path 356 + +Fragmentation 357 + +Mac80211 debugfs 358 + +Wireless Modes 359 + +High Throughput (ieee802.​11n) 359 + +Packet Aggregation 360 + +Mesh Networking (802.​11s) 362 + +HWMP Protocol 364 + +Setting Up a Mesh Network 365 + +Linux Wireless Development Process 366 + +Summary 366 + +Quick Reference 366 + +Methods 366 + +Table 371 + +Chapter 13:​ InfiniBand 373 + +RDMA and InfiniBand—General 373 + +The RDMA Stack Organization 374 + +RDMA Technology Advantages 375 + +InfiniBand Hardware Components 375 + +Addressing in InfiniBand 375 + +InfiniBand Features 376 + +InfiniBand Packets 376 + +Management Entities 377 + +RDMA Resources 378 + +RDMA Device 378 + +Protection Domain (PD) 380 + +Address Handle (AH) 380 + +Memory Region (MR) 381 + +Fast Memory Region (FMR) Pool 382 + +Memory Window (MW) 382 + +Completion Queue (CQ) 382 + +eXtended Reliable Connected (XRC) Domain 384 + +Shared Receive Queue (SRQ) 384 + +Queue Pair (QP) 386 + +Work Request Processing 391 + +Supported Operations in the RDMA Architecture 392 + +Multicast Groups 396 + +Difference Between the Userspace and the Kernel-Level RDMA API 396 + +Summary 397 + +Quick Reference 397 + +Methods 397 + +Chapter 14:​ Advanced Topics 405 + +Network Namespaces 405 + +Namespaces Implementation 406 + +UTS Namespaces Implementation 414 + +Network Namespaces Implementation 416 + +Network Namespaces Management 423 + +Cgroups 426 + +Cgroups Implementation 427 + +Cgroup Devices Controller:​ A Simple Example 430 + +Cgroup Memory Controller:​ A Simple Example 430 + +The net_​prio Module 431 + +The cls_​cgroup Classifier 432 + +Mounting cgroup Subsystems 432 + +Busy Poll Sockets 433 + +Enabling Globally 435 + +Enabling Per Socket 435 + +Tuning and Configuration 435 + +Performance 436 + +The Linux Bluetooth Subsystem 436 + +HCI Layer 439 + +HCI Connection 441 + +L2CAP 441 + +BNEP 442 + +Receiving Bluetooth Packets:​ Diagram 443 + +L2CAP Extended Features 444 + +Bluetooth Tools 444 + +IEEE 802.​15.​4 and 6LoWPAN 445 + +Neighbor Discovery Optimization 446 + +Linux Kernel 6LoWPAN 447 + +Near Field Communication (NFC) 450 + +NFC Tags 450 + +NFC Devices 451 + +Communication and Operation Modes 451 + +Host-Controller Interfaces 451 + +Linux NFC support 452 + +Userspace Architecture 456 + +NFC on Android 457 + +Notifications Chains 458 + +The PCI Subsystem 461 + +Wake-On-LAN (WOL) 463 + +Teaming Network Device 464 + +The PPPoE Protocol 465 + +PPPoE Header 465 + +PPPoE Initialization 467 + +Sending and Receiving Packets with PPPoE 468 + +Android 472 + +Android Networking 472 + +Android internals:​ Resources 473 + +Summary 474 + +Quick Reference 474 + +Methods 474 + +Macros 482 + +Appendix A:​ Linux API 483 + +The sk_​buff Structure 483 + +struct skb_​shared_​info 492 + +The net_​device structure 493 + +RDMA (Remote DMA) 518 + +RDMA Device 518 + +The ib_​register_​client( ) Method 518 + +The ib_​unregister_​client( ) Method 519 + +The ib_​get_​client_​data( ) Method 519 + +The ib_​set_​client_​data( ) Method 519 + +The INIT_​IB_​EVENT_​HANDLER macro 520 + +The ib_​register_​event_​handler( ) Method 520 + +The ib_​event_​handler struct:​ 520 + +The ib_​event Struct 520 + +The ib_​unregister_​event_​handler( ) Method 522 + +The ib_​query_​device( ) Method 522 + +The ib_​query_​port( ) Method 526 + +The rdma_​port_​get_​link_​layer( ) Method 529 + +The ib_​query_​gid( ) Method 530 + +The ib_​query_​pkey( ) Method 530 + +The ib_​modify_​device( ) Method 530 + +The ib_​modify_​port( ) Method 531 + +The ib_​find_​gid( ) Method 532 + +The ib_​find_​pkey( ) Method 532 + +The rdma_​node_​get_​transport( ) Method 532 + +The rdma_​node_​get_​transport( ) Method 532 + +The ib_​mtu_​to_​int( ) Method 533 + +The ib_​width_​enum_​to_​int( ) Method 533 + +The ib_​rate_​to_​mult( ) Method 533 + +The ib_​rate_​to_​mbps( ) Method 534 + +The ib_​rate_​to_​mbps( ) Method 534 + +Protection Domain (PD) 534 + +The ib_​alloc_​pd( ) Method 534 + +The ib_​dealloc_​pd( ) Method 534 + +eXtended Reliable Connected (XRC) 535 + +The ib_​alloc_​xrcd( ) Method 535 + +The ib_​dealloc_​xrcd_​cq( ) Method 535 + +Shared Receive Queue (SRQ) 535 + +The ib_​create_​srq( ) Method 536 + +The ib_​modify_​srq( ) Method 536 + +The ib_​query_​srq( ) Method 537 + +The ib_​destory_​srq( ) Method 537 + +The ib_​post_​srq_​recv( ) Method 537 + +Address Handle (AH) 538 + +The ib_​create_​ah( ) Method 539 + +The ib_​init_​ah_​from_​wc( ) Method 539 + +The ib_​create_​ah_​from_​wc( ) Method 540 + +The ib_​modify_​ah( ) Method 540 + +The ib_​query_​ah( ) Method 540 + +The ib_​destory_​ah( ) Method 540 + +Multicast Groups 541 + +The ib_​attach_​mcast( ) Method 541 + +The ib_​detach_​mcast( ) method 541 + +Completion Queue (CQ) 541 + +The ib_​create_​cq( ) Method 541 + +The ib_​resize_​cq( ) Method 542 + +The ib_​modify_​cq( ) Method 542 + +The ib_​peek_​cq( ) Method 542 + +The ib_​req_​notify_​cq( ) Method 543 + +The ib_​req_​ncomp_​notif( ) Method 543 + +The ib_​poll_​cq( ) Method 543 + +The ib_​destory_​cq( ) Method 547 + +Queue Pair (QP) 547 + +The ib_​qp_​cap Struct 547 + +The ib_​create_​qp( ) Method 547 + +The ib_​modify_​qp( ) Method 549 + +The ib_​query_​qp( ) Method 553 + +The ib_​open_​qp( ) Method 554 + +The ib_​close_​qp( ) Method 554 + +The ib_​post_​recv( ) Method 555 + +The ib_​post_​send( ) Method 555 + +Memory Windows (MW) 559 + +The ib_​alloc_​mw( ) Method 559 + +The ib_​bind_​mw( ) Method 560 + +The ib_​dealloc_​mw( ) Method 560 + +Memory Region (MR) 561 + +The ib_​get_​dma_​mr( ) Method 561 + +The ib_​dma_​mapping_​error( ) Method 561 + +The ib_​dma_​map_​single( ) Method 561 + +The ib_​dma_​unmap_​single( ) Method 562 + +The ib_​dma_​map_​single_​attrs( ) Method 562 + +The ib_​dma_​unmap_​single_​attrs( ) Method 562 + +The ib_​dma_​map_​page( ) Method 563 + +The ib_​dma_​unmap_​page( ) Method 563 + +The ib_​dma_​map_​sg( ) Method 564 + +The ib_​dma_​unmap_​sg( ) Method 564 + +The ib_​dma_​map_​sg_​attr( ) Method 564 + +The ib_​dma_​unmap_​sg( ) Method 565 + +The ib_​sg_​dma_​address( ) Method 565 + +The ib_​sg_​dma_​len( ) Method 565 + +The ib_​dma_​sync_​single_​for_​cpu( ) Method 565 + +The ib_​dma_​sync_​single_​for_​device( ) Method 566 + +The ib_​dma_​alloc_​coherent( ) Method 566 + +The ib_​dma_​free_​coherent( ) method 566 + +The ib_​reg_​phys_​mr( ) Method 567 + +The ib_​rereg_​phys_​mr( ) Method 567 + +The ib_​query_​mr( ) Method 568 + +The ib_​dereg_​mr( ) Method 569 + +Appendix B:​ Network Administration 571 + +arp 571 + +arping 571 + +arptables 571 + +arpwatch 571 + +ApacheBench (ab) 572 + +brctl 572 + +conntrack-tools 572 + +crtools 572 + +ebtables 572 + +ether-wake 572 + +ethtool 573 + +git 573 + +hciconfig 574 + +hcidump 574 + +hcitool 574 + +ifconifg 574 + +ifenslave 574 + +iperf 575 + +Using iperf 575 + +iproute2 575 + +iptables and iptables6 579 + +ipvsadm 579 + +iw 579 + +iwconfig 579 + +libreswan Project 580 + +l2ping 580 + +lowpan-tools 580 + +lshw 580 + +lscpu 580 + +lspci 580 + +mrouted 580 + +nc 580 + +ngrep 581 + +netperf 581 + +netsniff-ng 581 + +netstat 581 + +nmap (Network Mapper) 582 + +openswan 582 + +OpenVPN 582 + +packeth 582 + +ping 582 + +pimd 583 + +poptop 583 + +ppp 583 + +pktgen 583 + +radvd 583 + +route 583 + +RP-PPPoE 584 + +sar 584 + +smcroute 584 + +snort 584 + +suricata 584 + +strongSwan 584 + +sysctl 584 + +taskset 585 + +tcpdump 585 + +top 585 + +tracepath 585 + +traceroute 585 + +tshark 585 + +tunctl 586 + +udevadm 586 + +unshare 587 + +vconfig 587 + +wpa_​supplicant 587 + +wireshark 588 + +XORP 588 + +Appendix C:​ Glossary 589 + +Index599 + +Contents at a Glance + +Chapter 1:​ Introduction 1 + +Chapter 2:​ Netlink Sockets 13 + +Chapter 3:​ Internet Control Message ProLohol (ICMP) 37 + +Chapter 4:​ IPv4 63 + +Chapter 5:​ The IPv4 Routing Subsystem 113 + +Chapter 6:​ Advanced Routing 141 + +Chapter 7:​ Linux Neighbouring Subsystem 165 + +Chapter 8:​ IPv6 209 + +Chapter 9:​ Netfilter 247 + +Chapter 10:​ IPsec 279 + +Chapter 11:​ Layer 4 ProLohols 305 + +Chapter 12:​ Wireless in Linux 345 + +Chapter 13:​ InfiniBand 373 + +Chapter 14:​ Advanced Topics 405 + +Appendix A:​ Linux API 483 + +Appendix B:​ Network Administration 571 + +Appendix C:​ Glossary 589 + +Index599 +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_1 + +© Rami Rosen 2014 + +# 1. Introduction + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +This book deals with the implementation of the Linux Kernel Networking stack and the theory behind it. You will find in the following pages an in-depth and detailed analysis of the networking subsystem and its architecture. I will not burden you with topics not directly related to networking, which you may encounter while reading kernel networking code (for example, locking and synchronization, SMP, atomic operations, and so on). There are plenty of resources about such topics. On the other hand, there are very few up-to-date resources that focus on kernel networking proper. By this I mean primarily describing the traversal of the packet in the Linux Kernel Networking stack and its interaction with various networking layers and subsystems—and how various networking protocols are implemented. + +This book deals with the implementation of the Linux Kernel Networking stack and the theory behind it. You will find in the following pages an in-depth and detailed analysis of the networking subsystem and its architecture. I will not burden you with topics not directly related to networking, which you may encounter while reading kernel networking code (for example, locking and synchronization, SMP, atomic operations, and so on). There are plenty of resources about such topics. On the other hand, there are very few up-to-date resources that focus on kernel networking proper. By this I mean primarily describing the traversal of the packet in the Linux Kernel Networking stack and its interaction with various networking layers and subsystems—and how various networking protocols are implemented. + +This book is also not a cumbersome, line-by-line code walkthrough. I focus on the essence of the implementation of each network layer and the theory guidelines and principles that led to this implementation. The Linux operating system has proved itself in recent years as a successful, reliable, stable, and popular operating system. And it seems that its popularity is growing steadily, in a wide variety of flavors, from mainframes, data centers, core routers, and web servers to embedded devices like wireless routers, set-top boxes, medical instruments, navigation equipment (like GPS devices), and consumer electronics devices. Many semiconductor vendors use Linux as the basis for their Board Support Packages (BSPs). The Linux operating system, which started as a project of a Finnish student named Linus Torvalds back in 1991, based on the UNIX operating system, proved to be a serious and reliable operating system and a rival for veteran proprietary operating systems. + +Linux began as an Intel x86-based operating system but has been ported to a very wide range of processors, including ARM, PowerPC, MIPS, SPARC, and more. The Android operating system, based upon the Linux kernel, is common today in tablets and smartphones, and seems likely to gain popularity in the future in smart TVs. Apart from Android, Google has also contributed some kernel networking features that were merged into the mainline kernel. + +Linux is an open source project, and as such it has an advantage over other proprietary operating systems: its source code is freely available under the General Public License (GPL). Other open source operating systems, like the different types of BSD, have much less popularity. I should also mention in this context the OpenSolaris project, based on the Common Development and Distribution License (CDDL). This project, started by Sun Microsystems, has not achieved the popularity that Linux has. Among the large community of active Linux developers, some contribute code on behalf of the companies they work for, and some contribute code voluntarily. All of the kernel development process is accessible via the kernel mailing lists. There is one central mailing list, the Linux Kernel Mailing List (LKML), and many subsystems have their own mailing lists. Contributing code is done via sending patches to the appropriate kernel mailing lists and to the maintainers, and these patches are discussed over the mailing lists. + +The Linux Kernel Networking stack is a very important subsystem of the Linux kernel. It is quite difficult to find a Linux-based system, whether it is a desktop, a server, a mobile device or any other embedded device, that does not use any kind of networking. Even in the rare case when a machine doesn't have any hardware network devices, you will still be using networking (maybe unconsciously) when you use X-Windows, as X-Windows itself is based upon client-server networking. A wide range of projects are related to the Linux Networking stack, from core routers to small embedded devices. Some of these projects deal with adding vendor-specific features. For example, some hardware vendors implement Generic Segmentation Offload (GSO) in some network devices. GSO is a networking feature of the kernel network stack that divides a large packet into smaller ones in the Tx path. Many hardware vendors implement checksumming in hardware in their network devices. Checksum is a mechanism to verify that a packet was not damaged on transit by calculating some hash from the packet and attaching it to the packet. Many projects provide some security enhancements for Linux. Sometimes these enhancements require some changes in the networking subsystem, as you will see, for example, in Chapter 3, when discussing the Openwall GNU/*/Linux project. In the embedded device arena there are, for example, many wireless routers that are Linux based; one example is the WRT54GL Linksys router, which runs Linux. There is also an open source, Linux-based operating system that can run on this device (and on some other devices), named OpenWrt, with a large and active community of developers (see https://openwrt.org/ ). Learning about how the various protocols are implemented by the Linux Kernel Networking stack and becoming familiar with the main data structures and the main paths of a packet in it are essential to understanding it better. + +## The Linux Network Stack + +There are seven logical networking layers according to the Open Systems Interconnection (OSI) model. The lowest layer is the physical layer, which is the hardware, and the highest layer is the application layer, where userspace software processes are running. Let's describe these seven layers: + +1. + +The physical layer: Handles electrical signals and the low level details. + +2. + +The data link layer: Handles data transfer between endpoints. The most common data link layer is Ethernet. The Linux Ethernet network device drivers reside in this layer. + +3. + +The network layer: Handles packet forwarding and host addressing. In this book I discuss the most common network layers of the Linux Kernel Networking subsystem: IPv4 or IPv6. There are other, less common network layers which Linux implements, like DECnet, but they are not discussed. + +4. + +The protocol layer/transport layer: Handles data sending between nodes. The TCP and UDP protocols are the best-known protocols. + +5. + +The session layer: Handles sessions between endpoints. + +6. + +The presentation layer: Handles delivery and formatting. + +7. + +The application layer: Provides network services to end-user applications. + +Figure 1-1 shows the seven layers according to the OSI model. + +Figure 1-1. + +The OSI seven-layer model + +Figure 1-2 shows the three layers that the Linux Kernel Networking stack handles. The L2, L3, and L4 layers in this figure correspond to the data link layer, the network layer, and the transport layer in the seven-layer model, respectively. The essence of the Linux kernel stack is passing incoming packets from L2 (the network device drivers) to L3 (the network layer, usually IPv4 or IPv6) and then to L4 (the transport layer, where you have, for example, TCP or UDP listening sockets) if they are for local delivery, or back to L2 for transmission when the packets should be forwarded. Outgoing packets that were locally generated are passed from L4 to L3 and then to L2 for actual transmission by the network device driver. Along this way there are many stages, and many things can happen. For example: + + * The packet can be changed due to protocol rules (for example, due to an IPsec rule or to a NAT rule). + + * The packet can be discarded. + + * The packet can cause an error message to be sent. + + * The packet can be fragmented. + + * The packet can be defragmented. + + * A checksum should be calculated for the packet. + +The kernel does not handle any layer above L4; those layers (the session, presentation, and application layers) are handled solely by userspace applications. The physical layer (L1) is also not handled by the Linux kernel. + +If you feel overwhelmed, don't worry. You will learn a lot more about everything described here in a lot more depth in the following chapters. + +Figure 1-2. + +The Linux Kernel Networking layers + +## The Network Device + +The lower layer, Layer 2 (L2), as seen in Figure 1-2, is the link layer. The network device drivers reside in this layer. This book is not about network device driver development, because it focuses on the Linux kernel networking stack. I will briefly describe here the net_device structure, which represents a network device, and some of the concepts that are related to it. You should have a basic familiarity with the network device structure in order to better understand the network stack. Parameters of the device—like the size of MTU, which is typically 1,500 bytes for Ethernet devices—determine whether a packet should be fragmented. The net_device is a very large structure, consisting of device parameters like these: + + * The IRQ number of the device. + + * The MTU of the device. + + * The MAC address of the device. + + * The name of the device (like eth0 or eth1). + + * The flags of the device (for example, whether it is up or down). + + * A list of multicast addresses associated with the device. + + * The promiscuity counter (discussed later in this section). + + * The features that the device supports (like GSO or GRO offloading). + + * An object of network device callbacks (net_device_ops object), which consists of function pointers, such as for opening and stopping a device, starting to transmit, changing the MTU of the network device, and more. + + * An object of ethtool callbacks, which supports getting information about the device by running the command-line ethtool utility. + + * The number of Tx and Rx queues, when the device supports multiqueues. + + * The timestamp of the last transmit of a packet on this device. + + * The timestamp of the last reception of a packet on this device. + +The following is the definition of some of the members of the net_device structure to give you a first impression: + +struct net_device { + +unsigned int irq; /* device IRQ number */ + +... + +const struct net_device_ops *netdev_ops; + +... + +unsigned int mtu; + +... + +unsigned int promiscuity; + +... + +unsigned char *dev_addr; + +... + +}; + +(include/linux/netdevice.h) + +Appendix A of the book includes a very detailed description of the net_device structure and most of its members. In that appendix you can see the irq, mtu, and other members mentioned earlier in this chapter. + +When the promiscuity counter is larger than 0, the network stack does not discard packets that are not destined to the local host. This is used, for example, by packet analyzers ("sniffers") like tcpdump and wireshark, which open raw sockets in userspace and want to receive also this type of traffic. It is a counter and not a Boolean in order to enable opening several sniffers concurrently: opening each such sniffer increments the counter by 1. When a sniffer is closed, the promiscuity counter is decremented by 1; and if it reaches 0, there are no more sniffers running, and the device exits the promiscuous mode. + +When browsing kernel networking core source code, in various places you will probably encounter the term NAPI (New API), which is a feature that most network device drivers implement nowadays. You should know what it is and why network device drivers use it. + +### New API (NAPI) in Network Devices + +The old network device drivers worked in interrupt-driven mode, which means that for every received packet, there was an interrupt. This proved to be inefficient in terms of performance under high load traffic. A new software technique was developed, called New API (NAPI), which is now supported on almost all Linux network device drivers. NAPI was first introduced in the 2.5/2.6 kernel and was backported to the 2.4.20 kernel. With NAPI, under high load, the network device driver works in polling mode and not in interrupt-driven mode. This means that each received packet does not trigger an interrupt. Instead the packets are buffered in the driver, and the kernel polls the driver from time to time to fetch the packets. Using NAPI improves performance under high load. For sockets applications that need the lowest possible latency and are willing to pay a cost of higher CPU utilization, Linux has added a capability for Busy Polling on Sockets from kernel 3.11 and later. This technology is discussed in Chapter 14, in the "Busy Poll Sockets" section. + +With your new knowledge about network devices under your belt, it is time to learn about the traversal of a packet inside the Linux Kernel Networking stack. + +### Receiving and Transmitting Packets + +The main tasks of the network device driver are these: + + * To receive packets destined to the local host and to pass them to the network layer (L3), and from there to the transport layer (L4) + + * To transmit outgoing packets generated on the local host and sent outside, or to forward packets that were received on the local host + +For each packet, incoming or outgoing, a lookup in the routing subsystem is performed. The decision about whether a packet should be forwarded and on which interface it should be sent is done based on the result of the lookup in the routing subsystem, which I describe in depth in Chapters 5 and . The lookup in the routing subsystem is not the only factor that determines the traversal of a packet in the network stack. For example, there are five points in the network stack where callbacks of the netfilter subsystem (often referred to as netfilter hooks) can be registered. The first netfilter hook point of a received packet is NF_INET_PRE_ROUTING, before a routing lookup was performed. When a packet is handled by such a callback, which is invoked by a macro named NF_HOOK(), it will continue its traversal in the networking stack according to the result of this callback (also called verdict). For example, if the verdict is NF_DROP, the packet will be discarded, and if the verdict is NF_ACCEPT, the packet will continue its traversal as usual. Netfilter hooks callbacks are registered by the nf_register_hook() method or by the nf_register_hooks() method, and you will encounter these invocations, for example, in various netfilter kernel modules. The kernel netfilter subsystem is the infrastructure for the well-known iptables userspace package. Chapter 9 describes the netfilter subsystem and the netfilter hooks, along with the connection tracking layer of netfilter. + +Besides the netfilter hooks, the packet traversal can be influenced by the IPsec subsystem—for example, when it matches a configured IPsec policy. IPsec provides a network layer security solution, and it uses the ESP and the AH protocols. IPsec is mandatory according to IPv6 specification and optional in IPv4, though most operating systems, including Linux, implemented IPsec also in IPv4. IPsec has two modes of operation: transport mode and tunnel mode. It is used as a basis for many virtual private network (VPN) solutions, though there are also non-IPsec VPN solutions. You learn about the IPsec subsystem and about IPsec policies in Chapter 10, which also discusses the problems that occur when working with IPsec through a NAT, and the IPsec NAT traversal solution. + +Still other factors can influence the traversal of the packet—for example, the value of the ttl field in the IPv4 header of a packet being forwarded. This ttl is decremented by 1 in each forwarding device. When it reaches 0, the packet is discarded, and an ICMPv4 message of "Time Exceeded" with "TTL Count Exceeded" code is sent back. This is done to avoid an endless journey of a forwarded packet because of some error. Moreover, each time a packet is forwarded successfully and the ttl is decremented by 1, the checksum of the IPv4 header should be recalculated, as its value depends on the IPv4 header, and the ttl is one of the IPv4 header members. Chapter 4, which deals with the IPv4 subsystem, talks more about this. In IPv6 there is something similar, but the hop counter in the IPv6 header is named hop_limit and not ttl. You will learn about this in Chapter 8, which deals with the IPv6 subsystem. You will also learn about ICMP in IPv4 and in IPv6 in Chapter 3, which deals with ICMP. + +A large part of the book discusses the traversal of a packet in the networking stack, whether it is in the receive path (Rx path, also known as ingress traffic) or the transmit path (Tx path, also known as egress traffic). This traversal is complex and has many variations: large packets could be fragmented before they are sent; on the other hand, fragmented packets should be assembled (discussed in Chapter 4). Packets of different types are handled differently. For example, multicast packets are packets that can be processed by a group of hosts (as opposed to unicast packets, which are destined to a specified host). Multicast can be used, for example, in applications of streaming media in order to consume less network resources. Handling IPv4 multicast traffic is discussed in Chapter 4. You will also learn how a host joins and leaves a multicast group; in IPv4, the Internet Group Management Protocol (IGMP) protocol handles multicast membership. Yet there are cases when the host is configured as a multicast router, and multicast traffic should be forwarded and not delivered to the local host. These cases are more complex as they should be handled in conjunction with a userspace multicast routing daemon, like the pimd daemon or the mrouted daemon. These cases, which are called multicast routing, are discussed in Chapter 6. + +To better understand the packet traversal, you must learn about how a packet is represented in the Linux kernel. The sk_buff structure represents an incoming or outgoing packet, including its headers (include/linux/skbuff.h). I refer to an sk_buff object as SKB in many places along this book, as this is the common way to denote sk_buff objects (SKB stands for socket buffer). The socket buffer (sk_buff) structure is a large structure—I will only discuss a few members of this structure in this chapter. + +### The Socket Buffer + +The sk_buff structure is described in depth in Appendix A. I recommend referring to this appendix when you need to know more about one of the SKB members or how to use the SKB API. Note that when working with SKBs, you must adhere to the SKB API. Thus, for example, when you want to advance the skb->data pointer, you do not do it directly, but with the skb_pull_inline() method or the skb_pull() method (you will see an example of this later in this section). And if you want to fetch the L4 header (transport header) from an SKB, you do it by calling the skb_transport_header() method. Likewise if you want to fetch the L3 header (network header), you do it by calling the skb_network_header() method, and if you want to fetch the L2 header (MAC header), you do it by calling the skb_mac_header() method. These three methods get an SKB as a single parameter. + +Here is the (partial) definition of the sk_buff structure: + +struct sk_buff { + +... + +struct sock *sk; + +struct net_device *dev; + +... + +__u8 pkt_type:3, + +... + +__be16 protocol; + +... + +sk_buff_data_t tail; + +sk_buff_data_t end; + +unsigned char *head, + +*data; + +sk_buff_data_t transport_header; + +sk_buff_data_t network_header; + +sk_buff_data_t mac_header; + +... + +}; + +(include/linux/skbuff.h) + +When a packet is received on the wire, an SKB is allocated by the network device driver, typically by calling the netdev_alloc_skb() method (or the dev_alloc_skb() method, which is a legacy method that calls the netdev_alloc_skb() method with the first parameter as NULL). There are cases along the packet traversal where a packet can be discarded, and this is done by calling kfree_skb() or dev_kfree_skb(), both of which get as a single parameter a pointer to an SKB. Some members of the SKB are determined in the link layer (L2). For example, the pkt_type is determined by the eth_type_trans() method, according to the destination Ethernet address. If this address is a multicast address, the pkt_type will be set to PACKET_MULTICAST; if this address is a broadcast address, the pkt_type will be set to PACKET_BROADCAST; and if this address is the address of the local host, the pkt_type will be set to PACKET_HOST. Most Ethernet network drivers call the eth_type_trans() method in their Rx path. The eth_type_trans() method also sets the protocol field of the SKB according to the ethertype of the Ethernet header. The eth_type_trans() method also advances the data pointer of the SKB by 14 (ETH_HLEN), which is the size of an Ethernet header, by calling the skb_pull_inline() method. The reason for this is that the skb->data should point to the header of the layer in which it currently resides. When the packet was in L2, in the network device driver Rx path, skb->data pointed to the L2 (Ethernet) header; now that the packet is going to be moved to Layer 3, immediately after the call to the eth_type_trans() method, skb->data should point to the network (L3) header, which starts immediately after the Ethernet header (see Figure 1-3). + +The SKB includes the packet headers (L2, L3, and L4 headers) and the packet payload. In the packet traversal in the network stack, a header can be added or removed. For example, for an IPv4 packet generated locally by a socket and transmitted outside, the network layer (IPv4) adds an IPv4 header to the SKB. The IPv4 header size is 20 bytes as a minimum. When adding IP options, the IPv4 header size can be up to 60 bytes. IP options are described in Chapter 4, which discusses the IPv4 protocol implementation. Figure 1-3 shows an example of an IPv4 packet with L2, L3, and L4 headers. The example in Figure 1-3 is a UDPv4 packet. First is the Ethernet header (L2) of 14 bytes. Then there's the IPv4 header (L3) of a minimal size of 20 bytes up to 60 bytes, and after that is the UDPv4 header (L4), of 8 bytes. Then comes the payload of the packet. + +Figure 1-3. + +An IPv4 packet + +Each SKB has a dev member, which is an instance of the net_device structure. For incoming packets, it is the incoming network device, and for outgoing packets it is the outgoing network device. The network device attached to the SKB is sometimes needed to fetch information which might influence the traversal of the SKB in the Linux Kernel Networking stack. For example, the MTU of the network device may require fragmentation, as mentioned earlier. Each transmitted SKB has a sock object associated to it (sk). If the packet is a forwarded packet, then sk is NULL, because it was not generated on the local host. + +Each received packet should be handled by a matching network layer protocol handler. For example, an IPv4 packet should be handled by the ip_rcv() method, and an IPv6 packet should be handled by the ipv6_rcv() method. You will learn about the registration of the IPv4 protocol handler with the dev_add_pack() method in Chapter 4, and about the registration of the IPv6 protocol handler also with the dev_add_pack() method in Chapter 8. Moreover, I will follow the traversal of incoming and outgoing packets both in IPv4 and in IPv6. For example, in the ip_rcv() method, mostly sanity checks are performed, and if everything is fine the packet proceeds to an NF_INET_PRE_ROUTING hook callback, if such a callback is registered, and the next step, if it was not discarded by such a hook, is the ip_rcv_finish() method, where a lookup in the routing subsystem is performed. A lookup in the routing subsystem builds a destination cache entry (dst_entry object). You will learn about the dst_entry and about the input and output callback methods associated with it in Chapters 5 and , which describe the IPv4 routing subsystem. + +In IPv4 there is a problem of limited address space, as an IPv4 address is only 32 bit. Organizations use NAT (discussed in Chapter 9) to provide local addresses to their hosts, but the IPv4 address space still diminishes over the years. One of the main reasons for developing the IPv6 protocol was that its address space is huge compared to the IPv4 address space, because the IPv6 address length is 128 bit. But the IPv6 protocol is not only about a larger address space. The IPv6 protocol includes many changes and additions as a result of the experience gained over the years with the IPv4 protocol. For example, the IPv6 header has a fixed length of 40 bytes as opposed to the IPv4 header, which is variable in length (from a minimum of 20 bytes to 60 bytes) due to IP options, which can expand it. Processing IP options in IPv4 is complex and quite heavy in terms of performance. On the other hand, in IPv6 you cannot expand the IPv6 header at all (it is fixed in length, as mentioned). Instead there is a mechanism of extension headers which is much more efficient than the IP options in IPv4 in terms of performance. Another notable change is with the ICMP protocol; in IPv4 it was used only for error reporting and for informative messages. In IPv6, the ICMP protocol is used for many other purposes: for Neighbour Discovery (ND), for Multicast Listener Discovery (MLD), and more. Chapter 3 is dedicated to ICMP (both in IPv4 and IPv6). The IPv6 Neighbour Discovery protocol is described in Chapter 7, and the MLD protocol is discussed in Chapter 8, which deals with the IPv6 subsystem. + +As mentioned earlier, received packets are passed by the network device driver to the network layer, which is IPv4 or IPv6. If the packets are for local delivery, they will be delivered to the transport layer (L4) for handling by listening sockets. The most common transport protocols are UDP and TCP, discussed in Chapter 11, which discusses Layer 4, the transport layer. This chapter also covers two newer transport protocols, the Stream Control Transmission Protocol (SCTP) and the Datagram Congestion Control Protocol (DCCP). Both SCTP and DCCP adopted some TCP features and some UDP features, as you will find out. The SCTP protocol is known to be used in conjunction with the Long Term Evolution (LTE) protocol; the DCCP has not been tested so far in larger-scale Internet setups. + +Packets generated by the local host are created by Layer 4 sockets—for example, by TCP sockets or by UDP sockets. They are created by a userspace application with the Sockets API. There are two main types of sockets: datagram sockets and stream sockets. These two types of sockets and the POSIX-based socket API are also discussed in Chapter 11, where you will also learn about the kernel implementation of sockets (struct socket, which provides an interface to userspace, and struct sock, which provides an interface to Layer 3). The packets generated locally are passed to the network layer, L3 (described in Chapter 4, in the section "Sending IPv4 Packets") and then are passed to the network device driver (L2) for transmission. There are cases when fragmentation takes place in Layer 3, the network layer, and this is also discussed in chapter 4. + +Every Layer 2 network interface has an L2 address that identifies it. In the case of Ethernet, this is a 48-bit address, the MAC address which is assigned for each Ethernet network interface, provided by the manufacturer, and said to be unique (though you should consider that the MAC address for most network interfaces can be changed by userspace commands like ifconfig or ip). Each Ethernet packet starts with an Ethernet header, which is 14 bytes long. It consists of the Ethernet type (2 bytes), the source MAC address (6 bytes), and the destination MAC address (6 bytes). The Ethernet type value is 0x0800, for example, for IPv4, or 0x86DD for IPv6. For each outgoing packet, an Ethernet header should be built. When a userspace socket sends a packet, it specifies its destination address (it can be an IPv4 or an IPv6 address). This is not enough to build the packet, as the destination MAC address should be known. Finding the MAC address of a host based on its IP address is the task of the neighbouring subsystem, discussed in Chapter 7. Neighbor Discovery is handled by the ARP protocol in IPv4 and by the NDISC protocol in IPv6. These protocols are different: the ARP protocol relies on sending broadcast requests, whereas the NDISC protocol relies on sending ICMPv6 requests, which are in fact multicast packets. Both the ARP protocol and the NDSIC protocol are also discussed in Chapter 7. + +The network stack should communicate with the userspace for tasks such as adding or deleting routes, configuring neighboring tables, setting IPsec policies and states, and more. The communication between userspace and the kernel is done with netlink sockets, described in Chapter 2. The iproute2 userspace package, based on netlink sockets, is also discussed in Chapter 2, as well as the generic netlink sockets and their advantages. + +The wireless subsystem is discussed in Chapter 12. This subsystem is maintained separately, as mentioned earlier; it has a git tree of its own and a mailing list of its own. There are some unique features in the wireless stack that do not exist in the ordinary network stack, such as power save mode (which is when a station or an access point enters a sleep state). The Linux wireless subsystem also supports special topologies, like Mesh network, ad-hoc network, and more. These topologies sometimes require using special features. For example, Mesh networking uses a routing protocol called Hybrid Wireless Mesh Protocol (HWMP), discussed in Chapter 12. This protocol works in Layer 2 and deals with MAC addresses, as opposed to the IPV4 routing protocol. Chapter 12 also discusses the mac80211 framework, which is used by wireless device drivers. Another very interesting feature of the wireless subsystem is the block acknowledgment mechanism in IEEE 802.11n, also discussed in Chapter 12. + +In recent years InfiniBand technology has gained in popularity with enterprise datacenters. InfiniBand is based on a technology called Remote Direct Memory Access (RDMA). The RDMA API was introduced to the Linux kernel in version 2.6.11. In Chapter 13 you will find a good explanation about the Linux Infiniband implementation, the RDMA API, and its fundamental data structures. + +Virtualization solutions are also becoming popular, especially due to projects like Xen or KVM. Also hardware improvements, like VT-x for Intel processors or AMD-V for AMD processors, have made virtualization more efficient. There is another form of virtualization, which may be less known but has its own advantages. This virtualization is based on a different approach: process virtualization. It is implemented in Linux by namespaces. There is currently support for six namespaces in Linux, and there could be more in the future. The namespaces feature is already used by projects like Linux Containers ( http://lxc.sourceforge.net/ ) and Checkpoint/Restore In Userspace (CRIU). In order to support namespaces, two system calls were added to the kernel: unshare() and setns(); and six new flags were added to the CLONE_* flags, one for each type of namespace. I discuss namespaces and network namespaces in particular in Chapter 14. Chapter 14 also deals with the Bluetooth subsystem and gives a brief overview about the PCI subsystem, because many network device drivers are PCI devices. I do not delve into the PCI subsystem internals, because that is out of the scope of this book. Another interesting subsystem discussed in Chapter 14 is the IEEE 8012.15.4, which is for low-power and low-cost devices. These devices are sometimes mentioned in conjunction with the Internet of Things (IoT) concept, which involves connecting IP-enabled embedded devices to IP networks. It turns out that using IPv6 for these devices might be a good idea. This solution is termed IPv6 over Low Power Wireless Personal Area Networks (6LoWPAN). It has its own challenges, such as expanding the IPv6 Neighbour Discovery protocol to be suitable for such devices, which occasionally enter sleep mode (as opposed to ordinary IPv6 networks). These changes to the IPv6 Neighbour Discovery protocol have not been implemented yet, but it is interesting to consider the theory behind these changes. Apart from this, in Chapter 14 there are sections about other advanced topics like NFC, cgroups, Android, and more. + +To better understand the Linux Kernel Network stack or participate in its development, you must be familiar with how its development is handled. + +## The Linux Kernel Networking Development Model + +The kernel networking subsystem is very complex, and its development is quite dynamic. Like any Linux kernel subsystem, the development is done by git patches that are sent over a mailing list (sometimes over more than one mailing list) and that are eventually accepted or rejected by the maintainer of that subsystem. Learning about the Kernel Networking Development Model is important for many reasons. To better understand the code, to debug and solve problems in Linux Kernel Networking–based projects, to implement performance improvements and optimizations patches, or to implement new features, in many cases you need to learn many things such as the following: + + * How to apply a patch + + * How to read and interpret a patch + + * How to find which patches could cause a given problem + + * How to revert a patch + + * How to find which patches are relevant to some feature + + * How to adjust a project to an older kernel version (backporting) + + * How to adjust a project to a newer kernel version (upgrading) + + * How to clone a git tree + + * How to rebase a git tree + + * How to find out in which kernel version a specified git patch was applied + +There are cases when you need to work with new features that were just added, and for this you need to know how to work with the latest, bleeding-edge tree. And there are cases when you encounter some bug or you want to add some new feature to the network stack, and you need to prepare a patch and submit it. The Linux Kernel Networking subsystem, like the other parts of the kernel, is managed by git, a source code management (SCM) system, developed by Linus Torvalds. If you intend to send patches for the mainline kernel, or if your project is managed by git, you must learn to use the git tool. + +Sometimes you may even need to install a git server for development of local projects. Even if you are not intending to send any patches, you can use the git tool to retrieve a lot of information about the code and about the history of the development of the code. There are many available resources on the web about git; I recommend the free online book Pro Git, by Scott Chacon, available at http://git-scm.com/book . If you intend to submit your patches to the mainline, you must adhere to some strict rules for writing, checking, and submitting patches so that your patch will be applied. Your patch should conform to the kernel coding style and should be tested. You also need to be patient, as sometimes even a trivial patch can be applied only after several days. I recommend learning to configure a host for using the git send-email command to submit patches (though submitting patches can be done with other mail clients, even with the popular Gmail webmail client). There are plenty of guides on the web about how to use git to prepare and send kernel patches. I also recommend reading Documentation/SubmittingPatches and Documentation/CodingStyle in the kernel tree before submitting your first patch. + +And I recommended using the following PERL scripts: + + * scripts/checkpatch.pl to check the correctness of a patch + + * scripts/get_maintainer.pl to find out to which maintainers a patch should be sent + +One of the most important resources of information is the Kernel Networking Development mailing list, netdev: netdev@vger.kernel.org, archived at www.spinics.net/lists/netdev . This is a high volume list. Most of the posts are patches and Request for Comments (RFCs) for new code, along with comments and discussions about patches. This mailing list handles the Linux Kernel Networking stack and network device drivers, except for cases when dealing with a subsystem that has a specific mailing list and a specific git repository (such as the wireless subsystem, discussed in Chapter 12). Development of the iproute2 and the ethtool userspace packages is also handled in the netdev mailing list. It should be mentioned here that not every networking subsystem has a mailing list of its own; for example, the IPsec subsystem (discussed in Chapter 10), does not have a mailing list, nor does the IEEE 802.15.4 subsystem (Chapter 14). Some networking subsystems have their own specific git tree, maintainer, and mailing list, such as the wireless mailing list and the Bluetooth mailing list. From time to time the maintainers of these subsystems send a pull request for their git trees over the netdev mailing list. Another source of information is Documentation/networking in the kernel tree. It has a lot of information in many files about various networking topics, but keep in mind that the file that you find there is not always up to date. + +The Linux Kernel Networking subsystem is maintained in two git repositories. Patches and RFCs are sent to the netdev mailing list for both repositories. Here are the two git trees: + + * net: http://git.kernel.org/?p=linux/kernel/git/davem/net.git : for fixes to existing code already in the mainline tree + + * net-next: http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git : new code for the future kernel release + +From time to time the maintainer of the networking subsystem, David Miller, sends pull requests for mainline for these git trees to Linus over the LKML. You should be aware that there are periods of time, during merge with mainline, when the net-next git tree is closed, and no patches should be sent. An announcement about when this period starts and another one when it ends is sent over the netdev mailing list. + +Note + +This book is based on kernel 3.9. All the code snippets are from this version, unless explicitly specified otherwise. The kernel tree is available from www.kernel.org as a tar file. Alternatively, you can download a kernel git tree with git clone (for example, using the URLs of the git net tree or the git net-next tree, which were mentioned earlier, or other git kernel repositories). There are plenty of guides on the Internet covering how to configure, build, and boot a Linux kernel. You can also browse various kernel versions online at http://lxr.free-electrons.com/ . This website lets you follow where each method and each variable is referenced; moreover, you can navigate easily with a click of a mouse to previous versions of the Linux kernel. In case you are working with your own version of a Linux kernel tree, where some changes were made locally, you can locally install and configure a Linux Cross-Referencer server (LXR) on a local Linux machine. See http://lxr.sourceforge.net/en/index.shtml . + +## Summary + +This chapter is a short introduction to the Linux Kernel Networking subsystem. I described the benefits of using Linux, a popular open source project, and the Kernel Networking Development Model. I also described the network device structure (net_device) and the socket buffer structure (sk_buff), which are the two most fundamental structures of the networking subsystem. You should refer to Appendix A for a detailed description of almost all the members of these structures and their uses. This chapter covered other important topics related to the traversal of a packet in the kernel networking stack, such as the lookup in the routing subsystem, fragmentation and defragmentation, protocol handler registration, and more. Some of these protocols are discussed in later chapters, including IPv4, IPv6, ICMP4 and ICMP6, ARP, and Neighbour Discovery. Several important subsystems, including the wireless subsystem, the Bluetooth subsystem, and the IEEE 812.5.4 subsystem, are also covered in later chapters. Chapter 2 starts the journey in the kernel network stack with netlink sockets, which provide a way for bidirectional communication between the userspace and the kernel, and which are talked about in several other chapters. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_2 + +© Rami Rosen 2014 + +# 2. Netlink Sockets + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 1 discusses the roles of the Linux kernel networking subsystem and the three layers in which it operates. The netlink socket interface appeared first in the 2.2 Linux kernel as AF_NETLINK socket. It was created as a more flexible alternative to the awkward IOCTL communication method between userspace processes and the kernel. The IOCTL handlers cannot send asynchronous messages to userspace from the kernel, whereas netlink sockets can. In order to use IOCTL, there is another level of complexity: you need to define IOCTL numbers. The operation model of netlink is quite simple: you open and register a netlink socket in userspace using the socket API, and this netlink socket handles bidirectional communication with a kernel netlink socket, usually sending messages to configure various system settings and getting responses back from the kernel. + +Chapter 1 discusses the roles of the Linux kernel networking subsystem and the three layers in which it operates. The netlink socket interface appeared first in the 2.2 Linux kernel as AF_NETLINK socket. It was created as a more flexible alternative to the awkward IOCTL communication method between userspace processes and the kernel. The IOCTL handlers cannot send asynchronous messages to userspace from the kernel, whereas netlink sockets can. In order to use IOCTL, there is another level of complexity: you need to define IOCTL numbers. The operation model of netlink is quite simple: you open and register a netlink socket in userspace using the socket API, and this netlink socket handles bidirectional communication with a kernel netlink socket, usually sending messages to configure various system settings and getting responses back from the kernel. + +This chapter describes the netlink protocol implementation and API and discusses its advantages and drawbacks. I also talk about the new generic netlink protocol, discuss its implementation and its advantages, and give some illustrative examples using the libnl library. I conclude with a discussion of the socket monitoring interface. + +## The Netlink Family + +The netlink protocol is a socket-based Inter Process Communication (IPC) mechanism, based on RFC 3549, "Linux Netlink as an IP Services Protocol." It provides a bidirectional communication channel between userspace and the kernel or among some parts of the kernel itself. Netlink is an extension of the standard socket implementation. The netlink protocol implementation resides mostly under net/netlink, where you will find the following four files: + + * af_netlink.c + + * af_netlink.h + + * genetlink.c + + * diag.c + +Apart from them, there are a few header files. In fact, the af_netlink module is the most commonly used; it provides the netlink kernel socket API, whereas the genetlink module provides a new generic netlink API with which it should be easier to create netlink messages. The diag monitoring interface module (diag.c) provides an API to dump and to get information about the netlink sockets. I discuss the diag module later in this chapter in the section "Socket monitoring interface." + +I should mention here that theoretically netlink sockets can be used to communicate between two userspace processes, or more (including sending multicast messages), though this is usually not used, and was not the original goal of netlink sockets. The UNIX domain sockets provide an API for IPC, and they are widely used for communication between two userspace processes. + +Netlink has some advantages over other ways of communication between userspace and the kernel. For example, there is no need for polling when working with netlink sockets. A userspace application opens a socket and then calls recvmsg(), and enters a blocking state if no messages are sent from the kernel; see, for example, the rtnl_listen() method of the iproute2 package (lib/libnetlink.c). Another advantage is that the kernel can be the initiator of sending asynchronous messages to userspace, without any need for the userspace to trigger any action (for example, by calling some IOCTL or by writing to some sysfs entry). Yet another advantage is that netlink sockets support multicast transmission. + +You create netlink sockets from userspace with the socket() system call. The netlink sockets can be SOCK_RAW sockets or SOCK_DGRAM sockets. + +Netlink sockets can be created in the kernel or in userspace; kernel netlink sockets are created by the netlink_kernel_create() method; and userspace netlink sockets are created by the socket() system call. Creating a netlink socket from userspace or from the kernel creates a netlink_sock object. When the socket is created from userspace, it is handled by the netlink_create() method. When the socket is created in the kernel, it is handled by __netlink_kernel_create(); this method sets the NETLINK_KERNEL_SOCKET flag. Eventually both methods call __netlink_create() to allocate a socket in the common way (by calling the sk_alloc() method) and initialize it. Figure 2-1 shows how a netlink socket is created in the kernel and in userspace. + +Figure 2-1. + +Creating a netlink socket in the kernel and in userspace + +You can create a netlink socket from userspace in a very similar way to ordinary BSD-style sockets, like this, for example: socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE). Then you should create a sockaddr_nl object (instance of the netlink socket address structure), initialize it, and use the standard BSD sockets API (such as bind(), sendmsg(), recvmsg(), and so on). The sockaddr_nl structure represents a netlink socket address in userspace or in the kernel. + +Netlink socket libraries provide a convenient API to netlink sockets. I discuss them in the next section. + +### Netlink Sockets Libraries + +I recommend you use the libnl API to develop userspace applications, which send or receive data by netlink sockets. The libnl package is a collection of libraries providing APIs to the netlink protocol-based Linux kernel interfaces. The iproute2 package uses the libnl library, as mentioned. Besides the core library (libnl), it includes support for the generic netlink family (libnl-genl), routing family (libnl-route), and netfilter family (libnl-nf). The package was developed mostly by Thomas Graf ( www.infradead.org/~tgr/libnl/ ). I should mention here also that there is a library called libmnl, which is a minimalistic userspace library oriented to netlink developers. The libmnl library was mostly written by Pablo Neira Ayuso, with contributions from Jozsef Kadlecsik and Jan Engelhardt. ( http://netfilter.org/projects/libmnl/ ). + +### The sockaddr_nl Structure + +Let's take a look at the sockaddr_nl structure, which represents a netlink socket address: + +struct sockaddr_nl { + +__kernel_sa_family_t nl_family; /* AF_NETLINK */ + +unsigned short nl_pad; /* zero */ + +__u32 nl_pid; /* port ID */ + +__u32 nl_groups; /* multicast groups mask */ + +}; + +(include/uapi/linux/netlink.h) + + * nl_family: Should always be AF_NETLINK. + + * nl_pad: Should always be 0. + + * nl_pid: The unicast address of a netlink socket. For kernel netlink sockets, it should be 0. Userspace applications sometimes set the nl_pid to be their process id (pid). In a userspace application, when you set nl_pid explicitly to 0, or don't set it at all, and afterwards call bind(), the kernel method netlink_autobind() assigns a value to nl_pid. It tries to assign the process id of the current thread. If you're creating two sockets in userspace, then you are responsible that their nl_pids are unique in case you don't call bind. Netlink sockets are not used only for networking; other subsystems, such as SELinux, audit, uevent, and others, use netlink sockets. The rtnelink sockets are netlink sockets specifically used for networking; they are used for routing messages, neighbouring messages, link messages, and more networking subsystem messages. + + * nl_groups: The multicast group (or multicast group mask). + +The next section discusses the iproute2 and the older net-tools packages. The iproute2 package is based upon netlink sockets, and you'll see an example of using netlink sockets in iproute2 in the section "Adding and deleting a routing entry in a routing table", later in this chapter. I mention the net-tools package, which is older and might be deprecated in the future, to emphasize that as an alternative to iproute2, it has less power and less abilities. + +### Userspace Packages for Controlling TCP/IP Networking + +There are two userspace packages for controlling TCP/IP networking and handling network devices: net-tools and iproute2. The iproute2 package includes commands like the following: + + * ip: For management of network tables and network interfaces + + * tc: For traffic control management + + * ss: For dumping socket statistics + + * lnstat: For dumping linux network statistics + + * bridge: For management of bridge addresses and devices + +The iproute2 package is based mostly on sending requests to the kernel from userspace and getting replies back over netlink sockets. There are a few exceptions where IOCTLs are used in iproute2. For example, the ip tuntap command uses IOCTLs to add/remove a TUN/TAP device. If you look at the TUN/TAP software driver code, you'll find that it defines some IOCTL handlers, but it does not use the rtnetlink sockets. The net-tools package is based on IOCTLs and includes known commands like these: + + * ifconifg + + * arp + + * route + + * netstat + + * hostname + + * rarp + +Some of the advanced functionalities of the iproute2 package are not available in the net-tools package. + +The next section discusses kernel netlink sockets—the core engine of handling communication between userspace and the kernel by exchanging netlink messages of different types. Learning about kernel netlink sockets is essential for understanding the interface that the netlink layer provides to userspace. + +### Kernel Netlink Sockets + +You create several netlink sockets in the kernel networking stack. Each kernel socket handles messages of different types: so for example, the netlink socket, which should handle NETLINK_ROUTE messages, is created in rtnetlink_net_init(): + +static int __net_init rtnetlink_net_init(struct net *net) { + +... + +struct netlink_kernel_cfg cfg = { + +.groups = RTNLGRP_MAX, + +.input = rtnetlink_rcv, + +.cb_mutex = &rtnl_mutex, + +.flags = NL_CFG_F_NONROOT_RECV, + +}; + +sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); + +... + +} + +Note that the rtnetlink socket is aware of network namespaces; the network namespace object (struct net) contains a member named rtnl (rtnetlink socket). In the rtnetlink_net_init() method, after the rtnetlink socket was created by calling netlink_kernel_create(), it is assigned to the rtnl pointer of the corresponding network namespace object. + +Let's look in netlink_kernel_create() prototype: + +struct sock *netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) + + * The first parameter (net) is the network namespace. + + * The second parameter is the netlink protocol (for example, NETLINK_ROUTE for rtnetlink messages, or NETLINK_XFRM for IPsec or NETLINK_AUDIT for the audit subsystem). There are over 20 netlink protocols, but their number is limited by 32 (MAX_LINKS). This is one of the reasons for creating the generic netlink protocol, as you'll see later in this chapter. The full list of netlink protocols is in include/uapi/linux/netlink.h. + + * The third parameter is a reference to netlink_kernel_cfg, which consists of optional parameters for the netlink socket creation: + +struct netlink_kernel_cfg { + +unsigned int groups; + +unsigned int flags; + +void (*input)(struct sk_buff *skb); + +struct mutex *cb_mutex; + +void (*bind)(int group); + +}; + +(include/uapi/linux/netlink.h) + +The groups member is for specifying a multicast group (or a mask of multicast groups). It's possible to join a multicast group by setting nl_groups of the sockaddr_nl object (you can also do this with the nl_join_groups() method of libnl). However, in this way you are limited to joining only 32 groups. Since kernel version 2.6.14, you can use the NETLINK_ADD_MEMBERSHIP/ NETLINK_DROP_MEMBERSHIP socket option to join/leave a multicast group, respectively. Using the socket option enables you to join a much higher number of groups. The nl_socket_add_memberships()/nl_socket_drop_membership() methods of libnl use this socket option. + +The flags member can be NL_CFG_F_NONROOT_RECV or NL_CFG_F_NONROOT_SEND. + +When CFG_F_NONROOT_RECV is set, a non-superuser can bind to a multicast group; in netlink_bind() there is the following code: + +static int netlink_bind(struct socket *sock, struct sockaddr *addr, + +int addr_len) + +{ + +... + +if (nladdr->nl_groups) { + +if (!netlink_capable(sock, NL_CFG_F_NONROOT_RECV)) + +return -EPERM; + +} + +For a non-superuser, if the NL_CFG_F_NONROOT_RECV is not set, then when binding to a multicast group the netlink_capable() method will return 0, and you get –EPRM error. + +When the NL_CFG_F_NONROOT_SEND flag is set, a non-superuser is allowed to send multicasts. + +The input member is for a callback; when the input member in netlink_kernel_cfg is NULL, the kernel socket won't be able to receive data from userspace (sending data from the kernel to userspace is possible, though). For the rtnetlink kernel socket, the rtnetlink_rcv() method was declared to be the input callback; as a result, data sent from userspace over the rtnelink socket will be handled by the rtnetlink_rcv() callback. + +For uevent kernel events, you need only to send data from the kernel to userspace; so, in lib/kobject_uevent.c, you have an example of a netlink socket where the input callback is undefined: + +static int uevent_net_init(struct net *net) + +{ + +struct uevent_sock *ue_sk; + +struct netlink_kernel_cfg cfg = { + +.groups = 1, + +.flags = NL_CFG_F_NONROOT_RECV, + +}; + +... + +ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg); + +... + +} + +(lib/kobject_uevent.c) + +The mutex (cb_mutex) in the netlink_kernel_cfg object is optional; when not defining a mutex, you use the default one, cb_def_mutex (an instance of a mutex structure; see net/netlink/af_netlink.c). In fact, most netlink kernel sockets are created without defining a mutex in the netlink_kernel_cfg object. For example, the uevent kernel netlink socket (NETLINK_KOBJECT_UEVENT), mentioned earlier. Also, the audit kernel netlink socket (NETLINK_AUDIT) and other netlink sockets don't define a mutex. The rtnetlink socket is an exception—it uses the rtnl_mutex. Also the generic netlink socket, discussed in the next section, defines a mutex of its own: genl_mutex. + +The netlink_kernel_create() method makes an entry in a table named nl_table by calling the netlink_insert() method. Access to the nl_table is protected by a read write lock named nl_table_lock; lookup in this table is done by the netlink_lookup() method, specifying the protocol and the port id. Registration of a callback for a specified message type is done by rtnl_register(); there are several places in the networking kernel code where you register such callbacks. For example, in rtnetlink_init() you register callbacks for some messages, like RTM_NEWLINK (creating a new link), RTM_DELLINK (deleting a link), RTM_GETROUTE (dumping the route table), and more. In net/core/neighbour.c, you register callbacks for RTM_NEWNEIGH messages (creating a new neighbour), RTM_DELNEIGH (deleting a neighbour), RTM_GETNEIGHTBL message (dumping the neighbour table), and more. I discuss these actions in depth in Chapters 5 and . You also register callbacks to other types of messages in the FIB code (ip_fib_init()), in the multicast code (ip_mr_init()), in the IPv6 code, and in other places. + +The first step you should take to work with a netlink kernel socket is to register it. Let's take a look at the rtnl_register() method prototype: + +extern void rtnl_register(int protocol, int msgtype, + +rtnl_doit_func, + +rtnl_dumpit_func, + +rtnl_calcit_func); + +The first parameter is the protocol family (when you don't aim at a specific protocol, it is PF_UNSPEC); you'll find a list of all the protocol families in include/linux/socket.h. + +The second parameter is the netlink message type, like RTM_NEWLINK or RTM_NEWNEIGH. These are private netlink message types which the rtnelink protocol added. The full list of message types is in include/uapi/linux/rtnetlink.h . + +The last three parameters are callbacks: doit, dumpit, and calcit. The callbacks are the actions you want to perform for handling the message, and you usually specify only one callback. + +The doit callback is for actions like addition/deletion/modification; the dumpit callback is for retrieving information, and the calcit callback is for calculation of buffer size. The rtnetlink module has a table named rtnl_msg_handlers. This table is indexed by protocol number. Each entry in the table is a table in itself, indexed by message type. Each element in the table is an instance of rtnl_link, which is a structure that consists of pointers for these three callbacks. When registering a callback with rtnl_register(), you add the specified callback to this table. + +Registering a callback is done like this, for example: rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL) in net/core/rtnetlink.c . This adds rtnl_newlink as the doit callback for RTM_NEWLINK messages in the corresponding rtnl_msg_handlers entry. + +Sending of rtnelink messages is done with rtmsg_ifinfo(). For example, in dev_open() you create a new link, so you call: rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); in the rtmsg_ifinfo() method, first the nlmsg_new() method is called to allocate an sk_buff with the proper size. Then two objects are created: the netlink message header (nlmsghdr) and an ifinfomsg object, which is located immediately after the netlink message header. These two objects are initialized by the rtnl_fill_ifinfo() method. Then rtnl_notify() is called to send the packet; sending the packet is actually done by the generic netlink method, nlmsg_notify() (in net/netlink/af_netlink.c). Figure 2-2 shows the stages of sending rtnelink messages with the rtmsg_ifinfo() method. + +Figure 2-2. + +Sending of rtnelink messages with the rtmsg_ifinfo() method + +The next section is about netlink messages, which are exchanged between userspace and the kernel. A netlink message always starts with a netlink message header, so your first step in learning about netlink messages will be to study the netlink message header format. + +### The Netlink Message Header + +A netlink message should obey a certain format, specified in RFC 3549, "Linux Netlink as an IP Services Protocol", section 2.2, "Message Format." A netlink message starts with a fixed size netlink header, and after it there is a payload. This section describes the Linux implementation of the netlink message header. + +The netlink message header is defined by struct nlmsghdr in include/uapi/linux/netlink.h: + +struct nlmsghdr + +{ + +__u32 nlmsg_len; + +__u16 nlmsg_type; + +__u16 nlmsg_flags; + +__u32 nlmsg_seq; + +__u32 nlmsg_pid; + +}; + +(include/uapi/linux/netlink.h) + +Every netlink packet starts with a netlink message header, which is represented by struct nlmsghdr . The length of nlmsghdr is 16 bytes. It contains five fields: + + * nlmsg_len is the length of the message including the header. + + * nlmsg_type is the message type; there are four basic netlink message header types: + + * NLMSG_NOOP: No operation, message must be discarded. + + * NLMSG_ERROR: Error occurred. + + * NLMSG_DONE: A multipart message is terminated. + + * NLMSG_OVERRUN: Overrun notification: error, data was lost. + +(include/uapi/linux/netlink.h) + +However, families can add netlink message header types of their own. For example, the rtnetlink protocol family adds message header types such as RTM_NEWLINK, RTM_DELLINK, RTM_NEWROUTE, and a lot more (see include/uapi/linux/rtnetlink.h). For a full list of the netlink message header types that were added by the rtnelink family with detailed explanation on each, see: man 7 rtnetlink. Note that message type values smaller than NLMSG_MIN_TYPE (0x10) are reserved for control messages and may not be used. + + * nlmsg_flags field can be as follows: + + * NLM_F_REQUEST: When it's a request message. + + * NLM_F_MULTI: When it's a multipart message. Multipart messages are used for table dumps. Usually the size of messages is limited to a page (PAGE_SIZE). So large messages are divided into smaller ones, and each of them (except the last one) has the NLM_F_MULTI flag set. The last message has the NLMSG_DONE flag set. + + * NLM_F_ACK: When you want the receiver of the message to reply with ACK. Netlink ACK messages are sent by the netlink_ack() method (net/netlink/af_netlink.c). + + * NLM_F_DUMP: Retrieve information about a table/entry. + + * NLM_F_ROOT: Specify the tree root. + + * NLM_F_MATCH: Return all matching entries. + + * NLM_F_ATOMIC: This flag is deprecated. + +The following flags are modifiers for creation of an entry: + + * NLM_F_REPLACE: Override existing entry. + + * NLM_F_EXCL: Do not touch entry, if it exists. + + * NLM_F_CREATE: Create entry, if it does not exist. + + * NLM_F_APPEND: Add entry to end of list. + + * NLM_F_ECHO: Echo this request. + +I've shown the most commonly used flags. For a full list, see include/uapi/linux/netlink.h. + + * nlmsg_seq is the sequence number (for message sequences). Unlike some Layer 4 transport protocols, there is no strict enforcement of the sequence number. + + * nlmsg_pid is the sending port id. When a message is sent from the kernel, the nlmsg_pid is 0. When a message is sent from userspace, the nlmsg_pid can be set to be the process id of that userspace application which sent the message. + +Figure 2-3 shows the netlink message header. + +Figure 2-3. + +nlmsg header + +After the header comes the payload. The payload of netlink messages is composed of a set of attributes which are represented in Type-Length-Value (TLV) format. With TLV, the type and length are fixed in size (typically 1–4 bytes), and the value field is of variable size. The TLV representation is used also in other places in the networking code—for example, in IPv6 (see RFC 2460). TLV provides flexibility which makes future extensions easier to implement. Attributes can be nested, which enables complex tree structures of attributes. + +Each netlink attribute header is defined by struct nlattr: + +struct nlattr { + +__u16 nla_len; + +__u16 nla_type; + +}; + +(include/uapi/linux/netlink.h) + + * nla_len: The size of the attribute in bytes. + + * nla_type: The attribute type. The value of nla_type can be, for example, NLA_U32 (for a 32-bit unsigned integer), NLA_STRING for a variable length string, NLA_NESTED for a nested attribute, NLA_UNSPEC for arbitrary type and length, and more. You can find the list of available types in include/net/netlink.h. + +Every netlink attribute must be aligned by a 4-byte boundary (NLA_ALIGNTO). + +Each family can define an attribute validation policy, which represents the expectations regarding the received attributes. This validation policy is represented by the nla_policy object. In fact, the nla_policy struct has exactly the same content as struct nlattr: + +struct nla_policy { + +u16 type; + +u16 len; + +}; + +(include/uapi/linux/netlink.h) + +The attribute validation policy is an array of nla_policy objects; this array is indexed by the attribute number. For each attribute (except the fixed-length attributes), if the value of len in the nla_policy object is 0, no validation should be performed. If the attribute is one of the string types (such as NLA_STRING), len should be the maximum length of the string, without the terminating NULL byte. If the attribute type is NLA_UNSPEC or unknown, len should be set to the exact length of the attribute's payload. If the attribute type is NLA_FLAG, len is unused. (The reason is that the presence of the attribute itself implies a value of true, and the absence of the attribute implies a value of false). + +Receiving a generic netlink message in the kernel is handled by genl_rcv_msg(). In case it is a dump request (when the NLM_F_DUMP flag is set), you dump the table by calling the netlink_dump_start() method. If it's not a dump request, you parse the payload by the nlmsg_parse() method. The nlmsg_parse() method performs attribute validation by calling validate_nla() (lib/nlattr.c). If there are attributes with a type exceeding maxtype, they will be silently ignored for backwards compatibility. In case validation fails, you don't continue to the next step in genl_rcv_msg() (which is running the doit() callback), and the genl_rcv_msg() returns an error code. + +The next section describes the NETLINK_ROUTE messages, which are the most commonly used messages in the networking subsystem. + +### NETLINK_ROUTE Messages + +The rtnetlink (NETLINK_ROUTE) messages are not limited to the networking routing subsystem: there are neighbouring subsystem messages as well, interface setup messages, firewalling message, netlink queuing messages, policy routing messages, and many other types of rtnetlink messages, as you'll see in later chapters. + +The NETLINK_ROUTE messages can be divided into families: + + * LINK (network interfaces) + + * ADDR (network addresses) + + * ROUTE (routing messages) + + * NEIGH (neighbouring subsystem messages) + + * RULE (policy routing rules) + + * QDISC (queueing discipline) + + * TCLASS (traffic classes) + + * ACTION (packet action API, see net/sched/act_api.c) + + * NEIGHTBL (neighbouring table) + + * ADDRLABEL (address labeling) + +Each of these families has three types of messages: for creation, deletion, and retrieving information. So, for routing messages, you have the RTM_NEWROUTE message type for creating a route, the RTM_DELROUTE message type for deleting a route, and the RTM_GETROUTE message type for retrieving a route. With LINK messages there is, apart from the three methods for creation, deletion and information retrieval, an additional message for modifying a link: RTM_SETLINK. + +There are cases in which an error occurs, and you send an error message as a reply. The netlink error message is represented by the nlmsgerr struct: + +struct nlmsgerr { + +int error; + +struct nlmsghdr msg; + +}; + +(include/uapi/linux/netlink.h) + +In fact, as you can see in Figure 2-4, the netlink error message is built from a netlink message header and an error code. When the error code is not 0, the netlink message header of the original request which caused the error is appended after the error code field. + +Figure 2-4. + +Netlink error message + +If you send a message that was constructed erroneously (for example, the nlmsg_type is not valid) then a netlink error message is sent back, and the error code is set according to the error that occurred. For example, when the nlmsg_type is not valid (a negative value, or a value higher than the maximum value permitted) the error code is set to –EOPNOTSUPP. See the rtnetlink_rcv_msg() method in net/core/rtnetlink.c. In error messages, the sequence number is set to be the sequence number of the request that caused the error. + +The sender can request to get an ACK for a netlink message. This is done by setting the netlink message header type (nlmsg_type) to be NLM_F_ACK. When the kernel sends an ACK, it uses an error message (the netlink message header type of this message is set to be NLMSG_ERROR) with an error code of 0. In this case, the original netlink header of the request is not appended to the error message. For implementation details, see the netlink_ack() method implementation in net/netlink/af_netlink.c. + +After learning about NETLINK_ROUTE messages, you're ready to look at an example of adding and deleting a routing entry in a routing table using NETLINK_ROUTE messages. + +### Adding and Deleting a Routing Entry in a Routing Table + +Behind the scenes, let's see what happens in the kernel in the context of netlink protocol when adding and deleting a routing entry. You can add a routing entry to the routing table by running, for example, the following: + +ip route add 192.168.2.11 via 192.168.2.20 + +This command sends a netlink message from userspace (RTM_NEWROUTE) over an rtnetlink socket for adding a routing entry. The message is received by the rtnetlink kernel socket and handled by the rtnetlink_rcv() method. Eventually, adding the routing entry is done by invoking inet_rtm_newroute() in net/ipv4/fib_frontend.c. Subsequently, insertion into the Forwarding Information Base (FIB), which is the routing database, is accomplished with the fib_table_insert() method; however, inserting into the routing table is not the only task of fib_table_insert(). You should notify all listeners who performed registration for RTM_NEWROUTE messages. How? When inserting a new routing entry, you call the rtmsg_fib() method with RTM_NEWROUTE. The rtmsg_fib() method builds a netlink message and sends it by calling rtnl_notify() to notify all listeners who are registered to the RTNLGRP_IPV4_ROUTE group. These RTNLGRP_IPV4_ROUTE listeners can be registered in the kernel as well as in userspace (as is done in iproute2, or in some userspace routing daemons, like xorp). You'll see shortly how userspace daemons of iproute2 can subscribe to various rtnelink multicast groups. + +When deleting a routing entry, something quite similar happens. You can delete the routing entry earlier by running the following: + +ip route del 192.168.2.11 + +That command sends a netlink message from userspace (RTM_DELROUTE) over an rtnetlink socket for deleting a routing entry. The message is again received by the rtnetlink kernel socket and handled by the rtnetlink_rcv() callback. Eventually, deleting the routing entry is done by invoking inet_rtm_delroute() callback in net/ipv4/fib_frontend.c. Subsequently, deletion from the FIB is done with fib_table_delete(), which calls rtmsg_fib(), this time with the RTM_DELROUTE message. + +You can monitor networking events with iproute2 ip command like this: + +ip monitor route + +For example, if you open one terminal and run ip monitor route there, and then open another terminal and run ip route add 192.168.1.10 via 192.168.2.200, on the first terminal you'll see this line: 192.168.1.10 via 192.168.2.200 dev em1. And when you run, on the second terminal, ip route del 192.168.1.10, on the first terminal the following text will appear: Deleted 192.168.1.10 via 192.168.2.200 dev em1. + +Running ip monitor route runs a daemon that opens a netlink socket and subscribes to the RTNLGRP_IPV4_ROUTE multicast group. Now, adding/deleting a route, as done in this example, will result in this: the message that was sent with rtnl_notify() will be received by the daemon and displayed on the terminal. + +You can subscribe to other multicast groups in this way. For example, to subscribe to the RTNLGRP_LINK multicast group, run ip monitor link. This daemon receives netlink messages from the kernel—when adding/deleting a link, for example. So if you open one terminal and run ip monitor link, and then open another terminal and add a VLAN interface by vconfig add eth1 200, on the first terminal you'll see lines like this: + +4: eth1.200@eth1: mtu 1500 qdisc noop state DOWN + +link/ether 00:e0:4c:53:44:58 brd ff:ff:ff:ff:ff:ff + +And if you will add a bridge on the second terminal by brctl addbr mybr, on the first terminal you'll see lines like this: + +5: mybr: mtu 1500 qdisc noop state DOWN + +link/ether a2:7c:be:62:b5:b6 brd ff:ff:ff:ff:ff:ff + +You've seen what a netlink message is and how it is created and handled. You've seen how netlink sockets are handled. Next you'll learn why the generic netlink family (introduced in kernel 2.6.15) was created, and you'll learn about its Linux implementation. + +## Generic Netlink Protocol + +One of the drawbacks of the netlink protocol is that the number of protocol families is limited to 32 (MAX_LINKS). This is one of the main reasons that the generic netlink family was created—to provide support for adding a higher number of families. It acts as a netlink multiplexer and works with a single netlink family (NETLINK_GENERIC). The generic netlink protocol is based on the netlink protocol and uses its API. + +To add a netlink protocol family, you should add a protocol family definition in include/linux/netlink.h. But with generic netlink protocol, there is no need for that. The generic netlink protocol is also intended to be used in other subsystems besides networking, because it provides a general purpose communication channel. For example, it's used also by the acpi subsystem (see the definition of acpi_event_genl_family in drivers/acpi/event.c), by the task stats code (see kernel/taskstats.c), by the thermal events code, and more. + +The generic netlink kernel socket is created by the netlink_kernel_create() method like this: + +static int __net_init genl_pernet_init(struct net *net) { + +.. + +struct netlink_kernel_cfg cfg = { + +.input = genl_rcv, + +.cb_mutex = &genl_mutex, + +.flags = NL_CFG_F_NONROOT_RECV, + +}; + +net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); + +... + +} + +(net/netlink/genetlink.c) + +Note that, like the netlink sockets described earlier, the generic netlink socket is also aware of network namespaces; the network namespace object (struct net) contains a member named genl_sock (a generic netlink socket). As you can see, the network namespace genl_sock pointer is assigned in the genl_pernet_init() method. + +The genl_rcv() method is defined to be the input callback of the genl_sock object, which was created earlier by the genl_pernet_init() method. As a result, data sent from userspace over generic netlink sockets is handled in the kernel by the genl_rcv() callback. + +You can create a generic netlink userspace socket with the socket() system call, though it is better to use the libnl-genl API (discussed later in this section). + +Immediately after creating the generic netlink kernel socket, register the controller family (genl_ctrl): + +static struct genl_family genl_ctrl = { + +.id = GENL_ID_CTRL, + +.name = "nlctrl", + +.version = 0x2, + +.maxattr = CTRL_ATTR_MAX, + +.netnsok = true, + +}; + +static int __net_init genl_pernet_init(struct net *net) { + +... + +err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1) + +... + +The genl_ctrl has a fixed id of 0x10 (GENL_ID_CTRL); it is in fact the only instance of genl_family that's initialized with a fixed id; all other instances are initialized with GENL_ID_GENERATE as an id, which subsequently is replaced by a dynamically assigned value. + +There is support for registering multicast groups in generic netlink sockets by defining a genl_multicast_group object and calling genl_register_mc_group(); for example, in the Near Field Communication (NFC) subsystem, you have the following: + +static struct genl_multicast_group nfc_genl_event_mcgrp = { + +.name = NFC_GENL_MCAST_EVENT_NAME, + +}; + +int __init nfc_genl_init(void) + +{ + +... + +rc = genl_register_mc_group(&nfc_genl_family, &nfc_genl_event_mcgrp); + +... + +} + +(net/nfc/netlink.c) + +The name of a multicast group should be unique, because it is the primary key for lookups. + +In the multicast group, the id is also generated dynamically when registering a multicast group by calling the find_first_zero_bit() method in genl_register_mc_group(). There is only one multicast group, the notify_grp, that has a fixed id, GENL_ID_CTRL. + +To work with generic netlink sockets in the kernel, you should do the following: + + * Create a genl_family object and register it by calling genl_register_family(). + + * Create a genl_ops object and register it by calling genl_register_ops(). + +Alternatively, you can call genl_register_family_with_ops() and pass to it a genl_family object, an array of genl_ops, and its size. This method will first call genl_register_family() and then, if successful, will call genl_register_ops() for each genl_ops element of the specified array of genl_ops. + +The genl_register_family() and genl_register_ops() as well as the genl_family and genl_ops are defined in include/net/genetlink.h. + +The wireless subsystem uses generic netlink sockets: + +int nl80211_init(void) + +{ + +int err; + +err = genl_register_family_with_ops(&nl80211_fam, + +nl80211_ops, ARRAY_SIZE(nl80211_ops)); + +... + +} + +(net/wireless/nl80211.c) + +The generic netlink protocol is used by some userspace packages, such as the hostapd package and the iw package. The hostapd package ( http://hostap.epitest.fi ) provides a userspace daemon for wireless access point and authentication servers. The iw package is for manipulating wireless devices and their configuration (see http://wireless.kernel.org/en/users/Documentation/iw ). + +The iw package is based on nl80211 and the libnl library. Chapter 12 discusses nl80211 in more detail. The old userspace wireless package is called wireless-toolsand is based on sending IOCTLs. + +Here are the genl_family and genl_ops definitions in nl80211: + +static struct genl_family nl80211_fam = { + +.id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ + +.name = "nl80211", /* have users key off the name instead */ + +.hdrsize = 0, /* no private header */ + +.version = 1, /* no particular meaning now */ + +.maxattr = NL80211_ATTR_MAX, + +.netnsok = true, + +.pre_doit = nl80211_pre_doit, + +.post_doit = nl80211_post_doit, + +}; + + * name: Must be a unique name. + + * id: id is GENL_ID_GENERATE in this case, which is in fact 0. GENL_ID_GENERATE tells the generic netlink controller to assign the channel a unique channel number when you register the family with genl_register_family(). The genl_register_family() assigns an id in the range 16 (GENL_MIN_ID, which is 0x10) to 1023 (GENL_MAX_ID). + + * hdrsize: Size of a private header. + + * maxattr: NL80211_ATTR_MAX, which is the maximum number of attributes supported. + +The nl80211_policy validation policy array has NL80211_ATTR_MAX elements (each attribute has an entry in the array): + + * netnsok: true, which means the family can handle network namespaces. + + * pre_doit: A hook that's called before the doit() callback. + + * post_doit: A hook that can, for example, undo locking or any required private tasks after the doit() callback. + +You can add a command or several commands with the genl_ops structure. Let's take a look at the definition of genl_ops structand then at its usage in nl80211: + +struct genl_ops { + +u8 cmd; + +u8 internal_flags; + +unsigned int flags; + +const struct nla_policy *policy; + +int (*doit)(struct sk_buff *skb, + +struct genl_info *info); + +int (*dumpit)(struct sk_buff *skb, + +struct netlink_callback *cb); + +int (*done)(struct netlink_callback *cb); + +struct list_head ops_list; + +}; + + * cmd: Command identifier (the genl_ops struct defines a single command and its doit/dumpit handlers). + + * internal_flags: Private flags which are defined and used by the family. For example, in nl80211, there are many operations that define internal flags (such as NL80211_FLAG_NEED_NETDEV_UP, NL80211_FLAG_NEED_RTNL, and more). The nl80211 pre_doit() and post_doit() callbacks perform actions according to these flags. See net/wireless/nl80211. + + * flags: Operation flags. Values can be the following: + + * GENL_ADMIN_PERM: When this flag is set, it means that the operation requires the CAP_NET_ADMIN privilege; see the genl_rcv_msg() method in net/netlink/genetlink.c. + + * GENL_CMD_CAP_DO: This flag is set if the genl_ops struct implements the doit() callback. + + * GENL_CMD_CAP_DUMP: This flag is set if the genl_ops struct implements the dumpit() callback. + + * GENL_CMD_CAP_HASPOL: This flag is set if the genl_ops struct defines attribute validation policy (nla_policy array). + + * policy : Attribute validation policy is discussed later in this section when describing the payload. + + * doit: Standard command callback. + + * dumpit: Callback for dumping. + + * done: Completion callback for dumps. + + * ops_list: Operations list. + +static struct genl_ops nl80211_ops[] = { + +{ + +... + +{ + +.cmd = NL80211_CMD_GET_SCAN, + +.policy = nl80211_policy, + +.dumpit = nl80211_dump_scan, + +}, + +... + +} + +Note that either a doit or a dumpit callback must be specified for every element of genl_ops (nl80211_ops in this case) or the function will fail with -EINVAL. + +This entry in genl_ops adds the nl80211_dump_scan() callback as a handler of the NL80211_CMD_GET_SCAN command. The nl80211_policy is an array of nla_policy objects and defines the expected datatype of the attributes and their length. + +When running a scan command from userspace, for example by iw dev wlan0 scan, you send from userspace a generic netlink message whose command is NL80211_CMD_GET_SCAN over a generic netlink socket. Messages are sent by the nl_send_auto_complete() method or by nl_send_auto() in the newer libnl versions. nl_send_auto() fills the missing bits and pieces in the netlink message header. If you don't require any of the automatic message completion functionality, you can use nl_send() directly. + +The message is handled by the nl80211_dump_scan() method, which is the dumpit callback for this command (net/wireless/nl80211.c). There are more than 50 entries in the nl80211_ops object for handling commands, including NL80211_CMD_GET_INTERFACE, NL80211_CMD_SET_INTERFACE, NL80211_CMD_START_AP, and so on. + +To send commands to the kernel, a userspace application should know the family id. The family name is known in the userspace, but the family id is unknown in the userspace because it's determined only in runtime in the kernel. To get the family id, the userspace application should send a generic netlink CTRL_CMD_GETFAMILY request to the kernel. This request is handled by the ctrl_getfamily() method. It returns the family id as well as other information, such as the operations the family supports. Then the userspace can send commands to the kernel specifying the family id that it got in the reply. I discuss this more in the next section. + +### Creating and Sending Generic Netlink Messages + +A generic netlink message starts with a netlink header, followed by the generic netlink message header, and then there is an optional user specific header. Only after all that do you find the optional payload, as you can see in Figure 2-5. + +Figure 2-5. + +Generic netlink message. + +This is the generic netlink message header: + +struct genlmsghdr { + +__u8 cmd; + +__u8 version; + +__u16 reserved; + +}; + +(include/uapi/linux/genetlink.h) + + * cmd is a generic netlink message type; each generic family that you register adds its own commands. For example, for the nl80211_fam family mentioned above, the commands it adds (like NL80211_CMD_GET_INTERFACE) are represented by the nl80211_commands enum. There are more than 60 commands (see include/linux/nl80211.h). + + * version can be used for versioning support. With nl80211 it is 1, with no particular meaning. The version member allows changing the format of a message without breaking backward compatibility. + + * reserved is for future use. + +Allocating a buffer for a generic netlink message is done by the following method: + +sk_buff *genlmsg_new(size_t payload, gfp_t flags) + +This is in fact a wrapper around nlmsg_new(). + +After allocating a buffer with genlmsg_new(), the genlmsg_put() is called to create the generic netlink header, which is an instance of genlmsghdr. You send a unicast generic netlink message with genlmsg_unicast(), which is in fact a wrapper around nlmsg_unicast(). You can send a multicast generic netlink message in two ways: + + * genlmsg_multicast(): This method sends the message to the default network namespace, net_init. + + * genlmsg_multicast_allns(): This method sends the message to all network namespaces. + +(All prototypes of the methods mentioned in this section are in include/net/genetlink.h.) + +You can create a generic netlink socket from userspace like this: socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); this call is handled in the kernel by the netlink_create() method, like an ordinary, non-generic netlink socket, as you saw in the previous section. You can use the socket API to perform further calls like bind() and sendmsg() or recvmsg(); however, using the libnl library instead is recommended. + +libnl-genl provides generic netlink API, for management of controller, family, and command registration. With libnl-genl, you can call genl_connect() to create a local socket file descriptor and bind the socket to the NETLINK_GENERIC netlink protocol. + +Let's take a brief look at what happens in a short typical userspace-kernel session when sending a command to the kernel via generic netlink sockets using the libnl library and the libnl-genl library. + +The iw package uses the libnl-genl library. When you run a command like iw dev wlan0 list, the following sequence occurs (omitting unimportant details): + +state->nl_sock = nl_socket_alloc() + +Allocate a socket (note the use here of libnl core API and not the generic netlink family (libnl-genl) yet. + +genl_connect(state->nl_sock) + +Call socket() with NETLINK_GENERIC and call bind() on this socket; the genl_connect() is a method of the libnl-genl library. + +genl_ctrl_resolve(state->nl_sock, "nl80211"); + +This method resolves the generic netlink family name ("nl80211") to the corresponding numeric family identifier. The userspace application must send its subsequent messages to the kernel, specifying this id. + +The genl_ctrl_resolve() method calls genl_ctrl_probe_by_name(), which in fact sends a generic netlink message to the kernel with the CTRL_CMD_GETFAMILY command. + +In the kernel, the generic netlink controller ("nlctrl") handles the CTRL_CMD_GETFAMILY command by the ctrl_getfamily() method and returns the family id to userspace. This id was generated when the socket was created. + +Note + +You can get various parameters (such as generated id, header size, max attributes, and more) of all the ­registered generic netlink families with the userspace tool genl (of iproute2) by running genl ctrl list. + +You're now ready to learn about the socket monitoring interface, which lets you get information about sockets. The socket monitoring interface is used in userspace tools like ss, which displays socket information and statistics for various socket types, and in other projects, as you'll see in the next section. + +### Socket Monitoring Interface + +The sock_diag netlink sockets provide a netlink-based subsystem that can be used to get information about sockets. This feature was added to the kernel to support checkpoint/restore functionality for Linux in userspace (CRIU). To support this functionality, additional data about sockets was needed. For example, /procfs doesn't say which are the peers of a UNIX domain socket (AF_UNIX), and this info is needed for checkpoint/restore support. This additional data is not exported via /proc, and to make changes to procfs entries isn't always desirable because it might break userspace applications. The sock_diag netlink sockets give an API which enables access to this additional data. This API is used in the CRIU project as well as in the ss util. Without the sock_diag, after checkpointing a process (saving the state of a process to the filesystem), you can't reconstruct its UNIX domain sockets because you don't know who the peers are. + +To support the monitoring interface used by the ss tool, a netlink-based kernel socket is created (NETLINK_SOCK_DIAG). The ss tool, which is part of the iproute2 package, enables you to get socket statistics in a similar way to netstat. It can display more TCP and state information than other tools. + +You create a netlink kernel socket for sock_diag like this: + +static int __net_init diag_net_init(struct net *net) + +{ + +struct netlink_kernel_cfg cfg = { + +.input = sock_diag_rcv, + +}; + +net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); + +return net->diag_nlsk == NULL ? -ENOMEM : 0; + +} + +(net/core/sock_diag.c) + +The sock_diag module has a table of sock_diag_handlerobjects named sock_diag_handlers. This table is indexed by the protocol number (for the list of protocol numbers, see include/linux/socket.h). + +The sock_diag_handler struct is very simple: + +struct sock_diag_handler { + +__u8 family; + +int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); + +}; + +(net/core/sock_diag.c) + +Each protocol that wants to add a socket monitoring interface entry to this table first defines a handler and then calls sock_diag_register(), specifying its handler. For example, for UNIX sockets, there is the following in net/unix/diag.c: + +The first step is definition of the handler: + +static const struct sock_diag_handler unix_diag_handler = { + +.family = AF_UNIX, + +.dump = unix_diag_handler_dump, + +}; + +The second step is registration of the handler: + +static int __init unix_diag_init(void) + +{ + +return sock_diag_register(&unix_diag_handler); + +} + +Now, with ss –x or ss --unix, you can dump the statistics that are gathered by the UNIX diag module. In quite a similar way, there are diag modules for other protocols, such as UDP (net/ipv4/udp_diag.c), TCP (net/ipv4/tcp_diag.c), DCCP (/net/dccp/diag.c), and AF_PACKET (net/packet/diag.c). + +There's also a diag module for the netlink sockets themselves. The /proc/net/netlink entry provides information about the netlink socket (netlink_sock object) like the portid, groups, the inode number of the socket, and more. If you want the details, dumping /proc/net/netlink is handled by netlink_seq_show() in net/netlink/af_netlink.c. There are some netlink_sock fields which /proc/net/netlink doesn't provide—for example, dst_group or dst_portid or groups above 32. For this reason, the netlink socket monitoring interface was added (net/netlink/diag.c). You should be able to use the ss tool of iproute2 to read netlink sockets information. The netlink diag code can be built also as a kernel module. + +## Summary + +This chapter covered netlink sockets, which provide a mechanism for bidirectional communication between the userspace and the kernel and are widely used by the networking subsystem. You've seen some examples of netlink sockets usage. I also discussed netlink messages, how they're created and handled. Another important subject the chapter dealt with is the generic netlink sockets, including their advantages and their usage. The next chapter covers the ICMP protocol, including its usage and its implementation in IPv4 and IPv6. + +## Quick Reference + +I conclude this chapter with a short list of important methods of the netlink and generic netlink subsystems. Some of them were mentioned in this chapter: + +int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + +struct nlmsghdr *)) + +This method handles receiving netlink messages. It's called from the input callback of netlink families (for example, in the rtnetlink_rcv() method for the rtnetlink family, or in the sock_diag_rcv() method for the sock_diag family. The method performs sanity checks, like making sure that the length of the netlink message header does not exceed the permitted max length (NLMSG_HDRLEN). It also avoids invoking the specified callback in case that the message is a control message. In case the ACK flag (NLM_F_ACK) is set, it sends an error message by invoking the netlink_ack() method. + +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + +u32 dst_portid, gfp_t gfp_mask) + +This method allocates an SKB with the specified size and gfp_mask; the other parameters (ssk, dst_portid) are used when working with memory mapped netlink IO (NETLINK_MMAP). This feature is not discussed in this chapter, and is located here: net/netlink/af_netlink.c. + +struct netlink_sock *nlk_sk(struct sock *sk) + +This method returns the netlink_sock object, which has an sk as a member, and is located here: net/netlink/af_netlink.h. + +struct sock *netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) + +This method creates a kernel netlink socket. + +struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) + +This method returns the netlink message header pointed to by skb->data. + +struct nlmsghdr *__nlmsg_put(struct sk_buff *skb, u32 portid, + +u32 seq, int type, int len, int flags) + +This method builds a netlink message header according to the specified parameters, and puts it in the skb, and is located here: include/linux/netlink.h. + +struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) + +This method allocates a new netlink message with the specified message payload by calling alloc_skb(). If the specified payload is 0, alloc_skb() is called with NLMSG_HDRLEN (after alignment with the NLMSG_ALIGN macro). + +int nlmsg_msg_size(int payload) + +This method returns the length of a netlink message (message header length and payload), not including padding. + +void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, + +rtnl_calcit_func calcit) + +This method registers the specified rtnetlink message type with the three specified callbacks. + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + +This method processes an rtnetlink message. + +static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + +int type, u32 pid, u32 seq, u32 change, + +unsigned int flags, u32 ext_filter_mask) + +This method creates two objects: a netlink message header (nlmsghdr) and an ifinfomsg object, located immediately after the netlink message header. + +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + +struct nlmsghdr *nlh, gfp_t flags) + +This method sends an rtnetlink message. + +int genl_register_mc_group(struct genl_family *family, + +struct genl_multicast_group *grp) + +This method registers the specified multicast group, notifies the userspace, and returns 0 on success or a negative error code. The specified multicast group must have a name. The multicast group id is generated dynamically in this method by the find_first_zero_bit() method for all multicast groups, except for notify_grp, which has a fixed id of 0x10 (GENL_ID_CTRL). + +void genl_unregister_mc_group(struct genl_family *family, + +struct genl_multicast_group *grp) + +This method unregisters the specified multicast group and notifies the userspace about it. All current listeners on the group are removed. It's not necessary to unregister all multicast groups before unregistering the family—unregistering the family causes all assigned multicast groups to be unregistered automatically. + +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) + +This method registers the specified operations and assigns them to the specified family. Either a doit() or a dumpit() callback must be specified or the operation will fail with -EINVAL. Only one operation structure per command identifier may be registered. It returns 0 on success or a negative error code. + +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) + +This method unregisters the specified operations and unassigns them from the specified family. The operation blocks until the current message processing has finished and doesn't start again until the unregister process has finished. It's not necessary to unregister all operations before unregistering the family—unregistering the family causes all assigned operations to be unregistered automatically. It returns 0 on success or a negative error code. + +int genl_register_family(struct genl_family *family) + +This method registers the specified family after validating it first. Only one family may be registered with the same family name or identifier. The family id may equal GENL_ID_GENERATE, causing a unique id to be automatically generated and assigned. + +int genl_register_family_with_ops(struct genl_family *family, + +struct genl_ops *ops, size_t n_ops) + +This method registers the specified family and operations. Only one family may be registered with the same family name or identifier. The family id may equal GENL_ID_GENERATE, causing a unique id to be automatically generated and assigned. Either a doit or a dumpit callback must be specified for every registered operation or the function will fail. Only one operation structure per command identifier may be registered. This is equivalent to calling genl_register_family() followed by genl_register_ops() for every operation entry in the table, taking care to unregister the family on the error path. The method returns 0 on success or a negative error code. + +int genl_unregister_family(struct genl_family *family) + +This method unregisters the specified family and returns 0 on success or a negative error code. + +void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, + +struct genl_family *family, int flags, u8 cmd) + +This method adds a generic netlink header to a netlink message. + +int genl_register_family(struct genl_family *family) + +int genl_unregister_family(struct genl_family *family) + +This method registers/unregisters a generic netlink family. + +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) + +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) + +This method registers/unregisters generic netlink operations. + +void genl_lock(void) + +void genl_unlock(void) + +This method locks/unlocks the generic netlink mutex (genl_mutex). Used for example in net/l2tp/l2tp_netlink.c. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_3 + +© Rami Rosen 2014 + +# 3. Internet Control Message Protocol (ICMP) + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 2 discusses the netlink sockets implementation and how netlink sockets are used as a communication channel between the kernel and userspace. This chapter deals with the ICMP protocol, which is a Layer 4 protocol. Userspace applications can use the ICMP protocol (to send and receive ICMP packets) by using the sockets API (the best-known example is probably the ping utility). This chapter discusses how these ICMP packets are handled in the kernel and gives some examples. + +Chapter 2 discusses the netlink sockets implementation and how netlink sockets are used as a communication channel between the kernel and userspace. This chapter deals with the ICMP protocol, which is a Layer 4 protocol. Userspace applications can use the ICMP protocol (to send and receive ICMP packets) by using the sockets API (the best-known example is probably the ping utility). This chapter discusses how these ICMP packets are handled in the kernel and gives some examples. + +The ICMP protocol is used primarily as a mandatory mechanism for sending error and control messages about the network layer (L3). The protocol enables getting feedback about problems in the communication environment by sending ICMP messages. These messages provide error handling and diagnostics. The ICMP protocol is relatively simple but is very important for assuring correct system behavior. The basic definition of ICMPv4 is in RFC 792, "Internet Control Message Protocol." This RFC defines the goals of the ICMPv4 protocol and the format of various ICMPv4 messages. I also mention in this chapter RFC 1122 ("Requirements for Internet Hosts—Communication Layers") which defines some requirements about several ICMP messages; RFC 4443, which defines the ICMPv6 protocol; and RFC 1812, which defines requirements for routers. I also describe which types of ICMPv4 and ICMPv6 messages exist, how they are sent, and how they are processed. I cover ICMP sockets, including why they were added and how they are used. Keep in mind that the ICMP protocol is also used for various security attacks; for example, the Smurf Attack is a denial-of-service attack in which large numbers of ICMP packets with the intended victim's spoofed source IP are sent as broadcasts to a computer network using an IP broadcast address. + +## ICMPv4 + +ICMPv4 messages can be classified into two categories: error messages and information messages (they are termed "query messages" in RFC 1812). The ICMPv4 protocol is used in diagnostic tools like ping and traceroute. The famous ping utility is in fact a userspace application (from the iputils package) which opens a raw socket and sends an ICMP_ECHO message and should get back an ICMP_REPLY message as a response. Traceroute is a utility to find the path between a host and a given destination IP address. The traceroute utility is based on setting varying values to the Time To Live (TTL), which is a field in the IP header representing the hop count. The traceroute utility takes advantage of the fact that a forwarding machine will send back an ICMP_TIME_EXCEED message when the TTL of the packet reaches 0. The traceroute utility starts by sending messages with a TTL of 1, and with each received ICMP_DEST_UNREACH with code ICMP_TIME_EXCEED as a reply, it increases the TTL by 1 and sends again to the same destination. It uses the returned ICMP "Time Exceeded" messages to build a list of the routers that the packets traverse, until the destination is reached and returns an ICMP "Echo Reply" message. Traceroute uses the UDP protocol by default. The ICMPv4 module is net/ipv4/icmp.c. Note that ICMPv4 cannot be built as a kernel module. + +### ICMPv4 Initialization + +ICMPv4 initialization is done in the inet_init() method, which is invoked in boot phase. The inet_init() method invokes the icmp_init() method, which in turn calls the icmp_sk_init() method to create a kernel ICMP socket for sending ICMP messages and to initialize some ICMP procfs variables to their default values. (You will encounter some of these procfs variables later in this chapter.) + +Registration of the ICMPv4 protocol, like registration of other IPv4 protocols, is done in inet_init(): + +static const struct net_protocol icmp_protocol = { + +.handler = icmp_rcv, + +.err_handler = icmp_err, + +.no_policy = 1, + +.netns_ok = 1, + +}; + +(net/ipv4/af_inet.c) + + * icmp_rcv: The handler callback. This means that for incoming packets whose protocol field in the IP header equals IPPROTO_ICMP (0x1), icmp_rcv() will be invoked. + + * no_policy: This flag is set to 1, which implies that there is no need to perform IPsec policy checks; for example, the xfrm4_policy_check() method is not called in ip_local_deliver_finish() because the no_policy flag is set. + + * netns_ok: This flag is set to 1, which indicates that the protocol is aware of network namespaces. Network namespaces are described in Appendix A, in the net_device section. The inet_add_protocol() method will fail for protocols whose netns_ok field is 0 with an error of -EINVAL. + +static int __init inet_init(void) { + +... + +if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) + +pr_crit("%s: Cannot add ICMP protocol\n", __func__); + +... + +int __net_init icmp_sk_init(struct net *net) + +{ + +... + +for_each_possible_cpu(i) { + +struct sock *sk; + +err = inet_ctl_sock_create(&sk, PF_INET + +SOCK_RAW, IPPROTO_ICMP, net); + +if (err < 0) + +goto fail; + +net->ipv4.icmp_sk[i] = sk; + +... + +sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + +inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; + +} + +... + +} + +In the icmp_sk_init() method, a raw ICMPv4 socket is created for each CPU and is kept in an array. The current sk can be accessed with the icmp_sk(struct net *net) method. These sockets are used in the icmp_push_reply() method. The ICMPv4 procfs entries are initialized in the icmp_sk_init() method; I mention them in this chapter and summarize them in the "Quick Reference" section at the end of this chapter. Every ICMP packet starts with an ICMPv4 header. Before discussing how ICMPv4 messages are received and transmitted, the following section describes the ICMPv4 header, so that you better understand how ICMPv4 messages are built. + +### ICMPv4 Header + +The ICMPv4 header consists of type (8 bits), code (8 bits), and checksum (16 bits), and a 32 bits variable part member (its content varies based on the ICMPv4 type and code), as you can see in Figure 3-1. After the ICMPv4 header comes the payload, which should include the IPv4 header of the originating packet and a part of its payload. According to RFC 1812, it should contain as much of the original datagram as possible without the length of the ICMPv4 datagram exceeding 576 bytes. This size is in accordance to RFC 791, which specifies that "All hosts must be prepared to accept datagrams of up to 576 octets." + +Figure 3-1. + +The ICMPv4 header + +The ICMPv4 header is represented by struct icmphdr: + +struct icmphdr { + +__u8 type; + +__u8 code; + +__sum16 checksum; + +union { + +struct { + +__be16 id; + +__be16 sequence; + +} echo; + +__be32 gateway; + +struct { + +__be16 __unused; + +__be16 mtu; + +} frag; + +} un; + +}; + +(include/uapi/linux/icmp.h) + +You'll find the current complete list of assigned ICMPv4 message type numbers and codes at www.iana.org/assignments/icmp-parameters/icmp-parameters.xml . + +The ICMPv4 module defines an array of icmp_control objects, named icmp_pointers, which is indexed by ICMPv4 message type. Let's take a look at the icmp_control structure definition and at the icmp_pointers array: + +struct icmp_control { + +void (*handler)(struct sk_buff *skb); + +short error; /* This ICMP is classed as an error message */ + +}; + +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; + +NR_ICMP_TYPES is the highest ICMPv4 type, which is 18. + +(include/uapi/linux/icmp.h) + +The error field of the icmp_control objects of this array is 1 only for error message types, like the "Destination Unreachable" message (ICMP_DEST_UNREACH), and it is 0 (implicitly) for information messages, like echo (ICMP_ECHO). Some handlers are assigned to more than one type. Next I discuss handlers and the ICMPv4 message types they manage. + +ping_rcv() handles receiving a ping reply (ICMP_ECHOREPLY). The ping_rcv() method is implemented in the ICMP sockets code, net/ipv4/ping.c . In kernels prior to 3.0, in order to send ping, you had to create a raw socket in userspace. When receiving a reply to a ping (ICMP_ECHOREPLY message), the raw socket that sent the ping processed it. In order to understand how this is implemented, let's take a look in ip_local_deliver_finish(), which is the method which handles incoming IPv4 packets and passes them to the sockets which should process them: + +static int ip_local_deliver_finish(struct sk_buff *skb) + +{ + +... + +int protocol = ip_hdr(skb)->protocol; + +const struct net_protocol *ipprot; + +int raw; + +resubmit: + +raw = raw_local_deliver(skb, protocol); + +ipprot = rcu_dereference(inet_protos[protocol]); + +if (ipprot != NULL) { + +int ret; + +... + +ret = ipprot->handler(skb); + +... + +(net/ipv4/ip_input.c) + +When the ip_local_deliver_finish() method receives an ICMP_ECHOREPLY packet, it first tries to deliver it to a listening raw socket, which will process it. Because a raw socket that was opened in userspace handles the ICMP_ECHOREPLY message, there is no need to do anything further with it. So when the ip_local_deliver_finish() method receives ICMP_ECHOREPLY, the raw_local_deliver() method is invoked first to process it by a raw socket, and afterwards the ipprot->handler(skb) is invoked (this is the icmp_rcv() callback in the case of ICMPv4 packet). And because the packet was already processed by a raw socket, there is nothing more to do with it. So the packet is discarded silently by calling the icmp_discard() method, which is the handler for ICMP_ECHOREPLY messages. + +When the ICMP sockets ("ping sockets") were integrated into the Linux kernel in kernel 3.0, this was changed. Ping sockets are discussed in the "ICMP Sockets ("Ping Sockets")" section later in this chapter. In this context I should note that with ICMP sockets, the sender of ping can be also not a raw socket. For example, you can create a socket like this: socket (PF_INET, SOCK_DGRAM, PROT_ICMP) and use it to send ping packets. This socket is not a raw socket. As a result, the echo reply is not delivered to any raw socket, since there is no corresponding raw socket which listens. To avoid this problem, the ICMPv4 module handles receiving ICMP_ECHOREPLY messages with the ping_rcv() callback. The ping module is located in the IPv4 layer (net/ipv4/ping.c). Nevertheless, most of the code in net/ipv4/ping.c is a dual-stack code (intended for both IPv4 and IPv6). As a result, the ping_rcv() method also handles ICMPV6_ECHO_REPLY messages for IPv6 (see icmpv6_rcv() in net/ipv6/icmp.c). I talk more about ICMP sockets later in this chapter. + +icmp_discard() is an empty handler used for nonexistent message types (message types whose numbers are without corresponding declarations in the header file) and for some messages that do not need any handling, for example ICMP_TIMESTAMPREPLY. The ICMP_TIMESTAMP and the ICMP_TIMESTAMPREPLY messages are used for time synchronization; the sender sends the originate timestamp in an ICMP_TIMESTAMP request; the receiver sends ICMP_TIMESTAMPREPLY with three timestamps: the originating timestamp which was sent by the sender of the timestamp request, as well as a receive timestamp and a transmit timestamp. There are more commonly used protocols for time synchronization than ICMPv4 timestamp messages, like the Network Time Protocol (NTP). I should also mention the Address Mask request (ICMP_ADDRESS), which is normally sent by a host to a router in order to obtain an appropriate subnet mask. Recipients should reply to this message with an address mask reply message. The ICMP_ADDRESS and the ICMP_ADDRESSREPLY messages, which were handled in the past by the icmp_address() method and by the icmp_address_reply() method, are now handled also by icmp_discard(). The reason is that there are other ways to get the subnet masks, such as with DHCP. + +icmp_unreach() handles ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_PARAMETERPROB, and ICMP_QUENCH message types. + +An ICMP_DEST_UNREACH message can be sent under various conditions. Some of these conditions are described in the "Sending ICMPv4 Messages: Destination Unreachable" section in this chapter. + +An ICMP_TIME_EXCEEDED message is sent in two cases: + +In ip_forward(), each packet decrements its TTL. According to RFC 1700, the recommended TTL for the IPv4 protocol is 64. If the TTL reaches 0, this is indication that the packet should be dropped because probably there was some loop. So, if the TTL reaches 0 in ip_forward(), the icmp_send() method is invoked: + +icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + +(net/ipv4/ip_forward.c) + +In such a case, an ICMP_TIME_EXCEEDED message with code ICMP_EXC_TTL is sent, the SKB is freed, the InHdrErrors SNMP counter (IPSTATS_MIB_INHDRERRORS) is incremented, and the method returns NET_RX_DROP. + +In ip_expire(), the following occurs when a timeout of a fragment exists: + +icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + +(net/ipv4/ip_fragment.c) + +An ICMP_PARAMETERPROB message is sent when parsing the options of an IPv4 header fails, in the ip_options_compile() method or in the ip_options_rcv_srr() method (net/ipv4/ip_options.c). The options are an optional, variable length field (up to 40 bytes) of the IPv4 header. IP options are discussed in Chapter 4. + +An ICMP_QUENCH message type is in fact deprecated. According to RFC 1812, section 4.3.3.3 (Source Quench): "A router SHOULD NOT originate ICMP Source Quench messages", and also, "A router MAY ignore any ICMP Source Quench messages it receives." The ICMP_QUENCH message was intended to reduce congestion, but it turned out that this is an ineffective solution. + +icmp_redirect() handles ICMP_REDIRECT messages; according to RFC 1122, section 3.2.2.2, hosts should not send an ICMP redirect message; redirects are to be sent only by gateways. icmp_redirect() handles ICMP_REDIRECT messages. In the past, icmp_redirect() called ip_rt_redirect(), but an ip_rt_redirect() invocation is not needed anymore as the protocol handlers now all properly propagate the redirect back into the routing code. In fact, in kernel 3.6, the ip_rt_redirect() method was removed. So the icmp_redirect() method first performs sanity checks and then calls icmp_socket_deliver(), which delivers the packet to the raw sockets and invokes the protocol error handler (in case it exists). Chapter 6 discusses ICMP_REDIRECT messages in more depth. + +icmp_echo() handles echo ("ping") requests (ICMP_ECHO) by sending echo replies (ICMP_ECHOREPLY) with icmp_reply(). If case net->ipv4.sysctl_icmp_echo_ignore_all is set, a reply will not be sent. For configuring ICMPv4 procfs entries, see the "Quick Reference" section at the end of this chapter, and also Documentation/networking/ip-sysctl.txt . + +icmp_timestamp() handles ICMP Timestamp requests (ICMP_TIMESTAMP) by sending ICMP_TIMESTAMPREPLY with icmp_reply(). + +Before discussing sending ICMP messages by the icmp_reply() method and by the icmp_send() method, I should describe the icmp_bxm ("ICMP build xmit message") structure, which is used in both methods: + +struct icmp_bxm { + +struct sk_buff *skb; + +int offset; + +int data_len; + +struct { + +struct icmphdr icmph; + +__be32 times[3]; + +} data; + +int head_len; + +struct ip_options_data replyopts; + +}; + + * skb: For the icmp_reply() method, this skb is the request packet; the icmp_param object (instance of icmp_bxm) is built from it (in the icmp_echo() method and in the icmp_timestamp() method). For the icmp_send() method, this skb is the one that triggered sending an ICMPv4 message due to some conditions; you will see several examples of such messages in this section. + + * offset: Difference (offset) between skb_network_header(skb) and skb->data. + + * data_len: ICMPv4 packet payload size. + + * icmph: The ICMP v4 header. + + * times[3]: Array of three timestamps, filled in icmp_timestamp(). + + * head_len: Size of the ICMPv4 header (in case of icmp_timestamp(), there are additional 12 bytes for the timestamps). + + * replyopts: An ip_options data object. IP options are optional fields after the IP header, up to 40 bytes. They enable advanced features like strict routing/loose routing, record routing, time stamping, and more. They are initialized with the ip_options_echo() method. Chapter 4 discusses IP options. + +### Receiving ICMPv4 Messages + +The ip_local_deliver_finish() method handles packets for the local machine. When getting an ICMP packet, the method delivers the packet to the raw sockets that had performed registration of ICMPv4 protocol. In the icmp_rcv() method, first the InMsgs SNMP counter (ICMP_MIB_INMSGS) is incremented. Subsequently, the checksum correctness is verified. If the checksum is not correct, two SNMP counters are incremented, InCsumErrors and InErrors (ICMP_MIB_CSUMERRORS and ICMP_MIB_INERRORS, respectively), the SKB is freed, and the method returns 0. The icmp_rcv() method does not return an error in this case. In fact, the icmp_rcv() method always returns 0; the reason for returning 0 in case of checksum error is that no special thing should be done when receiving an erroneous ICMP message except to discard it; when a protocol handler returns a negative error, another attempt to process the packet is performed, and it is not needed in this case. For more details, refer to the implementation of the ip_local_deliver_finish() method. Then the ICMP header is examined in order to find its type; the corresponding procfs message type counter is incremented (each ICMP message type has a procfs counter), and a sanity check is performed to verify that it is not higher than the highest permitted value (NR_ICMP_TYPES). According to section 3.2.2 of RFC 1122, if an ICMP message of unknown type is received, it must be silently discarded. So if the message type is out of range, the InErrors SNMP counter (ICMP_MIB_INERRORS) is incremented, and the SKB is freed. + +In case the packet is a broadcast or a multicast, and it is an ICMP_ECHO message or an ICMP_TIMESTAMP message, there is a check whether broadcast/multicast echo requests are permitted by reading the variable net->ipv4.sysctl_icmp_echo_ignore_broadcasts. This variable can be configured via procfs by writing to /proc/sys/net/ipv4/icmp_echo_ignore_broadcasts, and by default its value is 1. If this variable is set, the packet is dropped silently. This is done according to section 3.2.2.6 of RFC 1122: "An ICMP Echo Request destined to an IP broadcast or IP multicast address MAY be silently discarded." And according to section 3.2.2.8 of this RFC, "An ICMP Timestamp Request message to an IP broadcast or IP multicast address MAY be silently discarded." Then a check is performed to detect whether the type is allowed for broadcast/multicast (ICMP_ECHO, ICMP_TIMESTAMP, ICMP_ADDRESS, and ICMP_ADDRESSREPLY). If it is not one of these message types, the packet is dropped and 0 is returned. Then according to its type, the corresponding entry in the icmp_pointers array is fetched and the appropriate handler is called. Let's take a look in the ICMP_ECHO entry in the icmp_control dispatch table: + +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { + +... + +[ICMP_ECHO] = { + +.handler = icmp_echo + +} + +... + +} + +So when receiving a ping (the type of the message is "Echo Request," ICMP_ECHO), it is handled by the icmp_echo() method. The icmp_echo() method changes the type in the ICMP header to be ICMP_ECHOREPLY and sends a reply by calling the icmp_reply() method. Apart from ping, the only other ICMP message which requires a response is the timestamp message (ICMP_TIMESTAMP); it is handled by the icmp_timestamp() method, which, much like in the ICMP_ECHO case, changes the type to ICMP_TIMESTAMPREPLY and sends a reply by calling the icmp_reply() method. Sending is done by ip_append_data() and by ip_push_pending_frames(). Receiving a ping reply (ICMP_ECHOREPLY) is handled by the ping_rcv() method . + +You can disable replying to pings with the following: + +echo 1 > /proc/sys/net/ipv4/icmp_echo_ignore_all + +There are some callbacks that handle more than one ICMP type. The icmp_discard() callback, for example, handles ICMPv4 packets whose type is not handled by the Linux ICMPv4 implementation, and messages like ICMP_TIMESTAMPREPLY, ICMP_INFO_REQUEST, ICMP_ADDRESSREPLY, and more. + +### Sending ICMPv4 Messages: "Destination Unreachable" + +There are two methods for sending an ICMPv4 message: the first is the icmp_reply() method, which is sent as a response for two types of ICMP requests, ICMP_ECHO and ICMP_TIMESTAMP. The second one is the icmp_send() method, where the local machine initiates sending an ICMPv4 message under certain conditions (described in this section). Both these methods eventually invoke icmp_push_reply() for actually sending the packet. The icmp_reply() method is called as a response to an ICMP_ECHO message from the icmp_echo() method, and as a response to an ICMP_TIMESTAMP message from the icmp_timestamp() method. The icmp_send() method is invoked from many places in the IPv4 network stack—for example, from netfilter, from the forwarding code (ip_forward.c) from tunnels like ipip and ip_gre, and more. + +This section looks into some of the cases when a "Destination Unreachable" message is sent (the type is ICMP_DEST_UNREACH). + +#### Code 2: ICMP_PROT_UNREACH (Protocol Unreachable) + +When the protocol of the IP header (which is an 8-bit field) is a nonexistent protocol, an ICMP_DEST_UNREACH/ICMP_PROT_UNREACH is sent back to the sender because there is no protocol handler for such a protocol (the protocol handler array is indexed by the protocol number, so for nonexistent protocols there will be no handler). By nonexistent protocol I mean either that because of some error indeed the protocol number of the IPv4 header does not appear in the protocol number list (which you can find in include/uapi/linux/in.h for IPv4), or that the kernel was built without support for that protocol, and, as a result, this protocol is not registered and there is no entry for it in the protocol handlers array. Because such a packet can't be handled, an ICMPv4 message of "Destination Unreachable" should be replied back to the sender; the ICMP_PROT_UNREACH code in the ICMPv4 reply signifies the cause of the error, "protocol is unreachable." See the following: + +static int ip_local_deliver_finish(struct sk_buff *skb) + +{ + +... + +int protocol = ip_hdr(skb)->protocol; + +const struct net_protocol *ipprot; + +int raw; + +resubmit: + +raw = raw_local_deliver(skb, protocol); + +ipprot = rcu_dereference(inet_protos[protocol]); + +if (ipprot != NULL) { + +... + +} else { + +if (!raw) { + +if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + +IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); + +icmp_send(skb, ICMP_DEST_UNREACH,ICMP_PROT_UNREACH, 0); + +} + +... + +} + +(net/ipv4/ip_input.c) + +In this example, a lookup in the inet_protos array by protocol is performed; and because no entry was found, this means that the protocol is not registered in the kernel. + +#### Code 3: ICMP_PORT_UNREACH ("Port Unreachable") + +When receiving UDPv4 packets, a matching UDP socket is searched for. If no matching socket is found, the checksum correctness is verified. If it is wrong, the packet is dropped silently. If it is correct, the statistics are updated and a "Destination Unreachable"/"Port Unreachable" ICMP message is sent back: + +int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) + +{ + +struct sock *sk; + +... + +sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable) + +... + +if (sk != NULL) { + +... + +} + +/* No socket. Drop packet silently, if checksum is wrong */ + +if (udp_lib_checksum_complete(skb)) + +goto csum_error; + +UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +... + +} + +... + +} + +(net/ipv4/udp.c) + +A lookup is being performed by the __udp4_lib_lookup_skb() method, and if there is no socket, the statistics are updated and an ICMP_DEST_UNREACH message with ICMP_PORT_UNREACH code is sent back. + +#### Code 4: ICMP_FRAG_NEEDED + +When forwarding a packet with a length larger than the MTU of the outgoing link, if the don't fragment (DF) bit in the IPv4 header (IP_DF) is set, the packet is discarded and an ICMP_DEST_UNREACH message with ICMP_FRAG_NEEDED code is sent back to the sender: + +int ip_forward(struct sk_buff *skb) + +{ + +... + +struct rtable *rt; /* Route we use */ + +... + +if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && + +(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + +IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED + +htonl(dst_mtu(&rt->dst))); + +goto drop; + +} + +... + +} + +(net/ipv4/ip_forward.c) + +#### Code 5: ICMP_SR_FAILED + +When forwarding a packet with the strict routing option and gatewaying set, a "Destination Unreachable" message with ICMP_SR_FAILED code is sent back, and the packet is dropped: + +int ip_forward(struct sk_buff *skb) + +{ + +struct ip_options *opt = &(IPCB(skb)->opt); + +... + +if (opt->is_strictroute && rt->rt_uses_gateway) + +goto sr_failed; + +... + +sr_failed: + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + +goto drop; + +} + +(net/ipv4/ip_forward.c) + +For a full list of all IPv4 "Destination Unreachable" codes, see Table 3-1 in the "Quick Reference" section at the end of this chapter. Note that a user can configure some rules with the iptables REJECT target and the \--reject-with qualifier, which can send "Destination Unreachable" messages according to the selection; more in the "Quick Reference" section at the end of this chapter. + +Both the icmp_reply() and the icmp_send() methods support rate limiting; they call icmpv4_xrlim_allow(), and if the rate limiting check allows sending the packet (the icmpv4_xrlim_allow() returns true), they send the packet. It should be mentioned here that rate limiting is not performed automatically on all types of traffic. Here are the conditions under which rate limiting check will not be performed: + + * The message type is unknown. + + * The packet is of PMTU discovery. + + * The device is a loopback device. + + * The ICMP type is not enabled in the rate mask. + +If all these conditions are not matched, rate limiting is performed by calling the inet_peer_xrlim_allow() method. You'll find more info about rate mask in the "Quick Reference" section at the end of this chapter. + +Let's look inside the icmp_send() method. First, this is its prototype: + +void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) + +skb_in is the SKB which caused the invocation of the icmp_send() method type and code are the ICMPv4 message type and code, respectively. The last parameter, info, is used in the following cases: + + * For the ICMP_PARAMETERPROB message type it is the offset in the IPv4 header where the parsing problem occurred. + + * For the ICMP_DEST_UNREACH message type with ICMP_FRAG_NEEDED code, it is the MTU. + + * For the ICMP_REDIRECT message type with ICMP_REDIR_HOST code, it is the IP address of the destination address in the IPv4 header of the provoking SKB. + +When further looking into the icmp_send() method, first there are some sanity checks. Then multicast/broadcast packets are rejected. A check of whether the packet is a fragment is performed by inspecting the frag_off field of the IPv4 header. If the packet is fragmented, an ICMPv4 message is sent, but only for the first fragment. According to section 4.3.2.7 of RFC 1812, an ICMP error message must not be sent as the result of receiving an ICMP error message. So first a check is performed to find out whether the ICMPv4 message to be sent is an error message, and if it is so, another check is performed to find out whether the provoking SKB contained an error ICMPv4 message, and if so, then the method returns without sending the ICMPv4 message. Also if the type is an unknown ICMPv4 type (higher than NR_ICMP_TYPES), the method returns without sending the ICMPv4 message, though this isn't specified explicitly by the RFC. Then the source address is determined according to the value of net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr value (more details in the "Quick Reference" section at the end of this chapter). Then the ip_options_echo() method is invoked to copy the IP options of the IPv4 header of the invoking SKB. An icmp_bxm object (icmp_param) is being allocated and initialized, and a lookup in the routing subsystem is performed with the icmp_route_lookup() method. Then the icmp_push_reply() method is invoked. + +Let's take a look at the icmp_push_reply() method, which actually sends the packet. The icmp_push_reply() first finds the socket on which the packet should be sent by calling: + +sk = icmp_sk(dev_net((*rt)->dst.dev)); + +The dev_net() method returns the network namespace of the outgoing network device. (The dev_net() method and network namespaces are discussed in chapter 14 and in Appendix A.) Then, the icmp_sk() method fetches the socket (because in SMP there is a socket per CPU). Then the ip_append_data() method is called to move the packet to the IP layer. If the ip_append_data() method fails, the statistics are updated by incrementing the ICMP_MIB_OUTERRORS counter and the ip_flush_pending_frames() method is called to free the SKB. I discuss the ip_append_data() method and the ip_flush_pending_frames() method in Chapter 4. + +Now that you know all about ICMPv4, it's time to move on to ICMPv6. + +## ICMPv6 + +ICMPv6 has many similarities to ICMPv4 when it comes to reporting errors in the network layer (L3). There are additional tasks for ICMPv6 which are not performed in ICMPv4. This section discusses the ICMPv6 protocol, its new features (which are not implemented in ICMPv4), and the features which are similar. ICMPv6 is defined in RFC 4443. If you delve into ICMPv6 code you will probably encounter, sooner or later, comments that mention RFC 1885. In fact, RFC 1885, "Internet Control Message Protocol (ICMPv6) for the Internet Protocol Version 6 (IPv6)," is the base ICMPv6 RFC. It was obsoleted by RFC 2463, which was in turn obsoleted by RFC 4443. The ICMPv6 implementation is based upon IPv4, but it is more complicated; the changes and additions that were added are discussed in this section. + +The ICMPv6 protocol has a next header value of 58, according to RFC 4443, section 1 (Chapter 8 discusses IPv6 next headers). ICMPv6 is an integral part of IPv6 and must be fully implemented by every IPv6 node. Apart from error handling and diagnostics, ICMPv6 is used for the Neighbour Discovery (ND) protocol in IPv6, which replaces and enhances functions of ARP in IPv4, and for the Multicast Listener Discovery (MLD) protocol, which is the counterpart of the IGMP protocol in IPv4, shown in Figure 3-2. + +Figure 3-2. + +ICMP in IPv4 and IPv6. The counterpart of the IGMP protocol in IPv6 is the MLD protocol, and the counterpart of the ARP protocol in IPv6 is the ND protocol + +This section covers the ICMPv6 implementation. As you will see, it has many things in common with the ICMPv4 implementation in the way messages are handled and sent. There are even cases when the same methods are called in ICMPv4 and in ICMPv6 (for example, ping_rcv() and inet_peer_xrlim_allow()). There are some differences, and some topics are unique to ICMPv6. The ping6 and traceroute6 utilities are based on ICMPv6 and are the counterparts of ping and traceroute utilities of IPv4 (mentioned in the ICMPv4 section in the beginning of this chapter). ICMPv6 is implemented in net/ipv6/icmp.c and in net/ipv6/ip6_icmp.c . As with ICMPv4, ICMPv6 cannot be built as a kernel module. + +### ICMPv6 Initialization + +ICMPv6 initialization is done by the icmpv6_init() method and by the icmpv6_sk_init() method. Registration of the ICMPv6 protocol is done by icmpv6_init() (net/ipv6/icmp.c): + +static const struct inet6_protocol icmpv6_protocol = { + +.handler = icmpv6_rcv + +.err_handler = icmpv6_err + +.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL + +}; + +The handler callback is icmpv6_rcv(); this means that for incoming packets whose protocol field equals IPPROTO_ICMPV6 (58), icmpv6_rcv() will be invoked. + +When the INET6_PROTO_NOPOLICY flag is set, this implies that IPsec policy checks should not be performed; for example, the xfrm6_policy_check() method is not called in ip6_input_finish() because the INET6_PROTO_NOPOLICY flag is set: + +int __init icmpv6_init(void) + +{ + +int err; + +... + +if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) + +goto fail; + +return 0; + +} + +static int __net_init icmpv6_sk_init(struct net *net) + +{ + +struct sock *sk; + +... + +for_each_possible_cpu(i) { + +err = inet_ctl_sock_create(&sk, PF_INET6 + +SOCK_RAW, IPPROTO_ICMPV6, net); + +... + +net->ipv6.icmp_sk[i] = sk; + +... + +} + +As in ICMPv4, a raw ICMPv6 socket is created for each CPU and is kept in an array. The current sk can be accessed by the icmpv6_sk() method. + +### ICMPv6 Header + +The ICMPv6 header consists of type (8 bits), code (8 bits), and checksum (16 bits), as you can see in Figure 3-3. + +Figure 3-3. + +ICMPv6 header + +The ICMPv6 header is represented by struct icmp6hdr: + +struct icmp6hdr { + +__u8 icmp6_type; + +__u8 icmp6_code; + +__sum16 icmp6_cksum; + +... + +} + +There is not enough room to show all the fields of struct icmp6hdr because it is too large (it is defined in include/uapi/linux/icmpv6.h). When the high-order bit of the type field is 0 (values in the range from 0 to 127), it indicates an error message; when the high-order bit is 1 (values in the range from 128 to 255), it indicates an information message. Table 3-1 shows the ICMPv6 message types by their number and kernel symbol. + +Table 3-1. + +ICMPv6 Messages + +Type | Kernel symbol | Error/Info | Description + +---|---|---|--- + +1 | ICMPV6_DEST_UNREACH | Error | Destination Unreachable + +2 | ICMPV6_PKT_TOOBIG | Error | Packet too big + +3 | ICMPV6_TIME_EXCEED | Error | Time Exceeded + +4 | ICMPV6_PARAMPROB | Error | Parameter problem + +128 | ICMPV6_ECHO_REQUEST | Info | Echo Request + +129 | ICMPV6_ECHO_REPLY | Info | Echo Reply + +130 | ICMPV6_MGM_QUERY | Info | Multicast group membership management query + +131 | ICMPV6_MGM_REPORT | Info | Multicast group membership management report + +132 | ICMPV6_MGM_REDUCTION | Info | Multicast group membership management reduction + +133 | NDISC_ROUTER_SOLICITATION | Info | Router solicitation + +134 | NDISC_ROUTER_ADVERTISEMENT | Info | Router advertisement + +135 | NDISC_NEIGHBOUR_SOLICITATION | Info | Neighbour solicitation + +136 | NDISC_NEIGHBOUR_ADVERTISEMENT | Info | Neighbour advertisement + +137 | NDISC_REDIRECT | Info | Neighbour redirect + +The current complete list of assigned ICMPv6 types and codes can be found at www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xml . + +ICMPv6 performs some tasks that are not performed by ICMPv4. For example, Neighbour Discovery is done by ICMPv6, whereas in IPv4 it is done by the ARP/RARP protocols. Multicast group memberships are handled by ICMPv6 in conjunction with the MLD (Multicast Listener Discovery) protocol, whereas in IPv4 this is performed by IGMP (Internet Group Management Protocol). Some ICMPv6 messages are similar in meaning to ICMPv4 messages; for example, ICMPv6 has these messages: "Destination Unreachable," (ICMPV6_DEST_UNREACH), "Time Exceeded" (ICMPV6_TIME_EXCEED), "Parameter Problem" (ICMPV6_PARAMPROB), "Echo Request" (ICMPV6_ECHO_REQUEST), and more. On the other hand, some ICMPv6 messages are unique to IPv6, such as the NDISC_NEIGHBOUR_SOLICITATION message. + +### Receiving ICMPv6 Messages + +When getting an ICMPv6 packet, it is delivered to the icmpv6_rcv() method, which gets only an SKB as a parameter. Figure 3-4 shows the Rx path of a received ICMPv6 message. + +Figure 3-4. + +Receive path of ICMPv6 message + +In the icmpv6_rcv() method, after some sanity checks, the InMsgs SNMP counter (ICMP6_MIB_INMSGS) is incremented. Subsequently, the checksum correctness is verified. If the checksum is not correct, the InErrors SNMP counter (ICMP6_MIB_INERRORS) is incremented, and the SKB is freed. The icmpv6_rcv() method does not return an error in this case (in fact it always returns 0, much like its IPv4 counterpart, icmp_rcv()).Then the ICMPv6 header is read in order to find its type; the corresponding procfs message type counter is incremented by the ICMP6MSGIN_INC_STATS_BH macro (each ICMPv6 message type has a procfs counter). For example, when receiving ICMPv6 ECHO requests ("pings"), the /proc/net/snmp6/Icmp6InEchos counter is incremented, and when receiving ICMPv6 Neighbour Solicitation requests, the /proc/net/snmp6/Icmp6InNeighborSolicits counter is incremented. + +In ICMPv6, there is no dispatch table like the icmp_pointers table in ICMPv4. The handlers are invoked according to the ICMPv6 message type, in a long switch(type) command: + + * "Echo Request" (ICMPV6_ECHO_REQUEST) is handled by the icmpv6_echo_reply() method. + + * "Echo Reply" (ICMPV6_ECHO_REPLY) is handled by the ping_rcv() method. The ping_rcv() method is in the IPv4 ping module (net/ipv4/ping.c); this method is a dual-stack method (it handles both IPv4 and IPv6—discussed in the beginning of this chapter). + + * Packet too big (ICMPV6_PKT_TOOBIG). + + * First a check is done to verify that the data block area (pointed to by skb->data) contains a block of data whose size is at least as big as an ICMP header. This is done by the pskb_may_pull() method. If this condition is not met, the packet is dropped. + + * Then the icmpv6_notify() method is invoked. This method eventually calls the raw6_icmp_error() method so that the registered raw sockets will handle the ICMP messages. + + * "Destination Unreachable," "Time Exceeded," and "Parameter Problem" (ICMPV6_DEST_UNREACH, ICMPV6_TIME_EXCEED, and ICMPV6_PARAMPROB respectively) are also handled by icmpv6_notify(). + + * Neighbour Discovery (ND) messages: + + * NDISC_ROUTER_SOLICITATION: Messages which are sent usually to the all-routers multicast address of FF02::2, and which are answered by router advertisements. (Special IPv6 multicast addresses are discussed in Chapter 8). + + * NDISC_ROUTER_ADVERTISEMENT: Messages which are sent periodically by routers or as an immediate response to router solicitation requests. Router advertisements contain prefixes that are used for on-link determination and/or address configuration, a suggested hop limit value, and so on. + + * NDISC_NEIGHBOUR_SOLICITATION: The counterpart of ARP request in IPv4. + + * NDISC_NEIGHBOUR_ADVERTISEMENT: The counterpart of ARP reply in IPv4. + + * NDISC_REDIRECT: Used by routers to inform hosts of a better first hop for a destination. + + * All the Neighbour Discovery (ND) messages are handled by the neighbour discovery method, ndisc_rcv() (net/ipv6/ndisc.c). The ndisc_rcv() method is discussed in Chapter 7. + + * ICMPV6_MGM_QUERY (Multicast Listener Report) is handled by igmp6_event_query(). + + * ICMPV6_MGM_REPORT (Multicast Listener Report) is handled by igmp6_event_report(). Note: Both ICMPV6_MGM_QUERY and ICMPV6_MGM_REPORT are discussed in more detail in Chapter 8. + + * Messages of unknown type, and the following messages, are all handled by the icmpv6_notify() method: + + * ICMPV6_MGM_REDUCTION: When a host leaves a multicast group, it sends an MLDv2 ICMPV6_MGM_REDUCTION message; see the igmp6_leave_group() method in net/ipv6/mcast.c. + + * ICMPV6_MLD2_REPORT: MLDv2 Multicast Listener Report packet; usually sent with destination address of the all MLDv2-capable routers Multicast Group Address (FF02::16). + + * ICMPV6_NI_QUERY- ICMP: Node Information Query. + + * ICMPV6_NI_REPLY: ICMP Node Information Response. + + * ICMPV6_DHAAD_REQUEST: ICMP Home Agent Address Discovery Request Message; see section 6.5, RFC 6275, "Mobility Support in IPv6." + + * ICMPV6_DHAAD_REPLY: ICMP Home Agent Address Discovery Reply Message; See section 6.6, RFC 6275. + + * ICMPV6_MOBILE_PREFIX_SOL: ICMP Mobile Prefix Solicitation Message Format; see section 6.7, RFC 6275. + + * ICMPV6_MOBILE_PREFIX_ADV: ICMP Mobile Prefix Advertisement Message Format; see section 6.8, RFC 6275. + +Notice that the switch(type) command ends like this: + +default: + +LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + +/* informational */ + +if (type & ICMPV6_INFOMSG_MASK) + +break; + +/* + +* error of unknown type. + +* must pass to upper level + +*/ + +icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + +} + +Informational messages fulfill the condition (type & ICMPV6_INFOMSG_MASK), so they are discarded, whereas the other messages which do not fulfill this condition (and therefore should be error messages) are passed to the upper layer. This is done in accordance with section 2.4 ("Message Processing Rules") of RFC 4443. + +### Sending ICMPv6 Messages + +The main method for sending ICMPv6 messages is the icmpv6_send() method. The method is called when the local machine initiates sending an ICMPv6 message under conditions described in this section. There is also the icmpv6_echo_reply() method, which is called only as a response to an ICMPV6_ECHO_REQUEST ("ping") message. The icmp6_send() method is invoked from many places in the IPv6 network stack. This section looks at several examples. + +#### Example: Sending "Hop Limit Time Exceeded" ICMPv6 Messages + +When forwarding a packet, every machine decrements the Hop Limit Counter by 1. The Hop Limit Counter is a member of the IPv6 header—it is the IPv6 counterpart to Time To Live in IPv4. When the value of the Hop Limit Counter header reaches 0, an ICMPV6_TIME_EXCEED message is sent with ICMPV6_EXC_HOPLIMIT code by calling the icmpv6_send() method, then the statistics are updated and the packet is dropped: + +int ip6_forward(struct sk_buff *skb) + +{ + +... + +if (hdr->hop_limit <= 1) { + +/* Force OUTPUT device used as source address */ + +skb->dev = dst->dev; + +icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + +IP6_INC_STATS_BH(net + +ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + +kfree_skb(skb); + +return -ETIMEDOUT; + +} + +... + +} + +(net/ipv6/ip6_output.c) + +#### Example: Sending "Fragment Reassembly Time Exceeded" ICMPv6 Messages + +When a timeout of a fragment occurs, an ICMPV6_TIME_EXCEED message with ICMPV6_EXC_FRAGTIME code is sent back, by calling the icmpv6_send() method: + +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq + +struct inet_frags *frags) + +{ + +. . . + +icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + +. . . + +} + +(net/ipv6/reassembly.c) + +#### Example: Sending "Destination Unreachable"/"Port Unreachable" ICMPv6 Messages + +When receiving UDPv6 packets, a matching UDPv6 socket is searched for. If no matching socket is found, the checksum correctness is verified. If it is wrong, the packet is dropped silently. If it is correct, the statistics (UDP_MIB_NOPORTS MIB counter, which is exported to procfs by /proc/net/snmp6/Udp6NoPorts) is updated and a "Destination Unreachable"/"Port Unreachable" ICMPv6 message is sent back with icmpv6_send(): + +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) + +{ + +... + +sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + +if (sk != NULL) { + +... + +} + +... + +if (udp_lib_checksum_complete(skb)) + +goto discard; + +UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + +icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +... + +} + +This case is very similar to the UDPv4 example given earlier in this chapter. + +#### Example: Sending "Fragmentation Needed" ICMPv6 Messages + +When forwarding a packet, if its size is larger than the MTU of the outgoing link, and the local_df bit in the SKB is not set, the packet is discarded and an ICMPV6_PKT_TOOBIG message is sent back to the sender. The information in this message is used as part of the Path MTU (PMTU) discovery process. + +Note that as opposed to the parallel case in IPv4, where an ICMP_DEST_UNREACH message with ICMP_FRAG_NEEDED code is sent, in this case an ICMPV6_PKT_TOOBIG message is sent back, and not a "Destination Unreachable" (ICMPV6_DEST_UNREACH) message. The ICMPV6_PKT_TOOBIG message has a message type number of its own in ICMPv6: + +int ip6_forward(struct sk_buff *skb) + +{ + +... + +if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || + +(IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + +/* Again, force OUTPUT device used as source address */ + +skb->dev = dst->dev; + +icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + +IP6_INC_STATS_BH(net + +ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); + +IP6_INC_STATS_BH(net + +ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + +kfree_skb(skb); + +return -EMSGSIZE; + +} + +... + +} + +(net/ipv6/ip6_output.c) + +#### Example: Sending "Parameter Problem" ICMPv6 Messages + +When encountering a problem in parsing extension headers, an ICMPV6_PARAMPROB message with ICMPV6_UNK_OPTION code is sent back: + +static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) { + +switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { + +... + +case 2: /* send ICMP PARM PROB regardless and drop packet */ + +icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); + +return false; + +} + +(net/ipv6/exthdrs.c) + +The icmpv6_send() method supports rate limiting by calling icmpv6_xrlim_allow(). I should mention here that, as in ICMPv4, rate limiting is not performed automatically in ICMPv6 on all types of traffic. Here are the conditions under which rate limiting check will not be performed: + + * Informational messages + + * PMTU discovery + + * Loopback device + +If all these conditions are not matched, rate limiting is performed by calling the inet_peer_xrlim_allow() method, which is shared between ICMPv4 and ICMPv6. Note that unlike IPv4, you can't set a rate mask in IPv6. It is not forbidden by the ICMPv6 spec, RFC 4443, but it was never implemented. + +Let's look inside the icmp6_send() method. First, this is its prototype: + +static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) + +The parameters are similar to those of the icmp_send() method of IPv4, so I won't repeat the explanation here. When further looking into the icmp6_send() code, you find some sanity checks. Checking whether the provoking message is an ICMPv6 error message is done by calling the is_ineligible() method; if it is, the icmp6_send() method terminates. The length of the message should not exceed 1280, which is IPv6 minimum MTU (IPV6_MIN_MTU, defined in include/linux/ipv6.h). This is done in accordance with RFC 4443, section 2.4 (c), which says that every ICMPv6 error message must include as much of the IPv6 offending (invoking) packet (the packet that caused the error) as possible without making the error message packet exceed the minimum IPv6 MTU. Then the message is passed to the IPv6 layer, by the ip6_append_data() method and by the icmpv6_push_pending_frame() method, to free the SKB. + +Now I'll turn to the icmpv6_echo_reply() method; as a reminder, this method is called as a response to an ICMPV6_ECHO message. The icmpv6_echo_reply() method gets only one parameter, the SKB. It builds an icmpv6_msg object and sets its type to ICMPV6_ECHO_REPLY. Then it passes the message to the IPv6 layer, by the ip6_append_data() method and by the icmpv6_push_pending_frame() method. If the ip6_append_data() method fails, an SNMP counter (ICMP6_MIB_OUTERRORS) is incremented, and ip6_flush_pending_frames() is invoked to free the SKB. + +Chapters 7 and also discuss ICMPv6. The next section introduces ICMP sockets and the purpose they serve. + +## ICMP Sockets ("Ping sockets") + +A new type of sockets (IPPROTO_ICMP) was added by a patch from the Openwall GNU/*/Linux distribution (Owl), which provides security enhancements over other distributions. The ICMP sockets enable a setuid-less "ping." For Openwall GNU/*/Linux, it was the last step on the road to a setuid-less distribution. With this patch, a new ICMPv4 ping socket (which is not a raw socket) is created with: + +socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); + +instead of with: + +socket(PF_INET, SOCK_RAW, IPPROTO_ICMP); + +There is also support for IPPROTO_ICMPV6 sockets, which was added later, in net/ipv6/icmp.c . A new ICMPv6 ping socket is created with: + +socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); + +instead of with: + +socket(PF_INET6, SOCK_RAW, IPPROTO_ICMP6); + +Similar functionality (non-privileged ICMP) is implemented in Mac OS X; see: www.manpagez.com/man/4/icmp/ . + +Most of the code for ICMP sockets is in net/ipv4/ping.c; in fact, large parts of the code in net/ipv4/ping.c are dual-stack (IPv4 and IPv6). In net/ipv6/ping.c there are only few IPv6-specific bits. Using ICMP sockets is disabled by default. You can enable ICMP sockets by setting the following procfs entry: /proc/sys/net/ipv4/ping_group_range. It is "1 0" by default, meaning that nobody (not even root) may create ping sockets. So, if you want to allow a user with uid and gid of 1000 to use the ICMP socket, you should run this from the command line (with root privileges): echo 1000 1000 > /proc/sys/net/ipv4/ping_group_range, and then you can ping from this user account using ICMP sockets. If you want to set privileges for a user in the system, you should run from the command line echo 0 2147483647 > /proc/sys/net/ipv4/ping_group_range. (2147483647 is the value of GID_T_MAX; see include/net/ping.h.) There are no separate security settings for IPv4 and IPv6; everything is controlled by /proc/sys/net/ipv4/ping_group_range. The ICMP sockets support only ICMP_ECHO for IPv4 or ICMPV6_ECHO_REQUEST for IPv6, and the code of the ICMP message must be 0 in both cases. + +The ping_supported() helper method checks whether the parameters for building the ICMP message (both for IPv4 and IPv6) are valid. It is invoked from ping_sendmsg(): + +static inline int ping_supported(int family, int type, int code) + +{ + +return (family == AF_INET && type == ICMP_ECHO && code == 0) || + +(family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); + +} + +(net/ipv4/ping.c) + +ICMP sockets export the following entries to procfs: /proc/net/icmp for IPv4 and /proc/net/icmp6 for IPv6. + +For more info about ICMP sockets see http://openwall.info/wiki/people/segoon/ping and http://lwn.net/Articles/420799/ . + +## Summary + +This chapter covered the implementation of ICMPv4 and ICMPv6. You learned about the ICMP header format of both protocols and about receiving and sending messages with both protocols. The new features of ICMPv6, which you will encounter in upcoming chapters, were also discussed. The Neighbouring Discovery protocol, which uses ICMPv6 messages, is discussed in Chapter 7, and the MLD protocol, which also uses ICMPv6 messages, is covered in Chapter 8. The next chapter, Chapter 4, talks about the implementation of the IPv4 network layer. + +In the "Quick Reference" section that follows, I cover the top methods related to the topics discussed in this chapter, ordered by their context. Then two tables mentioned in the chapter, some important relevant procfs entries and a short section about ICMP messages usage in iptables reject rules are all covered. + +## Quick Reference + +I conclude this chapter with a short list of important methods of ICMPv4 and ICMPv6, 6 tables, a section about procfs entries, and a short section about using a reject target in iptables and ip6tables to create ICMP "Destination Unreachable" messages. + +### Methods + +The following methods were covered in this chapter. + +#### int icmp_rcv(struct sk_buff *skb); + +This method is the main handler for processing incoming ICMPv4 packets. + +#### extern void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info); + +This method sends an ICMPv4 message. The parameters are the provoking SKB, ICMPv4 message type, ICMPv4 message code, and info (which is dependent on type). + +#### struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb); + +This method returns the ICMPv6 header, which the specified skb contains. + +#### void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info); + +This method sends an ICMPv6 message. The parameters are the provoking SKB, ICMPv6 message type, ICMPv6 message code, and info (which is dependent on type). + +#### void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos); + +This method is a convenient version of the icmp6_send() method, which all it does is call icmp6_send() with ICMPV6_PARAMPROB as a type, and with the other specified parameters, skb, code and pos, and frees the SKB afterwards. + +### Tables + +The following tables were covered in this chapter. + +Table 3-2. + +ICMPv4 "Destination Unreachable" (ICMP_DEST_UNREACH) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMP_NET_UNREACH | Network Unreachable + +1 | ICMP_HOST_UNREACH | Host Unreachable + +2 | ICMP_PROT_UNREACH | Protocol Unreachable + +3 | ICMP_PORT_UNREACH | Port Unreachable + +4 | ICMP_FRAG_NEEDED | Fragmentation Needed, but the DF flag is set. + +5 | ICMP_SR_FAILED | Source route failed + +6 | ICMP_NET_UNKNOWN | Destination network unknown + +7 | ICMP_HOST_UNKNOWN | Destination host unknown + +8 | ICMP_HOST_ISOLATED | Source host isolated + +9 | ICMP_NET_ANO | The destination network is administratively prohibited. + +10 | ICMP_HOST_ANO | The destination host is administratively prohibited. + +11 | ICMP_NET_UNR_TOS | The network is unreachable for Type Of Service. + +12 | ICMP_HOST_UNR_TOS | The host is unreachable for Type Of Service. + +13 | ICMP_PKT_FILTERED | Packet filtered + +14 | ICMP_PREC_VIOLATION | Precedence violation + +15 | ICMP_PREC_CUTOFF | Precedence cut off + +16 | NR_ICMP_UNREACH | Number of unreachable codes + +Table 3-3. + +ICMPv4 Redirect (ICMP_REDIRECT) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMP_REDIR_NET | Redirect Net + +1 | ICMP_REDIR_HOST | Redirect Host + +2 | ICMP_REDIR_NETTOS | Redirect Net for TOS + +3 | ICMP_REDIR_HOSTTOS | Redirect Host for TOS + +Table 3-4. + +ICMPv4 Time Exceeded (ICMP_TIME_EXCEEDED) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMP_EXC_TTL | TTL count exceeded + +1 | ICMP_EXC_FRAGTIME | Fragment Reassembly time exceeded + +Table 3-5. + +ICMPv6 "Destination Unreachable" (ICMPV6_DEST_UNREACH) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMPV6_NOROUTE | No route to destination + +1 | ICMPV6_ADM_PROHIBITED | Communication with destination administratively prohibited + +2 | ICMPV6_NOT_NEIGHBOUR | Beyond scope of source address + +3 | ICMPV6_ADDR_UNREACH | Address Unreachable + +4 | ICMPV6_PORT_UNREACH | Port Unreachable + +Note that ICMPV6_PKT_TOOBIG, which is the counterpart of IPv4 ICMP_DEST_UNREACH /ICMP_FRAG_NEEDED, is not a code of ICMPV6_DEST_UNREACH, but an ICMPv6 type in itself. + +Table 3-6. + +ICMPv6 Time Exceeded (ICMPV6_TIME_EXCEED) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMPV6_EXC_HOPLIMIT | Hop limit exceeded in transit + +1 | ICMPV6_EXC_FRAGTIME | Fragment reassembly time exceeded + +Table 3-7. + +ICMPv6 Parameter Problem (ICMPV6_PARAMPROB) Codes + +Code | Kernel Symbol | Description + +---|---|--- + +0 | ICMPV6_HDR_FIELD | Erroneous header field encountered + +1 | ICMPV6_UNK_NEXTHDR | Unknown Next Header type encountered + +2 | ICMPV6_UNK_OPTION | Unknown IPv6 option encountered + +### procfs entries + +The kernel provides a way of configuring various settings for various subsystems from the userspace by way of writing values to entries under /proc. These entries are referred to as procfs entries. All of the ICMPv4 procfs entries are represented by variables in the netns_ipv4 structure (include/net/netns/ipv4.h), which is an object in the network namespace (struct net). Network namespaces and their implementation are discussed in Chapter 14. The following are the names of the sysctl variables that correspond to the ICMPv4 netns_ipv4 elements, explanations about their usage, and the default values to which they are initialized, specifying also in which method the initialization takes place. + +#### sysctl_icmp_echo_ignore_all + +When icmp_echo_ignore_all is set, echo requests (ICMP_ECHO) will not be replied. + +procfs entry: /proc/sys/net/ipv4/icmp_echo_ignore_all + +Initialized to 0 in icmp_sk_init() + +#### sysctl_icmp_echo_ignore_broadcasts + +When receiving a broadcast or a multicast echo (ICMP_ECHO) message or a timestamp (ICMP_TIMESTAMP) message, you check whether broadcast/multicast requests are permitted by reading sysctl_icmp_echo_ignore_broadcasts. If this variable is set, you drop the packet and return 0. + +procfs entry: /proc/sys/net/ipv4/icmp_echo_ignore_broadcasts + +Initialized to 1 in icmp_sk_init() + +#### sysctl_icmp_ignore_bogus_error_responses + +Some routers violate RFC1122 by sending bogus responses to broadcast frames. In the icmp_unreach() method, you check this flag. If this flag is set to TRUE, the kernel will not log these warnings ("sent an invalid ICMP type..."). + +procfs entry: /proc/sys/net/ipv4/icmp_ignore_bogus_error_responses + +Initialized to 1 in icmp_sk_init() + +#### sysctl_icmp_ratelimit + +Limit the maximal rates for sending ICMP packets whose type matches the icmp ratemask (icmp_ratemask, see later in this section) to specific targets. + +A value of 0 means disable any limiting; otherwise it is the minimal space between responses in milliseconds. + +procfs entry: /proc/sys/net/ipv4/icmp_ratelimit + +Initialized to 1 * HZ in icmp_sk_init() + +#### sysctl_icmp_ratemask + +Mask made of ICMP types for which rates are being limited. Each bit is an ICMPv4 type. + +procfs entry: /proc/sys/net/ipv4/icmp_ratemask + +Initialized to 0x1818 in icmp_sk_init() + +#### sysctl_icmp_errors_use_inbound_ifaddr + +The value of this variable is checked in icmp_send(). When it's not set, the ICMP error messages are sent with the primary address of the interface on which the packet will be sent. When it is set, the ICMP message will be sent with the primary address of the interface that received the packet that caused the icmp error. + +procfs entry: /proc/sys/net/ipv4/icmp_errors_use_inbound_ifaddr + +Initialized to 0 in icmp_sk_init() + +Note + +See also more about the ICMP sysctl variables, their types and their default values in + +Documentation/networking/ip-sysctl.txt . + +### Creating "Destination Unreachable" Messages with iptables + +The iptables userspace tool enables us to set rules which dictate what the kernel should do with traffic according to filters set by these rules. Handling iptables rules is done in the netfilter subsystem, and is discussed in Chapter 9. One of the iptables rules is the reject rule, which discards packets without further processing them. When setting an iptables reject target, the user can set a rule to send a "Destination Unreachable" ICMPv4 messages with various codes using the -j REJECT and \--reject-with qualifiers. For example, the following iptables rule will discard any packet from any source with sending back an ICMP message of "ICMP Host Prohibited": + +iptables -A INPUT -j REJECT --reject-with icmp-host-prohibited + +These are the possible values to the \--reject-with qualifier for setting an ICMPV4 message which will be sent in reply to the sending host: + +icmp-net-unreachable - ICMP_NET_UNREACH + +icmp-host-unreachable - ICMP_HOST_UNREACH + +icmp-port-unreachable - ICMP_PORT_UNREACH + +icmp-proto-unreachable - ICMP_PROT_UNREACH + +icmp-net-prohibited - ICMP_NET_ANO + +icmp-host-prohibited - ICMP_HOST_ANO + +icmp-admin-prohibited - ICMP_PKT_FILTERED + +You can also use \--reject-with tcp-reset which will send a TCP RST packet in reply to the sending host. + +(net/ipv4/netfilter/ipt_REJECT.c) + +With ip6tables in IPv6, there is also a REJECT target. For example: + +ip6tables -A INPUT -s 2001::/64 -p ICMPv6 -j REJECT --reject-with icmp6-adm-prohibited + +These are the possible values to the \--reject-with qualifier for setting an ICMPv6 message which will be sent in reply to the sending host: + +no-route, icmp6-no-route - ICMPV6_NOROUTE. + +adm-prohibited, icmp6-adm-prohibited - ICMPV6_ADM_PROHIBITED. + +port-unreach, icmp6-port-unreachable - ICMPV6_NOT_NEIGHBOUR. + +addr-unreach, icmp6-addr-unreachable - ICMPV6_ADDR_UNREACH. + +(net/ipv6/netfilter/ip6t_REJECT.c) +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_4 + +© Rami Rosen 2014 + +# 4. IPv4 + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 3 deals with the implementation of the ICMP protocol in IPv4 and in IPv6. This chapter, which deals with the IPv4 protocol, shows how ICMP messages are used for reporting Internet protocol errors under certain circumstances. The IPv4 protocol (Internet Protocol version 4) is one of the core protocols of today's standards-based Internet and routes most of the traffic on the Internet. The base definition is in RFC 791, "Internet Protocol," from 1981. The IPv4 protocol provides an end-to-end connectivity between any two hosts. Another important function of the IP layer is forwarding packets (also called routing) and managing tables that store routing information. Chapters 5 and 6 discuss IPv4 routing. This chapter describes the IPv4 Linux implementation: receiving and sending IPv4 packets, including multicast packets, IPv4 forwarding, and handling IPv4 options. There are cases when the packet to be sent is bigger than the MTU of the outgoing interface; in such cases the packet should be fragmented into smaller fragments. When fragmented packets are received, they should be assembled into one big packet, which should be identical to the packet that was sent before it was fragmented. These are also important tasks of the IPv4 protocol discussed in this chapter. + +Chapter 3 deals with the implementation of the ICMP protocol in IPv4 and in IPv6. This chapter, which deals with the IPv4 protocol, shows how ICMP messages are used for reporting Internet protocol errors under certain circumstances. The IPv4 protocol (Internet Protocol version 4) is one of the core protocols of today's standards-based Internet and routes most of the traffic on the Internet. The base definition is in RFC 791, "Internet Protocol," from 1981. The IPv4 protocol provides an end-to-end connectivity between any two hosts. Another important function of the IP layer is forwarding packets (also called routing) and managing tables that store routing information. Chapters 5 and discuss IPv4 routing. This chapter describes the IPv4 Linux implementation: receiving and sending IPv4 packets, including multicast packets, IPv4 forwarding, and handling IPv4 options. There are cases when the packet to be sent is bigger than the MTU of the outgoing interface; in such cases the packet should be fragmented into smaller fragments. When fragmented packets are received, they should be assembled into one big packet, which should be identical to the packet that was sent before it was fragmented. These are also important tasks of the IPv4 protocol discussed in this chapter. + +Every IPv4 packet starts with an IP header, which is at least 20 bytes long. If IP options are used, the IPv4 header can be up to 60 bytes. After the IP header, there is the transport header (TCP header or UDP header, for example), and after it is the payload data. To understand the IPv4 protocol, you must first learn how the IPv4 header is built. In Figure 4-1 you can see the IPv4 header, which consists of two parts: the first part of 20 bytes (until the beginning of the options field in the IPv4 header) is the basic IPv4 header, and after it there is the IP options part, which can be from 0 to 40 bytes in length. + +## IPv4 Header + +The IPv4 header consists of information that defines how a packet should be handled by the kernel network stack: the protocol being used, the source and destination address, the checksum, the identification (id) of the packet that is needed for fragmentation, the ttl that helps avoiding packets being forwarded endlessly because of some error, and more. This information is stored in 13 members of the IPv4 header (the 14th member, IP Options, which is an extension to the IPv4 header, is optional). The various members of the IPv4 and the various IP options are described next. The IPv4 header is represented by the iphdr structure. Its members, which appear in Figure 4-1, are described in the next section. The IP options and their use are described in the "IP Options" section later in this chapter. + +Figure 4-1. + +IPv4 header + +Figure 4-1 shows the IPv4 header. All members always exist—except for the last one, the IP options, which is optional. The content of the IPv4 members determines how it will be handled in the IPv4 network stack: the packet is discarded when there is some problem (for example, if the version, which is the first member, is not 4, or if the checksum is incorrect). Each IPv4 packet starts with IPv4 header, and after it there is the payload: + +struct iphdr { + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u8 ihl:4, + +version:4; + +#elif defined (__BIG_ENDIAN_BITFIELD) + +__u8 version:4, + +ihl:4; + +#else + +#error "Please fix " + +#endif + +__u8 tos; + +__be16 tot_len; + +__be16 id; + +__be16 frag_off; + +__u8 ttl; + +__u8 protocol; + +__sum16 check; + +__be32 saddr; + +__be32 daddr; + +/*The options start here. */ + +}; + +(include/uapi/linux/ip.h) + +The following is a description of the IPv4 header members: + + * ihl: This stands for Internet Header Length. The length of the IPv4 header, measured in multiples of 4 bytes. The length of the IPv4 header is not fixed, as opposed to the header of IPv6, where the length is fixed (40 bytes). The reason is that the IPv4 header can include optional, varying length options. The minimum size of the IPv4 header is 20 bytes, when there are no options, and the maximum size is 60 bytes. The corresponding ihl values are 5 for minimum IPv4 header size, and 15 for the maximum size. The IPv4 header must be aligned to a 4-byte boundary. + + * version: Should be 4. + + * tos: The tos field of the IPv4 header was originally intended for Quality of Service (QoS) services; tos stands for Type of Service. Over the years this field took on a different meaning, as follows: RFC 2474 defines the Differentiated Services Field (DS Field) in the IPv4 and IPv6 headers, which is bits 0–5 of the tos. It is also named Differentiated Services Code Point (DSCP). RFC 3168 from 2001 defines the Explicit Congestion Notification (ECN) of the IP header; it is bits 6 and 7 of the tos field. + + * tot_len: The total length, including the header, measured in bytes. Because tot_len is a 16-bit field, it can be up to 64KB. According to RFC 791, the minimum size is 576 bytes. + + * id: Identification of the IPv4 header. The id field is important for fragmentation: when fragmenting an SKB, the id value of all the fragments of that SKB should be the same. Reassembling fragmented packets is done according to the id of the fragments. + + * frag_off: The fragment offset, a 16-bit field. The lower 13 bits are the offset of the fragment. In the first fragment, the offset is 0. The offset is measured in units of 8 bytes. The higher 3 bits are the flags: + + * 001 is MF (More Fragments). It is set for all fragments, except the last one. + + * 010 is DF (Don't Fragment). + + * 100 is CE (Congestion). + +See the IP_MF, IP_DF, and IP_CE flags declaration in include/net/ip.h . + + * ttl: Time To Live: this is a hop counter. Each forwarding node decreases the ttl by 1. When it reaches 0, the packet is discarded, and a time exceeded ICMPv4 message is sent back; this avoids packets from being forwarded endlessly, for this reason or another. + + * protocol: The L4 protocol of the packet—for example, IPPROTO_TCP for TCP traffic or IPPROTO_UDP for UDP traffic (for a list of all available protocols see include/linux/in.h). + + * check: The checksum (16-bit field). The checksum is calculated only over the IPv4 header bytes. + + * saddr: Source IPv4 address, 32 bits. + + * daddr: Destination IPv4 address, 32 bits. + +In this section you have learned about the various IPv4 header members and their purposes. The initialization of the IPv4 protocol, which sets the callback to be invoked when receiving an IPv4 header, is discussed in the next section. + +## IPv4 Initialization + +IPv4 packets are packets with Ethernet type 0x0800 (Ethernet type is stored in the first two bytes of the 14-byte Ethernet header). Each protocol should define a protocol handler, and each protocol should be initialized so that the network stack can handle packets that belong to this protocol. So that you understand what causes received IPv4 packets to be handled by IPv4 methods, this section describes the registration of the IPv4 protocol handler : + +static struct packet _type ip_packet_type __read_mostly = { + +.type = cpu_to_be16(ETH_P_IP), + +.func = ip_rcv, + +}; + +static int __init inet_init(void) + +{ + +... + +dev_add_pack(&ip_packet_type); + +... + +} + +(net/ipv4/af_inet.c) + +The dev_add_pack() method adds the ip_rcv() method as a protocol handler for IPv4 packets. These are packets with Ethernet type 0x0800 (ETH_P_IP, defined in include/uapi/linux/if_ether.h). The inet_init() method performs various IPv4 initializations and is called during the boot phase. + +The main functionality of the IPv4 protocol is divided into the Rx (receive) path and the Tx (transmit) path. Now that you learned about the registration of the IPv4 protocol handler, you know which protocol handler manages IPv4 packets (the ip_rcv callback) and how this protocol handler is registered. You are ready now to start to learn about the IPv4 Rx path and how received IPv4 packets are handled. The Tx path is described in a later section, "Sending IPv4 Packets." + +## Receiving IPv4 Packets + +The main IPv4 receive method is the ip_rcv() method, which is the handler for all IPv4 packets (including multicasts and broadcasts). In fact, this method consists mostly of sanity checks. The real work is done in the ip_rcv_finish() method it invokes. Between the ip_rcv() method and the ip_rcv_finish() method is the NF_INET_PRE_ROUTING netfilter hook, invoked by calling the NF_HOOK macro (see code snippet later in this section). In this chapter, you will encounter many invocations of the NF_HOOK macros—these are the netfilter hooks. The netfilter subsystem allows you to register callbacks in five points along the journey of a packet in the network stack. These points will be mentioned by their names shortly. The reason for adding the netfilter hooks is to enable loading the netfilter kernel modules at runtime. The NF_HOOK macro invokes the callbacks of a specified point, if such callbacks were registered. You might also encounter the NF_HOOK macro called NF_HOOK_COND, which is a variation of the NF_HOOK macro. In some places in the network stack, the NF_HOOK_COND macro includes a Boolean parameter (the last parameter), which must be true for the hook to be executed (Chapter 9 discusses netfilter hooks). Note that the netfilter hooks can discard the packet and in such a case it will not continue on its ordinary path. Figure 4-2 shows the receiving path (Rx) of a packet received by the network driver. This packet can either be delivered to the local machine or be forwarded to another host. It is the lookup in the routing table that determines which of these two options will take place. + +Figure 4-2. + +Receiving IPv4 packets. For simplicity, the diagram does not include the fragmentation/defragmentation/options/IPsec methods + +Figure 4-2 shows the paths for a received IPv4 packet. The packet is received by the IPv4 protocol handler, the ip_rcv() method (see the upper left side of the figure). First of all, a lookup in the routing subsystem should be performed, immediately after calling the ip_rcv_finish() method. The result of the routing lookup determines whether the packet is for local delivery to the local host or is to be forwarded (routing lookup is explained in Chapter 5). If the packet is destined for the local host, it will first reach the ip_local_deliver() method, and subsequently it will reach the ip_local_deliver_finish() method. When the packet is to be forwarded, it will be handled by the ip_forward() method. Some netfilter hooks appear in the figure, like NF_INET_PRE_ROUTING and NF_INET_LOCAL_IN. Note that multicast traffic is handled by the ip_mr_input() method, discussed in the "Receiving IPv4 Multicast Packets" section later in this chapter. The NF_INET_PRE_ROUTING, NF_INET_LOCAL_IN, NF_INET_FORWARD, and NF_INET_POST_ROUTING are four of the five entry points of the netfilter hooks. The fifth one, NF_INET_LOCAL_OUT, is mentioned in the "Sending IPv4 packets" section later in this chapter. These five entry points are defined in include/uapi/linux/netfilter.h. Note that the same enum for these five hooks is also used in IPv6; for example, in the ipv6_rcv() method, a hook is being registered on NF_INET_PRE_ROUTING (net/ipv6/ip6_input.c). Let's take a look at the ip_rcv() method: + +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) + +{ + +First some sanity checks are performed, and I mention some of them in this section. The length of the IPv4 header (ihl) is measured in multiples of 4 bytes. The IPv4 header must be at least 20 bytes in size, which means that the ihl size must be at least 5. The version should be 4 (for IPv4). If one of these conditions is not met, the packet is dropped and the statistics (IPSTATS_MIB_INHDRERRORS) are updated. + +if (iph->ihl < 5 || iph->version != 4) + +goto inhdr_error; + +According to section 3.2.1.2 of RFC 1122, a host must verify the IPv4 header checksum on every received datagram and silently discard every datagram that has a bad checksum. This is done by calling the ip_fast_csum() method, which should return 0 on success. The IPv4 header checksum is calculated only over the IPv4 header bytes: + +if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + +goto inhdr_error; + +Then the NF_HOOK macro is invoked: + +return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, + +ip_rcv_finish); + +When the registered netfilter hook method returns NF_DROP, it means that the packet should be dropped, and the packet traversal does not continue. When the registered netfilter hook returns NF_STOLEN, it means that the packet was taken over by the netfilter subsystem, and the packet traversal does not continue. When the registered netfilter hook returns NF_ACCEPT, the packet continues its traversal. There are other return values (also termed verdicts) from netfilter hooks, like NF_QUEUE, NF_REPEAT, and NF_STOP, which are not discussed in this chapter. (As mentioned earlier, netfilter hooks are discussed in Chapter 9.) Let's assume for a moment that there are no netfilter callbacks registered in the NF_INET_PRE_ROUTING entry point, so the NF_HOOK macro will not invoke any netfilter callbacks and the ip_rcv_finish() method will be invoked. Let's take a look at the ip_rcv_finish() method: + +static int ip_rcv_finish(struct sk_buff *skb) + +{ + +const struct iphdr *iph = ip_hdr(skb); + +struct rtable *rt; + +The skb_dst() method checks whether there is a dst object attached to the SKB; dst is an instance of dst_entry (include/net/dst.h) and represents the result of a lookup in the routing subsystem. The lookup is done according to the routing tables and the packet headers. The lookup in the routing subsystem also sets the input and /or the output callbacks of the dst. For example, if the packet is to be forwarded, the lookup in the routing subsystem will set the input callback to be ip_forward(). When the packet is destined to the local machine, the lookup in the routing subsystem will set the input callback to be ip_local_deliver(). For a multicast packet it can be ip_mr_input() under some conditions (I discuss multicast packets in the next section). The contents of the dst object determine how the packet will proceed in its journey; for example, when forwarding a packet, the decision about which input callback should be called when invoking dst_input(), or on which interface it should be transmitted, is taken according to the dst.(I discuss the routing subsystem in depth in the next chapter). + +If there is no dst attached to the SKB, a lookup in the routing subsystem is performed by the ip_route_input_noref() method. If the lookup fails, the packet is dropped. Note that handling multicast packets is different than handling unicast packets (discussed in the section "Receiving IPv4 Multicast Packets" later in this chapter). + +... + +if (!skb_dst(skb)) { + +Perform a lookup in the routing subsystem: + +int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + +iph->tos, skb->dev); + +if (unlikely(err)) { + +if (err == -EXDEV) + +NET_INC_STATS_BH(dev_net(skb->dev), + +LINUX_MIB_IPRPFILTER); + +goto drop; + +} + +} + +Note + +The -EXDEV ("Crossdevice link") error is returned by the __fib_validate_source() method under certain circumstances when the Reverse Path Filter (RPF) is set. The RPF can be set via an entry in the procfs. In such cases the packet is dropped, the statistics (LINUX_MIB_IPRPFILTER) are updated, and the method returns NET_RX_DROP. Note that you can display the LINUX_MIB_IPRPFILTER counter by looking in the IPReversePathFilter column in the output of cat /proc/net/netstat. + +Now a check is performed to see whether the IPv4 header includes options. Because the length of the IPv4 header (ihl) is measured in multiples of 4 bytes, if it is greater than 5 this means that it includes options, so the ip_rcv_options() method should be invoked to handle these options. Handling IP options is discussed in depth in the "IP Options" section later in this chapter. Note that the ip_rcv_options() method can fail, as you will shortly see. If it is a multicast entry or a broadcast entry, the IPSTATS_MIB_INMCAST statistics or the IPSTATS_MIB_INBCAST statistics is updated, respectively. Then the dst_input() method is invoked. This method in turn simply invokes the input callback method by calling skb_dst(skb)->input(skb): + +if (iph->ihl > 5 && ip_rcv_options(skb)) + +goto drop; + +rt = skb_rtable(skb); + +if (rt->rt_type == RTN_MULTICAST) { + +IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, + +skb->len); + +} else if (rt->rt_type == RTN_BROADCAST) + +IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, + +skb->len); + +return dst_input(skb); + +In this section you learned about the various stages in the reception of IPv4 packets: the sanity checks performed, the lookup in the routing subsystem, the ip_rcv_finish() method which performs the actual work. You also learned about which method is called when the packet should be forwarded and which method is called when the packet is for local delivery. IPv4 multicasting is a special case. Handling the reception of IPv4 multicast packets is discussed in the next section. + +## Receiving IPv4 Multicast Packets + +The ip_rcv() method is also a handler for multicast packets. As mentioned earlier, after some sanity checks, it invokes the ip_rcv_finish() method, which performs a lookup in the routing subsystem by calling ip_route_input_noref(). In the ip_route_input_noref() method, first a check is performed to see whether the local machine belongs to a multicast group of the destination multicast address, by calling the ip_check_mc_rcu() method. If it is so, or if the local machine is a multicast router (CONFIG_IP_MROUTE is set), the ip_route_input_mc() method is invoked; let's take a look at the code: + +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + +u8 tos, struct net_device *dev) + +{ + +int res; + +rcu_read_lock(); + +... + +if (ipv4_is_multicast(daddr)) { + +struct in_device *in_dev = __in_dev_get_rcu(dev); + +if (in_dev) { + +int our = ip_check_mc_rcu(in_dev, daddr, saddr, + +ip_hdr(skb)->protocol); + +if (our + +#ifdef CONFIG_IP_MROUTE + +|| + +(!ipv4_is_local_multicast(daddr) && + +IN_DEV_MFORWARD(in_dev)) + +#endif + +) { + +int res = ip_route_input_mc(skb, daddr, saddr, + +tos, dev, our); + +rcu_read_unlock(); + +return res; + +} + +} + +... + +} + +... + +Let's further look into the ip_route_input_mc() method. If the local machine belongs to a multicast group of the destination multicast address (the value of the variable our is 1), then the input callback of dst is set to be ip_local_deliver. If the local host is a multicast router and IN_DEV_MFORWARD(in_dev) is set, then the input callback of dst is set to be ip_mr_input. The ip_rcv_finish() method, which calls dst_input(skb), invokes thus either the ip_local_deliver() method or the ip_mr_input() method, according to the input callback of dst. The IN_DEV_MFORWARD macro checks the procfs multicast forwarding entry. Note that the procfs multicast forwarding entry, /proc/sys/net/ipv4/conf/all/mc_forwarding , is a read-only entry (as opposed to the IPv4 unicast procfs forwarding entry), so you cannot set it simply by running from the command line: echo 1 > /proc/sys/net/ipv4/conf/all/mc_forwarding. Starting the pimd daemon, for example, sets it to 1, and stopping the daemon sets it to 0. pimd is a lightweight standalone PIM-SM v2 multicast routing daemon. If you are interested in learning about multicast routing daemon implementation, you might want to look into the pimd source code in https://github.com/troglobit/pimd/ : + +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + +u8 tos, struct net_device *dev, int our) + +{ + +struct rtable *rth; + +struct in_device *in_dev = __in_dev_get_rcu(dev); + +... + +if (our) { + +rth->dst.input= ip_local_deliver; + +rth->rt_flags |= RTCF_LOCAL; + +} + +#ifdef CONFIG_IP_MROUTE + +if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) + +rth->dst.input = ip_mr_input; + +#endif + +... + +The multicast layer holds a data structure called the Multicast Forwarding Cache (MFC). I don't discuss the details of the MFC or of the ip_mr_input() method here (I discuss them in Chapter 6). What is important in this context is that if a valid entry is found in the MFC, the ip_mr_forward() method is called. The ip_mr_forward() method performs some checks and eventually calls the ipmr_queue_xmit() method. In the ipmr_queue_xmit() method, the ttl is decreased, and the checksum is updated by calling the ip_decrease_ttl() method (the same is done in the ip_forward() method, as you will see later in this chapter). Then the ipmr_forward_finish() method is invoked by calling the NF_INET_FORWARD NF_HOOK macro (let's assume that there are no registered IPv4 netfilter hooks on NF_INET_FORWARD): + +static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, + +struct sk_buff *skb, struct mfc_cache *c, int vifi) + +{ + +... + +ip_decrease_ttl(ip_hdr(skb)); + +... + +NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, + +ipmr_forward_finish); + +return; + +} + +The ipmr_forward_finish() method is very short and is shown here in its entirety. All it does is update the statistics, call the ip_forward_options() method if there are options in the IPv4 header (IP options are described in the next section), and call the dst_output() method: + +static inline int ipmr_forward_finish(struct sk_buff *skb) + +{ + +struct ip_options *opt = &(IPCB(skb)->opt); + +IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + +IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + +if (unlikely(opt->optlen)) + +ip_forward_options(skb); + +return dst_output(skb); + +} + +This section discussed how receiving IPv4 multicast packets is handled. The pimd was mentioned as an example of a multicast routing daemon, which interacts with the kernel in multicast packet forwarding. The next section describes the various IP options, which enable using special features of the network stack, such as tracking the route of a packet, tracking timestamps of packets, specifying network nodes which a packet should traverse. I also discuss how these IP options are handled in the network stack. + +## IP Options + +The IP options field of the IPv4 header is optional and is not often used for security reasons and because of processing overhead. Which options might be helpful? Suppose, for example, that your packets are being dropped by a certain firewall. You may be able to specify a different route with the Strict or Loose Source Routing options. Or if you want to find out the packets' path to some destination addresses, you can use the Record Route option. + +The IPv4 header may contain zero, one, or more options. The IPv4 header size is 20 bytes when there are no options. The length of the IP options field can be 40 bytes at most. The reason the IPv4 maximum length is 60 bytes is because the IPv4 header length is a 4-bit field, which expresses the length in multiples of 4 bytes. Hence the maximum value of the field is 15, which gives an IPv4 maximum header length of 60 bytes. When using more than one option, options are simply concatenated one after the other. The IPv4 header must be aligned to a 4-byte boundary, so sometimes padding is needed. The following RFCs discuss IP options: 781 (Timestamp Option), 791, 1063, 1108, 1393 (Traceroute Using an IP Option), and 2113 (IP Router Alert Option). There are two forms of IP options: + + * Single byte option (option type): The "End of Option List" and "No Operation" are the only single byte options. + + * Multibyte option: When using a multibyte option after the option type byte there are the following three fields: + + * Length (1 byte): Length of the option in bytes. + + * Pointer (1 byte): Offset from option start. + + * Option data: This is a space where intermediate hosts can store data, for example, timestamps or IP addresses. + +In Figure 4-3 the Option type is shown. + +Figure 4-3. + +Option type + +When set, copied flag means that the option should be copied in all fragments. When it is not set, the option should be copied only in the first fragment. The IPOPT_COPIED macro checks whether the copied flag of a specified IP option is set. It is used in the ip_options_fragment() method for detecting options which may not be copied and for inserting IPOPT_NOOP instead. The ip_options_fragment() method is discussed later in this section. + +The option class can be one of the following 4 values: + + * 00: control class (IPOPT_CONTROL) + + * 01: reserved1 (IPOPT_RESERVED1) + + * 10: debugging and measurement (IPOPT_MEASUREMENT) + + * 11: reserved2 (IPOPT_RESERVED2) + +In the Linux network stack, only the IPOPT_TIMESTAMP option belongs to the debugging and measurement class. All the other options are control classes. + +The Option Number specifies an option by a unique number; possible values are 0–31, but not all are used by the Linux kernel. + +Table 4-1 shows all options according to their Linux symbol, option number, option class, and copied flag. + +Table 4-1. + +Options Table + +Linux Symbol | Option Number | Class | Copied Flag | Description + +---|---|---|---|--- + +IPOPT_END | 0 | 0 | 0 | End of Option List + +IPOPT_NOOP | 1 | 0 | 0 | No Operation + +IPOPT_SEC | 2 | 0 | 1 | Security + +IPOPT_LSRR | 3 | 0 | 1 | Loose Source Record Route + +IPOPT_TIMESTAMP | 4 | 2 | 0 | Timestamp + +IPOPT_CIPSO | 6 | 0 | 1 | Commercial Internet Protocol Security Option + +IPOPT_RR | 7 | 0 | 0 | Record Route + +IPOPT_SID | 8 | 0 | 1 | Stream ID + +IPOPT_SSRR | 9 | 0 | 1 | Strict Source Record Route + +IPOPT_RA | 20 | 0 | 1 | Router Alert + +The option names (IPOPT_*) declarations are in include/uapi/linux/ip.h. + +The Linux network stack does not include all the IP options. For a full list, see www.iana.org/assignments/ip-parameters/ip-parameters.xml . + +I will describe the five options shortly, and then describe the Timestamp Option and the Record Route option in depth: + + * End of Option List (IPOPT_END): 1-byte option used to indicate the end of the options field. This is a single zero byte option (all its bits are '0'). There can be no IP options after it. + + * No Operation (IPOPT_NOOP): 1-byte option is used for internal padding, which is used for alignment. + + * Security (IPOPT_SEC): This option provides a way for hosts to send security, handling restrictions, and TCC (closed user group) parameters. See RFC 791 and RFC 1108. Initially intended to be used by military applications. + + * Loose Source Record Route (IPOPT_LSRR): This option specifies a list of routers that the packet should traverse. Between each two adjacent nodes in the list there can be intermediate routers which do not appear in the list, but the order should be kept. + + * Commercial Internet Protocol Security Option (IPOPT_CIPSO): CIPSO is an IETF draft that has been adopted by several vendors. It deals with a network labeling standard. CIPSO labeling of a socket means adding the CIPSO IP options to all packets leaving the system through that socket. This option is validated upon reception of the packet. For more info about the CIPSO option, see Documentation/netlabel/draft-ietf-cipso-ipsecurity-01.txt and Documentation/netlabel/cipso_ipv4.txt. + +### Timestamp Option + +Timestamp (IPOPT_TIMESTAMP): The Timestamp option is specified in RFC 781, "A Specification of the Internet Protocol (IP) Timestamp Option." This option stores timestamps of hosts along the packet route. The stored timestamp is a 32-bit timestamp in milliseconds since midnight UTC of the current day. In addition, it can also store the addresses of all hosts in the packet route or timestamps of only selected hosts along the route. The maximum Timestamp option length is 40. The Timestamp option is not copied for fragments; it is carried only in the first fragment. The Timestamp option begins with three bytes of option type, length, and pointer (offset). The higher 4 bits of the fourth byte are the overflow counter, which is incremented in each hop where there is no available space to store the required data. When the overflow counter exceeds 15, an ICMP message of Parameter Problem is sent back. The lower 4 bits is the flag. The value of the flag can be one of the following: + + * 0: Timestamp only (IPOPT_TS_TSONLY) + + * 1: Timestamps and addresses (IPOPT_TS_TSANDADDR) + + * 3: Timestamps of specified hops only (IPOPT_TS_PRESPEC) + +Note + +You can use the command-line ping utility with the Timestamp option and with the three subtypes mentioned earlier: + +ping -T tsonly (IPOPT_TS_TSONLY) + +ping -T tsandaddr (IPOPT_TS_TSANDADDR) + +ping -T tsprespec (IPOPT_TS_PRESPEC) + +Figure 4-4 shows the Timestamp option with timestamp only (the IPOPT_TS_TSONLY flag is set). Each router on the path adds its IPv4 address. When there is no more space, the overflow counter is incremented. + +Figure 4-4. + +Timestamp option (with timestamp only, flag = 0) + +Figure 4-5 shows the Timestamp option with timestamps and addresses (the IPOPT_TS_TSANDADDR flag is set). Each router on the path adds its IPv4 address and its timestamp. Again, when there is no more space, the overflow counter is incremented. + +Figure 4-5. + +Timestamp option (with timestamps and addresses, flag = 1) + +Figure 4-6 shows the Timestamp option with timestamps (the IPOPT_TS_PRESPEC flag is set). Each router on the path adds its timestamp only if it is in the pre-specified list. Again, when there is no more space, the overflow counter is incremented. + +Figure 4-6. + +Timestamp option (with timestamps of specified hops only, flag = 3) + +### Record Route Option + +Record Route (IPOPT_RR): The route of a packet is recorded. Each router on the way adds its address (see Figure 4-7). The length is set by the sending device. The command-line utility ping –R uses the Record Route IP Option. Note that the IPv4 header is only large enough for nine such routes (or even less, if more options are used). When the header is full and there is no room to insert an additional address, the datagram is forwarded without inserting the address to the IP options. See section 3.1, RFC 791. + +Figure 4-7. + +Record Route option + +Though ping –R uses the Record Route IP Option, in many cases, if you will try it, you will not get the expected result of all the network nodes along the way, because for security reasons many network nodes ignore this IP option. The manpage of ping mentions this explicitly. From man ping: + +... + +-R + +Includes the RECORD_ROUTE option in the ECHO_REQUEST packet and displays the route buffer on returned packets. + +... + +Many hosts ignore or discard this option. + +... + + * Stream ID (IPOPT_SID): This option provides a way for the 16-bit SATNET stream identifier to be carried through networks that do not support the stream concept. + + * Strict Source Record Route (IPOPT_SSRR): This option specifies a list of routers that the packet should traverse. The order should be kept, and no changes in traversal are permitted. Many routers block the Loose Source Record Route (LSRR) and Strict Source Record Route (SSRR) options because of security reasons. + + * Router Alert (IPOPT_RA): The IP Router Alert option can be used to notify transit routers to more closely examine the contents of an IP packet. This is useful, for example, for new protocols but requires relatively complex processing in routers along the path. Specified in RFC 2113, "IP Router Alert Option." + +IP options are represented in Linux by the ip_options structure: + +struct ip_options { + +__be32 faddr; + +__be32 nexthop; + +unsigned char optlen; + +unsigned char srr; + +unsigned char rr; + +unsigned char ts; + +unsigned char is_strictroute:1, + +srr_is_hit:1, + +is_changed:1, + +rr_needaddr:1, + +ts_needtime:1, + +ts_needaddr:1; + +unsigned char router_alert; + +unsigned char cipso; + +unsigned char __pad2; + +unsigned char __data[0]; + +}; + +(include/net/inet_sock.h) + +Here are short descriptions of the members of the IP options structure: + + * faddr: Saved first hop address. Set in ip_options_compile() when handling loose and strict routing, when the method was not invoked from the Rx path (SKB is NULL). + + * nexthop: Saved nexthop address in LSRR and SSRR. + + * optlen: The option length, in bytes. Cannot exceed 40 bytes. + + * is_strictroute: A flag specifing usage of strict source route. The flag is set in the ip_options_compile() method when parsing strict route option type (IPOPT_SSRR); note that it is not set for loose route (IPOPT_LSRR). + + * srr_is_hit: A flag specifing that the packet destination addr was the local host The srr_is_hit flag is set in ip_options_rcv_srr(). + + * is_changed: IP checksum is not valid anymore (the flag is set when one of the IP options is changed). + + * rr_needaddr: Need to record IPv4 address of the outgoing device. The flag is set for the Record Route option (IPOPT_RR). + + * ts_needtime: Need to record timestamp. The flag is set for these flags of the Timestamp IP Option: IPOPT_TS_TSONLY, IPOPT_TS_TSANDADDR and IPOPT_TS_PRESPEC (see a detailed explanation about the difference between these flags later in this section). + + * ts_needaddr: Need to record IPv4 address of the outgoing device. This flag is set only when the IPOPT_TS_TSANDADDR flag is set, and it indicates that the IPv4 address of each node along the route of the packet should be added. + + * router_alert: Set in the ip_options_compile() method when parsing a router alert option (IPOPT_RR). + + * __data[0]: A buffer to store options that are received from userspace by setsockopt(). + +See ip_options_get_from_user() and ip_options_get_finish() (net/ipv4/ip_options.c). + +Let's take a look at the ip_rcv_options() method: + +static inline bool ip_rcv_options(struct sk_buff *skb) + +{ + +struct ip_options *opt; + +const struct iphdr *iph; + +struct net_device *dev = skb->dev; + +... + +Fetch the IPv4 header from the SKB: + +iph = ip_hdr(skb); + +Fetch the ip_options object from the inet_skb_parm object which is associated to the SKB: + +opt = &(IPCB(skb)->opt); + +Calculate the expected options length: + +opt->optlen = iph->ihl*4 - sizeof(struct iphdr); + +Call the ip_options_compile() method to build an ip_options object out of the SKB: + +if (ip_options_compile(dev_net(dev), opt, skb)) { + +IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + +goto drop; + +} + +When the ip_options_compile() method is called in the Rx path (from the ip_rcv_options() method), it parses the IPv4 header of the specified SKB and builds an ip_options object out of it, according to the IPv4 header content, after verifying the validity of the options. The ip_options_compile() method can also be invoked from the ip_options_get_finish() method when getting options from userspace via the setsockopt() system call with IPPROTO_IP and IP_OPTIONS. In this case, data is copied from userspace into opt->data, and the third parameter for ip_options_compile(), the SKB, is NULL; the ip_options_compile() method builds the ip_options object in such a case from opt->__data. If some error is found while parsing the options, and it is in the Rx path (the ip_options_compile() method was invoked from ip_rcv_options()), a "Parameter Problem" ICMPv4 message (ICMP_PARAMETERPROB) is sent back. An error with the code –EINVAL is returned in case of error, regardless of how the method was invoked. Naturally, it is more convenient to work with the ip_options object than with the raw IPv4 header, because access to the IP options fields is much simpler this way. In the Rx path, the ip_options object that the ip_options_compile() method builds is stored in the control buffer (cb) of the SKB; this is done by setting the opt object to &(IPCB(skb)->opt). The IPCB(skb) macro is defined like this: + +#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) + +And the inet_skb_parm structure (which includes an ip_options object) is defined like this: + +struct inet_skb_parm { + +struct ip_options opt; /* Compiled IP options */ + +unsigned char flags; + +u16 frag_max_size; + +}; + +(include/net/ip.h) + +So &(IPCB(skb)->opt points to the ip_options object inside the inet_skb_parm object. I will not delve into all the small, tedious technical details of parsing the IPv4 header in the ip_options_compile() method in this book, because there is an abundance of such details and they are self-explanatory. I will discuss briefly how the ip_options_compile() parses some single byte options, like IPOPT_END and IPOPT_NOOP, and some more complex options like IPOPT_RR and IPOPT_TIMESTAMP in the Rx path and show some examples of which checks are done in this method and how it is implemented in the following code snippet: + +int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) + +{ + +... + +unsigned char *pp_ptr = NULL; + +struct rtable *rt = NULL; + +unsigned char *optptr; + +unsigned char *iph; + +int optlen, l; + +For starting the parsing process, the optptr pointer should point to the start of the IP options object and iterate over all the options in a loop. For the Rx path (when the ip_options_compile() method is invoked from the ip_rcv_options() method), the SKB that was received in the ip_rcv() method is passed as a parameter to ip_options_compile() and, needless to say, cannot be NULL. In such a case, the IP options start immediately after the initial fixed size (20 bytes) of the IPv4 header. When the ip_options_compile() was invoked from ip_options_get_finish(), the optptr pointer was set to opt->__data, because the ip_options_get_from_user() method copied the options that were sent from userspace into opt->__data. To be accurate, I should mention that if alignment is needed, the ip_options_get_finish() method also writes into opt->__data (it writes IPOPT_END in the proper place). + +if (skb != NULL) { + +rt = skb_rtable(skb); + +optptr = (unsigned char *)&(ip_hdr(skb)[1]); + +} else + +optptr = opt->__data; + +In this case, iph = ip_hdr(skb) cannot be used instead, because the case when SKB is NULL should be considered. The following assignment is correct also for the non-Rx path: + +iph = optptr - sizeof(struct iphdr); + +The variable l is initialized to be the options length (it can be 40 bytes at most). It is decremented by the length of the current option in each iteration of the following for loop: + +for (l = opt->optlen; l > 0; ) { + +switch (*optptr) { + +If an IPOPT_END option is encountered, it indicates that this is the end of the options list—there must be no other option after it. In such a case you write IPOPT_END for each byte which is different than IPOPT_END until the end of the options list. The is_changed Boolean flag should also be set, because it indicates that the IPv4 header was changed (and as a result, recalculation of checksum is pending—there is no justification for calculating the checksum right now or inside the for loop, because there might be other changes in the IPv4 header during the loop): + +case IPOPT_END: + +for (optptr++, l--; l>0; optptr++, l--) { + +if (*optptr != IPOPT_END) { + +*optptr = IPOPT_END; + +opt->is_changed = 1; + +} + +} + +goto eol; + +If an option type of No Operation (IPOPT_NOOP), which is a single byte option, is encountered, simply decrement l by 1, increment optptr by 1, and move forward to the next option type: + +case IPOPT_NOOP: + +l--; + +optptr++; + +continue; + +} + +Optlen is set to be the length of the option that is read (as optptr[1] holds the option length): + +optlen = optptr[1]; + +The No Operation (IPOPT_NOOP) option and the End of Option List (IPOPT_END) option are the only single byte options. All other options are multibyte options and must have at least two bytes (option type and option length). Now a check is made that there are at least two option bytes and the option list length was not exceeded. If there was some error, the pp_ptr pointer is set to point to the source of the problem and exit the loop. If it is in the Rx path, an ICMPv4 message of "Parameter Problem" is sent back, passing as a parameter the offset where the problem occurred, so that the other side can analyze the problem: + +if (optlen<2 || optlen>l) { + +pp_ptr = optptr; + +goto error; + +} + +switch (*optptr) { + +case IPOPT_SSRR: + +case IPOPT_LSRR: + +... + +case IPOPT_RR: + +The option length of the Record Route option must be at least 3 bytes: option type, option length, and pointer (offset): + +if (optlen < 3) { + +pp_ptr = optptr + 1; + +goto error; + +} + +The option pointer offset of the Record Route option must be at least 4 bytes, since the space reserved for the address list must start after the three initial bytes (option type, option length, and pointer): + +if (optptr[2] < 4) { + +pp_ptr = optptr + 2; + +goto error; + +} + +if (optptr[2] <= optlen) { + +If the offset (optptr[2]) plus the three initial bytes exceeds the option length, there is an error: + +if (optptr[2]+3 > optlen) { + +pp_ptr = optptr + 2; + +goto error; + +} + +if (rt) { + +spec_dst_fill(&spec_dst, skb); + +Copy the IPv4 address to the Record Route buffer: + +memcpy(&optptr[optptr[2]-1], &spec_dst, 4); + +Set the is_changed Boolean flag, which indicates that the IPv4 header was changed (recalculation of checksum is pending): + +opt->is_changed = 1; + +} + +Increment the pointer (offset) by 4 for the next address in the Record Route buffer (each IPv4 address is 4 bytes): + +optptr[2] += 4; + +Set the rr_needaddr flag (this flag is checked in the ip_forward_options() method): + +opt->rr_needaddr = 1; + +} + +opt->rr = optptr - iph; + +break; + +case IPOPT_TIMESTAMP: + +... + +The option length for Timestamp option must be at least 4 bytes: option type, option length, pointer (offset), and the fourth byte is divided into two fields: the higher 4 bits are the overflow counter, which is incremented in each hop where there is no available space to store the required data, and the lower 4 bits are the flag: timestamp only, timestamp and address, and timestamp by a specified hop: + +if (optlen < 4) { + +pp_ptr = optptr + 1; + +goto error; + +} + +optptr[2] is the pointer (offset). Because, as stated earlier, each Timestamp option starts with 4 bytes, it implies that the pointer (offset) must be at least 5: + +if (optptr[2] < 5) { + +pp_ptr = optptr + 2; + +goto error; + +} + +if (optptr[2] <= optlen) { + +unsigned char *timeptr = NULL; + +if (optptr[2]+3 > optptr[1]) { + +pp_ptr = optptr + 2; + +goto error; + +} + +In the switch command, the value of optptr[3]&0xF is checked. It is the flag (4 lower bits of the fourth byte) of the Timestamp option: + +switch (optptr[3]&0xF) { + +case IPOPT_TS_TSONLY: + +if (skb) + +timeptr = &optptr[optptr[2]-1]; + +opt->ts_needtime = 1; + +For the Timestamp option with timestamps only flag (IPOPT_TS_TSONLY), 4 bytes are needed; so the pointer (offset) is incremented by 4: + +optptr[2] += 4; + +break; + +case IPOPT_TS_TSANDADDR: + +if (optptr[2]+7 > optptr[1]) { + +pp_ptr = optptr + 2; + +goto error; + +} + +if (rt) { + +spec_dst_fill(&spec_dst, skb); + +memcpy(&optptr[optptr[2]-1], + +&spec_dst, 4); + +timeptr = &optptr[optptr[2]+3]; + +} + +opt->ts_needaddr = 1; + +opt->ts_needtime = 1; + +For the Timestamp option with timestamps and addresses flag (IPOPT_TS_TSANDADDR), 8 bytes are needed; so the pointer (offset) is incremented by 8: + +optptr[2] += 8; + +break; + +case IPOPT_TS_PRESPEC: + +if (optptr[2]+7 > optptr[1]) { + +pp_ptr = optptr + 2; + +goto error; + +} + +{ + +__be32 addr; + +memcpy(&addr, &optptr[optptr[2]-1], 4); + +if (inet_addr_type(net,addr) == RTN_UNICAST) + +break; + +if (skb) + +timeptr = &optptr[optptr[2]+3]; + +} + +opt->ts_needtime = 1; + +For the Timestamp option with timestamps and pre-specified hops flag (IPOPT_TS_PRESPEC), 8 bytes are needed, so the pointer (offset) is incremented by 8: + +optptr[2] += 8; + +break; + +default: + +... + +} + +... + +After the ip_options_compile() method has built the ip_options object, strict routing is handled. First, a check is performed to see whether the device supports source routing. This means that the /proc/sys/net/ipv4/conf/all/accept_source_route is set, and the /proc/sys/net/ipv4/conf//accept_source_route is set. If these conditions are not met, the packet is dropped: + +... + +if (unlikely(opt->srr)) { + +struct in_device *in_dev = __in_dev_get_rcu(dev); + +if (in_dev) { + +if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + +... + +goto drop; + +} + +} + +if (ip_options_rcv_srr(skb)) + +goto drop; + +} + +Let's take a look at the ip_options_rcv_srr() method (again, I will focus on the important points, not little details). The list of source route addresses is iterated over. During the parsing process some sanity checks are made in the loop to see if there are errors. When the first nonlocal address is encountered, the loop is exited, and the following actions take place: + + * Set the srr_is_hit flag of the IP option object (opt->srr_is_hit = 1). + + * Set opt->nexthop to be the nexthop address that was found. + + * Set the opt->is_changed flag to 1. + +The packet should be forwarded. When the method ip_forward_finish() is reached, the ip_forward_options() method is called. In this method, if the srr_is_hit flag of the IP option object is set, the daddr of the ipv4 header is changed to be opt->nexthop, the offset is incremented by 4 (to point to the next address in the source route addresses list), and—because the IPv4 header was changed—the checksum is recalculated by calling the ip_send_check() method. + +### IP Options and Fragmentation + +When describing the option type in the beginning of this section, I mentioned a copied flag in the option type byte which indicates whether or not to copy the option when forwarding a fragmented packet. Handling IP options in fragmentation is done by the ip_options_fragment() method, which is invoked from the method which prepares fragments, ip_fragment(). It is called only for the first fragment. Let's take a look at the ip_options_fragment() method, which is very simple: + +void ip_options_fragment(struct sk_buff *skb) + +{ + +unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); + +struct ip_options *opt = &(IPCB(skb)->opt); + +int l = opt->optlen; + +int optlen; + +The while loop simply iterates over the options, reading each option type. optptr is a pointer to the option list (which starts at the end of the 20 first bytes of the IPv4 header). l is the size of the option list, which is being decremented by 1 in each loop iteration: + +while (l > 0) { + +switch (*optptr) { + +When the option type is IPOPT_END, which terminates the option string, it means that reading the options is finished: + +case IPOPT_END: + +return; + +case IPOPT_NOOP: + +When the option type is IPOPT_NOOP, used for padding between options, the optptr pointer is incremented by 1, l is decremented, and the next option is processed: + +l--; + +optptr++; + +continue; + +} + +Perform a sanity check on the option length: + +optlen = optptr[1]; + +if (optlen<2 || optlen>l) + +return; + +Check whether the option should be copied; if not, simply put one or several IPOPT_NOOP options instead of it with the memset() function. The number of IPOPT_NOOP bytes that memset() writes is the size of the option that was read, namely optlen: + +if (!IPOPT_COPIED(*optptr)) + +memset(optptr, IPOPT_NOOP, optlen); + +Now go to the next option: + +l -= optlen; + +optptr += optlen; } + +IPOPT_TIMESTAMP and IPOPT_RR are options for which the copied flag is 0 (see Table 4-1). They are replaced by IPOPT_NOOP in the loop you saw earlier, and their relevant fields in the IP option object are reset to 0: + +opt->ts = 0; + +opt->rr = 0; + +opt->rr_needaddr = 0; + +opt->ts_needaddr = 0; + +opt->ts_needtime = 0; + +} + +(net/ipv4/ip_options.c) + +In this section you have learned how the ip_rcv_options() handles the reception of packets with IP options and how IP options are parsed by the ip_options_compile() method. Fragmentation in IP options was also discussed. The next section covers the process of building IPv4 options, which involves setting the IP options of an IPv4 header based on a specified ip_options object. + +### Building IP Options + +The ip_options_build() method can be thought of as the reverse of the ip_options_compile() method you saw earlier in this chapter. It takes an ip_options object as an argument and writes its content to the IPv4 header. Let's take a look at it: + +void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + +__be32 daddr, struct rtable *rt, int is_frag) + +{ + +unsigned char *iph = skb_network_header(skb); + +memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); + +memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + +opt = &(IPCB(skb)->opt); + +if (opt->srr) + +memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + +if (!is_frag) { + +if (opt->rr_needaddr) + +ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); + +if (opt->ts_needaddr) + +ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); + +if (opt->ts_needtime) { + +struct timespec tv; + +__be32 midtime; + +getnstimeofday(&tv); + +midtime = htonl((tv.tv_sec % 86400) * + +MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + +memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + +} + +return; + +} + +if (opt->rr) { + +memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + +opt->rr = 0; + +opt->rr_needaddr = 0; + +} + +if (opt->ts) { + +memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + +opt->ts = 0; + +opt->ts_needaddr = opt->ts_needtime = 0; + +} + +} + +The ip_forward_options() method handles forwarding fragmented packets (net/ipv4/ip_options.c). In this method the Record Route and Strict Record route options are handled, and the ip_send_check() method is invoked to calculate the checksum for packets whose IPv4 header was changed (the opt->is_changed flag is set) and to reset the opt->is_changed flag to 0. The IPv4 Tx path—namely, how packets are sent—is discussed in the next section. + +My discussion on the Rx path is finished. The next section talks about the Tx path—what happens when IPv4 packets are sent. + +## Sending IPv4 Packets + +The IPv4 layer provides the means for the layer above it, the transport layer (L4), to send packets by passing these packets to the link layer (L2). I discuss how that is implemented in this section, and you'll see some differences between handling transmission of TCPv4 packets in IPv4 and handling transmission of UDPv4 packets in IPv4. There are two main methods for sending IPv4 packets from Layer 4, the transport layer: The first one is the ip_queue_xmit() method, used by the transport protocols that handle fragmentation by themselves, like TCPv4. The ip_queue_xmit() method is not the only transmission method used by TCPv4, which uses also the ip_build_and_send_pkt() method, for example, to send SYN ACK messages (see the tcp_v4_send_synack() method implementation in net/ipv4/tcp_ipv4.c). The second method is the ip_append_data() method, used by the transport protocols that do not handle fragmentation, like the UDPv4 protocol or the ICMPv4 protocol. The ip_append_data() method does not send any packet—it only prepares the packet. The ip_push_pending_frames() method is for actually sending the packet, and it is used by ICMPv4 or raw sockets, for example. Calling ip_push_pending_frames() actually starts the transmission process by calling the ip_send_skb() method, which eventually calls the ip_local_out() method. The ip_push_pending_frames() method was used for carrying out the transmission in UDPv4 prior to kernel 2.6.39; with the new ip_finish_skb API in 2.6.39, the ip_send_skb() method is used instead. Both methods are implemented in net/ipv4/ip_output.c. + +There are cases where the dst_output() method is called directly, without using the ip_queue_xmit() method or the ip_append_data() method; for example, when sending with a raw socket which uses IP_HDRINCL socket option, there is no need to prepare an IPv4 header. Userspace applications that build an IPv4 by their own use the IPv4 IP_HDRINCL socket option. For example, the well-known ping of iputils and nping of nmap both enable the user to set the ttl of the IPv4 header like this: + +ping –ttl ipDestAddress + +or: + +nping –ttl ipDestAddress + +Sending packets by raw sockets whose IP_HDRINCL socket option is set is done like this: + +static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, + +void *from, size_t length, + +struct rtable **rtp, + +unsigned int flags) + +{ + +... + +err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, + +rt->dst.dev, dst_output); + +... + +} + +Figure 4-8 shows the paths for sending IPv4 packets from the transport layer. + +Figure 4-8. + +Sending IPv4 packets + +In figure 4-8 you can see the different paths for transmitted packets that come from the transport layer (L4); these packets are handled by the ip_queue_xmit() method or by the ip_append_data() method. + +Let's start with the ip_queue_xmit() method, which is the simpler method of the two: + +int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) + +... + +/* Make sure we can route this packet. */ + +rt = (struct rtable *)__sk_dst_check(sk, 0); + +The rtable object is the result of a lookup in the routing subsystem. First I discuss the case where the rtable instance is NULL and you need to perform a lookup in the routing subsystem. If the strict routing option flag is set, the destination address is set to be the first address of the IP options: + +if (rt == NULL) { + +__be32 daddr; + +/* Use correct destination address if we have options. */ + +daddr = inet->inet_daddr; + +if (inet_opt && inet_opt->opt.srr) + +daddr = inet_opt->opt.faddr; + +Now a lookup in the routing subsystem is performed with the ip_route_output_ports() method: if the lookup fails, the packet is dropped, and an error of –EHOSTUNREACH is returned: + +/* If this fails, retransmit mechanism of transport layer will + +* keep trying until route appears or the connection times + +* itself out. + +*/ + +rt = ip_route_output_ports(sock_net(sk), fl4, sk, + +daddr, inet->inet_saddr, + +inet->inet_dport, + +inet->inet_sport, + +sk->sk_protocol, + +RT_CONN_FLAGS(sk), + +sk->sk_bound_dev_if); + +if (IS_ERR(rt)) + +goto no_route; + +sk_setup_caps(sk, &rt->dst); + +} + +skb_dst_set_noref(skb, &rt->dst); + +... + +If the lookup succeeds, but both the is_strictroute flag in the options and the rt_uses_gateway flag in the routing entry are set, the packet is dropped, and an error of –EHOSTUNREACH is returned: + +if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) + +goto no_route; + +Now the IPv4 header is being built. You should remember that the packet arrived from Layer 4, where skb->data pointed to the transport header. The skb->data pointer is moved back by the skb_push() method; the offset needed to move it back is the size of the IPv4 header plus the size of the IP options list (optlen), if IP options are used: + +/* OK, we know where to send it, allocate and build IP header. */ + +skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); + +Set the L3 header (skb->network_header) to point to skb->data: + +skb_reset_network_header(skb); + +iph = ip_hdr(skb); + +*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + +if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + +iph->frag_off = htons(IP_DF); + +else + +iph->frag_off = 0; + +iph->ttl = ip_select_ttl(inet, &rt->dst); + +iph->protocol = sk->sk_protocol; + +ip_copy_addrs(iph, fl4); + +The options length (optlen) is divided by 4, and the result is added to the IPv4 header length (iph->ihl) because the IPv4 header is measured in multiples of 4 bytes. Then the ip_options_build() method is invoked to build the options in the IPv4 header based on the content of the specified IP options. The last parameter of the ip_options_build() method, is_frag, specifies that there are no fragments. The ip_options_build() method was discussed in the "IP Option" section earlier in this chapter. + +if (inet_opt && inet_opt->opt.optlen) { + +iph->ihl += inet_opt->opt.optlen >> 2; + +ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); + +} + +Set the id in the IPv4 header: + +ip_select_ident_more(iph, &rt->dst, sk, + +(skb_shinfo(skb)->gso_segs ?: 1) - 1); + +skb->priority = sk->sk_priority; + +skb->mark = sk->sk_mark; + +Send the packet: + +res = ip_local_out(skb); + +Before discussing the ip_append_data() method, I want to mention a callback which is a parameter to the ip_append_data() method: the getfrag() callback. The getfrag() method is a callback to copy the actual data from userspace into the SKB. In UDPv4, the getfrag() callback is set to be the generic method, ip_generic_getfrag(). In ICMPv4, the getfrag() callback is set to be a protocol-specific method, icmp_glue_bits(). Another issue I should mention here is the UDPv4 corking feature. The UDP_CORK socket option was added in kernel 2.5.44; when this option is enabled, all data output on this socket is accumulated into a single datagram that is transmitted when the option is disabled. You can enable and disable this socket option with the setsockopt() system call; see man 7 udp. In kernel 2.6.39, a lockless transmit fast path was added to the UDPv4 implementation. With this addition, when the corking feature is not used, the socket lock is not used. So when the UDP_CORK socket option is set (with the setsockopt() system call), or the MSG_MORE flag is set, the ip_append_data() method is invoked. And when the UDP_CORK socket option is not set, another path in the udp_sendmsg() method is used, which does not hold the socket lock and is faster as a result, and the ip_make_skb() method is invoked. Calling the ip_make_skb() method is similar to the ip_append_data() and the ip_push_pending_frames() methods rolled into one, except that it does not send the SKB produced. Sending the SKB is carried out by the ip_send_skb() method. + +Let's take a look now at the ip_append_data() method: + +int ip_append_data(struct sock *sk, struct flowi4 *fl4, + +int getfrag(void *from, char *to, int offset, int len, + +int odd, struct sk_buff *skb), + +void *from, int length, int transhdrlen, + +struct ipcm_cookie *ipc, struct rtable **rtp, + +unsigned int flags) + +{ + +struct inet_sock *inet = inet_sk(sk); + +int err; + +If the MSG_PROBE flag us used, it means that the caller is interested only in some information (usually MTU, for PMTU discovery), so there is no need to actually send the packet, and the method returns 0: + +if (flags&MSG_PROBE) + +return 0; + +The value of transhdrlen is used to indicate whether it is a first fragment or not. The ip_setup_cork() method creates a cork IP options object if it does not exist and copies the IP options of the specified ipc (ipcm_cookie object) to the cork IP options: + +if (skb_queue_empty(&sk->sk_write_queue)) { + +err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); + +if (err) + +return err; + +} else { + +transhdrlen = 0; + +} + +The real work is done by the __ip_append_data() method; this is a long and a complex method, and I can't delve into all its details. I will mention that there are two different ways to handle fragments in this method, according to whether the network device supports Scatter/Gather (NETIF_F_SG) or not. When the NETIF_F_SG flag is set, skb_shinfo(skb)->frags is used, whereas when the NETIF_F_SG flag is not set, skb_shinfo(skb)->frag_list is used. There is also a different memory allocation when the MSG_MORE flag is set. The MSG_MORE flag indicates that soon another packet will be sent. Since Linux 2.6, this flag is also supported for UDP sockets. + +return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, + +sk_page_frag(sk), getfrag, + +from, length, transhdrlen, flags); + +} + +In this section you have learned about the Tx path—how sending IPv4 packets is implemented. When the packet length is higher than the network device MTU, the packet can't be sent as is. The next section covers fragmentation in the Tx path and how it is handled. + +## Fragmentation + +The network interface has a limit on the size of a packet. Usually in 10/100/1000 Mb/s Ethernet networks, it is 1500 bytes, though there are network interfaces that allow using an MTU of up to 9K (called jumbo frames). When sending a packet that is larger than the MTU of the outgoing network card, it should be broken into smaller pieces. This is done within the ip_fragment() method (net/ipv4/ip_output.c). Received fragmented packets should be reassembled into one packet. This is done by the ip_defrag() method, (net/ipv4/ip_fragment.c), discussed in the next section, "Defragmentation." + +Let's take a look first at the ip_fragment() method. Here's its prototype: + +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) + +The output callback is the method of transmission to be used. When the ip_fragment() method is invoked from ip_finish_output(), the output callback is the ip_finish_output2() method. There are two paths in the ip_fragment() method: the fast path and the slow path. The fast path is for packets where the frag_list of the SKB is not NULL, and the slow path is for packets that do not meet this condition. + +First a check is performed to see whether fragmentation is permitted, and if not, a "Destination Unreachable" ICMPv4 message with code of fragmentation needed is sent back to the sender, the statistics (IPSTATS_MIB_FRAGFAILS) are updated, the packet is dropped, and an error code of –EMSGSIZE is returned: + +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) + +{ + +unsigned int mtu, hlen, left, len, ll_rs; + +... + +struct rtable *rt = skb_rtable(skb); + +int err = 0; + +dev = rt->dst.dev; + +... + +iph = ip_hdr(skb); + +if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || + +(IPCB(skb)->frag_max_size && + +IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + +htonl(ip_skb_dst_mtu(skb))); + +kfree_skb(skb); + +return -EMSGSIZE; + +} + +... + +... + +The next section discusses the fast path in fragmentation and its implementation. + +### Fast Path + +Now let's look into the fast path. First a check is performed to see whether the packet should be handled in the fast path by calling the skb_has_frag_list() method, which simply checks that skb_shinfo(skb)->frag_list is not NULL; if it is NULL, some sanity checks are made, and if something is not valid, the fallback to the slow path mechanism is activated (simply by calling goto slow_path). Then an IPv4 header is built for the first fragment. The frag_off of this IPv4 header is set to be htons(IP_MF), which indicates more fragments ahead. The frag_off field of the IPv4 header is a 16-bit field; the lower 13 bits are the fragment offset, and the higher 3 bits are the flags. For the first fragment, the offset should be 0, and the flag should be IP_MF (More Fragments). For all other fragments except the last one, the IP_MF flag should be set, and the lower 13 bits should be the fragment offset (measured in units of 8 bytes). For the last fragment, the IP_MF flag should not be set, but the lower 13 bits will still hold the fragment offset. + +Here's how to set hlen to the IPv4 header size in bytes: + +hlen = iph->ihl * 4; + +... + +if (skb_has_frag_list(skb)) { + +struct sk_buff *frag, *frag2; + +int first_len = skb_pagelen(skb); + +... + +err = 0; + +offset = 0; + +frag = skb_shinfo(skb)->frag_list; + +set skb_shinfo(skb)->frag_list to NULL by skb_frag_list_init(skb): + +skb_frag_list_init(skb); + +skb->data_len = first_len - skb_headlen(skb); + +skb->len = first_len; + +iph->tot_len = htons(first_len); + +Set the IP_MF (More Fragments) flag for the first fragment: + +iph->frag_off = htons(IP_MF); + +Because the value of some IPv4 header fields were changed, the checksum needs to be recalculated: + +ip_send_check(iph); + +Now take a look at the loop that traverses frag_list and builds fragments: + +for (;;) { + +/* Prepare header of the next frame, + +* before previous one went down. */ + +if (frag) { + +frag->ip_summed = CHECKSUM_NONE; + +skb_reset_transport_header(frag); + +The ip_fragment() was invoked from the transport layer (L4), so skb->data points to the transport header. The skb->data pointer should be moved back by hlen bytes so that it will point to the IPv4 header (hlen is the size of the IPv4 header in bytes): + +__skb_push(frag, hlen); + +Set the L3 header (skb->network_header) to point to skb->data: + +skb_reset_network_header(frag); + +Copy the IPv4 header which was created into the L3 network header; in the first iteration of this for loop, it is the header which was created outside the loop for the first fragment: + +memcpy(skb_network_header(frag), iph, hlen); + +Now the IPv4 header and its tot_len of the next frag are initialized: + +iph = ip_hdr(frag); + +iph->tot_len = htons(frag->len); + +Copy various SKB fields (like pkt_type, priority, protocol) from SKB into frag: + +ip_copy_metadata(frag, skb); + +Only for the first fragment (where the offset is 0) should the ip_options_fragment() method be called: + +if (offset == 0) + +ip_options_fragment(frag); + +offset += skb->len - hlen; + +The frag_off field of the IPv4 header is measured in multiples of 8 bytes, so divide the offset by 8: + +iph->frag_off = htons(offset>>3); + +Each fragment, except the last one, should have the IP_MF flag set: + +if (frag->next != NULL) + +iph->frag_off |= htons(IP_MF); + +The value of some IPv4 header fields were changed, so the checksum should be recalculated: + +/* Ready, complete checksum */ + +ip_send_check(iph); + +} + +Now send the fragment with the output callback. If sending it succeeded, increment IPSTATS_MIB_FRAGCREATES. If there was an error, exit the loop: + +err = output(skb); + +if (!err) + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + +if (err || !frag) + +break; + +Fetch the next SKB: + +skb = frag; + +frag = skb->next; + +skb->next = NULL; + +The following closing bracket is the end of the for loop: + +} + +The for loop is terminated, and the return value of the last call to output(skb) should be checked. If it is successful, the statistics (IPSTATS_MIB_FRAGOKS) are updated, and the method returns 0: + +if (err == 0) { + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + +return 0; + +} + +If the last call to output(skb) failed in one of the loop iterations, including the last one, the SKBs are freed, the statistics (IPSTATS_MIB_FRAGFAILS) are updated, and the error code (err) is returned: + +while (frag) { + +skb = frag->next; + +kfree_skb(frag); + +frag = skb; + +} + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + +return err; + +You should now have a good understanding of the fast path in fragmentation and how it is implemented. + +### Slow Path + +Let's now take a look at how to implement the slow path in fragmentation: + +... + +iph = ip_hdr(skb); + +left = skb->len - hlen; /* Space per frame */ + +... + +while (left > 0) { + +len = left; + +/* IF: it doesn't fit, use 'mtu' - the data space left */ + +if (len > mtu) + +len = mtu; + +Each fragment (except the last one) should be aligned on a 8-byte boundary: + +if (len < left) { + +len &= ∼7; + +} + +Allocate an SKB: + +if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { + +NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + +err = -ENOMEM; + +goto fail; + +} + +/* + +* Set up data on packet + +*/ + +Copy various SKB fields (like pkt_type, priority, protocol) from skb into skb2: + +ip_copy_metadata(skb2, skb); + +skb_reserve(skb2, ll_rs); + +skb_put(skb2, len + hlen); + +skb_reset_network_header(skb2); + +skb2->transport_header = skb2->network_header + hlen; + +/* + +* Charge the memory for the fragment to any owner + +* it might possess + +*/ + +if (skb->sk) + +skb_set_owner_w(skb2, skb->sk); + +/* + +* Copy the packet header into the new buffer. + +*/ + +skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); + +/* + +* Copy a block of the IP datagram. + +*/ + +if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) + +BUG(); + +left -= len; + +/* + +* Fill in the new header fields. + +*/ + +iph = ip_hdr(skb2); + +frag_off is measured in multiples of 8 bytes, so divide the offset by 8: + +iph->frag_off = htons((offset >> 3)); + +... + +Handle options only once for the first fragment: + +if (offset == 0) + +ip_options_fragment(skb); + +The MF flag (More Fragments) should be set on any fragment but the last: + +if (left > 0 || not_last_frag) + +iph->frag_off |= htons(IP_MF); + +ptr += len; + +offset += len; + +/* + +* Put this fragment into the sending queue. + +*/ + +iph->tot_len = htons(len + hlen); + +Because the value of some IPv4 header fields were changed, the checksum should be recalculated: + +ip_send_check(iph); + +Now send the fragment with the output callback. If sending it succeeded, increment IPSTATS_MIB_FRAGCREATES. If there was an error, then free the packet, update the statistics (IPSTATS_MIB_FRAGFAILS), and return the error code: + +err = output(skb2); + +if (err) + +goto fail; + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); + +} + +Now the while (left > 0) loop has terminated, and the consume_skb() method is invoked to free the SKB, the statistics (IPSTATS_MIB_FRAGOKS) are updated, and the value of err is returned: + +consume_skb(skb); + +IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); + +return err; + +This section dealt with the implementation of slow path in fragmentation, and this ends the discussion of fragmentation in the Tx path. Remember that received fragmented packets, which are received on a host, should be reconstructed again so that applications can handle the original packet. The next section discusses defragmentation—the opposite of fragmentation. + +## Defragmentation + +Defragmentation is the process of reassembling all the fragments of a packet, which all have the same id in the IPv4 header, into one buffer. The main method that handles defragmentation in the Rx path is ip_defrag() (net/ipv4/ip_fragment.c), which is called from ip_local_deliver(). There are other places where defragmentation might be needed, such as in firewalls, where the content of the packet should be known in order to be able to inspect it. In the ip_local_deliver() method, the ip_is_fragment() method is invoked to check whether the packet is fragmented; if it is, the ip_defrag() method is invoked. The ip_defrag() method has two arguments: the first is the SKB and the second is a 32-bit field which indicates the point where the method was invoked. Its value can be the following: + + * IP_DEFRAG_LOCAL_DELIVER when it was called from ip_local_deliver(). + + * IP_DEFRAG_CALL_RA_CHAIN when it was called from ip_call_ra_chain(). + + * IP_DEFRAG_VS_IN or IP_DEFRAG_VS_FWD or IP_DEFRAG_VS_OUT when it was called from IPVS. + +For a full list of possible values for the second argument of ip_defrag(), look in the ip_defrag_users enum definition in include/net/ip.h. + +Let's look at the ip_defrag() invocation in ip_local_deliver(): + +int ip_local_deliver(struct sk_buff *skb) + +{ + +/* + +* Reassemble IP fragments. + +*/ + +if (ip_is_fragment(ip_hdr(skb))) { + +if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + +return 0; + +} + +return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + +ip_local_deliver_finish); + +} + +(net/ipv4/ip_input.c) + +The ip_is_fragment() is a simple helper method that takes as a sole argument the IPv4 header and returns true when it is a fragment, like this: + +static inline bool ip_is_fragment(const struct iphdr *iph) + +{ + +return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; + +} + +(include/net/ip.h) + +The ip_is_fragment() method returns true in either of two cases (or both): + + * The IP_MF flag is set. + + * The fragment offset is not 0. + +Thus it will return true on all fragments: + + * On the first fragment, where frag_off is 0 but the IP_MF flag is set. + + * On the last fragment, where frag_off is not 0 but the IP_MF flag is not set. + + * On all other fragments, where frag_off is not 0 and the IP_MF flag is set. + +The implementation of defragmentation is based on a hash table of ipq objects. The hash function (ipqhashfn) has four arguments: fragment id, source address, destination address, and protocol: + +struct ipq { + +struct inet_frag_queue q; + +u32 user; + +__be32 saddr; + +__be32 daddr; + +__be16 id; + +u8 protocol; + +u8 ecn; /* RFC3168 support */ + +int iif; + +unsigned int rid; + +struct inet_peer *peer; + +}; + +Note that the logic of IPv4 defragmentation is shared with its IPv6 counterpart. So, for example, the inet_frag_queue structure and methods like the inet_frag_find() method and the inet_frag_evictor() method are not specific to IPv4; they are also used in IPv6 (see net/ipv6/reassembly.c and net/ipv6/nf_conntrack_reasm.c). + +The ip_defrag() method is quite short. First it makes sure there is enough memory by calling the ip_evictor() method. Then it tries to find an ipq for the SKB by calling the ip_find() method; if it does not find one, it creates an ipq object. The ipq object that the ip_find() method returns is assigned to a variable named qp (a pointer to an ipq object). Then it calls the ip_frag_queue() method to add the fragment to a linked list of fragments (qp->q.fragments). The addition to the list is done according to the fragment offset, because the list is sorted by the fragment offset. After all fragments of an SKB were added, the ip_frag_queue()method calls the ip_frag_reasm() method to build a new packet from all its fragments. The ip_frag_reasm() method also stops the timer (of ip_expire()) by calling the ipq_kill() method. If there was some error, and the size of the new packet exceeds the highest permitted size (which is 65535), the ip_frag_reasm() method updates the statistics (IPSTATS_MIB_REASMFAILS) and returns -E2BIG. If the call to skb_clone() method in ip_frag_reasm() fails, it returns –ENOMEM. The IPSTATS_MIB_REASMFAILS statistics is updated in this case as well. Constructing a packet from all its fragments should be done in a specified time interval. If it's not completed within that interval, the ip_expire() method will send an ICMPv4 message of "Time Exceeded" with "Fragment Reassembly Time Exceeded" code. The defragmentation time interval can be set by the following procfs entry: /proc/sys/net/ipv4/ipfrag_time. It is 30 seconds by default. + +Let's take a look at the ip_defrag() method: + +int ip_defrag(struct sk_buff *skb, u32 user) + +{ + +struct ipq *qp; + +struct net *net; + +net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); + +IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + +/* Start by cleaning up the memory. */ + +ip_evictor(net); + +/* Lookup (or create) queue header */ + +if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { + +int ret; + +spin_lock(&qp->q.lock); + +ret = ip_frag_queue(qp, skb); + +spin_unlock(&qp->q.lock); + +ipq_put(qp); + +return ret; + +} + +IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + +kfree_skb(skb); + +return -ENOMEM; + +} + +Before looking at the ip_frag_queue() method, consider the following macro, which simply returns the ipfrag_skb_cb object which is associated with the specified SKB: + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +Now let's look at the ip_frag_queue() method. I will not describe all the details because the method is very complicated and takes into account problems that might arise from overlapping (overlapping fragments may occur due to retransmissions). In the following snippet, qp->q.len is set to be the total length of the packet, including all its fragments; when the IP_MF flag is not set, this means that this is the last fragment: + +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) + +{ + +struct sk_buff *prev, *next; + +... + +/* Determine the position of this fragment. */ + +end = offset + skb->len - ihl; + +err = -EINVAL; + +/* Is this the final fragment? */ + +if ((flags & IP_MF) == 0) { + +/* If we already have some bits beyond end + +* or have different end, the segment is corrupted. + +*/ + +if (end < qp->q.len || + +((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) + +goto err; + +qp->q.last_in |= INET_FRAG_LAST_IN; + +qp->q.len = end; + +} else { + +... + +} + +Now the location for adding the fragment is found by looking for the first place which is after the fragment offset (the linked list of fragments is ordered by offset): + +... + +prev = NULL; + +for (next = qp->q.fragments; next != NULL; next = next->next) { + +if (FRAG_CB(next)->offset >= offset) + +break; /* bingo! */ + +prev = next; + +} + +Now, prev points to where to add the new fragment if it is not NULL. Skipping handling overlapping and some other checks, let's continue to the insertion of the fragment into the list: + +FRAG_CB(skb)->offset = offset; + +/* Insert this fragment in the chain of fragments. */ + +skb->next = next; + +if (!next) + +qp->q.fragments_tail = skb; + +if (prev) + +prev->next = skb; + +else + +qp->q.fragments = skb; + +... + +qp->q.meat += skb->len; + +Note that the qp->q.meat is incremented by skb->len for each fragment. As mentioned earlier, qp->q.len is the total length of all fragments, and when it is equal to qp->q.meat, it means that all fragments were added and should be reassembled into one packet with the ip_frag_reasm() method. + +Now you can see how and where reassembly takes place: (reassembly is done by calling the ip_frag_reasm() method): + +if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + +qp->q.meat == qp->q.len) { + +unsigned long orefdst = skb->_skb_refdst; + +skb->_skb_refdst = 0UL; + +err = ip_frag_reasm(qp, prev, dev); + +skb->_skb_refdst = orefdst; + +return err; + +} + +Let's take a look at the ip_frag_reasm() method: + +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, + +struct net_device *dev) + +{ + +struct net *net = container_of(qp->q.net, struct net, ipv4.frags); + +struct iphdr *iph; + +struct sk_buff *fp, *head = qp->q.fragments; + +int len; + +... + +/* Allocate a new buffer for the datagram. */ + +ihlen = ip_hdrlen(head); + +len = ihlen + qp->q.len; + +err = -E2BIG; + +if (len > 65535) + +goto out_oversize; + +... + +skb_push(head, head->data - skb_network_header(head)); + +## Forwarding + +The main handler for forwarding a packet is the ip_forward() method: + +int ip_forward(struct sk_buff *skb) + +{ + +struct iphdr *iph; /* Our header */ + +struct rtable *rt; /* Route we use */ + +struct ip_options *opt = &(IPCB(skb)->opt); + +I should describe why Large Receive Offload (LRO) packets are dropped in forwarding. LRO is a performance-optimization technique that merges packets together, creating one large SKB, before they are passed to higher network layers. This reduces CPU overhead and thus improves the performance. Forwarding a large SKB, which was built by LRO, is not acceptable because it will be larger than the outgoing MTU. Therefore, when LRO is enabled the SKB is freed and the method returns NET_RX_DROP. Generic Receive Offload (GRO) design included forwarding ability, but LRO did not: + +if (skb_warn_if_lro(skb)) + +goto drop; + +If the router_alert option is set, the ip_call_ra_chain() method should be invoked to handle the packet. When calling setsockopt() with IP_ROUTER_ALERT on a raw socket, the socket is added to a global list named ip_ra_chain (see include/net/ip.h). The ip_call_ra_chain() method delivers the packet to all raw sockets. You might wonder why is the packet delivered to all raw sockets and not to a single raw socket? In raw sockets there are no ports on which the sockets listen, as opposed to TCP or UDP. + +If the pkt_type—which was determined by the eth_type_trans() method, which should be called from the network driver, and which is discussed in Appendix A—is not PACKET_HOST, the packet is discarded: + +if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + +return NET_RX_SUCCESS; + +if (skb->pkt_type != PACKET_HOST) + +goto drop; + +The ttl (Time To Live) field of the IPv4 header is a counter which is decreased by 1 in each forwarding device. If the ttl reaches 0, that is an indication that the packet should be dropped and that a corresponding time exceeded ICMPv4 message with "TTL Count Exceeded" code should be sent: + +if (ip_hdr(skb)->ttl <= 1) + +goto too_many_hops;... + +... + +too_many_hops: + +/* Tell the sender its packet died... */ + +IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); + +icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + +... + +Now a check is performed if both the strict route flag (is_strictroute) is set and the rt_uses_gateway flag is set; in such a case, strict routing cannot be applied, and a "Destination Unreachable" ICMPv4 message with "Strict Routing Failed" code is sent back: + +rt = skb_rtable(skb); + +if (opt->is_strictroute && rt->rt_uses_gateway) + +goto sr_failed; + +... + +sr_failed: + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + +goto drop; + +... + +Now a check is performed to see whether the length of the packet is larger than the outgoing device MTU. If it is, that means the packet is not permitted to be sent as it is. Another check is performed to see whether the DF (Don't Fragment) field in the IPv4 header is set and whether the local_df flag in the SKB is not set. If these conditions are met, it means that when the packet reaches the ip_output() method, it will not be fragmented with the ip_fragment() method. This means the packet cannot be sent as is, and it also cannot be fragmented; so a destination unreachable ICMPv4 message with "Fragmentation Needed" code is sent back, the packet is dropped, and the statistics (IPSTATS_MIB_FRAGFAILS) are updated: + +if (unlikely(skb->len > dst_mtu(&rt->dst) && + +!skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) + +&& !skb->local_df) { + +IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + +htonl(dst_mtu(&rt->dst))); + +goto drop; } + +Because the ttl and checksum of the IPv4 header are going to be changed, a copy of the SKB should be kept: + +/* We are about to mangle packet. Copy it! */ + +if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) + +goto drop; + +iph = ip_hdr(skb); + +As mentioned earlier, each node that forwards the packet should decrease the ttl. As a result of the ttl change, the checksum is also updated accordingly in the ip_decrease_ttl() method: + +/* Decrease ttl after skb cow done */ + +ip_decrease_ttl(iph); + +Now a redirect ICMPv4 message is sent back. If the RTCF_DOREDIRECT flag of the routing entry is set then a "Redirect To Host" code is used for this message (I discuss ICMPv4 redirect messages in Chapter 5). + +/* + +* We now generate an ICMP HOST REDIRECT giving the route + +* we calculated. + +*/ + +if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + +ip_rt_send_redirect(skb); + +The skb->priority in the Tx path is set to be the socket priority (sk->sk_priority)—see, for example, the ip_queue_xmit() method. The socket priority, in turn, can be set by calling the setsockopt() system call with SOL_SOCKET and SO_PRIORITY. However, when forwarding the packet, there is no socket attached to the SKB. So, in the ip_forward() method, the skb->priority is set according to a special table called ip_tos2prio. This table has 16 entries (see include/net/route.h). + +skb->priority = rt_tos2priority(iph->tos); + +Now, assuming that there are no netfilter NF_INET_FORWARD hooks, the ip_forward_finish() method is invoked: + +return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, + +rt->dst.dev, ip_forward_finish); + +In ip_forward_finish(), the statistics are updated, and we check that the IPv4 packet includes IP options. If it does, the ip_forward_options() method is invoked to handle the options. If it does not have options, the dst_output() method is called. The only thing this method does is invoke skb_dst(skb)->output(skb): + +static int ip_forward_finish(struct sk_buff *skb) + +{ + +struct ip_options *opt = &(IPCB(skb)->opt); + +IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + +IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + +if (unlikely(opt->optlen)) + +ip_forward_options(skb); + +return dst_output(skb); + +} + +In this section you learned about the methods for forwarding packets (ip_forward() and ip_forward_finish()), about cases when a packet is discarded in forwarding, about cases when an ICMP redirect is sent, and more. + +## Summary + +This chapter dealt with the IPv4 protocol—how an IPv4 packet is built, the IPv4 header structure and IP options, and how they are handled. You learned how the IPv4 protocol handler is registered. You also learned about the Rx path (how the reception of IPv4 packets is handled) and about the Tx path in IPv4 (how the transmission of IPv4 packets is handled). There are cases when packets are larger than the network interface MTU, and as a result they can't be sent without being fragmented on the sender side and later defragmented on the receiver side. You learned about the implementation of fragmentation in IPv4 (including how the slow path and the fast path are implemented and when they are used) and the implementation of defragmentation in IPv4. The chapter also covered IPv4 forwarding—sending an incoming packet on a different network interface without passing it to the upper layer. And you saw some examples of when a packet is discarded in the forwarding process and when an ICMP redirect is sent. The next chapter discusses the IPv4 routing subsystem. The "Quick Reference" section that follows covers the top methods that are related to the topics discussed in this chapter, ordered by their context. + +## Quick Reference + +I conclude this chapter with a short list of important methods and macros of the IPv4 subsystem that were mentioned in this chapter. + +### Methods + +The following is a short list of important methods of the IPv4 layer, which were mentioned in this chapter. + +#### int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl); + +This method moves packets from L4 (the transport layer) to L3 (the network layer), invoked for example from TCPv4. + +#### int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags); + +This method moves packets from L4 (the transport layer) to L3 (the network layer); invoked for example from UDPv4 when working with corked UDP sockets and from ICMPv4. + +#### struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags); + +This method was added in kernel 2.6.39 for enabling lockless transmit fast path to the UDPv4 implementation; called when not using the UDP_CORK socket option. + +#### int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); + +This method is a generic method for copying data from userspace into the specified skb. + +#### static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); + +This method is the ICMPv4 getfrag callback. The ICMPv4 module calls the ip_append_data() method with icmp_glue_bits() as the getfrag callback. + +#### int ip_options_compile(struct net *net,struct ip_options *opt, struct sk_buff *skb); + +This method builds an ip_options object by parsing IP options. + +#### void ip_options_fragment(struct sk_buff *skb); + +This method fills the options whose copied flag is not set with NOOPs and resets the corresponding fields of these IP options. Invoked only for the first fragment. + +#### void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); + +This method takes the specified ip_options object and writes its content to the IPv4 header. The last parameter, is_frag, is in practice 0 in all invocations of the ip_options_build() method. + +#### void ip_forward_options(struct sk_buff *skb); + +This method handles IP options forwarding. + +#### int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); + +This method is the main Rx handler for IPv4 packets. + +#### ip_rcv_options(struct sk_buff *skb); + +This method is the main method for handling receiving a packet with options. + +#### int ip_options_rcv_srr(struct sk_buff *skb); + +This method handles receiving a packet with strict route option. + +#### int ip_forward(struct sk_buff *skb); + +This method is the main handler for forwarding IPv4 packets. + +#### static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, int vifi); + +This method is the multicast transmission method. + +#### static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, void *from, size_t length, struct rtable **rtp, unsigned int flags); + +This method is used by raw sockets for transmission when the IPHDRINC socket option is set. It calls the dst_output() method directly. + +#### int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +This method is the main fragmentation method. + +#### int ip_defrag(struct sk_buff *skb, u32 user); + +This method is the main defragmentation method. It processes an incoming IP fragment. The second parameter, user, indicates where this method was invoked from. For a full list of possible values for the second parameter, look in the ip_defrag_users enum definition in include/net/ip.h. + +#### bool skb_has_frag_list(const struct sk_buff *skb); + +This method returns true if skb_shinfo(skb)->frag_list is not NULL. The method skb_has_frag_list() was named skb_has_frags() in the past, and was renamed skb_has_frag_list() in kernel 2.6.37. (The reason was that the name was confusing.) SKBs can be fragmented in two ways: via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Because skb_has_frags() tests the latter, its name is confusing because it sounds more like it's testing the former. + +#### int ip_local_deliver(struct sk_buff *skb); + +This method handles delivering packets to Layer 4. + +int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, unsigned char __user *data, int optlen); + +This method handles setting options from userspace by the setsockopt() system call with IP_OPTIONS. + +#### bool ip_is_fragment(const struct iphdr *iph); + +This method returns true if the packet is a fragment. + +#### int ip_decrease_ttl(struct iphdr *iph); + +This method decrements the ttl of the specified IPv4 header by 1 and, because one of the IPv4 header fields had changed (ttl), recalculates the IPv4 header checksum. + +#### int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt); + +This method is used by TCPv4 to send SYN ACK. See the tcp_v4_send_synack() method in net/ipv4/tcp_ipv4.c. + +#### int ip_mr_input(struct sk_buff *skb); + +This method handles incoming multicast packets. + +#### int ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local); + +This method forwards multicast packets. + +#### bool ip_call_ra_chain(struct sk_buff *skb); + +This method handles the Router Alert IP option. + +### Macros + +This section mentions some macros from this chapter that deal with mechanisms encountered in the IPv4 stack, such as fragmentation, netfilter hooks, and IP options. + +#### IPCB(skb) + +This macro returns the inet_skb_parm object which skb->cb points to. It is used to access the ip_options object stored in the inet_skb_parm object (include/net/ip.h). + +#### FRAG_CB(skb) + +This macro returns the ipfrag_skb_cb object which skb->cb points to (net/ipv4/ip_fragment.c). + +#### int NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *)) + +This macro is the netilter hook; the first parameter, pf, is the protocol family; for IPv4 it is NFPROTO_IPV4, and for IPv6 it is NFPROTO_IPV6. The second parameter is one of the five netfilter hook points in the network stack; these five points are defined in include/uapi/linux/netfilter.h and can be used both by IPv4 and IPv6. The okfn callback is to be called if there is no hook registered or if the registered netfilter hook does not discard or reject the packet. + +#### int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), bool cond) + +This macro is same as the NF_HOOK() macro, but with an additional Boolean parameter, cond, which must be true so that the netfilter hook will be called. + +#### IPOPT_COPIED() + +This macro returns the copied flag of the option type. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_5 + +© Rami Rosen 2014 + +# 5. The IPv4 Routing Subsystem + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 4 discussed the IPv4 subsystem. In this chapter and the next I discuss one of the most important Linux subsystems, the routing subsystem, and its implementation in Linux. The Linux routing subsystem is used in a wide range of routers—from home and small office routers, to enterprise routers (which connect organizations or ISPs) and core high speed routers on the Internet backbone. It is impossible to imagine the modern world without these devices. The discussion in these two chapters is limited to the IPv4 routing subsystem, which is very similar to the IPv6 implementation. This chapter is mainly an introduction and presents the main data structures that are used by the IPv4 routing subsystem, like the routing tables, the Forwarding Information Base (FIB) info and the FIB alias, the FIB TRIE and more. (TRIE is not an acronym, by the way, but it is derived from the word retrieval). The TRIE is a data structure, a special tree that replaced the FIB hash table. You will learn how a lookup in the routing subsystem is performed, how and when ICMP Redirect messages are generated, and about the removal of the routing cache code. Note that the discussion and the code examples in this chapter relate to kernel 3.9, except for two sections where a different kernel version is explicitly mentioned. + +Chapter 4 discussed the IPv4 subsystem. In this chapter and the next I discuss one of the most important Linux subsystems, the routing subsystem, and its implementation in Linux. The Linux routing subsystem is used in a wide range of routers—from home and small office routers, to enterprise routers (which connect organizations or ISPs) and core high speed routers on the Internet backbone. It is impossible to imagine the modern world without these devices. The discussion in these two chapters is limited to the IPv4 routing subsystem, which is very similar to the IPv6 implementation. This chapter is mainly an introduction and presents the main data structures that are used by the IPv4 routing subsystem, like the routing tables, the Forwarding Information Base (FIB) info and the FIB alias, the FIB TRIE and more. (TRIE is not an acronym, by the way, but it is derived from the word retrieval). The TRIE is a data structure, a special tree that replaced the FIB hash table. You will learn how a lookup in the routing subsystem is performed, how and when ICMP Redirect messages are generated, and about the removal of the routing cache code. Note that the discussion and the code examples in this chapter relate to kernel 3.9, except for two sections where a different kernel version is explicitly mentioned. + +## Forwarding and the FIB + +One of the important goals of the Linux Networking stack is to forward traffic. This is relevant especially when discussing core routers, which operate in the Internet backbone. The Linux IP stack layer, responsible for forwarding packets and maintaining the forwarding database, is called the routing subsystem. For small networks, management of the FIB can be done by a system administrator, because most of the network topology is static. When discussing core routers, the situation is a bit different, as the topology is dynamic and there is a vast amount of ever-changing information. In this case, management of the FIB is done usually by userspace routing daemons, sometimes in conjunction with special hardware enhancements. These userspace daemons usually maintain routing tables of their own, which sometimes interact with the kernel routing tables. + +Let's start with the basics: what is routing? Take a look at a very simple forwarding example: you have two Ethernet Local Area Networks, LAN1 and LAN2. On LAN1 you have a subnet of 192.168.1.0/24, and on LAN2 you have a subnet of 192.168.2.0/24. There is a machine between these two LANs, which will be called a "forwarding router." There are two Ethernet network interface cards (NICs) in the forwarding router. The network interface connected to LAN1 is eth0 and has an IP address of 192.168.1.200, and the network interface connected to LAN2 is eth1 and has an IP address of 192.168.2.200, as you can see in Figure 5-1. For the sake of simplicity, let's assume that no firewall daemon runs on the forwarding router. You start sending traffic from LAN1, which is destined to LAN2. The process of forwarding incoming packets, which are sent from LAN1 and which are destined to LAN2 (or vice versa), according to data structures that are called routing tables, is called routing. I discuss this process and the routing table data structures in this chapter and in the next as well. + +In Figure 5-1, packets that arrive on eth0 from LAN1, which are destined to LAN2, are forwarded via eth1 as the outgoing device. In this process, the incoming packets move from Layer 2 (the link layer) in the kernel networking stack, to Layer 3, the network layer, in the forwarding router machine. As opposed to the case where the traffic is destined to the forwarding router machine ("Traffic to me"), however, there is no need to move the packets to Layer 4 (the transport layer) because this traffic in not intended to be handled by any Layer 4 transport socket. This traffic should be forwarded. Moving to Layer 4 has a performance cost, which is better to avoid whenever possible. This traffic is handled in Layer 3, and, according to the routing tables configured on the forwarding router machine, packets are forwarded on eth1 as the outgoing interface (or rejected). + +Figure 5-1. + +Forwarding packets between two LANs + +Figure 5-2 shows the three network layers handled by the kernel that were mentioned earlier. + +Figure 5-2. + +The three layers that are handled by the networking kernel stack + +Two additional terms that I should mention here, which are commonly used in routing, are default gateway and default route. When you are defining a default gateway entry in a routing table, every packet that is not handled by the other routing entries (if there are such entries) must be forwarded to it, regardless of the destination address in the IP header of this packet. The default route is designated as 0.0.0.0/0 in Classless Inter-Domain Routing (CIDR) notation. As a simple example, you can add a machine with an IPv4 address of 192.168.2.1 as a default gateway as follows: + +ip route add default via 192.168.2.1 + +Or, when using the route command, like this: + +route add default gateway 192.168.2.1 + +In this section you learned what forwarding is and saw a simple example illustrating how packets are forwarded between two LANs. You also learned what a default gateway is and what a default route is, and how to add them. Now that you know the basic terminology and what forwarding is, let's move on and see how a lookup in the routing subsystem is performed. + +## Performing a Lookup in the Routing Subsystem + +A lookup in the routing subsystem is done for each packet, both in the Rx path and in the Tx path. In kernels prior to 3.6, each lookup, both in the Rx path and in the Tx path, consisted of two phases: a lookup in the routing cache and, in case of a cache miss, a lookup in the routing tables (I discuss the routing cache at the end of this chapter, in the "IPv4 Routing Cache" section). A lookup is done by the fib_lookup() method. When the fib_lookup() method finds a proper entry in the routing subsystem, it builds a fib_result object, which consists of various routing parameters, and it returns 0. I discuss the fib_result object in this section and in other sections of this chapter. Here is the fib_lookup() prototype: + +int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res) + +The flowi4 object consists of fields that are important to the IPv4 routing lookup process, including the destination address, source address, Type of Service (TOS), and more. In fact the flowi4 object defines the key to the lookup in the routing tables and should be initialized prior to performing a lookup with the fib_lookup() method. For IPv6 there is a parallel object named flowi6; both are defined in include/net/flow.h. The fib_result object is built in the IPv4 lookup process. The fib_lookup() method first searches the local FIB table. If the lookup fails, it performs a lookup in the main FIB table (I describe these two tables in the next section, "FIB tables"). After a lookup is successfully done, either in the Rx path or the Tx path, a dst object is built (an instance of the dst_entry structure, the destination cache, defined in include/net/dst.h). The dst object is embedded in a structure called rtable, as you will soon see. The rtable object, in fact, represents a routing entry which can be associated with an SKB. The most important members of the dst_entry object are two callbacks named input and output. In the routing lookup process, these callbacks are assigned to be the proper handlers according to the routing lookup result. These two callbacks get only an SKB as a parameter: + +struct dst_entry { + +... + +int (*input)(struct sk_buff *); + +int (*output)(struct sk_buff *); + +... + +} + +The following is the rtable structure; as you can see, the dst object is the first object in this structure: + +struct rtable { + +struct dst_entry dst; + +int c rt_genid; + +unsigned int rt_flags; + +__u16 rt_type; + +__u8 rt_is_input; + +__u8 rt_uses_gateway; + +int rt_iif; + +/* Info on neighbour */ + +__be32 rt_gateway; + +/* Miscellaneous cached information */ + +u32 rt_pmtu; + +struct list_head rt_uncached; + +}; + +(include/net/route.h) + +The following is a description of the members of the rtable structure: + + * rt_flags: The rtable object flags; some of the important flags are mentioned here: + + * RTCF_BROADCAST: When set, the destination address is a broadcast address. This flag is set in the __mkroute_output() method and in the ip_route_input_slow() method. + + * RTCF_MULTICAST: When set, the destination address is a multicast address. This flag is set in the ip_route_input_mc() method and in the __mkroute_output() method. + + * RTCF_DOREDIRECT: When set, an ICMPv4 Redirect message should be sent as a response for an incoming packet. Several conditions should be fulfilled for this flag to be set, including that the input device and the output device are the same and the corresponding procfs send_redirects entry is set. There are more conditions, as you will see later in this chapter. This flag is set in the __mkroute_input() method. + + * RTCF_LOCAL: When set, the destination address is local. This flag is set in the following methods: ip_route_input_slow(), __mkroute_output(), ip_route_input_mc() and __ip_route_output_key(). Some of the RTCF_XXX flags can be set simultaneously. For example, RTCF_LOCAL can be set when RTCF_BROADCAST or RTCF_MULTICAST are set. For the complete list of RTCF_ XXX flags, look in include/uapi/linux/in_route.h. Note that some of them are unused. + + * rt_is_input: A flag that is set to 1 when this is an input route. + + * rt_uses_gateway: Gets a value according to the following: + + * When the nexthop is a gateway, rt_uses_gateway is 1. + + * When the nexthop is a direct route, rt_uses_gateway is 0. + + * rt_iif: The ifindex of the incoming interface. (Note that the rt_oif member was removed from the rtable structure in kernel 3.6; it was set to the oif of the specified flow key, but was used in fact only in one method). + + * rt_pmtu: The Path MTU (the smallest MTU along the route). + +Note that in kernel 3.6, the fib_compute_spec_dst() method was added, which gets an SKB as a parameter. This method made the rt_spec_dst member of the rtable structure unneeded, and rt_spec_dst was removed from the rtable structure as a result. The fib_compute_spec_dst() method is needed in special cases, such as in the icmp_reply() method, when replying to the sender using its source address as a destination for the reply. + +For incoming unicast packets destined to the local host, the input callback of the dst object is set to ip_local_deliver(), and for incoming unicast packets that should be forwarded, this input callback is set to ip_forward(). For a packet generated on the local machine and sent away, the output callback is set to be ip_output(). For a multicast packet, the input callback can be set to ip_mr_input() (under some conditions which are not detailed in this chapter). There are cases when the input callback is set to be ip_error(), as you will see later in the PROHIBIT rule example in this chapter. Let's take a look in the fib_result object: + +struct fib_result { + +unsigned char prefixlen; + +unsigned char nh_sel; + +unsigned char type; + +unsigned char scope; + +u32 tclassid; + +struct fib_info *fi; + +struct fib_table *table; + +struct list_head *fa_head; + +}; + +(include/net/ip_fib.h) + + * prefixlen: The prefix length, which represents the netmask. Its values are in the range 0 to 32. It is 0 when using the default route. When adding, for example, a routing entry by ip route add 192.168.2.0/24 dev eth0, the prefixlen is 24, according to the netmask which was specified when adding the entry. The prefixlen is set in the check_leaf() method (net/ipv4/fib_trie.c). + + * nh_sel: The nexthop number. When working with one nexthop only, it is 0. When working with Multipath Routing, there can be more than one nexthop. The nexthop objects are stored in an array in the routing entry (inside the fib_info object), as discussed in the next section. + + * type: The type of the fib_result object is the most important field because it determines in fact how to handle the packet: whether to forward it to a different machine, deliver it locally, discard it silently, discard it with replying with an ICMPv4 message, and so on. The type of the fib_result object is determined according to the packet content (most notably the destination address) and according to routing rules set by the administrator, routing daemons, or a Redirect message. You will see how the type of the fib_result object is determined in the lookup process later in this chapter and in the next. The two most common types of the fib_result objects are the RTN_UNICAST type, which is set when the packet is for forwarding via a gateway or a direct route, and the RTN_LOCAL type, which is set when the packet is for the local host. Other types you will encounter in this book are the RTN_BROADCAST type, for packets that should be accepted locally as broadcasts, the RTN_MULTICAST type, for multicast routes, the RTN_UNREACHABLE type, for packets which trigger sending back an ICMPv4 "Destination Unreachable" message, and more. There are 12 route types in all. For a complete list of all available route types, see include/uapi/linux/rtnetlink.h. + + * fi: A pointer to a fib_info object, which represents a routing entry. The fib_info object holds a reference to the nexthop (fib_nh). I discuss the FIB info structure in the section "FIB Info" later in this chapter. + + * table: A pointer to the FIB table on which the lookup is done. It is set in the check_leaf() method (net/ipv4/fib_trie.c). + + * fa_head: A pointer to a fib_alias list (a list of fib_alias objects associated with this route); optimization of routing entries is done when using fib_alias objects, which avoids creating a separate fib_info object for each routing entry, regardless of the fact that there are other fib_info objects which are very similar. All FIB aliases are sorted by fa_tos descending and fib_priority (metric) ascending. Aliases whose fa_tos is 0 are the last and can match any TOS. I discuss the fib_alias structure in the section "FIB Alias" later in this chapter. + +In this section you learned how a lookup in the routing subsystem is performed. You also found out about important data structures that relate to the routing lookup process, like fib_result and rtable. The next section discusses how the FIB tables are organized. + +## FIB Tables + +The main data structure of the routing subsystem is the routing table, which is represented by the fib_table structure. A routing table can be described, in a somewhat simplified way, as a table of entries where each entry determines which nexthop should be chosen for traffic destined to a subnet (or to a specific IPv4 destination address). This entry has other parameters, of course, discussed later in this chapter. Each routing entry contains a fib_info object (include/net/ip_fib.h), which stores the most important routing entry parameters (but not all, as you will see later in this chapter). The fib_info object is created by the fib_create_info() method (net/ipv4/fib_semantics.c) and is stored in a hash table named fib_info_hash. When the route uses prefsrc, the fib_info object is added also to a hash table named fib_info_laddrhash. + +There is a global counter of fib_info objects named fib_info_cnt which is incremented when creating a fib_info object, by the fib_create_info() method, and decremented when freeing a fib_info object, by the free_fib_info() method. The hash table is dynamically resized when it grows over some threshold. A lookup in the fib_info_hash hash table is done by the fib_find_info() method (it returns NULL when not finding an entry). Serializing access to the fib_info members is done by a spinlock named fib_info_lock. Here's the fib_table structure: + +struct fib_table { + +struct hlist_node tb_hlist; + +u32 tb_id; + +int tb_default; + +int tb_num_default; + +unsigned long tb_data[0]; + +}; + +(include/net/ip_fib.h) + + * tb_id: The table identifier. For the main table, tb_id is 254 (RT_TABLE_MAIN), and for the local table, tb_id is 255 (RT_TABLE_LOCAL). I talk about the main table and the local table soon—for now, just note that when working without Policy Routing, only these two FIB tables, the main table and the local table, are created in boot. + + * tb_num_default: The number of the default routes in the table. The fib_trie_table() method, which creates a table, initializes tb_num_default to 0. Adding a default route increments tb_num_default by 1, by the fib_table_insert() method. Deleting a default route decrements tb_num_default by 1, by the fib_table_delete() method. + + * tb_data[0] : A placeholder for a routing entry (trie) object. + +This section covered how a FIB table is implemented. Next you will learn about the FIB info, which represents a single routing entry. + +### FIB Info + +A routing entry is represented by a fib_info structure. It consists of important routing entry parameters, such as the outgoing network device (fib_dev), the priority (fib_priority), the routing protocol identifier of this route (fib_protocol), and more. Let's take a look at the fib_info structure: + +struct fib_info { + +struct hlist_node fib_hash; + +struct hlist_node fib_lhash; + +struct net *fib_net; + +int fib_treeref; + +atomic_t fib_clntref; + +unsigned int fib_flags; + +unsigned char fib_dead; + +unsigned char fib_protocol; + +unsigned char fib_scope; + +unsigned char fib_type; + +__be32 fib_prefsrc; + +u32 fib_priority; + +u32 *fib_metrics; + +#define fib_mtu fib_metrics[RTAX_MTU-1] + +#define fib_window fib_metrics[RTAX_WINDOW-1] + +#define fib_rtt fib_metrics[RTAX_RTT-1] + +#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + +int fib_nhs; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +int fib_power; + +#endif + +struct rcu_head rcu; + +struct fib_nh fib_nh[0]; + +#define fib_dev fib_nh[0].nh_dev + +}; + +(include/net/ip_fib.h) + + * fib_net: The network namespace the fib_info object belongs to. + + * fib_treeref: A reference counter that represents the number of fib_alias objects which hold a reference to this fib_info object. This reference counter is incremented in the fib_create_info() method and decremented in the fib_release_info() method. Both methods are in net/ipv4/fib_semantics.c. + + * fib_clntref: A reference counter that is incremented by the fib_create_info() method (net/ipv4/fib_semantics.c) and decremented by the fib_info_put() method (include/net/ip_fib.h). If, after decrementing it by 1 in the fib_info_put() method, it reaches zero, than the associated fib_info object is freed by the free_fib_info() method. + + * fib_dead: A flag that indicates whether it is permitted to free the fib_info object with the free_fib_info() method; fib_dead must be set to 1 before calling the free_fib_info() method. If the fib_dead flag is not set (its value is 0), then it is considered alive, and trying to free it with the free_fib_info() method will fail. + + * fib_protocol: The routing protocol identifier of this route. When adding a routing rule from userspace without specifying the routing protocol ID, the fib_protocol is assigned to be RTPROT_BOOT. The administrator may add a route with the "proto static" modifier, which indicates that the route was added by an administrator; this can be done, for example, like this: ip route add proto static 192.168.5.3 via 192.168.2.1. The fib_protocol can be assigned one of these flags: + + * RTPROT_UNSPEC: An error value. + + * RTPROT_REDIRECT: When set, the routing entry was created as a result of receiving an ICMP Redirect message. The RTPROT_REDIRECT protocol identifier is used only in IPv6. + + * RTPROT_KERNEL: When set, the routing entry was created by the kernel (for example, when creating the local IPv4 routing table, explained shortly). + + * RTPROT_BOOT: When set, the admin added a route without specifying the "proto static" modifier. + + * RTPROT_STATIC: Route installed by system administrator. + + * RTPROT_RA: Don't misread this— this protocol identifier is not for Router Alert; it is for RDISC/ND Router Advertisements, and it is used in the kernel by the IPv6 subsystem only; see: net/ipv6/route.c. I discuss it in Chapter 8. + +The routing entry could also be added by userspace routing daemons, like ZEBRA, XORP, MROUTED, and more. Then it will be assigned the corresponding value from a list of protocol identifiers (see the RTPROT_XXX definitions in include/uapi/linux/rtnetlink.h). For example, for the XORP daemon it will be RTPROT_XORP. Note that these flags (like RTPROT_KERNEL or RTPROT_STATIC) are also used by IPv6, for the parallel field (the rt6i_protocol field in the rt6_info structure; the rt6_info object is the IPv6 parallel to the rtable object). + + * fib_scope: The scope of the destination address. In short, scopes are assigned to addresses and routes. Scope indicates the distance of the host from other nodes. The ip address show command shows the scopes of all configured IP addresses on a host. The ip route show command displays the scopes of all the route entries of the main table. A scope can be one of these: + + * host (RT_SCOPE_HOST): The node cannot communicate with the other network nodes. The loopback address has scope host. + + * global (RT_SCOPE_UNIVERSE): The address can be used anywhere. This is the most common case. + + * link (RT_SCOPE_LINK): This address can be accessed only from directly attached hosts. + + * site (RT_SCOPE_SITE): This is used in IPv6 only (I discuss it in Chapter 8). + + * nowhere (RT_SCOPE_NOWHERE): Destination doesn't exist. + +When a route is added by an administrator without specifying a scope, the fib_scope field is assigned a value according to these rules: + + * global scope (RT_SCOPE_UNIVERSE): For all gatewayed unicast routes. + + * scope link (RT_SCOPE_LINK): For direct unicast and broadcast routes. + + * scope host (RT_SCOPE_HOST): For local routes. + + * fib_type: The type of the route. The fib_type field was added to the fib_info structure as a key to make sure there is differentiation among fib_info objects by their type. The fib_type field was added to the fib_info struct in kernel 3.7. Originally this type was stored only in the fa_type field of the FIB alias object (fib_alias). You can add a rule to block traffic according to a specified category, for example, by: ip route add prohibit 192.168.1.17 from 192.168.2.103. + + * The fib_type of the generated fib_info object is RTN_PROHIBIT. + + * Sending traffic from 192.168.2.103 to 192.168.1.17 results in an ICMPv4 message of "Packet Filtered" (ICMP_PKT_FILTERED). + + * fib_prefsrc: There are cases when you want to provide a specific source address to the lookup key. This is done by setting fib_prefsrc. + + * fib_priority: The priority of the route, by default, is 0, which is the highest priority. The higher the value of the priority, the lower the priority is. For example, a priority of 3 is lower than a priority of 0, which is the highest priority. You can configure it, for example, with the ip command, in one of the following ways: + + * ip route add 192.168.1.10 via 192.168.2.1 metric 5 + + * ip route add 192.168.1.10 via 192.168.2.1 priority 5 + + * ip route add 192.168.1.10 via 192.168.2.1 preference 5 + +Each of these three commands sets the fib_priority to 5; there is no difference at all between them. Moreover, the metric parameter of the ip route command is not related in any way to the fib_metrics field of the fib_info structure. + + * fib_mtu, fib_window, fib_rtt, and fib_advmss simply give more convenient names to commonly used elements of the fib_metrics array. + +fib_metrics is an array of 15 (RTAX_MAX) elements consisting of various metrics. It is initialized to be dst_default_metrics in net/core/dst.c. Many metrics are related to the TCP protocol, such as the Initial Congestion Window (initcwnd) metric. Table 5-1, at the end of the chapter shows all the available metrics and displays whether each is a TCP-related metric or not. + +From userspace, the TCPv4 initcwnd metric can be set thus, for example: + +ip route add 192.168.1.0/24 initcwnd 35 + +There are metrics which are not TCP specific—for example, the mtu metric, which can be set from userspace like this: + +ip route add 192.168.1.0/24 mtu 800 + +or like this: + +ip route add 192.168.1.0/24 mtu lock 800 + +The difference between the two commands is that when specifying the modifier lock, no path MTU discovery will be tried. When not specifying the modifier lock, the MTU may be updated by the kernel due to Path MTU discovery. For more about how this is implemented, see the __ip_rt_update_pmtu() method, in net/ipv4/route.c: + +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + +{ + +Avoiding Path MTU update when specifying the mtu lock modifier is achieved by calling the dst_metric_locked() method : + +... + +if (dst_metric_locked(dst, RTAX_MTU)) + +return; + +... + +} + + * fib_nhs: The number of nexthops. When Multipath Routing (CONFIG_IP_ROUTE_MULTIPATH) is not set, it cannot be more than 1. The Multipath Routing feature sets multiple alternative paths for a route, possibly assigning different weights to these paths. This feature provides benefits such as fault tolerance, increased bandwidth, or improved security (I discuss it in Chapter 6). + + * fib_dev: The network device that will transmit the packet to the nexthop. + + * fib_nh[0]: The fib_nh[0] member represents the nexthop. When working with Multipath Routing, you can define more than one nexthop in a route, and in this case there is an array of nexthops. Defining two nexthop nodes can be done like this, for example: ip route add default scope global nexthop dev eth0 nexthop dev eth1. + +As mentioned, when the fib_type is RTN_PROHIBIT, an ICMPv4 message of "Packet Filtered" (ICMP_PKT_FILTERED) is sent. How is it implemented? An array named fib_props consists of 12 (RTN_MAX) elements (defined in net/ipv4/fib_semantics.c). The index of this array is the route type. The available route types, such as RTN_PROHIBIT or RTN_UNICAST, can be found in include/uapi/linux/rtnetlink.h. Each element in the array is an instance of struct fib_prop; the fib_prop structure is a very simple structure: + +struct fib_prop { + +int error; + +u8 scope; + +}; + +(net/ipv4/fib_lookup.h) + +For every route type, the corresponding fib_prop object contains the error and the scope for that route. For example, for the RTN_UNICAST route type (gateway or direct route), which is a very common route, the error value is 0, which means that there is no error, and the scope is RT_SCOPE_UNIVERSE. For the RTN_PROHIBIT route type (a rule which a system administrator configures in order to block traffic), the error is –EACCES, and the scope is RT_SCOPE_UNIVERSE: + +const struct fib_prop fib_props[RTN_MAX + 1] = { + +... + +[RTN_PROHIBIT] = { + +.error = -EACCES, + +.scope = RT_SCOPE_UNIVERSE, + +}, + +... + +Table 5-2 at the end of this chapter shows all available route types, their error codes, and their scopes. + +When you configure a rule like the one mentioned earlier, by ip route add prohibit 192.168.1.17 from 192.168.2.103—and when a packet is sent from 192.168.2.103 to 192.168.1.17, what happens is the following: a lookup in the routing tables is performed in the Rx path. When a corresponding entry, which is in fact a leaf in the FIB TRIE, is found, the check_leaf() method is invoked. This method accesses the fib_props array with the route type of the packet as an index (fa->fa_type): + +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, + +t_key key, const struct flowi4 *flp, + +struct fib_result *res, int fib_flags) + +{ + +... + +fib_alias_accessed(fa); + +err = fib_props[fa->fa_type].error; + +if (err) { + +... + +return err; + +} + +... + +Eventually, the fib_lookup() method, which initiated the lookup in the IPv4 routing subsystem, returns an error of –EACCES (in our case). It propagates all the way back from check_leaf() via fib_table_lookup() and so on until it returns to the method which triggered this chain, namely the fib_lookup() method. When the fib_lookup() method returns an error in the Rx path, it is handled by the ip_error() method. According to the error, an action is taken. In the case of –EACCES, an ICMPv4 of destination unreachable with code of Packet Filtered (ICMP_PKT_FILTERED) is sent back, and the packet is dropped. + +This section covered the FIB info, which represents a single routing entry. The next section discusses caching in the IPv4 routing subsystem (not to be confused with the IPv4 routing cache, which was removed from the network stack, and is discussed in the "IPv4 Routing Cache" section at the end of this chapter). + +### Caching + +Caching the results of a routing lookup is an optimization technique that improves the performance of the routing subsystem. The results of a routing lookup are usually cached in the nexthop (fib_nh) object; when the packet is not a unicast packet or realms are used (the packet itag is not 0), the results are not cached in the nexthop. The reason is that if all types of packets are cached, then the same nexthop can be used by different kinds of routes—that should be avoided. There are some minor exceptions to this which I do not discuss in this chapter. Caching in the Rx and the Tx path are performed as follows: + + * In the Rx path, caching the fib_result object in the nexthop (fib_nh) object is done by setting the nh_rth_input field of the nexthop (fib_nh) object. + + * In the Tx path, caching the fib_result object in the nexthop (fib_nh) object is done by setting the nh_pcpu_rth_output field of the nexthop (fib_nh) object. + + * Both nh_rth_input and nh_pcpu_rth_output are instances of the rtable structure. + + * Caching the fib_result is done by the rt_cache_route() method both in the Rx and the Tx paths (net/ipv4/route.c). + + * Caching of Path MTU and ICMPv4 redirects is done with FIB exceptions. + +For performance, the nh_pcpu_rth_output is a per-CPU variable, meaning there is a copy for each CPU of the output dst entry. Caching is used almost always. The few exceptions are when an ICMPv4 Redirect message is sent, or itag (tclassid) is set, or there is not enough memory. + +In this section you have learned how caching is done using the nexthop object. The next section discusses the fib_nh structure, which represents the nexthop, and the FIB nexthop exceptions. + +### Nexthop (fib_nh) + +The fib_nh structure represents the nexthop. It consists of information such as the outgoing nexthop network device (nh_dev), outgoing nexthop interface index (nh_oif), the scope (nh_scope), and more. Let's take a look: + +struct fib_nh { + +struct net_device *nh_dev; + +struct hlist_node nh_hash; + +struct fib_info *nh_parent; + +unsigned int nh_flags; + +unsigned char nh_scope; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +int nh_weight; + +int nh_power; + +#endif + +#ifdef CONFIG_IP_ROUTE_CLASSID + +__u32 nh_tclassid; + +#endif + +int nh_oif; + +__be32 nh_gw; + +__be32 nh_saddr; + +int nh_saddr_genid; + +struct rtable __rcu * __percpu *nh_pcpu_rth_output; + +struct rtable __rcu *nh_rth_input; + +struct fnhe_hash_bucket *nh_exceptions; + +}; + +(include/net/ip_fib.h) + +The nh_dev field represents the network device (net_device object) on which traffic to the nexthop will be transmitted. When a network device associated with one or more routes is disabled, a NETDEV_DOWN notification is sent. The FIB callback for handling this event is the fib_netdev_event() method; it is the callback of the fib_netdev_notifier notifier object, which is registered in the ip_fib_init() method by calling the register_netdevice_notifier() method (notification chains are discussed in Chapter 14). The fib_netdev_event() method calls the fib_disable_ip() method upon receiving a NETDEV_DOWN notification. In the fib_disable_ip() method, the following steps are performed: + + * First, the fib_sync_down_dev() method is called (net/ipv4/fib_semantics.c). In the fib_sync_down_dev() method, the RTNH_F_DEAD flag of the nexthop flags (nh_flags) is set and the FIB info flags (fib_flags) is set. + + * The routes are flushed by the fib_flush() method. + + * The rt_cache_flush() method and the arp_ifdown() method are invoked. The arp_ifdown() method is not on any notifier chain. + +#### FIB Nexthop Exceptions + +FIB nexthop exceptions were added in kernel 3.6 to handle cases when a routing entry is changed not as a result of a userspace action, but as a result of an ICMPv4 Redirect message or as a result of Path MTU discovery. The hash key is the destination address. The FIB nexthop exceptions are based on a 2048 entry hash table; reclaiming (freeing hash entries) starts at a chain depth of 5. Each nexthop object (fib_nh) has a FIB nexthop exceptions hash table, nh_exceptions (an instance of the fnhe_hash_bucket structure). Let's take a look at the fib_nh_exception structure: + +struct fib_nh_exception { + +struct fib_nh_exception __rcu *fnhe_next; + +__be32 fnhe_daddr; + +u32 fnhe_pmtu; + +__be32 fnhe_gw; + +unsigned long fnhe_expires; + +struct rtable __rcu *fnhe_rth; + +unsigned long fnhe_stamp; + +}; + +(include/net/ip_fib.h) + +The fib_nh_exception objects are created by the update_or_create_fnhe() method (net/ipv4/route.c). Where are FIB nexthop exceptions generated? The first case is when receiving an ICMPv4 Redirect message ("Redirect to Host") in the __ip_do_redirect() method. The "Redirect to Host" message includes a new gateway. The fnhe_gw field of the fib_nh_exception is set to be the new gateway when creating the FIB nexthop exception object (in the update_or_create_fnhe() method): + +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, + +bool kill_route) + +{ + +... + +__be32 new_gw = icmp_hdr(skb)->un.gateway; + +... + +update_or_create_fnhe(nh, fl4->daddr, new_gw, 0, 0); + +... + +} + +The second case of generating FIB nexthop exceptions is when the Path MTU has changed, in the __ip_rt_update_pmtu() method. In such a case, the fnhe_pmtu field of the fib_nh_exception object is set to be the new MTU when creating the FIB nexthop exception object (in the update_or_create_fnhe() method). PMTU value is expired if it was not updated in the last 10 minutes (ip_rt_mtu_expires). This period is checked on every dst_mtu() call via the ipv4_mtu() method, which is a dst->ops->mtu handler. The ip_rt_mtu_expires, which is by default 600 seconds, can be configured via the procfs entry /proc/sys/net/ipv4/route/mtu_expires: + +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) + +{ + +... + +if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { + +struct fib_nh *nh = &FIB_RES_NH(res); + +update_or_create_fnhe(nh, fl4->daddr, 0, mtu, + +jiffies + ip_rt_mtu_expires); + +} + +... + +} + +Note + +FIB nexthop exceptions are used in the Tx path. Starting with Linux 3.11, they are also used in the Rx path. As a result, instead of fnhe_rth, there are fnhe_rth_input and fnhe_rth_output. + +Since kernel 2.4, Policy Routing is supported. With Policy Routing, the routing of a packet depends not only on the destination address, but on several other factors, such as the source address or the TOS. The system administrator can add up to 255 routing tables. + +### Policy Routing + +When working without Policy Routing (CONFIG_IP_MULTIPLE_TABLES is not set), two routing tables are created: the local table and the main table. The main table id is 254 (RT_TABLE_MAIN), and the local table id is 255 (RT_TABLE_LOCAL). The local table contains routing entries of local addresses. These routing entries can be added to the local table only by the kernel. Adding routing entries to the main table (RT_TABLE_MAIN) is done by a system administrator (via ip route add, for example). These tables are created by the fib4_rules_init() method of net/ipv4/fib_frontend.c. These tables were called ip_fib_local_table and ip_fib_main_table in kernels prior to 2.6.25, but they were removed in favor of using unified access to the routing tables with the fib_get_table() method with appropriate argument. By unified access, I mean that access to the routing tables is done in the same way, with the fib_get_table() method, both when Policy Routing support is enabled and when it is disabled. The fib_get_table() method gets only two arguments: the network namespace and the table id. Note that there is a different method with the same name, fib4_rules_init(), for the Policy Routing case, in net/ipv4/fib_rules.c, which is invoked when working with Policy Routing support. When working with Policy Routing support (CONFIG_IP_MULTIPLE_TABLES is set), there are three initial tables (local, main, and default), and there can be up to 255 routing tables. I talk more about Policy Routing in Chapter 6. Access to the main routing table can be done as follows: + + * By a system administrator command (using ip route or route): + + * Adding a route by ip route add is implemented by sending RTM_NEWROUTE message from userspace, which is handled by the inet_rtm_newroute() method. Note that a route is not necessarily always a rule that permits traffic. You can also add a route that blocks traffic, for example, by ip route add prohibit 192.168.1.17 from 192.168.2.103. As a result of applying this rule, all packets sent from 192.168.2.103 to 192.168.1.17 will be blocked. + + * Deleting a route by ip route del is implemented by sending RTM_DELROUTE message from userspace, which is handled by the inet_rtm_delroute() method. + + * Dumping a routing table by ip route show is implemented by sending RTM_GETROUTE message from userspace, which is handled by the inet_dump_fib() method. + +Note that ip route show displays the main table. For displaying the local table, you should run ip route show table local. + + * Adding a route by route add is implemented by sending SIOCADDRT IOCTL, which is handled by the ip_rt_ioctl() method (net/ipv4/fib_frontend.c). + + * Deleting a route by route del is implemented by sending SIOCDELRT IOCTL, which is handled by the ip_rt_ioctl() method (net/ipv4/fib_frontend.c). + + * By userspace routing daemons which implement routing protocols like BGP (Border Gateway Protocol), EGP (Exterior Gateway Protocol), OSPF (Open Shortest Path First), or others. These routing daemons run on core routers, which operate in the Internet backbone, and can handle hundreds of thousands of routes. + +I should mention here that routes that were changed as a result of an ICMPv4 REDIRECT message or as a result of Path MTU discovery are cached in the nexthop exception table, discussed shortly. The next section describes the FIB alias, which helps in routing optimizations. + +### FIB Alias (fib_alias) + +There are cases when several routing entries to the same destination address or to the same subnet are created. These routing entries differ only in the value of their TOS. Instead of creating a fib_info for each such route, a fib_alias object is created. A fib_alias is smaller, which reduces memory consumption. Here is a simple example of creating 3 fib_alias objects: + +ip route add 192.168.1.10 via 192.168.2.1 tos 0x2 + +ip route add 192.168.1.10 via 192.168.2.1 tos 0x4 + +ip route add 192.168.1.10 via 192.168.2.1 tos 0x6 + +Let's take a look at the fib_alias structure definition: + +struct fib_alias { + +struct list_head fa_list; + +struct fib_info *fa_info; + +u8 fa_tos; + +u8 fa_type; + +u8 fa_state; + +struct rcu_head rcu; + +}; + +(net/ipv4/fib_lookup.h) + +Note that there was also a scope field in the fib_alias structure (fa_scope), but it was moved in kernel 2.6.39 to the fib_info structure. + +The fib_alias object stores routes to the same subnet but with different parameters. You can have one fib_info object which will be shared by many fib_alias objects. The fa_info pointer in all these fib_alias objects, in this case, will point to the same shared fib_info object. In Figure 5-3, you can see one fib_info object which is shared by three fib_alias objects, each with a different fa_tos. Note that the reference counter value of the fib_info object is 3 (fib_treeref). + +Figure 5-3. + +A fib_info which is shared by three fib_alias objects. Each fib_alias object has a different fa_tos value + +Let's take a look at what happens when you try to add a key for which a fib_node was already added before (as in the earlier example with the three TOS values 0x2, 0x4, and 0x6); suppose you had created the first rule with TOS of 0x2, and now you create the second rule, with TOS of 0x4. + +A fib_alias object is created by the fib_table_insert() method, which is the method that handles adding a routing entry: + +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) + +{ + +struct trie *t = (struct trie *) tb->tb_data; + +struct fib_alias *fa, *new_fa; + +struct list_head *fa_head = NULL; + +struct fib_info *fi; + +... + +First, a fib_info object is created. Note that in the fib_create_info() method, after allocating and creating a fib_info object, a lookup is performed to check whether a similar object already exists by calling the fib_find_info() method. If such an object exists, it will be freed, and the reference counter of the object that was found (ofi in the code snippet you will shortly see) will be incremented by 1: + +fi = fib_create_info(cfg); + +Let's take a look at the code snippet in the fib_create_info() method mentioned earlier; for creating the second TOS rule, the fib_info object of the first rule and the fib_info object of the second rule are identical. You should remember that the TOS field exists in the fib_alias object but not in the fib_info object: + +struct fib_info *fib_create_info(struct fib_config *cfg) + +{ + +struct fib_info *fi = NULL; + +struct fib_info *ofi; + +... + +fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + +if (fi == NULL) + +goto failure; + +... + +link_it: + +ofi = fib_find_info(fi); + +If a similar object is found, free the fib_info object and increment the fib_treeref reference count: + +if (ofi) { + +fi->fib_dead = 1; + +free_fib_info(fi); + +ofi->fib_treeref++; + +return ofi; + +} + +... + +} + +Now a check is performed to find out whether there is an alias to the fib_info object; in this case, there will be no alias because the TOS of the second rule is different than the TOS of the first rule: + +l = fib_find_node(t, key); + +fa = NULL; + +if (l) { + +fa_head = get_fa_head(l, plen); + +fa = fib_find_alias(fa_head, tos, fi->fib_priority); + +} + +if (fa && fa->fa_tos == tos && + +fa->fa_info->fib_priority == fi->fib_priority) { + +... + +} + +Now a fib_alias is created, and its fa_info pointer is assigned to point the fib_info of the first rule that was created: + +new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + +if (new_fa == NULL) + +goto out; + +new_fa->fa_info = fi; + +... + +Now that I have covered the FIB Alias, you are ready to look at the ICMPv4 redirect message, which is sent when there is a suboptimal route. + +## ICMPv4 Redirect Message + +There are cases when a routing entry is suboptimal. In such cases, an ICMPv4 redirect message is sent. The main criterion for a suboptimal entry is that the input device and the output device are the same. But there are more conditions that should be fulfilled so that an ICMPv4 redirect message is sent, as you will see in this section. There are four codes of ICMPv4 redirect message: + + * ICMP_REDIR_NET: Redirect Net + + * ICMP_REDIR_HOST: Redirect Host + + * ICMP_REDIR_NETTOS: Redirect Net for TOS + + * ICMP_REDIR_HOSTTOS: Redirect Host for TOS + +Figure 5-4 shows a setup where there is a suboptimal route. There are three machines in this setup, all on the same subnet (192.168.2.0/24) and all connected via a gateway (192.168.2.1). The AMD server (192.168.2.200) added the Windows server (192.168.2.10) as a gateway for accessing 192.168.2.7 (the laptop) by ip route add 192.168.2.7 via 192.168.2.10. The AMD server sends traffic to the laptop, for example, by ping 192.168.2.7. Because the default gateway is 192.168.2.10, the traffic is sent to 192.168.2.10. The Windows server detects that this is a suboptimal route, because the AMD server could send directly to 192.168.2.7, and sends back to the AMD server an ICMPv4 redirect message with ICMP_REDIR_HOST code. + +Figure 5-4. + +Redirect to Host (ICMP_REDIR_HOST), a simple setup + +Now that you have a better understanding of redirects, let's look at how an ICMPv4 message is generated. + +### Generating an ICMPv4 Redirect Message + +An ICMPv4 Redirect message is sent when there is some suboptimal route. The most notable condition for a suboptimal route is that the input device and the output device are the same, but there are some more conditions which should be met. Generating an ICMPv4 Redirect message is done in two phases: + + * In the __mkroute_input() method: Here the RTCF_DOREDIRECT flag is set if needed. + + * In the ip_forward() method: Here the ICMPv4 Redirect message is actually sent by calling the ip_rt_send_redirect() method. + +static int __mkroute_input(struct sk_buff *skb, + +const struct fib_result *res, + +struct in_device *in_dev, + +__be32 daddr, __be32 saddr, u32 tos) + +{ + +struct rtable *rth; + +int err; + +struct in_device *out_dev; + +unsigned int flags = 0; + +bool do_cache; + +All of the following conditions should be sustained so that the RTCF_DOREDIRECT flag is set: + + * The input device and the output device are the same. + + * The procfs entry, /proc/sys/net/ipv4/conf//send_redirects, is set. + + * Either this outgoing device is a shared media or the source address (saddr) and the nexthop gateway address (nh_gw) are on the same subnet: + +if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && + +(IN_DEV_SHARED_MEDIA(out_dev) || + +inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { + +flags |= RTCF_DOREDIRECT; + +do_cache = false; + +} + +... + +Setting the rtable object flags is done by: + +rth->rt_flags = flags; + +... + +} + +Sending the ICMPv4 Redirect message is done in the second phase, by the ip_forward() method: + +int ip_forward(struct sk_buff *skb) + +{ + +struct iphdr *iph; /* Our header */ + +struct rtable *rt; /* Route we use */ + +struct ip_options *opt = &(IPCB(skb)->opt); + +Next a check is performed to see whether the RTCF_DOREDIRECT flag is set, whether an IP option of strict route does not exist (see chapter 4), and whether it is not an IPsec packet. (With IPsec tunnels, the input device of the tunneled packet can be the same as the decapsulated packet outgoing device; see http://lists.openwall.net/netdev/2007/08/24/29 ): + +if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + +ip_rt_send_redirect(skb); + +In the ip_rt_send_redirect() method, the ICMPv4 Redirect message is actually sent. The third parameter is the IP address of the advised new gateway, which will be 192.168.2.7 in this case (The address of the laptop): + +void ip_rt_send_redirect(struct sk_buff *skb) + +{ + +... + +icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + +rt_nexthop(rt, ip_hdr(skb)->daddr)) + +... + +} + +(net/ipv4/route.c) + +### Receiving an ICMPv4 Redirect Message + +For an ICMPv4 Redirect message to be processed, it should pass some sanity checks. Handling an ICMPv4 Redirect message is done by the __ip_do_redirect() method: + +static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 + +*fl4,bool kill_route) + +{ + +__be32 new_gw = icmp_hdr(skb)->un.gateway; + +__be32 old_gw = ip_hdr(skb)->saddr; + +struct net_device *dev = skb->dev; + +struct in_device *in_dev; + +struct fib_result res; + +struct neighbour *n; + +struct net *net; + +... + +Various checks are performed, such as that the network device is set to accept redirects. The redirect is rejected if necessary: + +if (rt->rt_gateway != old_gw) + +return; + +in_dev = __in_dev_get_rcu(dev); + +if (!in_dev) + +return; + +net = dev_net(dev); + +if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + +ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + +ipv4_is_zeronet(new_gw)) + +goto reject_redirect; + +if (!IN_DEV_SHARED_MEDIA(in_dev)) { + +if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + +goto reject_redirect; + +if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + +goto reject_redirect; + +} else { + +if (inet_addr_type(net, new_gw) != RTN_UNICAST) + +goto reject_redirect; + +} + +A lookup in the neighboring subsystem is performed; the key to the lookup is the address of the advised gateway, new_gw, which was extracted from the ICMPv4 message in the beginning of this method: + +n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + +if (n) { + +if (!(n->nud_state & NUD_VALID)) { + +neigh_event_send(n, NULL); + +} else { + +if (fib_lookup(net, fl4, &res) == 0) { + +struct fib_nh *nh = &FIB_RES_NH(res); + +Create / update a FIB nexthop exception, specifying the IP address of an advised gateway (new_gw): + +update_or_create_fnhe(nh, fl4->daddr, new_gw, + +0, 0); + +} + +if (kill_route) + +rt->dst.obsolete = DST_OBSOLETE_KILL; + +call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + +} + +neigh_release(n); + +} + +return; + +reject_redirect: + +... + +(net/ipv4/route.c) + +Now that we've covered how a received ICMPv4 message is handled, we can next tackle the IPv4 routing cache and the reasons for its removal. + +### IPv4 Routing Cache + +In kernels prior to 3.6, there was an IPv4 routing cache with a garbage collector. The IPv4 routing cache was removed in kernel 3.6 (around July 2012). The FIB TRIE / FIB hash was a choice in the kernel for years, but not as the default. Having the FIB TRIE made it possible to remove the IPv4 routing cache, as it had Denial of Service (DoS) issues. FIB TRIE (also known as LC-trie) is the longest matching prefix lookup algorithm that performs better than FIB hash for large routing tables. It consumes more memory and is more complex, but since it performs better, it made the removal of the routing cache feasible. The FIB TRIE code was in the kernel for a long time before it was merged, but it was not the default. The main reason for the removal of the IPv4 routing cache was that launching DoS attacks against it was easy because the IPv4 routing cache created a cache entry for each unique flow. Basically that meant that by sending packets to random destinations, you could generate an unlimited amount of routing cache entries. + +Merging the FIB TRIE entailed the removal of the routing cache and of some of the cumbersome FIB hash tables and of the routing cache garbage collector methods. This chapter discusses the routing cache very briefly. Because the novice reader may wonder what it is needed for, note that in the Linux-based software industry, in commercial distributions like RedHat Enterprise, the kernels are fully maintained and fully supported for a very long period of time (RedHat, for example, gives support for its distributions for up to seven years). So it is very likely that some readers will be involved in projects based on kernels prior to 3.6, where you will find the routing cache and the FIB hash-based routing tables. Delving into the theory and implementation details of the FIB TRIE data structure is beyond the scope of this book. To learn more, I recommend the article "TRASH—A dynamic LC-trie and hash data structure" by Robert Olsson and Stefan Nilsson, www.nada.kth.se/~snilsson/publications/TRASH/trash.pdf . + +Note that with the IPv4 routing cache implementation, there is a single cache, regardless of how many routing tables are used (there can be up to 255 routing tables when using Policy Routing). Note that there was also support for IPv4 Multipath Routing cache, but it was removed in kernel 2.6.23, in 2007. In fact, it never did work very well and never got out of the experimental state. + +For kernels prior to the 3.6 kernel, where the FIB TRIE is not yet merged, the lookup in the IPv4 routing subsystem was different: access to routing tables was preceded by access to the routing cache, the tables were organized differently, and there was a routing cache garbage collector, which was both asynchronous (periodic timer) and synchronous (activated under specific conditions, for example when the number of the cache entries exceeded some threshold). The cache was basically a big hash with the IP flow source address, destination address, and TOS as a key, associated with all flow-specific information like neighbor entry, PMTU, redirect, TCPMSS info, and so on. The benefit here is that cached entries were fast to look up and contained all the information needed by higher layers. + +Note + +The following two sections ("Rx Path" and "Tx Path") refer to the 2.6.38 kernel. + +#### Rx Path + +In the Rx path, first the ip_route_input_common() method is invoked. This method performs a lookup in the IPv4 routing cache, which is much quicker than the lookup in the IPv4 routing tables. Lookup in these routing tables is based on the Longest Prefix Match (LPM) search algorithm. With the LPM search, the most specific table entry—the one with the highest subnet mask—is called the Longest Prefix Match. In case the lookup in the routing cache fails ("cache miss"), a lookup in the routing tables is being performed by calling the ip_route_input_slow() method. This method calls the fib_lookup() method to perform the actual lookup. Upon success, it calls the ip_mkroute_input() method which (among other actions) inserts the routing entry into the routing cache by calling the rt_intern_hash() method. + +#### Tx Path + +In the Tx path, first the ip_route_output_key() method is invoked. This method performs a lookup in the IPv4 routing cache. In case of a cache miss, it calls the ip_route_output_slow() method, which calls the fib_lookup() method to perform a lookup in the routing subsystem. Subsequently, upon success, it calls the ip_mkroute_output() method which (among other actions) inserts the routing entry into the routing cache by calling the rt_intern_hash() method. + +## Summary + +This chapter covered various topics of the IPv4 routing subsystem. The routing subsystem is essential for handling both incoming and outgoing packets. You learned about various topics like forwarding, lookup in the routing subsystem, organization of the FIB tables, Policy Routing and the routing subsystem, and ICMPv4 Redirect message. You also learned about optimization which is gained with the FIB alias and the fact that the routing cache was removed, and why. The next chapter covers advanced topics of the IPv4 routing subsystem. + +## Quick Reference + +I conclude this chapter with a short list of important methods, macros, and tables of the IPv4 routing subsystem, along with a short explanation about routing flags. + +Note + +The IPv4 routing subsystem is implemented in these modules under net/ipv4: fib_frontend.c, fib_trie.c, fib_semantics.c, route.c. + +The fib_rules.c module implements Policy Routing and is compiled only when CONFIG_IP_MULTIPLE_TABLES is set. Among the most important header files are fib_lookup.h, include/net/ip_fib.h, and include/net/route.h. + +The destination cache (dst) implementation is in net/core/dst.c and in include/net/dst.h. + +CONFIG_IP_ROUTE_MULTIPATH should be set for Multipath Routing Support. + +### Methods + +This section lists the methods that were mentioned in this chapter. + +#### int fib_table_insert(struct fib_table *tb, struct fib_config *cfg); + +This method inserts an IPv4 routing entry to the specified FIB table (fib_table object), based on the specified fib_config object. + +#### int fib_table_delete(struct fib_table *tb, struct fib_config *cfg); + +This method deletes an IPv4 routing entry from the specified FIB table (fib_table object), based on the specified fib_config object. + +#### struct fib_info *fib_create_info(struct fib_config *cfg); + +This method creates a fib_info object derived from the specified fib_config object. + +#### void free_fib_info(struct fib_info *fi); + +This method frees a fib_info object in condition that it is not alive (the fib_dead flag is not 0) and decrements the global fib_info objects counter (fib_info_cnt). + +#### void fib_alias_accessed(struct fib_alias *fa); + +This method sets the fa_state flag of the specified fib_alias to be FA_S_ACCESSED. Note that the only fa_state flag is FA_S_ACCESSED. + +#### void ip_rt_send_redirect(struct sk_buff *skb); + +This method sends an ICMPV4 Redirect message, as a response to a suboptimal path. + +#### void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4*fl4, bool kill_route); + +This method handles receiving an ICMPv4 Redirect message. + +#### void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, u32 pmtu, unsigned long expires); + +This method creates a FIB nexthop exception table (fib_nh_exception) in the specified nexthop object (fib_nh), if it does not already exist, and initializes it. It is invoked when there should be a route update due to ICMPv4 redirect or due to PMTU discovery. + +#### u32 dst_metric(const struct dst_entry *dst, int metric); + +This method returns a metric of the specified dst object. + +#### struct fib_table *fib_trie_table(u32 id); + +This method allocates and initializes a FIB TRIE table. + +#### struct leaf *fib_find_node(struct trie *t, u32 key); + +This method performs a TRIE lookup with the specified key. It returns a leaf object upon success, or NULL in case of failure. + +### Macros + +This section is a list of macros of the IPv4 routing subsystem, some of which were mentioned in this chapter. + +#### FIB_RES_GW() + +This macro returns the nh_gw field (nexthop gateway address) associated with the specified fib_result object. + +#### FIB_RES_DEV() + +This macro returns the nh_dev field (Next hop net_device object) associated with the specified fib_result object. + +#### FIB_RES_OIF() + +This macro returns the nh_oif field (nexthop output interface index) associated with the specified fib_result object. + +#### FIB_RES_NH() + +This macro returns the nexthop (fib_nh object) of the fib_info of the specified fib_result object. When Multipath Routing is set, you can have multiple nexthops; the value of nh_sel field of the specified fib_result object is taken into account in this case, as an index to the array of the nexthops which is embedded in the fib_info object. + +(include/net/ip_fib.h) + +#### IN_DEV_FORWARD() + +This macro checks whether the specified network device (in_device object) supports IPv4 forwarding. + +#### IN_DEV_RX_REDIRECTS() + +This macro checks whether the specified network device (in_device object) supports accepting ICMPv4 Redirects. + +#### IN_DEV_TX_REDIRECTS() + +This macro checks whether the specified network device (in_device object) supports sending ICMPv4 Redirects. + +#### IS_LEAF() + +This macro checks whether the specified tree node is a leaf. + +#### IS_TNODE() + +This macro checks whether the specified tree node is an internal node (trie node or tnode). + +#### change_nexthops() + +This macro iterates over the nexthops of the specified fib_info object (net/ipv4/fib_semantics.c). + +### Tables + +There are 15 (RTAX_MAX) metrics for routes. Some of them are TCP related, and some are general. Table 5-1 shows which of these metrics are related to TCP. + +Table 5-1. + +Route Metrics + +Linux Symbol | TCP Metric (Y/N) + +---|--- + +RTAX_UNSPEC | N + +RTAX_LOCK | N + +RTAX_MTU | N + +RTAX_WINDOW | Y + +RTAX_RTT | Y + +RTAX_RTTVAR | Y + +RTAX_SSTHRESH | Y + +RTAX_CWND | Y + +RTAX_ADVMSS | Y + +RTAX_REORDERING | Y + +RTAX_HOPLIMIT | N + +RTAX_INITCWND | Y + +RTAX_FEATURES | N + +RTAX_RTO_MIN | Y + +RTAX_INITRWND | Y + +(include/uapi/linux/rtnetlink.h) + +Table 5-2 shows the error value and the scope of all the route types. + +Table 5-2. + +Route Types + +Linux Symbol | Error | Scope + +---|---|--- + +RTN_UNSPEC | 0 | RT_SCOPE_NOWHERE + +RTN_UNICAST | 0 | RT_SCOPE_UNIVERSE + +RTN_LOCAL | 0 | RT_SCOPE_HOST + +RTN_BROADCAST | 0 | RT_SCOPE_LINK + +RTN_ANYCAST | 0 | RT_SCOPE_LINK + +RTN_MULTICAST | 0 | RT_SCOPE_UNIVERSE + +RTN_BLACKHOLE | -EINVAL | RT_SCOPE_UNIVERSE + +RTN_UNREACHABLE | -EHOSTUNREACH | RT_SCOPE_UNIVERSE + +RTN_PROHIBIT | -EACCES | RT_SCOPE_UNIVERSE + +RTN_THROW | -EAGAIN | RT_SCOPE_UNIVERSE + +RTN_NAT | -EINVAL | RT_SCOPE_NOWHERE + +RTN_XRESOLVE | -EINVAL | RT_SCOPE_NOWHERE + +### Route Flags + +When running the route –n command, you get an output that shows the route flags. Here are the flag values and a short example of the output of route –n: + + * U (Route is up) + + * H (Target is a host) + + * G (Use gateway) + + * R (Reinstate route for dynamic routing) + + * D (Dynamically installed by daemon or redirect) + + * M (Modified from routing daemon or redirect) + + * A (Installed by addrconf) + + * ! (Reject route) + +Table 5-3 shows an example of the output of running route –n (the results are organized into a table form): + +Table 5-3. + +Kernel IP Routing Table + +Destination | Gateway | Genmask | Flags | Metric | Ref | Use | Iface + +---|---|---|---|---|---|---|--- + +169.254.0.0 | 0.0.0.0 | 255.255.0.0 | U | 1002 | 0 | 0 | eth0 + +192.168.3.0 | 192.168.2.1 | 255.255.255.0 | UG | 0 | 0 | 0 | eth1 +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_6 + +© Rami Rosen 2014 + +# 6. Advanced Routing + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 5 dealt with the IPv4 routing subsystem. This chapter continues with the routing subsystem and discusses advanced IPv4 routing topics such as Multicast Routing, Multipath Routing, Policy Routing, and more. This book deals with the Linux Kernel Networking implementation—it does not delve into the internals of userspace Multicast Routing daemons implementation, which are quite complex and beyond the scope of the book. I do, however, discuss to some extent the interaction between a userspace multicast routing daemon and the multicast layer in the kernel. I also briefly discuss the Internet Group Management Protocol (IGMP) protocol, which is the basis of multicast group membership management; adding and deleting multicast group members is done by the IGMP protocol. Some basic knowledge of IGMP is needed to understand the interaction between a multicast host and a multicast router. + +Chapter 5 dealt with the IPv4 routing subsystem. This chapter continues with the routing subsystem and discusses advanced IPv4 routing topics such as Multicast Routing, Multipath Routing, Policy Routing, and more. This book deals with the Linux Kernel Networking implementation—it does not delve into the internals of userspace Multicast Routing daemons implementation, which are quite complex and beyond the scope of the book. I do, however, discuss to some extent the interaction between a userspace multicast routing daemon and the multicast layer in the kernel. I also briefly discuss the Internet Group Management Protocol (IGMP) protocol, which is the basis of multicast group membership management; adding and deleting multicast group members is done by the IGMP protocol. Some basic knowledge of IGMP is needed to understand the interaction between a multicast host and a multicast router. + +Multipath Routing is the ability to add more than one nexthop to a route. Policy Routing enables configuring routing policies that are not based solely on the destination address. I start with describing Multicast Routing. + +## Multicast Routing + +Chapter 4 briefly mentions Multicast Routing, in the "Receiving IPv4 Multicast Packets" section. I will now discuss it in more depth. Sending multicast traffic means sending the same packet to multiple recipients. This feature can be useful in streaming media, audio/video conferencing, and more. It has a clear advantage over unicast traffic in terms of saving network bandwidth. Multicast addresses are defined as Class D addresses. The Classless Inter-Domain Routing (CIDR) prefix of this group is 224.0.0.0/4. The range of IPv4 multicast addresses is from 224.0.0.0 to 239.255.255.255. Handling Multicast Routing must be done in conjunction with a userspace routing daemon which interacts with the kernel. According to the Linux implementation, Multicast Routing cannot be handled solely by the kernel code without this userspace Routing daemon, as opposed to Unicast Routing. There are various multicast daemons: for example: mrouted, which is based on an implementation of the Distance Vector Multicast Routing Protocol (DVMRP), or pimd, which is based on the Protocol-Independent Multicast protocol (PIM). The DVMRP protocol is defined in RFC 1075, and it was the first multicast routing protocol. It is based on the Routing Information Protocol (RIP) protocol. + +The PIM protocol has two versions, and the kernel supports both of them (CONFIG_IP_PIMSM_V1 and CONFIG_IP_PIMSM_V2). PIM has four different modes: PIM-SM (PIM Sparse Mode), PIM-DM (PIM Dense Mode), PIM Source-Specific Multicast (PIM-SSM) and Bidirectional PIM. The protocol is called protocol-independent because it is not dependent on any particular routing protocol for topology discovery. This section discusses the interaction between the userspace daemon and the kernel multicast routing layer. Delving into the internals of the PIM protocol or the DVMRP protocol (or any other Multicast Routing protocol) is beyond the scope of this book. Normally, the Multicast Routing lookup is based on the source and destination addresses. There is a "Multicast Policy Routing" kernel feature, which is the parallel to the unicast policy routing kernel feature that was mentioned in Chapter 5 and which is also discussed in the course of this chapter. The multicast policy routing protocol is implemented using the Policy Routing API (for example, it calls the fib_rules_lookup() method to perform a lookup, creates a fib_rules_ops object, and registers it with the fib_rules_register() method, and so on). With Multicast Policy Routing, the routing can be based on additional criteria, like the ingress network interfaces. Moreover, you can work with more than one multicast routing table. In order to work with Multicast Policy Routing, IP_MROUTE_MULTIPLE_TABLES must be set. + +Figure 6-1 shows a simple IPv4 Multicast Routing setup. The topology is very simple: the laptop, on the left, joins a multicast group (224.225.0.1) by sending an IGMP packet (IP_ADD_MEMBERSHIP). The IGMP protocol is discussed in the next section, "The IGMP Protocol." The AMD server, in the middle, is configured as a multicast router, and a userspace multicast routing daemon (like pimd or mrouted) runs on it. The Windows server, on the right, which has an IP address of 192.168.2.10, sends multicast traffic to 224.225.0.1; this traffic is forwarded to the laptop via the multicast router. Note that the Windows server itself did not join the 224.225.0.1 multicast group. Running ip route add 224.0.0.0/4 dev tells the kernel to send all multicast traffic via the specified network device. + +Figure 6-1. + +Simple Multicast Routing setup + +The next section discusses the IGMP protocol, which is used for the management of multicast group membership. + +### The IGMP Protocol + +The IGMP protocol is an integral part of IPv4 multicast. It must be implemented on each node that supports IPv4 multicast. In IPv6, multicast management is handled by the MLD (Multicast Listener Discovery) protocol, which uses ICMPv6 messages, discussed in Chapter 8. With the IGMP protocol, multicast group memberships are established and managed. There are three versions of IGMP: + +1. + +IGMPv1 (RFC 1112) : Has two types of messages—host membership report and host membership query. When a host wants to join a multicast group, it sends a membership report message. Multicast routers send membership queries to discover which host multicast groups have members on their attached local networks. Queries are addressed to the all-hosts group address (224.0.0.1, IGMP_ALL_HOSTS) and carry a TTL of 1 so that the membership query will not travel outside of the LAN. + +2. + +IGMPv2 (RFC 2236) : This is an extension of IGMPv1. The IGMPv2 protocol adds three new messages: + +a. + +Membership Query (0x11): There are two sub-types of Membership Query messages: General Query, used to learn which groups have members on an attached network, and Group-Specific Query, used to learn whether a particular group has any members on an attached network. + +b. + +Version 2 Membership Report (0x16). + +c. + +Leave Group (0x17). + +Note + +IGMPv2 also supports Version 1 Membership Report message, for backward compatibility with IGMPv1. See RFC 2236, section 2.1. + +3. + +IGMPv3 (RFC 3376, updated by RFC 4604) : This major revision of the protocol adds a feature called source filtering. This means that when a host joins a multicast group, it can specify a set of source addresses from which it will receive multicast traffic. The source filters can also exclude source addresses. To support the source filtering feature, the socket API was extended; see RFC 3678, "Socket Interface Extensions for Multicast Source Filters." I should also mention that the multicast router periodically (about every two minutes) sends a membership query to 224.0.0.1, the all-hosts multicast group address. A host that receives a membership query responds with a membership report. This is implemented in the kernel by the igmp_rcv() method: getting an IGMP_HOST_MEMBERSHIP_QUERY message is handled by the igmp_heard_query() method. + +Note + +The kernel implementation of IPv4 IGMP is in net/core/igmp.c, include/linux/igmp.h and include/uapi/linux/igmp.h. + +The next section examines the fundamental data structure of IPv4 Multicast Routing, the multicast routing table, and its Linux implementation. + +### The Multicast Routing Table + +The multicast routing table is represented by a structure named mr_table. Let's take a look at it: + +struct mr_table { + +struct list_head list; + +#ifdef CONFIG_NET_NS + +struct net *net; + +#endif + +u32 id; + +struct sock __rcu *mroute_sk; + +struct timer_list ipmr_expire_timer; + +struct list_head mfc_unres_queue; + +struct list_head mfc_cache_array[MFC_LINES]; + +struct vif_device vif_table[MAXVIFS]; + +. . . + +}; + +(net/ipv4/ipmr.c) + +The following is a description of some members of the mr_table structure: + + * net: The network namespace associated with the multicast routing table; by default it is the initial network namespace, init_net. Network namespaces are discussed in Chapter 14. + + * id: The multicast routing table id; it is RT_TABLE_DEFAULT (253) when working with a single table. + + * mroute_sk: This pointer represents a reference to the userspace socket that the kernel keeps. The mroute_sk pointer is initialized by calling setsockopt() from the userspace with the MRT_INIT socket option and is nullified by calling setsockopt() with the MRT_DONE socket option. The interaction between the userspace and the kernel is based on calling the setsockopt() method, on sending IOCTLs from userspace, and on building IGMP packets and passing them to the Multicast Routing daemon by calling the sock_queue_rcv_skb() method from the kernel. + + * ipmr_expire_timer: Timer of cleaning unresolved multicast routing entries. This timer is initialized when creating a multicast routing table, in the ipmr_new_table() method, and removed when removing a multicast routing table, by the ipmr_free_table() method. + + * mfc_unres_queue: A queue of unresolved routing entries. + + * mfc_cache_array: A cache of the routing entries, with 64 (MFC_LINES) entries, discussed shortly in the next section. + + * vif_table[MAXVIFS]: An array of 32 (MAXVIFS) vif_device objects. Entries are added by the vif_add() method and deleted by the vif_delete() method. The vif_device structure represents a virtual multicast routing network interface; it can be based on a physical device or on an IPIP (IP over IP) tunnel. The vif_device structure is discussed later in "The Vif Device" section. + +I have covered the multicast routing table and mentioned its important members, such as the Multicast Forwarding Cache (MFC) and the queue of unresolved routing entries. Next I will look at the MFC, which is embedded in the multicast routing table object and plays an important role in Multicast Routing. + +### The Multicast Forwarding Cache (MFC) + +The most important data structure in the multicast routing table is the MFC, which is in fact an array of cache entries (mfc_cache objects). This array, named mfc_cache_array, is embedded in the multicast routing table (mr_table) object. It has 64 (MFC_LINES) elements. The index of this array is the hash value (the hash function takes two parameters—the multicast group address and the source IP address; see the description of the MFC_HASH macro in the "Quick Reference" section at the end of this chapter). + +Usually there is only one multicast routing table, which is an instance of the mr_table structure, and a reference to it is kept in the IPv4 network namespace (net->ipv4.mrt). The table is created by the ipmr_rules_init() method, which also assigns net->ipv4.mrt to point to the multicast routing table that was created. When working with the multicast policy routing feature mentioned earlier, there can be multiple multicast policy routing tables. In both cases, you get the routing table using the same method, ipmr_fib_lookup(). The ipmr_fib_lookup() method gets three parameters as an input: the network namespace, the flow, and a pointer to the mr_table object which it should fill. Normally, it simply sets the specified mr_table pointer to be net->ipv4.mrt; when working with multiple tables (IP_MROUTE_MULTIPLE_TABLES is set), the implementation is more complex. Let's take a look at the mfc_cache structure: + +struct mfc_cache { + +struct list_head list; + +__be32 mfc_mcastgrp; + +__be32 mfc_origin; + +vifi_t mfc_parent; + +int mfc_flags; + +union { + +struct { + +unsigned long expires; + +struct sk_buff_head unresolved; /* Unresolved buffers */ + +} unres; + +struct { + +unsigned long last_assert; + +int minvif; + +int maxvif; + +unsigned long bytes; + +unsigned long pkt; + +unsigned long wrong_if; + +unsigned char ttls[MAXVIFS]; /* TTL thresholds */ + +} res; + +} mfc_un; + +struct rcu_head rcu; + +}; + +(include/linux/mroute.h) + +The following is a description of some members of the mfc_cache structure: + + * mfc_mcastgrp: the address of the multicast group that the entry belongs to. + + * mfc_origin: The source address of the route. + + * mfc_parent: The source interface. + + * mfc_flags: The flags of the entry. Can have one of these values: + + * MFC_STATIC: When the route was added statically and not by a multicast routing daemon. + + * MFC_NOTIFY: When the RTM_F_NOTIFY flag of the routing entry was set. See the rt_fill_info() method and the ipmr_get_route() method for more details. + + * The mfc_un union consists of two elements: + + * unres: Unresolved cache entries. + + * res: Resolved cache entries. + +The first time an SKB of a certain flow reaches the kernel, it is added to the queue of unresolved entries (mfc_un.unres.unresolved), where up to three SKBs can be saved. If there are three SKBs in the queue, the packet is not appended to the queue but is freed, and the ipmr_cache_unresolved() method returns -ENOBUFS ("No buffer space available"): + +static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) + +{ + +. . . + +if (c->mfc_un.unres.unresolved.qlen > 3) { + +kfree_skb(skb); + +err = -ENOBUFS; + +} else { + +. . . + +} + +(net/ipv4/ipmr.c) + +This section described the MFC and its important members, including the queue of resolved entries and the queue of unresolved entries. The next section briefly describes what a multicast router is and how it is configured in Linux. + +### Multicast Router + +In order to configure a machine as a multicast router, you should set the CONFIG_IP_MROUTE kernel configuration option. You should also run some routing daemon such as pimd or mrouted, as mentioned earlier. These routing daemons create a socket to communicate with the kernel. In pimd, for example, you create a raw IGMP socket by calling socket(AF_INET, SOCK_RAW, IPPROTO_IGMP). Calling setsockopt() on this socket triggers sending commands to the kernel, which are handled by the ip_mroute_setsockopt() method. When calling setsockopt() on this socket from the routing daemon with MRT_INIT, the kernel is set to keep a reference to the userspace socket in the mroute_sk field of the mr_table object that is used, and the mc_forwarding procfs entry (/proc/sys/net/ipv4/conf/all/mc_forwarding) is set by calling IPV4_DEVCONF_ALL(net, MC_FORWARDING)++. Note that the mc_forwarding procfs entry is a read-only entry and can't be set from userspace. You can't create another instance of a multicast routing daemon: when handling the MRT_INIT option, the ip_mroute_setsockopt() method checks whether the mroute_sk field of the mr_table object is initialized and returns -EADDRINUSE if so. Adding a network interface is done by calling setsockopt() on this socket with MRT_ADD_VIF, and deleting a network interface is done by calling setsockopt() on this socket with MRT_DEL_VIF. You can pass the parameters of the network interface to these setsockopt() calls by passing a vifctl object as the optval parameter of the setsockopt() system call. Let's take a look at the vifctl structure: + +struct vifctl { + +vifi_t vifc_vifi; /* Index of VIF */ + +unsigned char vifc_flags; /* VIFF_ flags */ + +unsigned char vifc_threshold; /* ttl limit */ + +unsigned int vifc_rate_limit; /* Rate limiter values (NI) */ + +union { + +struct in_addr vifc_lcl_addr; /* Local interface address */ + +int vifc_lcl_ifindex; /* Local interface index */ + +}; + +struct in_addr vifc_rmt_addr; /* IPIP tunnel addr */ + +}; + +(include/uapi/linux/mroute.h) + +The following is a description of some members of the vifctl structure: + + * vifc_flags can be: + + * VIFF_TUNNEL: When you want to use an IPIP tunnel. + + * VIFF_REGISTER: When you want to register the interface. + + * VIFF_USE_IFINDEX: When you want to use the local interface index and not the local interface IP address; in such a case, you will set the vifc_lcl_ifindex to be the local interface index. The VIFF_USE_IFINDEX flag is available for 2.6.33 kernel and above. + + * vifc_lcl_addr: The local interface IP address. (This is the default—no flag should be set for using it). + + * vifc_lcl_ifindex: The local interface index. It should be set when the VIFF_USE_IFINDEX flag is set in vifc_flags. + + * vifc_rmt_addr: The address of the remote node of a tunnel. + +When the multicast routing daemon is closed, the setsockopt() method is called with an MRT_DONE option. This triggers calling the mrtsock_destruct() method to nullify the mroute_sk field of the mr_table object that is used and to perform various cleanups. + +This section covered what a multicast router is and how it is configured in Linux. I also examined the vifctl structure. Next, I look at the Vif device, which represents a multicast network interface. + +### The Vif Device + +Multicast Routing supports two modes: direct multicast and multicast encapsulated in a unicast packet over a tunnel. In both cases, the same object is used (an instance of the vif_device structure) to represent the network interface. When working over a tunnel, the VIFF_TUNNEL flag will be set. Adding and deleting a multicast interface is done by the vif_add() method and by the vif_delete() method, respectively. The vif_add() method also sets the device to support multicast by calling the dev_set_allmulti(dev, 1) method, which increments the allmulti counter of the specified network device (net_device object). The vif_delete() method calls dev_set_allmulti(dev, -1) to decrement the allmulti counter of the specified network device (net_device object). For more details about the dev_set_allmulti() method, see appendix A. Let's take a look at the vif_device structure; its members are quite self-explanatory: + +struct vif_device { + +struct net_device *dev; /* Device we are using */ + +unsigned long bytes_in,bytes_out; + +unsigned long pkt_in,pkt_out; /* Statistics */ + +unsigned long rate_limit; /* Traffic shaping (NI) */ + +unsigned char threshold; /* TTL threshold */ + +unsigned short flags; /* Control flags */ + +__be32 local,remote; /* Addresses(remote for tunnels)*/ + +int link; /* Physical interface index */ + +}; + +(include/linux/mroute.h) + +In order to receive multicast traffic, a host must join a multicast group. This is done by creating a socket in userspace and calling setsockopt() with IPPROTO_IP and with the IP_ADD_MEMBERSHIP socket option. The userspace application also creates an ip_mreq object where it initializes the request parameters, like the desired group multicast address and the source IP address of the host (see the netinet/in.h userspace header). The setsockopt() call is handled in the kernel by the ip_mc_join_group() method, in net/ipv4/igmp.c. Eventually, the multicast address is added by the ip_mc_join_group() method to a list of multicast addresses (mc_list), which is a member of the in_device object. A host can leave a multicast group by calling setsockopt() with IPPROTO_IP and with the IP_DROP_MEMBERSHIP socket option. This is handled in the kernel by the ip_mc_leave_group() method, in net/ipv4/igmp.c. A single socket can join up to 20 multicast groups (sysctl_igmp_max_memberships). Trying to join more than 20 multicast groups by the same socket will fail with the -ENOBUFS error ("No buffer space available.") See the ip_mc_join_group() method implementation in net/ipv4/igmp.c. + +### IPv4 Multicast Rx Path + +Chapter 4's "Receiving IPv4 Multicast Packets" section briefly discusses how multicast packets are handled. I will now describe this in more depth. My discussion assumes that our machine is configured as a multicast router; this means, as was mentioned earlier, that CONFIG_IP_MROUTE is set and a routing daemon like pimd or mrouted runs on this host. Multicast packets are handled by the ip_route_input_mc() method, in which a routing table entry (an rtable object) is allocated and initialized, and in which the input callback of the dst object is set to be ip_mr_input(), in case CONFIG_IP_MROUTE is set. Let's take a look at the ip_mr_input() method: + +int ip_mr_input(struct sk_buff *skb) + +{ + +struct mfc_cache *cache; + +struct net *net = dev_net(skb->dev); + +First the local flag is set to true if the packet is intended for local delivery, as the ip_mr_input() method also handles local multicast packets. + +int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + +struct mr_table *mrt; + +/* Packet is looped back after forward, it should not be + +* forwarded second time, but still can be delivered locally. + +*/ + +if (IPCB(skb)->flags & IPSKB_FORWARDED) + +goto dont_forward; + +Normally, when working with a single multicast routing table, the ipmr_rt_fib_lookup() method simply returns the net->ipv4.mrt object: + +mrt = ipmr_rt_fib_lookup(net, skb); + +if (IS_ERR(mrt)) { + +kfree_skb(skb); + +return PTR_ERR(mrt); + +} + +if (!local) { + +IGMPv3 and some IGMPv2 implementations set the router alert option (IPOPT_RA) in the IPv4 header when sending JOIN or LEAVE packets. See the igmpv3_newpack() method in net/ipv4/igmp.c: + +if (IPCB(skb)->opt.router_alert) { + +The ip_call_ra_chain() method (net/ipv4/ip_input.c) calls the raw_rcv() method to pass the packet to the userspace raw socket, which listens. The ip_ra_chain object contains a reference to the multicast routing socket, which is passed as a parameter to the raw_rcv() method. For more details, look at the ip_call_ra_chain() method implementation, in net/ipv4/ip_input.c: + +if (ip_call_ra_chain(skb)) + +return 0; + +There are implementations where the router alert option is not set, as explained in the following comment; these cases must be handled as well, by calling the raw_rcv() method directly: + +} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { + +/* IGMPv1 (and broken IGMPv2 implementations sort of + +* Cisco IOS <= 11.2(8)) do not put router alert + +* option to IGMP packets destined to routable + +* groups. It is very bad, because it means + +* that we can forward NO IGMP messages. + +*/ + +struct sock *mroute_sk; + +The mrt->mroute_sk socket is a copy in the kernel of the socket that the multicast routing userspace application created: + +mroute_sk = rcu_dereference(mrt->mroute_sk); + +if (mroute_sk) { + +nf_reset(skb); + +raw_rcv(mroute_sk, skb); + +return 0; + +} + +} + +} + +First a lookup in the multicast routing cache, mfc_cache_array, is performed by calling the ipmr_cache_find() method. The hash key is the destination multicast group address and the source IP address of the packet, taken from the IPv4 header: + +cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + +if (cache == NULL) { + +A lookup in the virtual devices array (vif_table) is performed to see whether there is a corresponding entry which matches the incoming network device (skb->dev): + +int vif = ipmr_find_vif(mrt, skb->dev); + +The ipmr_cache_find_any() method handles the advanced feature of multicast proxy support (which is not discussed in this book): + +if (vif >= 0) + +cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + +vif); + +} + +/* + +* No usable cache entry + +*/ + +if (cache == NULL) { + +int vif; + +If the packet is destined to the local host, deliver it: + +if (local) { + +struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + +ip_local_deliver(skb); + +if (skb2 == NULL) + +return -ENOBUFS; + +skb = skb2; + +} + +read_lock(&mrt_lock); + +vif = ipmr_find_vif(mrt, skb->dev); + +if (vif >= 0) { + +The ipmr_cache_unresolved() method creates a multicast routing entry (mfc_cache object) by calling the ipmr_cache_alloc_unres() method. This method creates a cache entry (mfc_cache object) and initializes its expiration time interval (by setting mfc_un.unres.expires). Let's take a look at this very short method, ipmr_cache_alloc_unres(): + +static struct mfc_cache *ipmr_cache_alloc_unres(void) + +{ + +struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); + +if (c) { + +skb_queue_head_init(&c->mfc_un.unres.unresolved); + +Setting the expiration time interval: + +c->mfc_un.unres.expires = jiffies + 10*HZ; + +} + +return c; + +} + +If the routing daemon does not resolve the routing entry within its expiration interval, the entry is removed from the queue of the unresolved entries. When creating a multicast routing table (by the ipmr_new_table() method), its timer (ipmr_expire_timer) is set. This timer invokes the ipmr_expire_process() method periodically. The ipmr_expire_process() method iterates over all the unresolved cache entries in the queue of unresolved entries (mfc_unres_queue of the mrtable object) and removes the expired unresolved cache entries. + +After creating the unresolved cache entry, the ipmr_cache_unresolved() method adds it to the queue of unresolved entries (mfc_unres_queue of the multicast table, mrtable) and increments by 1 the unresolved queue length (cache_resolve_queue_len of the multicast table, mrtable). It also calls the ipmr_cache_report() method, which builds an IGMP message (IGMPMSG_NOCACHE) and delivers it to the userspace multicast routing daemon by calling eventually the sock_queue_rcv_skb() method. + +I mentioned that the userspace routing daemon should resolve the routing within some time interval. I will not delve into how this is implemented in userspace. Note, however, that once the routing daemon decides it should resolve an unresolved entry, it builds the cache entry parameters (in an mfcctl object) and calls setsockopt() with the MRT_ADD_MFC socket option, then it passes the mfcctl object embedded in the optval parameter of the setsockopt() system call; this is handled in the kernel by the ipmr_mfc_add() method: + +int err2 = ipmr_cache_unresolved(mrt, vif, skb); + +read_unlock(&mrt_lock); + +return err2; + +} + +read_unlock(&mrt_lock); + +kfree_skb(skb); + +return -ENODEV; + +} + +read_lock(&mrt_lock); + +If a cache entry was found in the MFC, call the ip_mr_forward() method to continue the packet traversal: + +ip_mr_forward(net, mrt, skb, cache, local); + +read_unlock(&mrt_lock); + +if (local) + +return ip_local_deliver(skb); + +return 0; + +dont_forward: + +if (local) + +return ip_local_deliver(skb); + +kfree_skb(skb); + +return 0; + +} + +This section detailed the IPv4 Multicast Rx path and the interaction with the routing daemon in this path. The next section describes the multicast routing forwarding method, ip_mr_forward(). + +### The ip_mr_forward() Method + +Let's take a look at the ip_mr_forward() method: + +static int ip_mr_forward(struct net *net, struct mr_table *mrt, + +struct sk_buff *skb, struct mfc_cache *cache, + +int local) + +{ + +int psend = -1; + +int vif, ct; + +int true_vifi = ipmr_find_vif(mrt, skb->dev); + +vif = cache->mfc_parent; + +Here you can see update statistics of the resolved cache object (mfc_un.res): + +cache->mfc_un.res.pkt++; + +cache->mfc_un.res.bytes += skb->len; + +if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + +struct mfc_cache *cache_proxy; + +The expression (*, G) means traffic from any source sending to the group G: + +/* For an (*,G) entry, we only check that the incomming + +* interface is part of the static tree. + +*/ + +cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + +if (cache_proxy && + +cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + +goto forward; + +} + +/* + +* Wrong interface: drop packet and (maybe) send PIM assert. + +*/ + +if (mrt->vif_table[vif].dev != skb->dev) { + +if (rt_is_output_route(skb_rtable(skb))) { + +/* It is our own packet, looped back. + +* Very complicated situation... + +* + +* The best workaround until routing daemons will be + +* fixed is not to redistribute packet, if it was + +* send through wrong interface. It means, that + +* multicast applications WILL NOT work for + +* (S,G), which have default multicast route pointing + +* to wrong oif. In any case, it is not a good + +* idea to use multicasting applications on router. + +*/ + +goto dont_forward; + +} + +cache->mfc_un.res.wrong_if++; + +if (true_vifi >= 0 && mrt->mroute_do_assert && + +/* pimsm uses asserts, when switching from RPT to SPT, + +* so that we cannot check that packet arrived on an oif. + +* It is bad, but otherwise we would need to move pretty + +* large chunk of pimd to kernel. Ough... --ANK + +*/ + +(mrt->mroute_do_pim || + +cache->mfc_un.res.ttls[true_vifi] < 255) && + +time_after(jiffies, + +cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + +cache->mfc_un.res.last_assert = jiffies; + +Call the ipmr_cache_report() method to build an IGMP message (IGMPMSG_WRONGVIF) and to deliver it to the userspace multicast routing daemon by calling the sock_queue_rcv_skb() method: + +ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + +} + +goto dont_forward; + +} + +The frame is now ready to be forwarded: + +forward: + +mrt->vif_table[vif].pkt_in++; + +mrt->vif_table[vif].bytes_in += skb->len; + +/* + +* Forward the frame + +*/ + +if (cache->mfc_origin == htonl(INADDR_ANY) && + +cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + +if (true_vifi >= 0 && + +true_vifi != cache->mfc_parent && + +ip_hdr(skb)->ttl > + +cache->mfc_un.res.ttls[cache->mfc_parent]) { + +/* It's an (*,*) entry and the packet is not coming from + +* the upstream: forward the packet to the upstream + +* only. + +*/ + +psend = cache->mfc_parent; + +goto last_forward; + +} + +goto dont_forward; + +} + +for (ct = cache->mfc_un.res.maxvif - 1; + +ct >= cache->mfc_un.res.minvif; ct--) { + +/* For (*,G) entry, don't forward to the incoming interface */ + +if ((cache->mfc_origin != htonl(INADDR_ANY) || + +ct != true_vifi) && + +ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + +if (psend != -1) { + +struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + +Call the ipmr_queue_xmit() method to continue with the packet forwarding: + +if (skb2) + +ipmr_queue_xmit(net, mrt, skb2, cache, + +psend); + +} + +psend = ct; + +} + +} + +last_forward: + +if (psend != -1) { + +if (local) { + +struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + +if (skb2) + +ipmr_queue_xmit(net, mrt, skb2, cache, psend); + +} else { + +ipmr_queue_xmit(net, mrt, skb, cache, psend); + +return 0; + +} + +} + +dont_forward: + +if (!local) + +kfree_skb(skb); + +return 0; + +} + +Now that I have covered the multicast routing forwarding method, ip_mr_forward(), it is time to examine the ipmr_queue_xmit() method. + +### The ipmr_queue_xmit() Method + +Let's take a look at the ipmr_queue_xmit() method: + +static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, + +struct sk_buff *skb, struct mfc_cache *c, int vifi) + +{ + +const struct iphdr *iph = ip_hdr(skb); + +struct vif_device *vif = &mrt->vif_table[vifi]; + +struct net_device *dev; + +struct rtable *rt; + +struct flowi4 fl4; + +The encap field is used when working with a tunnel: + +int encap = 0; + +if (vif->dev == NULL) + +goto out_free; + +#ifdef CONFIG_IP_PIMSM + +if (vif->flags & VIFF_REGISTER) { + +vif->pkt_out++; + +vif->bytes_out += skb->len; + +vif->dev->stats.tx_bytes += skb->len; + +vif->dev->stats.tx_packets++; + +ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); + +goto out_free; + +} + +#endif + +When working with a tunnel, a routing lookup is performed with the vif->remote and vif->local, which represent the destination and local addresses, respectively. These addresses are the end points of the tunnel. When working with a vif_device object which represents a physical device, a routing lookup is performed with the destination of the IPv4 header and 0 as a source address: + +if (vif->flags & VIFF_TUNNEL) { + +rt = ip_route_output_ports(net, &fl4, NULL, + +vif->remote, vif->local, + +0, 0, + +IPPROTO_IPIP, + +RT_TOS(iph->tos), vif->link); + +if (IS_ERR(rt)) + +goto out_free; + +encap = sizeof(struct iphdr); + +} else { + +rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, + +0, 0, + +IPPROTO_IPIP, + +RT_TOS(iph->tos), vif->link); + +if (IS_ERR(rt)) + +goto out_free; + +} + +dev = rt->dst.dev; + +Note that if the packet size is higher than the MTU, an ICMPv4 message is not sent (as is done in such a case under unicast forwarding); only the statistics are updated, and the packet is discarded: + +if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { + +/* Do not fragment multicasts. Alas, IPv4 does not + +* allow to send ICMP, so that packets will disappear + +* to blackhole. + +*/ + +IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + +ip_rt_put(rt); + +goto out_free; + +} + +encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; + +if (skb_cow(skb, encap)) { + +ip_rt_put(rt); + +goto out_free; + +} + +vif->pkt_out++; + +vif->bytes_out += skb->len; + +skb_dst_drop(skb); + +skb_dst_set(skb, &rt->dst); + +The TTL is decreased, and the IPv4 header checksum is recalculated (because the TTL is one of the IPv4 fields) when forwarding the packet; the same is done in the ip_forward() method for unicast packets: + +ip_decrease_ttl(ip_hdr(skb)); + +/* FIXME: forward and output firewalls used to be called here. + +* What do we do with netfilter? -- RR + +*/ + +if (vif->flags & VIFF_TUNNEL) { + +ip_encap(skb, vif->local, vif->remote); + +/* FIXME: extra output firewall step used to be here. --RR */ + +vif->dev->stats.tx_packets++; + +vif->dev->stats.tx_bytes += skb->len; + +} + +IPCB(skb)->flags |= IPSKB_FORWARDED; + +/* + +* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + +* not only before forwarding, but after forwarding on all output + +* interfaces. It is clear, if mrouter runs a multicasting + +* program, it should receive packets not depending to what interface + +* program is joined. + +* If we will not make it, the program will have to join on all + +* interfaces. On the other hand, multihoming host (or router, but + +* not mrouter) cannot join to more than one interface - it will + +* result in receiving multiple packets. + +*/ + +Invoke the NF_INET_FORWARD hook: + +NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev, + +ipmr_forward_finish); + +return; + +out_free: + +kfree_skb(skb); + +} + +### The ipmr_forward_finish() Method + +Let's take a look at the ipmr_forward_finish() method, which is a very short method—it is in fact identical to the ip_forward() method: + +static inline int ipmr_forward_finish(struct sk_buff *skb) + +{ + +struct ip_options *opt = &(IPCB(skb)->opt); + +IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + +IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + +Handle IPv4 options, if set (see Chapter 4): + +if (unlikely(opt->optlen)) + +ip_forward_options(skb); + +return dst_output(skb); + +} + +Eventually, dst_output() sends the packet via the ip_mc_output() method, which calls the ip_finish_output() method (both methods are in net/ipv4/route.c). + +Now that I have covered these multicast methods, let's get a better understanding of how the value of the TTL field is used in multicast traffic. + +### The TTL in Multicast Traffic + +The TTL field of the IPv4 header has a double meaning when discussing multicast traffic. The first is the same as in unicast IPV4 traffic: the TTL represents a hop counter which is decreased by 1 on every device that is forwarding the packet. When it reaches 0, the packet is discarded. This is done to avoid endless travelling of packets due to some error. The second meaning of the TTL, which is unique to multicast traffic, is a threshold. The TTL values are divided into scopes. Routers have a TTL threshold assigned to each of their interfaces, and only packets with a TTL greater than the interface's threshold are forwarded. Here are the values of these thresholds: + + * 0:Restricted to the same host (cannot be sent out by any interface) + + * 1: Restricted to the same subnet (will not be forwarded by a router) + + * 32: Restricted to the same site + + * 64: Restricted to the same region + + * 128: Restricted to the same continent + + * 255: Unrestricted in scope (global) + +See: "IP Multicast Extensions for 4.3BSD UNIX and related systems," by Steve Deering, available at www.kohala.com/start/mcast.api.txt . + +Note + +IPv4 Multicast Routing is implemented in net/ipv4/ipmr.c, include/linux/mroute.h, and include/uapi/linux/mroute.h. + +This completes my discussion of Multicast Routing. The chapter now moves on to Policy Routing, which enables you to configure routing policies that are not based solely on the destination address. + +## Policy Routing + +With Policy Routing, a system administrator can define up to 255 routing tables. This section discusses IPv4 Policy Routing; IPv6 Policy Routing is discussed in Chapter 8. In this section, I use the terms policy or rule for entries that are created by Policy Routing, in order to avoid confusing the ordinary routing entries (discussed in Chapter 5) with policy rules. + +### Policy Routing Management + +Policy Routing management is done with the ip rule command of the iproute2 package (there is no parallel for Policy Routing management with the route command). Let's see how to add, delete, and dump all Policy Routing rules: + + * You add a rule with the ip rule add command; for example: ip rule add tos 0x04 table 252. After this rule is inserted, every packet which has an IPv4 TOS field matching 0x04 will be handled according to the routing rules of table 252. You can add routing entries to this table by specifying the table number when adding a route; for example: ip route add default via 192.168.2.10 table 252. This command is handled in the kernel by the fib_nl_newrule() method, in net/core/fib_rules.c. The tos modifier in the ip rule command earlier is one of the available SELECTOR modifiers of the ip rule command; see man 8 ip rule, and also Table 6-1 in the "Quick Reference" section at the end of this chapter. + + * You delete a rule with the ip rule del command; for example: ip rule del tos 0x04 table 252. This command is handled in the kernel by the fib_nl_delrule() method in net/core/fib_rules.c. + + * You dump all the rules with the ip rule list command or the ip rule show command. Both these commands are handled in the kernel by the fib_nl_dumprule() method in net/core/fib_rules.c. + +You now have a good idea about the basics of Policy Routing management, so let's examine the Linux implementation of Policy Routing. + +### Policy Routing Implementation + +The core infrastructure of Policy Routing is the fib_rules module, net/core/fib_rules.c. It is used by three protocols of the kernel networking stack: IPv4 (including the multicast module, which has a multicast policy routing feature, as mentioned in the "Multicast Routing" section earlier in this chapter), IPv6, and DECnet. The IPv4 Policy Routing is implemented also in a file named fib_rules.c. Don't be confused by the identical name (net/ipv4/fib_rules.c). In IPv6, policy routing is implemented in net/ipv6/fib6_rules.c. The header file, include/net/fib_rules.h, contains the data structures and methods of the Policy Routing core. Here is the definition of the fib4_rule structure, which is the basis for IPv4 Policy Routing: + +struct fib4_rule { + +struct fib_rule common; + +u8 dst_len; + +u8 src_len; + +u8 tos; + +__be32 src; + +__be32 srcmask; + +__be32 dst; + +__be32 dstmask; + +#ifdef CONFIG_IP_ROUTE_CLASSID + +u32 tclassid; + +#endif + +}; + +(net/ipv4/fib_rules.c) + +Three policies are created by default at boot time, by calling the fib_default_rules_init() method: the local (RT_TABLE_LOCAL) table, the main (RT_TABLE_MAIN) table, and the default (RT_TABLE_DEFAULT) table. Lookup is done by the fib_lookup() method. Note that there are two different implementations of the fib_lookup() method in include/net/ip_fib.h. The first one, which is wrapped in the #ifndef CONFIG_IP_MULTIPLE_TABLES block, is for non-Policy Routing, and the second is for Policy Routing. When working with Policy Routing, the lookup is performed like this: if there were no changes to the initial policy routing rules (net->ipv4.fib_has_custom_rules is not set), that means the rule must be in one of the three initial routing tables. So, first a lookup is done in the local table, then in the main table, and then the default table. If there is no corresponding entry, a network unreachable (-ENETUNREACH) error is returned. If there was some change in the initial policy routing rules (net->ipv4.fib_has_custom_rules is set), the_fib_lookup() method is invoked, which is a heavier method, because it iterates over the list of rules and calls fib_rule_match() for each rule in order to decide whether it matches or not. See the implementation of the fib_rules_lookup() method in net/core/fib_rules.c. (The fib_rules_lookup() method is invoked from the __fib_lookup() method). I should mention here that the net->ipv4.fib_has_custom_rules variable is set to false in the initialization phase, by the fib4_rules_init() method, and to true in the fib4_rule_configure() method and the fib4_rule_delete() method. Note that CONFIG_IP_MULTIPLE_TABLES should be set for working with Policy Routing. + +This concludes my Multicast Routing discussion. The next section talks about Multipath Routing, which is the ability to add more than one nexthop to a route. + +## Multipath Routing + +Multipath Routing provides the ability to add more than one nexthop to a route. Defining two nexthop nodes can be done like this, for example: ip route add default scope global nexthop dev eth0 nexthop dev eth1. A system administrator can also assign weights for each nexthop—like this, for example: ip route add 192.168.1.10 nexthop via 192.168.2.1 weight 3 nexthop via 192.168.2.10 weight 5. The fib_info structure represents an IPv4 routing entry that can have more than one FIB nexthop. The fib_nhs member of the fib_info object represents the number of FIB nexthop objects; the fib_info object contains an array of FIB nexthop objects named fib_nh. So in this case, a single fib_info object is created, with an array of two FIB nexthop objects. The kernel keeps the weight of each next hop in the nh_weight field of the FIB nexthop object (fib_nh). If weight was not specified when adding a multipath route, it is set by default to 1, in the fib_create_info() method. The fib_select_multipath() method is called to determine the nexthop when working with Multipath Routing. This method is invoked from two places: from the __ip_route_output_key() method, in the Tx path, and from the ip_mkroute_input() method, in the Rx path. Note that when the output device is set in the flow, the fib_select_multipath() method is not invoked, because the output device is known: + +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) { + +. . . + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + +fib_select_multipath(&res); + +else + +#endif + +. . . + +} + +In the Rx path there is no need for checking whether fl4->flowi4_oif is 0, because it is set to 0 in the beginning of this method. I won't delve into the details of the fib_select_multipath() method. I will only mention that there is an element of randomness in the method, using jiffies, for helping in creating a fair weighted route distribution, and that the weight of each next hop is taken in account. The FIB nexthop to use is assigned by setting the FIB nexthop selector (nh_sel) of the specified fib_result object. In contrast to Multicast Routing, which is handled by a dedicated module (net/ipv4/ipmr.c), the code of Multipath Routing appears scattered in the existing routing code, enclosed in #ifdef CONFIG_IP_ROUTE_MULTIPATH conditionals, and no separate module was added in the source code for supporting it. As mentioned in Chapter 5, there was support for IPv4 multipath routing cache, but it was removed in 2007 in kernel 2.6.23; in fact, it never did work very well, and never got out of the experimental state. Do not confuse the removal of the multipath routing cache with the removal of the routing cache; these are two different caches. The removal of the routing cache took place five years later, in kernel 3.6 (2012). + +NOTE + +CONFIG_IP_ROUTE_MULTIPATH should be set for Multipath Routing Support. + +## Summary + +This chapter covered advanced IPv4 routing topics, like Multicast Routing, the IGMP protocol, Policy Routing, and Multipath Routing. You learned about the fundamental structures of Multicast Routing, such as the multicast table (mr_table), the multicast forwarding cache (MFC), the Vif device, and more. You also learned what should be done to set a host to be a multicast router, and all about the use of the ttl field in Multicast Routing. Chapter 7 deals with the Linux neighbouring subsystem. The "Quick Reference" section that follows covers the top methods related to the topics discussed in this chapter, ordered by their context. + +## Quick Reference + +I conclude this chapter with a short list of important routing subsystem methods (some of which were mentioned in this chapter), a list of macros, and procfs multicast entries and tables. + +### Methods + +Let's start with the methods: + +#### int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); + +This method handles setsockopt() calls from the multicast routing daemon. The supported socket options are: MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ADD_MFC_PROXY, MRT_DEL_MFC_PROXY, MRT_ASSERT, MRT_PIM (when PIM support is set), and MRT_TABLE (when Multicast Policy Routing is set). + +#### int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen); + +This method handles getsockopt() calls from the multicast routing daemon. The supported socket options are MRT_VERSION, MRT_ASSERT and MRT_PIM. + +#### struct mr_table *ipmr_new_table(struct net *net, u32 id); + +This method creates a new multicast routing table. The id of the table will be the specified id. + +#### void ipmr_free_table(struct mr_table *mrt); + +This method frees the specified multicast routing table and the resources attached to it. + +#### int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr); + +This method is for joining a multicast group. The address of the multicast group to be joined is specified in the given ip_mreqn object. The method returns 0 on success. + +#### static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp); + +This method performs a lookup in the IPv4 multicast routing cache. It returns NULL when no entry is found. + +#### bool ipv4_is_multicast(__be32 addr); + +This method returns true if the address is a multicast address. + +#### int ip_mr_input(struct sk_buff *skb); + +This method is the main IPv4 multicast Rx method (net/ipv4/ipmr.c). + +#### struct mfc_cache *ipmr_cache_alloc(void); + +This method allocates a multicast forwarding cache (mfc_cache) entry. + +#### static struct mfc_cache *ipmr_cache_alloc_unres(void); + +This method allocates a multicast routing cache (mfc_cache) entry for the unresolved cache and sets the expires field of the queue of unresolved entries. + +#### void fib_select_multipath(struct fib_result *res); + +This method is called to determine the nexthop when working with Multipath Routing. + +#### int dev_set_allmulti(struct net_device *dev, int inc); + +This method increments/decrements the allmulti counter of the specified network device according to the specified increment (the increment can be a positive number or a negative number). + +#### int igmp_rcv(struct sk_buff *skb); + +This method is the receive handler for IGMP packets. + +#### static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent); + +This method adds a multicast cache entry; it is invoked by calling setsockopt() from userspace with MRT_ADD_MFC. + +#### static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent); + +This method deletes a multicast cache entry; it is invoked by calling setsockopt() from userspace with MRT_DEL_MFC. + +#### static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock); + +This method adds a multicast virtual interface; it is invoked by calling setsockopt() from userspace with MRT_ADD_VIF. + +#### static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head); + +This method deletes a multicast virtual interface; it is invoked by calling setsockopt() from userspace with MRT_DEL_VIF. + +#### static void ipmr_expire_process(unsigned long arg); + +This method removes expired entries from the queue of unresolved entries. + +#### static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); + +This method builds an IGMP packet, setting the type in the IGMP header to be the specified assert value and the code to be 0. This IGMP packet is delivered to the userspace multicast routing daemon by calling the sock_queue_rcv_skb() method. The assert parameter can be assigned one of these values: IGMPMSG_NOCACHE, when an unresolved cache entry is added to the queue of unresolved entries and wants to notify the userspace routing daemon that it should resolve it, IGMPMSG_WRONGVIF, and IGMPMSG_WHOLEPKT. + +#### static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr); + +This method is a notifier callback which is registered by the register_netdevice_notifier() method; when some network device is unregistered, a NETDEV_UNREGISTER event is generated; this callback receives this event and deletes the vif_device objects in the vif_table, whose device is the one that was unregistered. + +#### static void mrtsock_destruct(struct sock *sk); + +This method is called when the userspace routing daemon calls setsockopt() with MRT_DONE. This method nullifies the multicast routing socket (mroute_sk of the multicast routing table), decrements the mc_forwarding procfs entry, and calls the mroute_clean_tables() method to free resources. + +### Macros + +This section describes our macros. + +#### MFC_HASH(a,b) + +This macro calculates the hash value for adding entries to the MFC cache. It takes the group multicast address and the source IPv4 address as parameters. + +#### VIF_EXISTS(_mrt, _idx) + +This macro checks the existence of an entry in the vif_table; it returns true if the array of multicast virtual devices (vif_table) of the specified multicast routing table (mrt) has an entry with the specified index (_idx). + +### Procfs Multicast Entries + +The following is a description of two important procfs multicast entries: + +#### /proc/net/ip_mr_vif + +Lists all the multicast virtual interfaces; it displays all the vif_device objects in the multicast virtual device table (vif_table). Displaying the /proc/net/ip_mr_vif entry is handled by the ipmr_vif_seq_show() method. + +#### /proc/net/ip_mr_cache + +The state of the Multicast Forwarding Cache (MFC). This entry shows the following fields of all the cache entries: group multicast address (mfc_mcastgrp), source IP address (mfc_origin), input interface index (mfc_parent), forwarded packets (mfc_un.res.pkt), forwarded bytes (mfc_un.res.bytes), wrong interface index (mfc_un.res.wrong_if), the index of the forwarding interface (an index in the vif_table), and the entry in the mfc_un.res.ttls array corresponding to this index. Displaying the /proc/net/ip_mr_cache entry is handled by the ipmr_mfc_seq_show() method. + +### Table + +And finally, here in Table 6-1, is the table of rule selectors. + +Table 6-1. + +IP Rule Selectors + +Linux Symbol | Selector | Member of fib_rule | fib4_rule + +---|---|---|--- + +FRA_SRC | from | src | (fib4_rule) + +FRA_DST | to | dst | (fib4_rule) + +FRA_IIFNAME | iif | iifname | (fib_rule) + +FRA_OIFNAME | oif | oifname | (fib_rule) + +FRA_FWMARK | fwmark | mark | (fib_rule) + +FRA_FWMASK | fwmark/fwmask | mark_mask | (fib_rule) + +FRA_PRIORITY | preference,order,priority | pref | (fib_rule) + +- | tos, dsfield | tos | (fib4_rule) +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_7 + +© Rami Rosen 2014 + +# 7. Linux Neighbouring Subsystem + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +This chapter discusses the Linux neighbouring subsystem and its implementation in Linux. The neighbouring subsystem is responsible for the discovery of the presence of nodes on the same link and for translation of L3 (network layer) addresses to L2 (link layer) addresses. L2 addresses are needed to build the L2 header for outgoing packets, as described in the next section. The protocol that implements this translation is called the Address Resolution Protocol (ARP) in IPv4 and Neighbour Discovery protocol (NDISC or ND) in IPv6. The neighbouring subsystem provides a protocol-independent infrastructure for performing L3-to-L2 mappings. The discussion in this chapter, however, is restricted to the most common cases—namely, the neighbouring subsystem usage in IPv4 and in IPv6. Keep in mind that the ARP protocol, like the ICMP protocol discussed in Chapter 3, is subject to security threats—such as ARP poisoning attacks and ARP spoofing attacks (security aspects of the ARP protocol are beyond the scope of this book). + +This chapter discusses the Linux neighbouring subsystem and its implementation in Linux. The neighbouring subsystem is responsible for the discovery of the presence of nodes on the same link and for translation of L3 (network layer) addresses to L2 (link layer) addresses. L2 addresses are needed to build the L2 header for outgoing packets, as described in the next section. The protocol that implements this translation is called the Address Resolution Protocol (ARP) in IPv4 and Neighbour Discovery protocol (NDISC or ND) in IPv6. The neighbouring subsystem provides a protocol-independent infrastructure for performing L3-to-L2 mappings. The discussion in this chapter, however, is restricted to the most common cases—namely, the neighbouring subsystem usage in IPv4 and in IPv6. Keep in mind that the ARP protocol, like the ICMP protocol discussed in Chapter 3, is subject to security threats—such as ARP poisoning attacks and ARP spoofing attacks (security aspects of the ARP protocol are beyond the scope of this book). + +I first discuss the common neighbouring data structures in this chapter and some important API methods, which are used both in IPv4 and in IPv6. Then I discuss the particular implementations of the ARP protocol and NDISC protocol. You will see how a neighbour is created and how it is freed, and you will learn about the interaction between userspace and the neighbouring subsystem. You will also learn about ARP requests and ARP replies, about NDISC neighbour solicitation and NDISC neighbour advertisements, and about a mechanism called Duplicate Address Detection (DAD), which is used by the NDISC protocol to avoid duplicate IPv6 addresses. + +## The Neighbouring Subsystem Core + +What is the neighbouring subsystem needed for? When a packet is sent over the L2 layer, the L2 destination address is needed to build an L2 header. Using the neighbouring subsystem solicitation requests and solicitation replies, the L2 address of a host can be found out given its L3 address (or the fact that such L3 address does not exist). In Ethernet, which is the most commonly used link layer (L2), the L2 address of a host is its MAC address. In IPv4, ARP is the neighbouring protocol, and solicitation requests and solicitation replies are called ARP requests and ARP replies, respectively. In IPv6, the neighbouring protocol is NDISC, and solicitation requests and solicitation replies are called neighbour solicitations and neighbour advertisements, respectively. + +There are cases where the destination address can be found without any help from the neighbouring subsystem—for example, when a broadcast is sent. In this case, the destination L2 address is fixed (for example, it is FF:FF:FF:FF:FF:FF in Ethernet). Or when the destination address is a multicast address, there is a fixed mapping between the L3 multicast address to its L2 address. I discuss such cases in the course of this chapter. + +The basic data structure of the Linux neighbouring subsystem is the neighbour. A neighbour represents a network node that is attached to the same link (L2). It is represented by the neighbour structure. This representation is not unique for a particular protocol. However, as mentioned, the discussion of the neighbour structure will be restricted to its use in the IPv4 and in the IPv6 protocols. Let's take a look in the neighbour structure: + +struct neighbour { + +struct neighbour __rcu *next; + +struct neigh_table *tbl; + +struct neigh_parms *parms; + +unsigned long confirmed; + +unsigned long updated; + +rwlock_t lock; + +atomic_t refcnt; + +struct sk_buff_head arp_queue; + +unsigned int arp_queue_len_bytes; + +struct timer_list timer; + +unsigned long used; + +atomic_t probes; + +__u8 flags; + +__u8 nud_state; + +__u8 type; + +__u8 dead; + +seqlock_t ha_lock; + +unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; + +struct hh_cache hh; + +int (*output)(struct neighbour *, struct sk_buff *); + +const struct neigh_ops *ops; + +struct rcu_head rcu; + +struct net_device *dev; + +u8 primary_key[0]; + +}; + +(include/net/neighbour.h) + +The following is a description of some of the important members of the neighbour structure: + + * next: A pointer to the next neighbour on the same bucket in the hash table. + + * tbl: The neighbouring table associated to this neighbour. + + * parms: The neigh_parms object associated to this neighbour. It is initialized by the constructor method of the associated neighbouring table. For example, in IPv4 the arp_constructor() method initializes parms to be the arp_parms of the associated network device. Do not confuse it with the neigh_parms object of the neighbouring table. + + * confirmed: Confirmation timestamp (discussed later in this chapter). + + * refcnt: Reference counter. Incremented by the neigh_hold() macro and decremented by the neigh_release() method. The neigh_release() method frees the neighbour object by calling the neigh_destroy() method only if after decrementing the reference counter its value is 0. + + * arp_queue: A queue of unresolved SKBs. Despite the name, this member is not unique to ARP and is used by other protocols, such as the NDISC protocol. + + * timer: Every neighbour object has a timer; the timer callback is the neigh_timer_handler() method. The neigh_timer_handler() method can change the Network Unreachability Detection (NUD) state of the neighbour. When sending solicitation requests, and the state of the neighbour is NUD_INCOMPLETE or NUD_PROBE, and the number of solicitation requests probes is higher or equal to neigh_max_probes(), then the state of the neighbour is set to be NUD_FAILED, and the neigh_invalidate() method is invoked. + + * ha_lock: Provides access protection to the neighbour hardware address (ha). + + * ha: The hardware address of the neighbour object; in the case of Ethernet, it is the MAC address of the neighbour. + + * hh: A hardware header cache of the L2 header (An hh_cache object). + + * output: A pointer to a transmit method, like the neigh_resolve_output() method or the neigh_direct_output() method. It is dependent on the NUD state and as a result can be assigned to different methods during a neighbour lifetime. When initializing the neighbour object in the neigh_alloc() method, it is set to be the neigh_blackhole() method, which discards the packet and returns -ENETDOWN. + +And here are the helper methods (methods which set the output callback): + + * void neigh_connect(struct neighbour *neigh) + +Sets the output() method of the specified neighbour to be neigh->ops->connected_output. + + * void neigh_suspect(struct neighbour *neigh) + +Sets the output() method of the specified neighbour to be neigh->ops->output. + + * nud_state: The NUD state of the neighbour. The nud_state value can be changed dynamically during the lifetime of a neighbour object. Table 7-1 in the "Quick Reference" section at the end of this chapter describes the basic NUD states and their Linux symbols. The NUD state machine is very complex; I do not delve into all of its nuances in this book. + + * dead: A flag that is set when the neighbour object is alive. It is initialized to 0 when creating a neighbour object, at the end of the __neigh_create() method. The neigh_destroy() method will fail for neighbour objects whose dead flag is not set. The neigh_flush_dev() method sets the dead flag to 1 but does not yet remove the neighbour entry. The removal of neighbours marked as dead (their dead flag is set) is done later, by the garbage collectors. + + * primary_key: The IP address (L3) of the neighbour. A lookup in the neighbouring tables is done with the primary_key. The primary_key length is based on which protocol is used. For IPv4, for example, it should be 4 bytes. For IPv6 it should be sizeof(struct in6_addr), as the in6_addr structure represents an IPv6 address. Therefore, the primary_key is defined as an array of 0 bytes, and when allocating a neighbour it should be taken into account which protocol is used. See the explanation about entry_size and key_len later in this chapter, in the description of the neigh_table structure members. + +To avoid sending solicitation requests for each new packet that is transmitted, the kernel keeps the mapping between L3 addresses and L2 addresses in a data structure called a neighbouring table; in the case of IPv4, it is the ARP table (sometimes also called the ARP cache, though they are the same)—in contrast to what you saw in the IPv4 routing subsystem in Chapter 5: the routing cache, before it was removed, and the routing table, were two different entities, which were represented by two different data structures. In the case of IPv6, the neighbouring table is the NDISC table (also known as the NDISC cache). Both the ARP table (arp_tbl) and the NDISC table (nd_tbl) are instances of the neigh_table structure. Let's take a look at the neigh_table structure: + +struct neigh_table { + +struct neigh_table *next; + +int family; + +int entry_size; + +int key_len; + +__u32 (*hash)(const void *pkey, + +const struct net_device *dev, + +__u32 *hash_rnd); + +int (*constructor)(struct neighbour *); + +int (*pconstructor)(struct pneigh_entry *); + +void (*pdestructor)(struct pneigh_entry *); + +void (*proxy_redo)(struct sk_buff *skb); + +char *id; + +struct neigh_parms parms; + +/* HACK. gc_* should follow parms without a gap! */ + +int gc_interval; + +int gc_thresh1; + +int gc_thresh2; + +int gc_thresh3; + +unsigned long last_flush; + +struct delayed_work gc_work; + +struct timer_list proxy_timer; + +struct sk_buff_head proxy_queue; + +atomic_t entries; + +rwlock_t lock; + +unsigned long last_rand; + +struct neigh_statistics __percpu *stats; + +struct neigh_hash_table __rcu *nht; + +struct pneigh_entry **phash_buckets; + +}; + +(include/net/neighbour.h) + +Here are some important members of the neigh_table structure: + + * next: Each protocol creates its own neigh_table instance. There is a linked list of all the neighbouring tables in the system. The neigh_tables global variable is a pointer to the beginning of the list. The next variable points to the next item in this list. + + * family: The protocol family: AF_INET for the IPv4 neighbouring table (arp_tbl), and AF_INET6 for the IPv6 neighbouring table (nd_tbl). + + * entry_size: When allocating a neighbour entry by the neigh_alloc() method, the size for allocation is tbl->entry_size + dev->neigh_priv_len. Usually the neigh_priv_len value is 0. Before kernel 3.3, the entry_size was explicitly initialized to be sizeof(struct neighbour) + 4 for ARP, and sizeof(struct neighbour) + sizeof(struct in6_addr) for NDISC. The reason for this initialization was that when allocating a neighbour, you want to allocate space also for the primary_key[0] member. From kernel 3.3, the enrty_size was removed from the static initialization of arp_tbl and ndisc_tbl, and the entry_size initialization is done based on the key_len in the core neighbouring layer, by the neigh_table_init_no_netlink() method. + + * key_len: The size of the lookup key; it is 4 bytes for IPv4, because the length of IPv4 address is 4 bytes, and it is sizeof (struct in6_addr) for IPv6. The in6_addr structure represents an IPv6 address. + + * hash: The hash function for mapping a key (L3 address) to a specific hash value; for ARP it is the arp_hash() method. For NDISC it is the ndisc_hash() method. + + * constructor: This method performs protocol-specific initialization when creating a neighbour object. For example, arp_constructor() for ARP in IPv4 and ndisc_constructor() for NDISC in IPv6. The constructor callback is invoked by the __neigh_create() method. It returns 0 on success. + + * pconstructor: A method for creation of a neighbour proxy entry; it is not used by ARP, and it is pndisc_constructor for NDISC. This method should return 0 upon success. The pconstructor method is invoked from the pneigh_lookup() method if the lookup fails, on the condition that the pneigh_lookup() was invoked with creat = 1. + + * pdestructor: A method for destroying a neighbour proxy entry. Like the pconstructor callback, the pdestructor is not used by ARP, and it is pndisc_destructor for NDISC. The pdestructor method is invoked from the pneigh_delete() method and from the pneigh_ifdown() method. + + * id: The name of the table; it is arp_cache for IPv4 and ndisc_cache for IPv6. + + * parms: A neigh_parms object: each neighbouring table has an associated neigh_parms object, which consists of various configuration settings, like reachability information, various timeouts, and more. The neigh_parms initialization is different in the ARP table and in the NDISC table. + + * gc_interval: Not used directly by the neighbouring core. + + * gc_thresh1, gc_thresh2, gc_thresh3: Thresholds of the number of neighbouring table entries. Used as criteria to activation of the synchronous garbage collector (neigh_forced_gc) and in the neigh_periodic_work() asynchronous garbage collector handler. See the explanation about allocating a neighbour object in the "Creating and Freeing a Neighbour" section later in this chapter. In the ARP table, the default values are: gc_thresh1 is 128, gc_thresh2 is 512, and gc_thresh3 is 1024. These values can be set by procfs. The same default values are also used in the NDISC table in IPv6. The IPv4 procfs entries are: + + * /proc/sys/net/ipv4/neigh/default/gc_thresh1 + + * /proc/sys/net/ipv4/neigh/default/gc_thresh2 + + * /proc/sys/net/ipv4/neigh/default/gc_thresh3 + +and for IPv6, these are the procfs entries: + + * /proc/sys/net/ipv6/neigh/default/gc_thresh1 + + * /proc/sys/net/ipv6/neigh/default/gc_thresh2 + + * /proc/sys/net/ipv6/neigh/default/gc_thresh3 + + * last_flush: The most recent time when the neigh_forced_gc() method ran. It is initialized to be the current time (jiffies) in the neigh_table_init_no_netlink () method. + + * gc_work: Asynchronous garbage collector handler. Set to be the neigh_periodic_work() timer by the neigh_table_init_no_netlink() method. The delayed_work struct is a type of a work queue. Before kernel 2.6.32, the neigh_periodic_timer() method was the asynchronous garbage collector handler; it processed only one bucket and not the entire neighbouring hash table. The neigh_periodic_work() method first checks whether the number of the entries in the table is less than gc_thresh1, and if so, it exits without doing anything; then it recomputes the reachable time (the reachable_time field of parms, which is the neigh_parms object associated with the neighbouring table). Then it scans the neighbouring hash table and removes entries which their state is not NUD_PERMANENT or NUD_IN_TIMER, and which their reference count is 1, and if one of these conditions is met: either they are in the NUD_FAILED state or the current time is after their used timestamp \+ gc_staletime (gc_staletime is a member of the neighbour parms object). Removal of the neighbour entry is done by setting the dead flag to 1 and calling the neigh_cleanup_and_release() method. + + * proxy_timer: When a host is configured as an ARP proxy, it is possible to avoid immediate processing of solicitation requests and to process them with some delay. This is due to the fact that for an ARP proxy host, there can be a large number of solicitation requests (as opposed to the case when the host is not an ARP proxy, when you usually have a small amount of ARP requests). Sometimes you may prefer to delay the reply to such broadcasts so that you can give priority to hosts that own such IP addresses to be the first to get the request. This delay is a random value up to the proxy_delay parameter. The ARP proxy timer handler is the neigh_proxy_process() method. The proxy_timer is initialized by the neigh_table_init_no_netlink() method. + + * proxy_queue: Proxy ARP queue of SKBs. SKBs are added with the pneigh_enqueue() method. + + * stats: The neighbour statistics (neigh_statistics) object; consists of per CPU counters like allocs, which is the number of neighbour objects allocated by the neigh_alloc() method, or destroys, which is the number of neighbour objects which were freed by the neigh_destroy() method, and more. The neighbour statistics counters are incremented by the NEIGH_CACHE_STAT_INC macro. Note that because the statistics are per CPU counters, the macro this_cpu_inc() is used by this macro. You can display the ARP statistics and the NDISC statistics with cat /proc/net/stat/arp_cache and cat/proc/net/stat/ndisc_cache, respectively. In the "Quick Reference" section at the end of this chapter, there is a description of the neigh_statistics structure, specifying in which method each counter is incremented. + + * nht: The neighbour hash table (neigh_hash_table object). + + * phash_buckets: The neighbouring proxy hash table; allocated in the neigh_table_init_no_netlink() method. + +The initialization of the neighbouring table is done with the neigh_table_init() method: + + * In IPv4, the ARP module defines the ARP table (an instance of the neigh_table structure named arp_tbl) and passes it as an argument to the neigh_table_init() method (see the arp_init() method in net/ipv4/arp.c). + + * In IPv6, the NDISC module defines the NDSIC table (which is also an instance of the neigh_table structure named nd_tbl) and passes it as an argument to the neigh_table_init() method (see the ndisc_init() method in net/ipv6/ndisc.c). + +The neigh_table_init() method also creates the neighbouring hash table (the nht object) by calling the neigh_hash_alloc() method in the neigh_table_init_no_netlink() method, allocating space for eight hash entries: + +static void neigh_table_init_no_netlink(struct neigh_table *tbl) + +{ + +. . . + +RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); + +. . . + +} + +static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) + +{ + +The size of the hash table is 1<< shift (when size <= PAGE_SIZE): + +size_t size = (1 << shift) * sizeof(struct neighbour *); + +struct neigh_hash_table *ret; + +struct neighbour __rcu **buckets; + +int i; + +ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + +if (!ret) + +return NULL; + +if (size <= PAGE_SIZE) + +buckets = kzalloc(size, GFP_ATOMIC); + +else + +buckets = (struct neighbour __rcu **) + +__get_free_pages(GFP_ATOMIC | __GFP_ZERO, + +get_order(size)); + +. . . + +} + +You may wonder why you need the neigh_table_init_no_netlink() method—why not perform all of the initialization in the neigh_table_init() method? The neigh_table_init_no_netlink() method performs all of the initializations of the neighbouring tables, except for linking it to the global linked list of neighbouring tables, neigh_tables. Originally such initialization, without linking to the neigh_tables linked list, was needed for ATM, and as a result the neigh_table_init() method was split, and the ATM clip module called the neigh_table_init_no_netlink() method instead of calling the neigh_table_init() method; however, over time, a different solution was found in ATM. Though the ATM clip module does not invoke the neigh_table_init_no_netlink() method anymore, the split of these methods remained, perhaps in case it is needed in the future. + +I should mention that each L3 protocol that uses the neighbouring subsystem also registers a protocol handler: for IPv4, the handler for ARP packets (packets whose type in their Ethernet header is 0x0806) is the arp_rcv() method: + +static struct packet_type arp_packet_type __read_mostly = { + +.type = cpu_to_be16(ETH_P_ARP), + +.func = arp_rcv, + +}; + +void __init arp_init(void) + +{ + +. . . + +dev_add_pack(&arp_packet_type); + +. . . + +} + +(net/ipv4/arp.c) + +For IPv6, the neighbouring messages are ICMPv6 messages, so they are handled by the icmpv6_rcv() method, which is the ICMPv6 handler. There are five ICMPv6 neighbouring messages; when each of them is received (by the icmpv6_rcv() method), the ndisc_rcv() method is invoked to handle them (see net/ipv6/icmp.c). The ndisc_rcv() method is discussed in a later section in this chapter. Each neighbour object defines a set of methods by the neigh_ops structure. This is done by its constructor method. The neigh_ops structure contains a protocol family member and four function pointers: + +struct neigh_ops { + +int family; + +void (*solicit)(struct neighbour *, struct sk_buff *); + +void (*error_report)(struct neighbour *, struct sk_buff *); + +int (*output)(struct neighbour *, struct sk_buff *); + +int (*connected_output)(struct neighbour *, struct sk_buff *); + +}; + +(include/net/neighbour.h) + + * family: AF_INET for IPv4 and AF_INET6 for IPv6. + + * solicit: This method is responsible for sending the neighbour solicitation requests: in ARP it is the arp_solicit() method, and in NDISC it is the ndisc_solicit() method. + + * error_report: This method is called from the neigh_invalidate() method when the neighbour state is NUD_FAILED. This happens, for example, after some timeout when a solicitation request is not replied. + + * output: When the L3 address of the next hop is known, but the L2 address is not resolved, the output callback should be neigh_resolve_output(). + + * connected_output: The output method of the neighbour is set to be connected_output() when the neighbour state is NUD_REACHABLE or NUD_CONNECTED. See the invocations of neigh_connect() in the neigh_update() method and in the neigh_timer_handler() method. + +### Creating and Freeing a Neighbour + +A neighbour is created by the __neigh_create() method: + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) + +First, the __neigh_create() method allocates a neighbour object by calling the neigh_alloc() method, which also performs various initializations. There are cases when the neigh_alloc() method calls the synchronous garbage collector (which is the neigh_forced_gc() method): + +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) + +{ + +struct neighbour *n = NULL; + +unsigned long now = jiffies; + +int entries; + +entries = atomic_inc_return(&tbl->entries) - 1; + +If the number of table entries is greater than gc_thresh3 (1024 by default) or if the number of table entries is greater than gc_thresh2 (512 by default), and the time passed since the last flush is more than 5 Hz, the synchronous garbage collector method is invoked (the neigh_forced_gc() method). If after running the neigh_forced_gc() method, the number of table entries is greater than gc_thresh3 (1024), you do not allocate a neighbour object and return NULL: + +if (entries >= tbl->gc_thresh3 || + +(entries >= tbl->gc_thresh2 && + +time_after(now, tbl->last_flush + 5 * HZ))) { + +if (!neigh_forced_gc(tbl) && + +entries >= tbl->gc_thresh3) + +goto out_entries; + +} + +Then the __neigh_create() method performs the protocol-specific setup by calling the constructor method of the specified neighbouring table (arp_constructor() for ARP, ndisc_constructor() for NDISC). In the constructor method, special cases like multicast or loopback addresses are handled. In the arp_constructor() method, for example, you call the arp_mc_map() method to set the hardware address of the neighbour (ha) according to the neighbour IPv4 primary_key address, and you set the nud_state to be NUD_NOARP, because multicast addresses don't need ARP. In the ndisc_constructor() method, for example, you do something quite similar when handling multicast addresses: you call the ndisc_mc_map() to set the hardware address of the neighbour (ha) according to the neighbour IPv6 primary_key address, and you again set the nud_state to be NUD_NOARP. There's also special treatment for broadcast addresses: in the arp_constructor() method, for example, when the neighbour type is RTN_BROADCAST, you set the neighbour hardware address (ha) to be the network device broadcast address (the broadcast field of the net_device object), and you set the nud_state to be NUD_NOARP. Note that the IPv6 protocol does not implement traditional IP broadcast, so the notion of a broadcast address is irrelevant (there is a link-local all nodes multicast group at address ff02::1, though). There are two special cases when additional setup needs to be done: + + * When the ndo_neigh_construct() callback of the netdev_ops is defined, it is invoked. In fact, this is done only in the classical IP over ATM code (clip); see net/atm/clip.c. + + * When the neigh_setup() callback of the neigh_parms object is defined, it is invoked. This is used, for example, in the bonding driver; see drivers/net/bonding/bond_main.c. + +When trying to create a neighbour object by the __neigh_create() method, and the number of the neighbour entries exceeds the hash table size, it must be enlarged. This is done by calling the neigh_hash_grow() method, like this: + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + +struct net_device *dev, bool want_ref) + +{ + +. . . + +The hash table size is 1 << nht->hash_shift; the hash table must be enlarged if it is exceeded: + +if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) + +nht = neigh_hash_grow(tbl, nht->hash_shift + 1); + +. . . + +} + +When the want_ref parameter is true, you will increment the neighbour reference count within this method. You also initialize the confirmed field of the neighbour object: + +n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + +It is initialized to be a little less than the current time, jiffies (for the simple reason that you want reachability confirmation to be required sooner). At the end of the __neigh_create() method, the dead flag is initialized to be 0, and the neighbour object is added to the neighbour hash table. + +The neigh_release() method decrements the reference counter of the neighbour and frees it when it reaches zero by calling the neigh_destroy() method. The neigh_destroy() method will verify that the neighbour is marked as dead: neighbours whose dead flag is 0 will not be removed. + +In this section, you learned about the kernel methods to create and free a neighbour. Next you will learn how adding and deleting a neighbour entry can be triggered from userspace, as well as how to display the neighbouring table, with the arp command for IPv4 and the ip command for IPv4/IPv6. + +### Interaction Between Userspace and the Neighbouring Subsystem + +Management of the ARP table is done with the ip neigh command of the iproute2 package or with the arp command of the net-tools package. Thus, you can display the ARP table by running, from the command line, one of the following commands: + + * arp: Handled by the arp_seq_show() method in net/ipv4/arp.c. + + * ip neigh show (or ip neighbour show): Handled by the neigh_dump_info() method in net/core/neighbour.c. + +Note that the ip neigh show command shows the NUD states of the neighbouring table entries (like NUD_REACHABLE or NUD_STALE). Note also that the arp command can display only the IPv4 neighbouring table (the ARP table), whereas with the ip command you can display both the IPv4 ARP table and the IPv6 neighbouring table. If you want to display only the IPv6 neighbouring table, you should run ip -6 neigh show. + +The ARP and NDISC modules also export data via procfs. That means you can display the ARP table by running cat /proc/net/arp (this procfs entry is handled by the arp_seq_show() method, which is the same method that handles the arp command, as mentioned earlier). Or you can display ARP statistics by cat /proc/net/stat/arp_cache, and you can display the NDISC statistics by cat /proc/net/stat/ndisc_cache (both are handled by the neigh_stat_seq_show() method). + +You can add an entry with ip neigh add, which is handled by the neigh_add() method. When running ip neigh add, you can specify the state of the entry which you are adding (like NUD_PERMANENT, NUD_STALE, NUD_REACHABLE and so on). For example: + +ip neigh add 192.168.0.121 dev eth0 lladdr 00:30:48:5b:cc:45 nud permanent + +Deleting an entry can be done by ip neigh del, and is handled by the neigh_delete() method. For example: + +ip neigh del 192.168.0.121 dev eth0 + +Adding an entry to the proxy ARP table can be done with ip neigh add proxy. For example: + +ip neigh add proxy 192.168.2.11 dev eth0 + +The addition is handled again by the neigh_add() method. In this case, the NTF_PROXY flag is set in the data passed from userspace (see the ndm_flags field of the ndm object), and therefore the pneigh_lookup() method is called to perform a lookup in the proxy neighbouring hash table (phash_buckets). In case the lookup failed, the pneigh_lookup() method adds an entry to the proxy neighbouring hash table. + +Deleting an entry from the proxy ARP table can be done with ip neigh del proxy. For example: + +ip neigh del proxy 192.168.2.11 dev eth0 + +The deletion is handled by the neigh_delete() method. Again, in this case the NTF_PROXY flag is set in the data passed from userspace (see the ndm_flags field of the ndm object), and therefore the pneigh_delete() method is called to delete the entry from the proxy neighbouring table. + +With the ip ntable command, you can control the parameters for the neighbouring tables. For example: + + * ip ntable show: Shows the parameters for all the neighbouring tables. + + * ip ntable change: Change a value of a parameter of a neighbouring table. Handled by the neightbl_set() method. For example: ip ntable change name arp_cache queue 20 dev eth0. + +You can also add entries to the ARP table by arp add. And it is possible to add static entries manually to the ARP table, like this: arp –s . The static ARP entries are not deleted by the neigbouring subsystem garbage collector, but they are not persistent over reboot. + +The next section briefly describes how network events are handled in the neighbouring subsystem. + +### Handling Network Events + +The neighbouring core does not register any events with the register_netdevice_notifier() method. On the other hand, the ARP module and the NDISC module do register network events. In ARP, the arp_netdev_event() method is registered as the callback for netdev events. It handles changes of MAC address events by calling the generic neigh_changeaddr() method and by calling the rt_cache_flush() method. From kernel 3.11, you handle a NETDEV_CHANGE event when there was a change of the IFF_NOARP flag by calling the neigh_changeaddr() method. A NETDEV_CHANGE event is triggered when a device changes its flags, by the __dev_notify_flags() method, or when a device changes its state, by the netdev_state_change() method. In NDISC, the ndisc_netdev_event() method is registered as the callback for netdev events; it handles the NETDEV_CHANGEADDR, NETDEV_DOWN, and NETDEV_NOTIFY_PEERS events. + +After describing the fundamental data structures common to IPv4 and IPv6, like the neighbouring table (neigh_table) and the neighbour structure, and after discussing how a neighbour object is created and freed, it is time to describe the implementation of the first neighbouring protocol, the ARP protocol. + +## The ARP protocol (IPv4) + +The ARP protocol is defined in RFC 826. When working with Ethernet, the addresses are called MAC addresses and are 48-bit values. MAC addresses should be unique, but you must take into account that you may encounter a non-unique MAC address. A common reason for this is that on most network interfaces, a system administrator can configure MAC addresses with userspace tools like ifconfig or ip. + +When sending an IPv4 packet, you know the destination IPv4 address. You should build an Ethernet header, which should include a destination MAC address. Finding the MAC address based on a given IPv4 address is done by the ARP protocol as you will see shortly. If the MAC address is unknown, you send an ARP request as a broadcast. This ARP request contains the IPv4 address you are seeking. If there is a host with such an IPv4 address, this host sends a unicast ARP response as a reply. The ARP table (arp_tbl) is an instance of the neigh_table structure. The ARP header is represented by the arphdr structure: + +struct arphdr { + +__be16 ar_hrd; /* format of hardware address */ + +__be16 ar_pro; /* format of protocol address */ + +unsigned char ar_hln; /* length of hardware address */ + +unsigned char ar_pln; /* length of protocol address */ + +__be16 ar_op; /* ARP opcode (command) */ + +#if 0 + +* + +* Ethernet looks like this : This bit is variable sized however... + +*/ + +unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + +unsigned char ar_sip[4]; /* sender IP address */ + +unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + +unsigned char ar_tip[4]; /* target IP address */ + +#endif + +}; + +(include/uapi/linux/if_arp.h) + +The following is a description of some of the important members of the arphdr structure: + + * ar_hrd is the hardware type; for Ethernet it is 0x01. For the full list of available ARP header hardware identifiers, see ARPHRD_XXX definitions in include/uapi/linux/if_arp.h. + + * ar_pro is the protocol ID; for IPv4 it is 0x80. For the full list of available protocols IDs, see ETH_P_XXX in include/uapi/linux/if_ether.h. + + * ar_hln is the hardware address length in bytes, which is 6 bytes for Ethernet addresses. + + * ar_pln is the length of the protocol address in bytes, which is 4 bytes for IPv4 addresses. + + * ar_op is the opcode, ARPOP_REQUEST for an ARP request, and ARPOP_REPLY for an ARP reply. For the full list of available ARP header opcodes look in include/uapi/linux/if_arp.h. + +Immediately after the ar_op are the sender hardware (MAC) address and IPv4 address, and the target hardware (MAC) address and IPv4 address. These addresses are not part of the ARP header (arphdr) structure. In the arp_process() method, they are extracted by reading the corresponding offsets of the ARP header, as you can see in the explanation about the arp_process() method in the section "ARP: Receiving Solicitation Requests and Replies" later in this chapter. Figure 7-1 shows an ARP header for an ARP Ethernet packet. + +Figure 7-1. + +ARP header (for Ethernet) + +In ARP, four neigh_ops objects are defined: arp_direct_ops, arp_generic_ops, arp_hh_ops, and arp_broken_ops. The initialization of the ARP table neigh_ops object is done by the arp_constructor() method, based on the network device features: + + * If the header_ops of the net_device object is NULL, the neigh_ops object will be set to be arp_direct_ops. In this case, sending the packet will be done with the neigh_direct_output() method, which is in fact a wrapper around dev_queue_xmit(). In most Ethernet network devices, however, the header_ops of the net_device object is initialized to be eth_header_ops by the generic ether_setup() method; see net/ethernet/eth.c. + + * If the header_ops of the net_device object contains a NULL cache() callback, then the neigh_ops object will be set to be arp_generic_ops. + + * If the header_ops of the net_device object contains a non-NULL cache() callback, then the neigh_ops object will be set to be arp_hh_ops. In the case of using the generic eth_header_ops object, the cache() callback is the eth_header_cache() callback. + + * For three types of devices, the neigh_ops object will be set to be arp_broken_ops (when the type of the net_device object is ARPHRD_ROSE, ARPHRD_AX25, or ARPHRD_NETROM). + +Now that I've covered the ARP protocol and the ARP header (arphdr) object, let's look at how ARP solicitation requests are sent. + +### ARP: Sending Solicitation Requests + +Where are solicitation requests being sent? The most common case is in the Tx path, before actually leaving the network layer (L3) and moving to the link layer (L2). In the ip_finish_output2() method, you first perform a lookup for the next hop IPv4 address in the ARP table by calling the __ipv4_neigh_lookup_noref() method, and if you don't find any matching neighbour entry, you create one by calling the __neigh_create() method: + +static inline int ip_finish_output2(struct sk_buff *skb) + +{ + +struct dst_entry *dst = skb_dst(skb); + +struct rtable *rt = (struct rtable *)dst; + +struct net_device *dev = dst->dev; + +unsigned int hh_len = LL_RESERVED_SPACE(dev); + +struct neighbour *neigh; + +u32 nexthop; + +. . . + +. . . + +nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); + +neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + +if (unlikely(!neigh)) + +neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + +if (!IS_ERR(neigh)) { + +int res = dst_neigh_output(dst, neigh, skb); + +. . . + +} + +Let's take a look in the dst_neigh_output() method: + +static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, + +struct sk_buff *skb) + +{ + +const struct hh_cache *hh; + +if (dst->pending_confirm) { + +unsigned long now = jiffies; + +dst->pending_confirm = 0; + +/* avoid dirtying neighbour */ + +if (n->confirmed != now) + +n->confirmed = now; + +} + +When you reach this method for the first time with this flow, nud_state is not NUD_CONNECTED, and the output callback is the neigh_resolve_output() method: + +hh = &n->hh; + +if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) + +return neigh_hh_output(hh, skb); + +else + +return n->output(n, skb); + +} + +(include/net/dst.h) + +In the neigh_resolve_output() method, you call the neigh_event_send() method, which eventually puts the SKB in the arp_queue of the neighbour by __skb_queue_tail(&neigh->arp_queue, skb); later, the neigh_probe() method, invoked from the neighbour timer handler, neigh_timer_handler(), will send the packet by invoking the solicit() method (neigh->ops->solicit is the arp_solicit() method in our case): + +static void neigh_probe(struct neighbour *neigh) + +__releases(neigh->lock) + +{ + +struct sk_buff *skb = skb_peek(&neigh->arp_queue); + +. . . + +neigh->ops->solicit(neigh, skb); + +atomic_inc(&neigh->probes); + +kfree_skb(skb); + +} + +Let's take a look at the arp_solicit() method, which actually sends the ARP request: + +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) + +{ + +__be32 saddr = 0; + +u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; + +struct net_device *dev = neigh->dev; + +__be32 target = *(__be32 *)neigh->primary_key; + +int probes = atomic_read(&neigh->probes); + +struct in_device *in_dev; + +rcu_read_lock(); + +in_dev = __in_dev_get_rcu(dev); + +if (!in_dev) { + +rcu_read_unlock(); + +return; + +} + +With the arp_announce procfs entry, you can set restrictions for which local source IP address to use for the ARP packet you want to send: + + * 0: Use any local address, configured on any interface. This is the default value. + + * 1: First try to use addresses that are on the target subnet. If there are no such addresses, use level 2. + + * 2: Use primary IP address. + +Note that the max value of these two entries is used: + +/proc/sys/net/ipv4/conf/all/arp_announce + +/proc/sys/net/ipv4/conf//arp_announce + +See also the description of the IN_DEV_ARP_ANNOUNCE macro in the "Quick Reference" section at the end of this chapter. + +switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { + +default: + +case 0: /* By default announce any local IP */ + +if (skb && inet_addr_type(dev_net(dev), + +ip_hdr(skb)->saddr) == RTN_LOCAL) + +saddr = ip_hdr(skb)->saddr; + +break; + +case 1: /* Restrict announcements of saddr in same subnet */ + +if (!skb) + +break; + +saddr = ip_hdr(skb)->saddr; + +if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + +The inet_addr_onlink() method checks whether the specified target address and the specified source address are on the same subnet: + +/* saddr should be known to target */ + +if (inet_addr_onlink(in_dev, target, saddr)) + +break; + +} + +saddr = 0; + +break; + +case 2: /* Avoid secondary IPs, get a primary/preferred one */ + +break; + +} + +rcu_read_unlock(); + +if (!saddr) + +The inet_select_addr() method returns the address of the first primary interface of the specified device whose scope is smaller than the specified scope (RT_SCOPE_LINK in this case), and which is in the same subnet as the target: + +saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + +probes -= neigh->parms->ucast_probes; + +if (probes < 0) { + +if (!(neigh->nud_state & NUD_VALID)) + +pr_debug("trying to ucast probe in NUD_INVALID\n"); + +neigh_ha_snapshot(dst_ha, neigh, dev); + +dst_hw = dst_ha; + +} else { + +probes -= neigh->parms->app_probes; + +if (probes < 0) { + +CONFIG_ARPD is set when working with the userspace ARP daemon; there are projects like OpenNHRP, which are based on ARPD. Next Hop Resolution Protocol (NHRP) is used to improve the efficiency of routing computer network traffic over Non-Broadcast, Multiple Access (NBMA) networks (I don't discuss the ARPD userspace daemon in this book): + +#ifdef CONFIG_ARPD + +neigh_app_ns(neigh); + +#endif + +return; + +} + +} + +Now you call the arp_send() method to send an ARP request. Note that the last parameter, target_hw, is NULL. You do not yet know the target hardware (MAC) address. When calling arp_send() with target_hw as NULL, a broadcast ARP request is sent: + +arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + +dst_hw, dev->dev_addr, NULL); + +} + +Let's take a look at the arp_send() method, which is quite short: + +void arp_send(int type, int ptype, __be32 dest_ip, + +struct net_device *dev, __be32 src_ip, + +const unsigned char *dest_hw, const unsigned char *src_hw, + +const unsigned char *target_hw) + +{ + +struct sk_buff *skb; + +/* + +* No arp on this interface. + +*/ + +You must check whether the IFF_NOARP is supported on this network device. There are cases in which ARP is disabled: an administrator can disable ARP, for example, by ifconfig eth1 –arp or by ip link set eth1 arp off. Some network devices set the IFF_NOARP flag upon creation—for example, IPv4 tunnel devices, or PPP devices, which do not need ARP. See the ipip_tunnel_setup() method in net/ipv4/ipip.c or the ppp_setup() method in drivers/net/ppp_generic.c. + +if (dev->flags&IFF_NOARP) + +return; + +The arp_create() method creates an SKB with an ARP header and initializes it according to the specified parameters: + +skb = arp_create(type, ptype, dest_ip, dev, src_ip, + +dest_hw, src_hw, target_hw); + +if (skb == NULL) + +return; + +The only thing the arp_xmit() method does is call dev_queue_xmit() by the NF_HOOK() macro: + +arp_xmit(skb); + +} + +Now it is time to learn how these ARP requests are processed and how ARP replies are processed. + +### ARP: Receiving Solicitation Requests and Replies + +In IPv4, the arp_rcv() method is responsible for handling ARP packets, as mentioned earlier. Let's take a look at the arp_rcv() method: + +static int arp_rcv(struct sk_buff *skb, struct net_device *dev, + +struct packet_type *pt, struct net_device *orig_dev) + +{ + +const struct arphdr *arp; + +If the network device on which the ARP packet was received has the IFF_NOARP flag set, or if the packet is not destined for the local machine, or if it is for a loopback device, then the packet should be dropped. You continue and make some more sanity checks, and if everything is okay, you proceed to the arp_process() method, which performs the real work of processing an ARP packet: + +if (dev->flags & IFF_NOARP || + +skb->pkt_type == PACKET_OTHERHOST || + +skb->pkt_type == PACKET_LOOPBACK) + +goto freeskb; + +If the SKB is shared, you must clone it because it might be changed by someone else while being processed by the arp_rcv() method. The skb_share_check() method creates a clone of the SKB if it is shared (see Appendix A). + +skb = skb_share_check(skb, GFP_ATOMIC); + +if (!skb) + +goto out_of_mem; + +/* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + +if (!pskb_may_pull(skb, arp_hdr_len(dev))) + +goto freeskb; + +arp = arp_hdr(skb); + +The ar_hln of the ARP header represents the length of a hardware address, which should be 6 bytes for Ethernet header, and should be equal to the addr_len of the net_device object. The ar_pln of the ARP header represents the length of the protocol address and should be equal to the length of an IPv4 address, which is 4 bytes: + +if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) + +goto freeskb; + +memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + +return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + +freeskb: + +kfree_skb(skb); + +out_of_mem: + +return 0; + +} + +Handling ARP requests is not restricted to packets that have the local host as their destination. When the local host is configured as a proxy ARP, or as a private VLAN proxy ARP (see RFC 3069), you also handle packets which have a destination that is not the local host. Support for private VLAN proxy ARP was added in kernel 2.6.34. + +In the arp_process() method, you handle only ARP requests or ARP responses. For ARP requests you perform a lookup in the routing subsystem by the ip_route_input_noref() method. If the ARP packet is for the local host (the rt_type of the routing entry is RTN_LOCAL), you proceed to check some conditions (described shortly). If all these checks pass, an ARP reply is sent back with the arp_send() method. If the ARP packet is not for the local host but should be forwarded (the rt_type of the routing entry is RTN_UNICAST), then you check some conditions (also described shortly), and if they are fulfilled you perform a lookup in the proxy ARP table by calling the pneigh_lookup() method. + +You will now see the implementation details of the main ARP method which handles ARP requests, the arp_process() method. + +#### The arp_process() Method + +Let's take a look at the arp_process() method, where the real work is done: + +static int arp_process(struct sk_buff *skb) + +{ + +struct net_device *dev = skb->dev; + +struct in_device *in_dev = __in_dev_get_rcu(dev); + +struct arphdr *arp; + +unsigned char *arp_ptr; + +struct rtable *rt; + +unsigned char *sha; + +__be32 sip, tip; + +u16 dev_type = dev->type; + +int addr_type; + +struct neighbour *n; + +struct net *net = dev_net(dev); + +/* arp_rcv below verifies the ARP header and verifies the device + +* is ARP'able. + +*/ + +if (in_dev == NULL) + +goto out; + +Fetch the ARP header from the SKB (it is the network header, see the arp_hdr() method): + +arp = arp_hdr(skb); + +switch (dev_type) { + +default: + +if (arp->ar_pro != htons(ETH_P_IP) || + +htons(dev_type) != arp->ar_hrd) + +goto out; + +break; + +case ARPHRD_ETHER: + +. . . + +if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + +arp->ar_hrd != htons(ARPHRD_IEEE802)) || + +arp->ar_pro != htons(ETH_P_IP)) + +goto out; + +break; + +. . . + +You want to handle only ARP requests or ARP responses in the arp_process() method, and discard all other packets: + +/* Understand only these message types */ + +if (arp->ar_op != htons(ARPOP_REPLY) && + +arp->ar_op != htons(ARPOP_REQUEST)) + +goto out; + +/* + +* Extract fields + +*/ + +arp_ptr = (unsigned char *)(arp + 1); + +#### The arp_process() Method—Extracting Headers: + +Immediately after the ARP header, there are the following fields (see the ARP header definition above): + + * sha: The source hardware address (the MAC address, which is 6 bytes). + + * sip: The source IPv4 address (4 bytes). + + * tha: The target hardware address (the MAC address, which is 6 bytes). + + * tip: The target IPv4 address (4 bytes). + +Extract the sip and tip addresses: + +sha = arp_ptr; + +arp_ptr += dev->addr_len; + +Set sip to be the source IPv4 address after advancing arp_ptr with the corresponding offset: + +memcpy(&sip, arp_ptr, 4); + +arp_ptr += 4; + +switch (dev_type) { + +. . . + +default: + +arp_ptr += dev->addr_len; + +} + +Set tip to be the target IPv4 address after advancing arp_ptr with the corresponding offset: + +memcpy(&tip, arp_ptr, 4); + +Discard these two types of packets: + + * Multicast packets + + * Packets for the loopback device if the use of local routing with loopback addresses is disabled; see also the description of the IN_DEV_ROUTE_LOCALNET macro in the "Quick Reference" section at the end of this chapter. + +/* + +* Check for bad requests for 127.x.x.x and requests for multicast + +* addresses. If this is one such, delete it. + +*/ + +if (ipv4_is_multicast(tip) || + +(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) + +goto out; + +. . . + +The source IP (sip) is 0 when you use Duplicate Address Detection (DAD). DAD lets you detect the existence of double L3 addresses on different hosts on a LAN. DAD is implemented in IPv6 as an integral part of the address configuration process, but not in IPv4. However, there is support for correctly handling DAD requests in IPv4, as you will soon see. The arping utility of the iputils package is an example for using DAD in IPv4. When sending ARP request with arping –D, you send an ARP request where the sip of the ARP header is 0. (The –D modifier tells arping to be in DAD mode); the tip is usually the sender IPv4 address (because you want to check whether there is another host on the same LAN with the same IPv4 address as yours); if there is a host with the same IP address as the tip of the DAD ARP request, it will send back an ARP reply (without adding the sender to its neighbouring table): + +/* Special case: IPv4 duplicate address detection packet (RFC2131) */ + +if (sip == 0) { + +if (arp->ar_op == htons(ARPOP_REQUEST) && + +#### The arp_process() Method—arp_ignore() and arp_filter() Methods + +The arp_ignore procfs entry provides support for different modes for sending ARP replies as a response for an ARP request. The value used is the max value of /proc/sys/net/ipv4/conf/all/arp_ignore and /proc/sys/net/ipv4/conf//arp_ignore. By default, the value of the arp_ignore procfs entry is 0, and in such a case, the arp_ignore() method returns 0. You reply to the ARP request with arp_send(), as you can see in the next code snippet (assuming that inet_addr_type(net, tip) returned RTN_LOCAL). The arp_ignore() method checks the value of IN_DEV_ARP_IGNORE(in_dev); for more details, see the arp_ignore()implementation in net/ipv4/arp.c and the description of the IN_DEV_ARP_IGNORE macro in the "Quick Reference" section at the end of this chapter: + +inet_addr_type(net, tip) == RTN_LOCAL && + +!arp_ignore(in_dev, sip, tip)) + +arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, + +dev->dev_addr, sha); + +goto out; + +} + +if (arp->ar_op == htons(ARPOP_REQUEST) && + +ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { + +rt = skb_rtable(skb); + +addr_type = rt->rt_type; + +When addr_type equals RTN_LOCAL, the packet is for local delivery: + +if (addr_type == RTN_LOCAL) { + +int dont_send; + +dont_send = arp_ignore(in_dev, sip, tip); + +The arp_filter() method fails (returns 1) in two cases: + + * When the lookup in the routing tables with the ip_route_output() method fails. + + * When the outgoing network device of the routing entry is different than the network device on which the ARP request was received. + +In case of success, the arp_filter() method returns 0 (see also the description of the IN_DEV_ARPFILTER macro in the "Quick Reference" section at the end of this chapter): + +if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + +dont_send = arp_filter(sip, tip, dev); + +if (!dont_send) { + +Before sending the ARP reply, you want to add the sender to your neighbouring table or update it; this is done with the neigh_event_ns() method. The neigh_event_ns() method creates a new neighbouring table entry and sets its state to be NUD_STALE. If there is already such an entry, it updates its state to be NUD_STALE, with the neigh_update() method. Adding entries this way is termed passive learning: + +n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + +if (n) { + +arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + +dev, tip, sha, dev->dev_addr, + +sha); + +neigh_release(n); + +} + +} + +goto out; + +} else if (IN_DEV_FORWARD(in_dev)) { + +The arp_fwd_proxy() method returns 1 when the device can be used as an ARP proxy; the arp_fwd_pvlan() method returns 1 when the device can be used as an ARP VLAN proxy: + +if (addr_type == RTN_UNICAST && + +(arp_fwd_proxy(in_dev, dev, rt) || + +arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + +(rt->dst.dev != dev && + +pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + +Again, call the neigh_event_ns() method to create a neighbour entry of the sender with NUD_STALE, or if such an entry exists, update that entry state to be NUD_STALE: + +n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + +if (n) + +neigh_release(n); + +if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || + +skb->pkt_type == PACKET_HOST || + +in_dev->arp_parms->proxy_delay == 0) { + +arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + +dev, tip, sha, dev->dev_addr, + +sha); + +} else { + +Delay sending an ARP reply by putting the SKB at the tail of the proxy_queue, by calling the pneigh_enqueue() method. Note that the delay is random and is a number between 0 and in_dev->arp_parms->proxy_delay: + +pneigh_enqueue(&arp_tbl, + +in_dev->arp_parms, skb); + +return 0; + +} + +goto out; + +} + +} + +} + +/* Update our ARP tables */ + +Note that the last parameter of calling the __neigh_lookup() method is 0, which means that you only perform a lookup in the neighbouring table (and do not create a new neighbour if the lookup failed): + +n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + +The IN_DEV_ARP_ACCEPT macro tells you whether the network device is set to accept ARP requests (see also the description of the IN_DEV_ARP_ACCEPT macro in the "Quick Reference" section at the end of this of this chapter): + +if (IN_DEV_ARP_ACCEPT(in_dev)) { + +/* Unsolicited ARP is not accepted by default. + +It is possible, that this option should be enabled for some + +devices (strip is candidate) + +*/ + +Unsolicited ARP requests are sent only to update the neighbouring table. In such requests, tip is equal to sip (the arping utility supports sending unsolicited ARP requests by arping –U): + +if (n == NULL && + +(arp->ar_op == htons(ARPOP_REPLY) || + +(arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && + +inet_addr_type(net, sip) == RTN_UNICAST) + +n = __neigh_lookup(&arp_tbl, &sip, dev, 1); + +} + +if (n) { + +int state = NUD_REACHABLE; + +int override; + +/* If several different ARP replies follows back-to-back, + +use the FIRST one. It is possible, if several proxy + +agents are active. Taking the first reply prevents + +arp trashing and chooses the fastest router. + +*/ + +override = time_after(jiffies, n->updated + n->parms->locktime); + +/* Broadcast replies and request packets + +do not assert neighbour reachability. + +*/ + +if (arp->ar_op != htons(ARPOP_REPLY) || + +skb->pkt_type != PACKET_HOST) + +state = NUD_STALE; + +Call neigh_update() to update the neighbouring table: + +neigh_update(n, sha, state, + +override ? NEIGH_UPDATE_F_OVERRIDE : 0); + +neigh_release(n); + +} + +out: + +consume_skb(skb); + +return 0; + +} + +Now that you know about the IPv4 ARP protocol implementation, it is time to move on to IPv6 NDISC protocol implementation. You will soon notice some of the differences between the neighbouring subsystem implementation in IPv4 and in IPv6. + +## The NDISC Protocol (IPv6) + +The Neighbour Discovery (NDISC) protocol is based on RFC 2461, "Neighbour Discovery for IP Version 6 (IPv6)," which was later obsoleted by RFC 4861 from 2007. IPv6 nodes (hosts or routers) on the same link use the Neighbour Discovery protocol to discover each other's presence, to discover routers, to determine each other's L2 addresses, and to maintain neighbour reachability information. Duplicate Address Detection (DAD) was added to avoid double L3 addresses on the same LAN. I discuss DAD and handling NDISC neighbour solicitation and neighbour advertisements shortly. + +Next you learn how IPv6 neighbour discovery protocols avoid creating duplicate IPv6 addresses. + +### Duplicate Address Detection (DAD) + +How can you be sure there is no other same IPv6 address on a LAN? The chances are low, but if such address does exist, it may cause trouble. DAD is a solution. When a host tries to configure an address, it first creates a Link Local address (a Link Local address starts with FE80). This address is tentative (IFA_F_TENTATIVE ), which means that the host can communicate only with ND messages. Then the host starts the DAD process by calling the addrconf_dad_start() method (net/ipv6/addrconf.c). The host sends a Neighbour Solicitation DAD message. The target is its tentative address, the source is all zeros (the unspecified address). If there is no answer in a specified time interval, the state is changed to permanent (IFA_F_PERMANENT). When Optimistic DAD (CONFIG_IPV6_OPTIMISTIC_DAD) is set, you don't wait until DAD is completed, but allow hosts to communicate with peers before DAD has finished successfully. See RFC 4429, "Optimistic Duplicate Address Detection (DAD) for IPv6," from 2006. + +The neighbouring table for IPv6 is called nd_tbl: + +struct neigh_table nd_tbl = { + +.family = AF_INET6, + +.key_len = sizeof(struct in6_addr), + +.hash = ndisc_hash, + +.constructor = ndisc_constructor, + +.pconstructor = pndisc_constructor, + +.pdestructor = pndisc_destructor, + +.proxy_redo = pndisc_redo, + +.id = "ndisc_cache", + +.parms = { + +.tbl = &nd_tbl, + +.base_reachable_time = ND_REACHABLE_TIME, + +.retrans_time = ND_RETRANS_TIMER, + +.gc_staletime = 60 * HZ, + +.reachable_time = ND_REACHABLE_TIME, + +.delay_probe_time = 5 * HZ, + +.queue_len_bytes = 64*1024, + +.ucast_probes = 3, + +.mcast_probes = 3, + +.anycast_delay = 1 * HZ, + +.proxy_delay = (8 * HZ) / 10, + +.proxy_qlen = 64, + +}, + +.gc_interval = 30 * HZ, + +.gc_thresh1 = 128, + +.gc_thresh2 = 512, + +.gc_thresh3 = 1024, + +}; + +(net/ipv6/ndisc.c) + +Note that some of the members of the NDISC table are equal to the parallel members in the ARP table—for example, the values of the garbage collector thresholds (gc_thresh1, gc_thresh2 and gc_thresh3). + +The Linux IPv6 Neighbour Discovery implementation is based on ICMPv6 messages to manage the interaction between neighbouring nodes. The Neighbour Discovery protocol defines the following five ICMPv6 message types: + +#define NDISC_ROUTER_SOLICITATION 133 + +#define NDISC_ROUTER_ADVERTISEMENT 134 + +#define NDISC_NEIGHBOUR_SOLICITATION 135 + +#define NDISC_NEIGHBOUR_ADVERTISEMENT 136 + +#define NDISC_REDIRECT 137 + +(include/net/ndisc.h) + +Note that these five ICMPv6 message types are informational messages. ICMPv6 message types whose values are in the range from 0 to 127 are error messages, and ICMPv6 message types whose values are from 128 to 255 are informational messages. For more on that, see Chapter 3, which discusses the ICMP protocol. This chapter discusses only the Neighbour Solicitation and the Neighbour Discovery messages. + +As mentioned in the beginning of this chapter, because neighbouring discovery messages are ICMPv6 messages, they are handled by the icmpv6_rcv() method, which in turn invokes the ndisc_rcv() method for ICMPv6 packets whose message type is one of the five types mentioned earlier (see net/ipv6/icmp.c). + +In NDISC, there are three neigh_ops objects: ndisc_generic_ops, ndisc_hh_ops, and ndisc_direct_ops: + + * If the header_ops of the net_device object is NULL, the neigh_ops object will be set to be ndisc_direct_ops. As in the case of arp_direct_ops, sending the packet is done with the neigh_direct_output() method, which is in fact a wrapper around dev_queue_xmit(). Note that, as mentioned in the ARP section earlier, in most Ethernet network devices, the header_ops of the net_device object is not NULL. + + * If the header_ops of the net_device object contains a NULL cache() callback, then the neigh_ops object is set to be ndisc_generic_ops. + + * If the header_ops of the net_device object contains a non-NULL cache() callback, then the neigh_ops object is set to be ndisc_hh_ops. + +This section discussed the DAD mechanism and how it helps to avoid duplicate addresses. The next section describes how solicitation requests are sent. + +### NIDSC: Sending Solicitation Requests + +Similarly to what you saw in IPv6, you also perform a lookup and create an entry if you did not find any match: + +static int ip6_finish_output2(struct sk_buff *skb) + +{ + +struct dst_entry *dst = skb_dst(skb); + +struct net_device *dev = dst->dev; + +struct neighbour *neigh; + +struct in6_addr *nexthop; + +int ret; + +. . . + +. . . + +nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); + +neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + +if (unlikely(!neigh)) + +neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + +if (!IS_ERR(neigh)) { + +ret = dst_neigh_output(dst, neigh, skb); + +. . . + +Eventually, much like in the IPv4 Tx path, you call the solicit method neigh->ops->solicit(neigh, skb) from the neigh_probe() method. The neigh->ops->solicit in this case is the ndisc_solicit() method. The ndisc_solicit()is a very short method; it is in fact a wrapper around the ndisc_send_ns() method: + +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) + +{ + +struct in6_addr *saddr = NULL; + +struct in6_addr mcaddr; + +struct net_device *dev = neigh->dev; + +struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + +int probes = atomic_read(&neigh->probes); + +if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1)) + +saddr = &ipv6_hdr(skb)->saddr; + +if ((probes -= neigh->parms->ucast_probes) < 0) { + +if (!(neigh->nud_state & NUD_VALID)) { + +ND_PRINTK(1, dbg, + +"%s: trying to ucast probe in NUD_INVALID: %pI6\n", + +__func__, target); + +} + +ndisc_send_ns(dev, neigh, target, target, saddr); + +} else if ((probes -= neigh->parms->app_probes) < 0) { + +#ifdef CONFIG_ARPD + +neigh_app_ns(neigh); + +#endif + +} else { + +addrconf_addr_solict_mult(target, &mcaddr); + +ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + +} + +} + +In order to send the solicitation request, we need to build an nd_msg object: + +struct nd_msg { + +struct icmp6hdr icmph; + +struct in6_addr target; + +__u8 opt[0]; + +}; + +(include/net/ndisc.h) + +For a solicitation request, the ICMPv6 header type should be set to NDISC_NEIGHBOUR_SOLICITATION, and for solicitation reply, the ICMPv6 header type should be set to NDISC_NEIGHBOUR_ADVERTISEMENT. Note that with Neighbour Advertisement messages, there are cases when you need to set flags in the ICMPv6 header. The ICMPv6 header includes a structure named icmpv6_nd_advt, which includes the override, solicited, and router flags: + +struct icmp6hdr { + +__u8 icmp6_type; + +__u8 icmp6_code; + +__sum16 icmp6_cksum; + +union { + +. . . + +. . . + +struct icmpv6_nd_advt { + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u32 reserved:5, + +override:1, + +solicited:1, + +router:1, + +reserved2:24; + +. . . + +#endif + +} u_nd_advt; + +} icmp6_dataun; + +. . . + +#define icmp6_router icmp6_dataun.u_nd_advt.router + +#define icmp6_solicited icmp6_dataun.u_nd_advt.solicited + +#define icmp6_override icmp6_dataun.u_nd_advt.override + +. . . + +(include/uapi/linux/icmpv6.h) + + * When a message is sent in response to a Neighbour Solicitation, you set the solicited flag (icmp6_solicited). + + * When you want to override a neighbouring cache entry (update the L2 address), you set the override flag (icmp6_override). + + * When the host sending the Neighbour Advertisement message is a router, you set the router flag (icmp6_router). + +You can see the use of these three flags in the ndisc_send_na() method that follows. Let's take a look at the ndisc_send_ns() method: + +void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, + +const struct in6_addr *solicit, + +const struct in6_addr *daddr, const struct in6_addr *saddr) + +{ + +struct sk_buff *skb; + +struct in6_addr addr_buf; + +int inc_opt = dev->addr_len; + +int optlen = 0; + +struct nd_msg *msg; + +if (saddr == NULL) { + +if (ipv6_get_lladdr(dev, &addr_buf, + +(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) + +return; + +saddr = &addr_buf; + +} + +if (ipv6_addr_any(saddr)) + +inc_opt = 0; + +if (inc_opt) + +optlen += ndisc_opt_addr_space(dev); + +skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + +if (!skb) + +return; + +Build the ICMPv6 header, which is embedded in the nd_msg object: + +msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + +*msg = (struct nd_msg) { + +.icmph = { + +.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + +}, + +.target = *solicit, + +}; + +if (inc_opt) + +ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, + +dev->dev_addr); + +ndisc_send_skb(skb, daddr, saddr); + +} + +Let's take a look at the ndisc_send_na() method: + +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + +const struct in6_addr *daddr, + +const struct in6_addr *solicited_addr, + +bool router, bool solicited, bool override, bool inc_opt) + +{ + +struct sk_buff *skb; + +struct in6_addr tmpaddr; + +struct inet6_ifaddr *ifp; + +const struct in6_addr *src_addr; + +struct nd_msg *msg; + +int optlen = 0; + +. . . + +skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); + +if (!skb) + +return; + +Build the ICMPv6 header, which is embedded in the nd_msg object: + +msg = (struct nd_msg *)skb_put(skb, sizeof(*msg)); + +*msg = (struct nd_msg) { + +.icmph = { + +.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + +.icmp6_router = router, + +.icmp6_solicited = solicited, + +.icmp6_override = override, + +}, + +.target = *solicited_addr, + +}; + +if (inc_opt) + +ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + +dev->dev_addr); + +ndisc_send_skb(skb, daddr, src_addr); + +} + +This section described how solicitation requests are sent. The next section talks about how Neighbour Solicitations and Advertisements are handled. + +### NDISC: Receiving Neighbour Solicitations and Advertisements + +As mentioned, the ndisc_rcv() method handles all five neighbour discovery message types; let's take a look at this method: + +int ndisc_rcv(struct sk_buff *skb) + +{ + +struct nd_msg *msg; + +if (skb_linearize(skb)) + +return 0; + +msg = (struct nd_msg *)skb_transport_header(skb); + +__skb_push(skb, skb->data - skb_transport_header(skb)); + +According to RFC 4861, the hop limit of neighbour messages should be 255; the hop limit length is 8 bits, so the maximum hop limit is 255. A value of 255 assures that the packet was not forwarded, and this assures you that you are not exposed to some security attack. Packets that do not fulfill this requirement are discarded: + +if (ipv6_hdr(skb)->hop_limit != 255) { + +ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", + +ipv6_hdr(skb)->hop_limit); + +return 0; + +} + +According to RFC 4861, the ICMPv6 code of neighbour messages should be 0, so drop packets that do not fulfill this requirement: + +if (msg->icmph.icmp6_code != 0) { + +ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", + +msg->icmph.icmp6_code); + +return 0; + +} + +memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + +switch (msg->icmph.icmp6_type) { + +case NDISC_NEIGHBOUR_SOLICITATION: + +ndisc_recv_ns(skb); + +break; + +case NDISC_NEIGHBOUR_ADVERTISEMENT: + +ndisc_recv_na(skb); + +break; + +case NDISC_ROUTER_SOLICITATION: + +ndisc_recv_rs(skb); + +break; + +case NDISC_ROUTER_ADVERTISEMENT: + +ndisc_router_discovery(skb); + +break; + +case NDISC_REDIRECT: + +ndisc_redirect_rcv(skb); + +break; + +} + +return 0; + +} + +I do not discuss router solicitations and router advertisements in this chapter, since they are discussed in Chapter 8. Let's take a look at the ndisc_recv_ns() method: + +static void ndisc_recv_ns(struct sk_buff *skb) + +{ + +struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + +const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + +const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + +u8 *lladdr = NULL; + +u32 ndoptlen = skb->tail - (skb->transport_header + + +offsetof(struct nd_msg, opt)); + +struct ndisc_options ndopts; + +struct net_device *dev = skb->dev; + +struct inet6_ifaddr *ifp; + +struct inet6_dev *idev = NULL; + +struct neighbour *neigh; + +The ipv6_addr_any() method returns 1 when saddr is the unspecified address of all zeroes (IPV6_ADDR_ANY). When the source address is the unspecified address (all zeroes), this means that the request is DAD: + +int dad = ipv6_addr_any(saddr); + +bool inc; + +int is_router = -1; + +Perform some validity checks: + +if (skb->len < sizeof(struct nd_msg)) { + +ND_PRINTK(2, warn, "NS: packet too short\n"); + +return; + +} + +if (ipv6_addr_is_multicast(&msg->target)) { + +ND_PRINTK(2, warn, "NS: multicast target address\n"); + +return; + +} + +/* + +* RFC2461 7.1.1: + +* DAD has to be destined for solicited node multicast address. + +*/ + +if (dad && !ipv6_addr_is_solict_mult(daddr)) { + +ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); + +return; + +} + +if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + +ND_PRINTK(2, warn, "NS: invalid ND options\n"); + +return; + +} + +if (ndopts.nd_opts_src_lladdr) { + +lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + +if (!lladdr) { + +ND_PRINTK(2, warn, + +"NS: invalid link-layer address length\n"); + +return; + +} + +/* RFC2461 7.1.1: + +* If the IP source address is the unspecified address, + +* there MUST NOT be source link-layer address option + +* in the message. + +*/ + +if (dad) { + +ND_PRINTK(2, warn, + +"NS: bad DAD packet (link-layer address option)\n"); + +return; + +} + +} + +inc = ipv6_addr_is_multicast(daddr); + +ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + +if (ifp) { + +if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { + +if (dad) { + +/* + +* We are colliding with another node + +* who is doing DAD + +* so fail our DAD process + +*/ + +addrconf_dad_failure(ifp); + +return; + +} else { + +/* + +* This is not a dad solicitation. + +* If we are an optimistic node, + +* we should respond. + +* Otherwise, we should ignore it. + +*/ + +if (!(ifp->flags & IFA_F_OPTIMISTIC)) + +goto out; + +} + +} + +idev = ifp->idev; + +} else { + +struct net *net = dev_net(dev); + +idev = in6_dev_get(dev); + +if (!idev) { + +/* XXX: count this drop? */ + +return; + +} + +if (ipv6_chk_acast_addr(net, dev, &msg->target) || + +(idev->cnf.forwarding && + +(net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + +(is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { + +if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && + +skb->pkt_type != PACKET_HOST && + +inc != 0 && + +idev->nd_parms->proxy_delay != 0) { + +/* + +* for anycast or proxy, + +* sender should delay its response + +* by a random time between 0 and + +* MAX_ANYCAST_DELAY_TIME seconds. + +* (RFC2461) -- yoshfuji + +*/ + +struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); + +if (n) + +pneigh_enqueue(&nd_tbl, idev->nd_parms, n); + +goto out; + +} + +} else + +goto out; + +} + +if (is_router < 0) + +is_router = idev->cnf.forwarding; + +if (dad) { + +Send a neighbour advertisement message: + +ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + +!!is_router, false, (ifp != NULL), true); + +goto out; + +} + +if (inc) + +NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); + +else + +NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); + +/* + +* update / create cache entry + +* for the source address + +*/ + +neigh = __neigh_lookup(&nd_tbl, saddr, dev, + +!inc || lladdr || !dev->addr_len); + +if (neigh) + +Update your neighbouring table with the sender's L2 address; the nud_state will be set to be NUD_STALE: + +neigh_update(neigh, lladdr, NUD_STALE, + +NEIGH_UPDATE_F_WEAK_OVERRIDE| + +NEIGH_UPDATE_F_OVERRIDE); + +if (neigh || !dev->header_ops) { + +Send a Neighbour Advertisement message: + +ndisc_send_na(dev, neigh, saddr, &msg->target, + +!!is_router, + +true, (ifp != NULL && inc), inc); + +if (neigh) + +neigh_release(neigh); + +} + +out: + +if (ifp) + +in6_ifa_put(ifp); + +else + +in6_dev_put(idev); + +} + +Let's take a look at the method that handles Neighbour Advertisements, ndisc_recv_na(): + +static void ndisc_recv_na(struct sk_buff *skb) + +{ + +struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + +const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + +const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + +u8 *lladdr = NULL; + +u32 ndoptlen = skb->tail - (skb->transport_header + + +offsetof(struct nd_msg, opt)); + +struct ndisc_options ndopts; + +struct net_device *dev = skb->dev; + +struct inet6_ifaddr *ifp; + +struct neighbour *neigh; + +if (skb->len < sizeof(struct nd_msg)) { + +ND_PRINTK(2, warn, "NA: packet too short\n"); + +return; + +} + +if (ipv6_addr_is_multicast(&msg->target)) { + +ND_PRINTK(2, warn, "NA: target address is multicast\n"); + +return; + +} + +if (ipv6_addr_is_multicast(daddr) && + +msg->icmph.icmp6_solicited) { + +ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); + +return; + +} + +if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + +ND_PRINTK(2, warn, "NS: invalid ND option\n"); + +return; + +} + +if (ndopts.nd_opts_tgt_lladdr) { + +lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + +if (!lladdr) { + +ND_PRINTK(2, warn, + +"NA: invalid link-layer address length\n"); + +return; + +} + +} + +ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + +if (ifp) { + +if (skb->pkt_type != PACKET_LOOPBACK + +&& (ifp->flags & IFA_F_TENTATIVE)) { + +addrconf_dad_failure(ifp); + +return; + +} + +/* What should we make now? The advertisement + +is invalid, but ndisc specs say nothing + +about it. It could be misconfiguration, or + +an smart proxy agent tries to help us :-) + +We should not print the error if NA has been + +received from loopback - it is just our own + +unsolicited advertisement. + +*/ + +if (skb->pkt_type != PACKET_LOOPBACK) + +ND_PRINTK(1, warn, + +"NA: someone advertises our address %pI6 on %s!\n", + +&ifp->addr, ifp->idev->dev->name); + +in6_ifa_put(ifp); + +return; + +} + +neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + +if (neigh) { + +u8 old_flags = neigh->flags; + +struct net *net = dev_net(dev); + +if (neigh->nud_state & NUD_FAILED) + +goto out; + +/* + +* Don't update the neighbour cache entry on a proxy NA from + +* ourselves because either the proxied node is off link or it + +* has already sent a NA to us. + +*/ + +if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && + +net->ipv6.devconf_all->forwarding && + +net->ipv6.devconf_all->proxy_ndp && + +pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + +/* XXX: idev->cnf.proxy_ndp */ + +goto out; + +} + +Update the neighbouring table. When the received message is a Neighbour Solicitation, the icmp6_solicited is set, so you want to set the state to be NUD_REACHABLE. When the icmp6_override flag is set, you want the override flag to be set (this mean update the L2 address with the specified lladdr, if it is different): + +neigh_update(neigh, lladdr, + +msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + +NEIGH_UPDATE_F_WEAK_OVERRIDE| + +(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| + +NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + +(msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + +if ((old_flags & ∼neigh->flags) & NTF_ROUTER) { + +/* + +* Change: router to host + +*/ + +struct rt6_info *rt; + +rt = rt6_get_dflt_router(saddr, dev); + +if (rt) + +ip6_del_rt(rt); + +} + +out: + +neigh_release(neigh); + +} + +} + +## Summary + +This chapter described the neighbouring subsystem in IPv4 and in IPv6. First you learned about the goals of the neighbouring subsystem. Then you learned about ARP requests and ARP replies in IPv4, and about NDISC Neighbour Solicitation and NDISC Neighbour Advertisements in IPv6. You also found out about how DAD implementation avoids duplicate IPv6 addresses, and you saw various methods for handling the neighbouring subsystem requests and replies. Chapter 8 discusses the IPv6 subsystem implementation. The "Quick Reference" section that follows covers the top methods and macros related to the topics discussed in this chapter, ordered by their context. I also show the neigh_statistics structure, which represents statistics collected by the neighbouring subsystem. + +## Quick Reference + +The following are some important methods and macros of the neighbouring subsystem, and a description of the neigh_statistics structure. + +Note + +The core neighbouring code is in net/core/neighbour.c, include/net/neighbour.h and include/uapi/linux/neighbour.h. + +The ARP code (IPv4) is in net/ipv4/arp.c, include/net/arp.h and in include/uapi/linux/if_arp.h. + +The NDISC code (IPv6) is in net/ipv6/ndisc.c and include/net/ndisc.h. + +### Methods + +Let's start by covering the methods. + +#### void neigh_table_init(struct neigh_table *tbl) + +This method invokes the neigh_table_init_no_netlink() method to perform the initialization of the neighbouring table, and links the table to the global neighbouring tables linked list (neigh_tables). + +#### void neigh_table_init_no_netlink(struct neigh_table *tbl) + +This method performs all the neighbour initialization apart from linking it to the global neighbouring table linked list, which is done by the neigh_table_init(), as mentioned earlier. + +#### int neigh_table_clear(struct neigh_table *tbl) + +This method frees the resources of the specified neighbouring table. + +#### struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) + +This method allocates a neighbour object. + +#### struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) + +This method allocates a neighbouring hash table. + +#### struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) + +This method creates a neighbour object. + +#### int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) + +This method adds a neighbour entry; it is the handler for netlink RTM_NEWNEIGH message. + +#### int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) + +This method deletes a neighbour entry; it is the handler for netlink RTM_DELNEIGH message. + +#### void neigh_probe(struct neighbour *neigh) + +This method fetches an SKB from the neighbour arp_queue and calls the corresponding solicit() method to send it. In case of ARP, it will be arp_solicit(). It increments the neighbour probes counter and frees the packet. + +#### int neigh_forced_gc(struct neigh_table *tbl) + +This method is a synchronous garbage collection method. It removes neighbour entries that are not in the permanent state (NUD_PERMANENT) and whose reference count equals 1. The removal and cleanup of a neighbour is done by first setting the dead flag of the neighbour to be 1 and then calling the neigh_cleanup_and_release() method, which gets a neighbour object as a parameter. The neigh_forced_gc() method is invoked from the neigh_alloc() method under some conditions, as described in the "Creating and Freeing a Neighbour" section earlier in this chapter. The neigh_forced_gc() method returns 1 if at least one neighbour object was removed, and 0 otherwise. + +#### void neigh_periodic_work(struct work_struct *work) + +This method is the asynchronous garbage collector handler. + +#### static void neigh_timer_handler(unsigned long arg) + +This method is the per-neighbour periodic timer garbage collector handler. + +#### struct neighbour *__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) + +This method performs a lookup in the specified neighbouring table by the given key. If the creat parameter is 1, and the lookup fails, call the neigh_create() method to create a neighbour entry in the specified neighbouring table and return it. + +#### neigh_hh_init(struct neighbour *n, struct dst_entry *dst) + +This method initializes the L2 cache (hh_cache object) of the specified neighbour based on the specified routing cache entry. + +#### void __init arp_init(void) + +This method performs the setup for the ARP protocol: initialize the ARP table, register the arp_rcv() as a handler for receiving ARP packets, initialize procfs entries, register sysctl entries, and register the ARP netdev notifier callback, arp_netdev_event(). + +#### int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) + +This method is the Rx handler for ARP packets (Ethernet packets with type 0x0806). + +#### int arp_constructor(struct neighbour *neigh) + +This method performs ARP neighbour initialization. + +#### int arp_process(struct sk_buff *skb) + +This method, invoked by the arp_rcv() method, handles the main processing of ARP requests and ARP responses. + +#### void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) + +This method sends the solicitation request (ARPOP_REQUEST) after some checks and initializations, by calling the arp_send() method. + +#### void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) + +This method creates an ARP packet and initializes it with the specified parameters, by calling the arp_create() method, and sends it by calling the arp_xmit() method. + +#### void arp_xmit(struct sk_buff *skb) + +This method actually sends the packet by calling the NF_HOOK macro with dev_queue_xmit(). + +#### struct arphdr *arp_hdr(const struct sk_buff *skb) + +This method fetches the ARP header of the specified SKB. + +#### int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) + +This method translates an IPv4 address to L2 (link layer) address according to the network device type. When the device is an Ethernet device, for example, this is done with the ip_eth_mc_map() method; when the device is an Infiniband device, this is done with the ip_ib_mc_map() method. + +#### static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) + +This method returns 1 if the specified device can use proxy ARP for the specified routing entry. + +#### static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev,struct rtable *rt, __be32 sip, __be32 tip) + +This method returns 1 if the specified device can use proxy ARP VLAN for the specified routing entry and specified IPv4 source and destination addresses. + +#### int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) + +This method is the ARP handler for netdev notification events. + +#### int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) + +This method is the NDISC handler for netdev notification events. + +#### int ndisc_rcv(struct sk_buff *skb) + +This method is the main NDISC handler for receiving one of the five types of solicitation packets. + +#### static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) + +This method discards the packet and returns –ENETDOWN error (network is down). + +#### static void ndisc_recv_ns(struct sk_buff *skb) and static void ndisc_recv_na(struct sk_buff *skb) + +These methods handle receiving Neighbour Solicitation and Neighbour Advertisement, respectively. + +#### static void ndisc_recv_rs(struct sk_buff *skb) and static void ndisc_router_discovery(struct sk_buff *skb) + +These methods handle receiving router solicitation and router advertisement, respectively. + +#### int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) + +This method translates an IPv4 address to a L2 (link layer) address according to the network device type. In Ethernet under IPv6, this is done by the ipv6_eth_mc_map() method. + +#### int ndisc_constructor(struct neighbour *neigh) + +This method performs NDISC neighbour initialization. + +#### void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) + +This method sends the solicitation request after some checks and initializations, by calling the ndisc_send_ns() method. + +#### int icmpv6_rcv(struct sk_buff *skb) + +This method is a handler for receiving ICMPv6 messages. + +#### bool ipv6_addr_any(const struct in6_addr *a) + +This method returns 1 when the given IPv6 address is the unspecified address of all zeroes (IPV6_ADDR_ANY). + +#### int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) + +This method checks whether the two specified addresses are on the same subnet. + +### Macros + +Now, let's look at the macros. + +#### IN_DEV_PROXY_ARP(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//proxy_arp is set or if /proc/sys/net/ipv4/conf/all/proxy_arp is set, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_PROXY_ARP_PVLAN(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//proxy_arp_pvlan is set, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ARPFILTER(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//arp_filter is set or if /proc/sys/net/ipv4/conf/all/arp_filter is set, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ARP_ACCEPT(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//arp_accept is set or if /proc/sys/net/ipv4/conf/all/arp_accept is set, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ARP_ANNOUNCE(in_dev) + +This macro returns the max value of /proc/sys/net/ipv4/conf//arp_announce and /proc/sys/net/ipv4/conf/all/arp_announce, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ARP_IGNORE(in_dev) + +This macro returns the max value of /proc/sys/net/ipv4/conf//arp_ignore and /proc/sys/net/ipv4/conf/all/arp_ignore, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ARP_NOTIFY(in_dev) + +This macro returns the max value of /proc/sys/net/ipv4/conf//arp_notify and /proc/sys/net/ipv4/conf/all/arp_notify, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_SHARED_MEDIA(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//shared_media is set or if /proc/sys/net/ipv4/conf/all/shared_media is set, where netDevice is the network device associated with the specified in_dev. + +#### IN_DEV_ROUTE_LOCALNET(in_dev) + +This macro returns true if /proc/sys/net/ipv4/conf//route_localnet is set or if /proc/sys/net/ipv4/conf/all/route_localnet is set, where netDevice is the network device associated with the specified in_dev. + +#### neigh_hold() + +This macro increments the reference count of the specified neighbour. + +### The neigh_statistics Structure + +The neigh_statistics structure is important for monitoring the neighbouring subsystem; as mentioned in the beginning of the chapter, both ARP and NDISC export this structure members via procfs (/proc/net/stat/arp_cache and /proc/net/stat/ndisc_cache, respectively). Following is a description of its members and pointing out where they are incremented: + +struct neigh_statistics { + +unsigned long allocs; /* number of allocated neighs */ + +unsigned long destroys; /* number of destroyed neighs */ + +unsigned long hash_grows; /* number of hash resizes */ + +unsigned long res_failed; /* number of failed resolutions */ + +unsigned long lookups; /* number of lookups */ + +unsigned long hits; /* number of hits (among lookups) */ + +unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ + +unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ + +unsigned long periodic_gc_runs; /* number of periodic GC runs */ + +unsigned long forced_gc_runs; /* number of forced GC runs */ + +unsigned long unres_discards; /* number of unresolved drops */ + +}; + +Here is a description of the members of the neigh_statistics structure: + + * allocs: The number of the allocated neighbours; incremented by the neigh_alloc() method. + + * destroys: The number of the destroyed neighbours; incremented by the neigh_destroy() method. + + * hash_grows: The number of times that hash resize was done; incremented by the neigh_hash_grow() method. + + * res_failed: The number of failed resolutions; incremented by the neigh_invalidate() method. + + * lookups: The number of neighbour lookups that were done; incremented by the neigh_lookup() method and by the neigh_lookup_nodev() method. + + * hits: The number of hits when performing a neighbour lookup ; incremented by the neigh_lookup() method and by the neigh_lookup_nodev() method, when you have a hit. + + * rcv_probes_mcast: The number of received multicast probes (IPv6 only); incremented by the ndisc_recv_ns() method. + + * rcv_probes_ucast: The number of received unicast probes (IPv6 only); incremented by the ndisc_recv_ns() method. + + * periodic_gc_runs: The number of periodic GC invocations; incremented by the neigh_periodic_work() method. + + * forced_gc_runs: The number of forced GC invocations; incremented by the neigh_forced_gc() method. + + * unres_discards: The number of unresolved drops; incremented by the __neigh_event_send() method when an unresolved packet is discarded. + +### Table + +Here is the table that was covered. + +Table 7-1. + +Network Unreachability Detection States + +Linux | Symbol + +---|--- + +NUD_INCOMPLETE | Address resolution is in progress and the link-layer address of the neighbour has not yet been determined. This means that a solicitation request was sent, and you are waiting for a solicitation reply or a timeout. + +NUD_REACHABLE | The neighbour is known to have been reachable recently. + +NUD_STALE | More than ReachableTime milliseconds have elapsed since the last positive confirmation that the forward path was functioning properly was received. + +NUD_DELAY | The neighbour is no longer known to be reachable. Delay sending probes for a short while in order to give upper layer protocols a chance to provide reachability confirmation. + +NUD_PROBE | The neighbour is no longer known to be reachable, and unicast Neighbour Solicitation probes are being sent to verify reachability. + +NUD_FAILED | Set the neighbour to be unreachable. When you delete a neighbour, you set it to be in the NUD_FAILED state. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_8 + +© Rami Rosen 2014 + +# 8. IPv6 + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +In Chapter 7, I dealt with the Linux Neighboring Subsystem and its implementation. In this chapter, I will discuss the IPv6 protocol and its implementation in Linux. IPv6 is the next-generation network layer protocol of the TCP/IP protocol stack. It was developed by the Internet Engineering Task Force (IETF), and it is intended to replace IPv4, which still carries the vast majority of Internet traffic. + +In Chapter 7, I dealt with the Linux Neighbouring Subsystem and its implementation. In this chapter, I will discuss the IPv6 protocol and its implementation in Linux. IPv6 is the next-generation network layer protocol of the TCP/IP protocol stack. It was developed by the Internet Engineering Task Force (IETF), and it is intended to replace IPv4, which still carries the vast majority of Internet traffic. + +In the early '90s, the IETF started an effort to develop the next generation of the IP protocol, due to the anticipated Internet growth. The first IPv6 RFC is from 1995: RFC 1883, "Internet Protocol, Version 6 (IPv6) Specification." Later, in 1998, RFC 2460 replaced it. The main problem IPv6 solves is the shortage of addresses: the length of an IPv6 address is 128 bits. IPv6 sets a much larger address space. Instead of 2^32 addresses in IPv4, we have 2^128 addresses in IPv6. This indeed enlarges the address space significantly, probably far more than will be needed in the next few decades. But extended address space is not the only advantage of IPv6, as some might think. Based on the experience gained with IPv4, many changes were made in IPv6 to improve the IP protocol. We will discuss many of these changes in this chapter. + +The IPv6 protocol is now gaining momentum as an improved network layer protocol. The growing popularity of the Internet all over the globe, and the growing markets for smart mobile devices and tablets, surely make the exhaustion of IPv4 addresses a more evident problem. This gives rise to the need for transitioning to the IPv4 successor, the IPv6 protocol. + +## IPv6 – Short Introduction + +The IPv6 subsystem is undoubtedly a very broad subject, which is growing steadily. Exciting features were added during the last decade. Some of these new features are based on IPv4, like ICMPv6 sockets, IPv6 Multicast Routing, and IPv6 NAT. IPsec is mandatory in IPv6 and optional in IPv4, though most operating systems implemented IPsec also in IPv4. When we delve into the IPv6 kernel internals, we find many similarities. Sometime the names of the methods and even the names of some of the variables are similar, except for the addition of "v6" or "6." There are, however, some changes in the implementation in some places. + +We chose to discuss in this chapter the important new features of IPv6, show some places where it differs from IPv4, and explain why a change was made. The extension headers, the Multicast Listener Discovery (MLD) protocol, and the Autoconfiguration process are some of the new features that we discuss and demonstrate with some userspace examples. We also discuss how receiving IPv6 packets works, how IPv6 forwarding works, and some points of difference when comparing them to IPv4. On the whole, it seems that the developers of IPv6 made a lot of improvements based on the past experience with IPv4, and the IPv6 implementation brings a lot of benefits not found in IPv4 and a lot of advantages over IPv4. We will discuss IPv6 addresses in the following section, including multicast addresses and special addresses. + +## IPv6 Addresses + +The first step in learning IPv6 is to become familiar with the IPv6 Addressing Architecture, which is defined in RFC 4291. There are three types of IPv6 addresses: + + * Unicast: This address uniquely identifies an interface. A packet sent to a unicast address is delivered to the interface identified by that address. + + * Anycast: This address can be assigned for a set of interfaces (usually on different nodes). This type of address does not exist in IPv4. It is, in fact, a mixture of a unicast address and a multicast address. A packet sent to an anycast address is delivered to one of the interfaces identified by that address (the "nearest" one, according to the routing protocols). + + * Multicast: This address can be assigned for a set of interfaces (usually on different nodes). A packet sent to a multicast address is delivered to all the interfaces identified by that address. An interface can belong to any number of multicast groups. + +There is no broadcast address in IPv6. In IPv6, to get the same result as broadcast, you can send a packet to the group multicast address of all nodes (ff02::1). In IPv4, a large part of the functionality of the Address Resolution Protocol (ARP) protocol is based on broadcasts. The IPv6 subsystem uses neighbour discovery instead of ARP to map L3 addresses to L2 addresses. The IPv6 neighbour discovery protocol is based on ICMPv6, and it uses multicast addresses instead of broadcasts, as you saw in the previous chapter. You will see more examples of using multicast traffic later in this chapter. + +An IPv6 address comprises of 8 blocks of 16 bits, which is 128 bits in total. An IPv6 address looks like this:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx (where x is a hexadecimal digit.) Sometimes you will encounter "::" inside an IPv6 address; this is a shortcut for leading zeroes. + +In IPv6, address prefixes are used. Prefixes are, in fact, the parallel of IPv4 subnet masks. IPv6 prefixes are described in RFC 4291, "IP Version 6 Addressing Architecture." An IPv6 address prefix is represented by the following notation: ipv6-address/prefix-length. + +The prefix-length is a decimal value specifying how many of the leftmost contiguous bits of the address comprise the prefix. We use "/n" to denote a prefix n bits long. For example, for all IPv6 addresses that begin with the 32 bits 2001:0da7, the following prefix is used: 2001:da7::/32. + +Now that you have learned about the types of IPv6 addresses, you will learn in the following section about some special IPv6 addresses and their usage. + +### Special Addresses + +In this section, I describe some special IPv6 addresses and their usage. It is recommended that you be familiar with these special addresses because you will encounter some of them later in this chapter (like the unspecified address of all zeroes that is used in DAD, or Duplicate Address Detection) and while browsing the code. The following list contains special IPv6 addresses and explanations about their usage: + + * There should be at least one link-local unicast address on each interface. The link-local address allows communication with other nodes in the same physical network; it is required for neighbour discovery, automatic address configuration, and more. Routers must not forward any packets with link-local source or destination addresses. Link-local addresses are assigned with the prefix fe80::/64. + + * The Global Unicast Address general format is as follows: the first n bits are the global routing prefix, the next m bits are the subnet ID, and the rest of the 128-n-m bits are the interface ID. + + * global routing prefix: A value assigned to a site. It represents the network ID or prefix of the address. + + * subnet ID: An identifier of a subnet within the site. + + * interface ID: An id; its value must be unique within the subnet. This is defined in RFC 3513, section 2.5.1. + +The Global Unicast Address is described in RFC 3587, "IPv6 Global Unicast Address Format." The assignable Global Unicast Address space is defined in RFC 4291. + + * The IPv6 loopback address is 0:0:0:0:0:0:0:1, or ::1 in short notation. + + * The address of all zeroes (0:0:0:0:0:0:0:0) is called the unspecified address. It is used in DAD (Duplicate Address Detection) as you saw in the previous chapter. It should not be used as a destination address. You cannot assign the unspecified address to an interface by using userspace tools like the ip command or the ifconfig command. + + * IPv4-mapped IPv6 addresses are addresses that start with 80 bits of zero. The next 16 bits are one, and the remaining 32 bits are the IPv4 address. For example, ::ffff:192.0.2.128 represents the IPv4 address of 192.0.2.128. For usage of these addresses, see RFC 4038, "Application Aspects of IPv6 Transition." + + * The IPv4-compatible format is deprecated; in this format, the IPv4 address is in the lower 32 bits of the IPv6 address and all remaining bits are 0; the address mentioned earlier should be ::192.0.2.128 in this format. See RFC 4291, section 2.5.5.1. + + * Site local addresses were originally designed to be used for addressing inside of a site without the need for a global prefix, but they were deprecated in RFC 3879, "Deprecating Site Local Addresses," in 2004. + +An IPv6 address is represented in Linux by the in6_addr structure; using a union with three arrays (with 8, 16, and 32 bit elements) in the in6_addr structure helps in bit-manipulation operations: + +struct in6_addr { + +union { + +__u8 u6_addr8[16]; + +__be16 u6_addr16[8]; + +__be32 u6_addr32[4]; + +} in6_u; + +#define s6_addr in6_u.u6_addr8 + +#define s6_addr16 in6_u.u6_addr16 + +#define s6_addr32 in6_u.u6_addr32 + +}; + +(include/uapi/linux/in6.h) + +Multicast plays an important role in IPv6, especially for ICMPv6-based protocols like NDISC (which I discussed in Chapter 7, which dealt with the Linux Neighbouring Subsystem) and MLD (which is discussed later in this chapter). I will now discuss multicast addresses in IPv6 in the next section. + +### Multicast Addresses + +Multicast addresses provide a way to define a multicast group; a node can belong to one or more multicast groups. Packets whose destination is a multicast address should be delivered to every node that belongs to that multicast group. In IPv6, all multicast addresses start with FF (8 first bits). Following are 4 bits for flags and 4 bits for scope. Finally, the last 112 bits are the group ID. The 4 bits of the flags field have this meaning: + + * Bit 0: Reserved for future use. + + * Bit 1: A value of 1 indicates that a Rendezvous Point is embedded in the address. Discussion of Rendezvous Points is more related to userspace daemons and is not within the scope of this book. For more details, see RFC 3956, "Embedding the Rendezvous Point (RP) Address in an IPv6 Multicast Address." This bit is sometimes referred to as the R-flag (R for Rendezvous Point.) + + * Bit 2: A value of 1 indicates a multicast address that is assigned based on the network prefix. (See RFC 3306.) This bit is sometimes referred to as the P-flag (P for Prefix information.) + + * Bit 3: A value of 0 indicates a permanently-assigned ("well-known") multicast address, assigned by the Internet Assigned Numbers Authority (IANA). A value of 1 indicates a non-permanently-assigned ("transient") multicast address. This bit is sometimes referred to as the T-flag (T for Temporary.) + +The scope can be one of the entries in Table 8-1, which shows the various IPv6 scopes by their Linux symbol and by their value. + +Table 8-1. + +IPv6 scopes + +Hex value | Description | Linux Symbol + +---|---|--- + +0x01 | node local | IPV6_ADDR_SCOPE_NODELOCAL + +0x02 | link local | IPV6_ADDR_SCOPE_LINKLOCAL + +0x05 | site local | IPV6_ADDR_SCOPE_SITELOCAL + +0x08 | organization | IPV6_ADDR_SCOPE_ORGLOCAL + +0x0e | global | IPV6_ADDR_SCOPE_GLOBAL + +Now that you've learned about IPv6 multicast addresses, you will learn about some special multicast addresses in the next section. + +#### Special Multicast Addresses + +There are some special multicast addresses that I will mention in this chapter. Section 2.7.1 of RFC 4291 defines these special multicast addresses: + + * All Nodes Multicast Address group: ff01::1, ff02::1 + + * All Routers Multicast Address group: ff01::2, ff02::2, ff05::2 + +According to RFC 3810, there is this special address: All MLDv2-capable routers Multicast Group, which is ff02::16. Version 2 Multicast Listener Reports will be sent to this special address; I will discuss it in the "Multicast Listener Discovery (MLD)" section later in this chapter. + +A node is required to compute and join (on the appropriate interface) the associated Solicited-Node multicast addresses for all unicast and anycast addresses that have been configured for the node's interfaces (manually or automatically). Solicited-Node multicast addresses are computed based on the node's unicast and anycast addresses. A Solicited-Node multicast address is formed by taking the low-order 24 bits of an address (unicast or anycast) and appending those bits to the prefix ff02:0:0:0:0:1:ff00::/104, resulting in a multicast address in the range ff02:0:0:0:0:1:ff00:0000 to ff02:0:0:0:0:1:ffff:ffff. See RFC 4291. + +The method addrconf_addr_solict_mult() computes a link-local, solicited-node multicast address (include/net/addrconf.h). The method addrconf_join_solict() joins to a solicited address multicast group (net/ipv6/addrconf.c). + +In the previous chapter, you saw that a neighbour advertisement message is sent by the ndisc_send_na() method to the link-local, all nodes address (ff02::1). You will see more examples of using special addresses like the all nodes multicast group address or all routers multicast group address in later subsections of this chapter. In this section, you have seen some multicast addresses, which you will encounter later in this chapter and while browsing the IPv6 source code. I will now discuss the IPv6 header in the following section. + +## IPv6 Header + +Each IPv6 packet starts with an IPv6 header, and it is important to learn about its structure to understand fully the IPv6 Linux implementation. The IPv6 header has a fixed length of 40 bytes; for this reason, there is no field specifying the IPv6 header length (as opposed to IPv4, where the ihl member of the IPv4 header represents the header length). Note that there is also no checksum field in the IPv6 header, and this will be explained later in this chapter. In IPv6, there is no IP options mechanism as in IPv4. The IP options processing mechanism in IPv4 has a performance cost. Instead, IPV6 has a much more efficient mechanism of extension headers, which will be discussed in the next section, "extension headers." Figure 8-1 shows the IPv6 header and its fields. + +Figure 8-1. + +IPv6 header + +Note that in the original IPv6 standard, RFC 2460, the priority (Traffic Class) is 8 bits and the flow label is 20 bits. In the definition of the ipv6hdr structure, the priority (Traffic Class) field size is 4 bits. In fact, in the Linux IPv6 implementation, the first 4 bits of flow_lbl are glued to the priority (Traffic Class) field in order to form a "class." Figure 8-1 reflects the Linux definition of the ipv6hdr structure, which is shown here: + +struct ipv6hdr { + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u8 priority:4, + +version:4; + +#elif defined(__BIG_ENDIAN_BITFIELD) + +__u8 version:4, + +priority:4; + +#else + +#error "Please fix " + +#endif + +__u8 flow_lbl[3]; + +__be16 payload_len; + +__u8 nexthdr; + +__u8 hop_limit; + +struct in6_addr saddr; + +struct in6_addr daddr; + +}; + +(include/uapi/linux/ipv6.h) + +The following is a description of the members of the ipv6hdr structure: + + * version: A 4-bit field. It should be set to 6. + + * priority: Indicates the traffic class or priority of the IPv6 packet. RFC 2460, the base of IPv6, does not define specific traffic class or priority values. + + * flow_lbl: The flow labeling field was regarded as experimental when the base IPv6 standard was written (RFC 2460). It provides a way to label sequences of packets of a particular flow; this labeling can be used by upper layers for various purposes. RFC 6437, "IPv6 Flow Label Specification," from 2011, suggests using flow labeling to detect address spoofing. + + * payload_len: A 16-bit field. The size of the packet, without the IPv6 header, can be up to 65,535 bytes. I will discuss larger packets ("jumbo frames") in the next section, when presenting the Hop-by-Hop Options header. + + * nexthdr: When there are no extension headers, this will be the upper layer protocol number, like IPPROTO_UDP (17) for UDP or IPPROTO_TCP (6) for TCP. The list of available protocols is in include/uapi/linux/in.h. When using extension headers, this will be the type of the next header immediately following the IPv6 header. I will discuss extension headers in the next section. + + * hop_limit: One byte field. Every forwarding device decrements the hop_limit counter by one. When it reaches zero, an ICMPv6 message is sent back and the packet is discarded. This parallels the TTL member in the IPv4 header. See the ip6_forward() method in net/ipv6/ip6_output.c. + + * saddr: IPv6 source address (128 bit). + + * daddr: IPv6 destination address (128 bit). This is possibly not the final packet destination if a Routing Header is used. + +Note that, as opposed to the IPv4 header, there is no checksum in the IPv6 header. Checksumming is assumed to be assured by both Layer 2 and Layer 4. UDP in IPv4 permits having a checksum of 0, indicating no checksum; UDP in IPV6 requires having its own checksum normally. There are some special cases in IPv6 where zero UDP checksum is allowed for IPv6 UDP tunnels; see RFC 6935, "IPv6 and UDP Checksums for Tunneled Packets." In Chapter 4, which deals with the IPv4 subsystem, you saw that when forwarding a packet the ip_decrease_ttl() method is invoked. This method recomputes the checksum of the IPv4 header because the value of the ttl was changed. In IPv6, there is no such a need for recomputation of the checksum when forwarding a packet, because there is no checksum at all in the IPv6 header. This results in a performance improvement in software-based routers. + +In this section, you have seen how the IPv6 header is built. You saw some differences between the IPv4 header and the IPv6 header—for example, in the IPv6 header there is no checksum and no header length. The next section discusses the IPv6 extension headers, which are the counterpart of IPv4 options. + +## Extension Headers + +The IPv4 header can include IP options, which can extend the IPv4 header from a minimum size of 20 bytes to 60 bytes. In IPv6, we have optional extension headers instead. With one exception (Hop-by-Hop Options header), extension headers are not processed by any node along a packet's delivery path until the packet reaches its final destination; this improves the performance of the forwarding process significantly. The base IPv6 standard defines extension headers. An IPv6 packet can include 0, 1 or more extension headers. These headers can be placed between the IPv6 header and the upper-layer header in a packet. The nexthdr field of the IPv6 header is the number of the next header immediately after the IPv6 header. These extension headers are chained; every extension header has a Next Header field. In the last extension header, the Next Header indicates the upper-layer protocol (such as TCP, UDP, or ICMPv6). Another advantage of extension headers is that adding new extension headers in the future is easy and does not require any changes in the IPv6 header. + +Extension headers must be processed strictly in the order they appear in the packet. Each extension header should occur at most once, except for the Destination Options header, which should occur at most twice. (See more detail later in this section in the description of the Destination Options header.) The Hop-by-Hop Options header must appear immediately after the IPv6 header; all other options can appear in any order. Section 4.1 of RFC 2460 ("Extension Header Order") states a recommended order in which extension headers should appear, but this is not mandatory. When an unknown Next Header number is encountered while processing a packet, an ICMPv6 "Parameter Problem" message with a code of "unknown Next Header" (ICMPV6_UNK_NEXTHDR) will be sent back to the sender by calling the icmpv6_param_prob() method. A description of the available ICMPv6 "Parameter Problem Codes" appears in Table 8-4 in the "Quick Reference" section at the end of this chapter. + +Each extension header must be aligned on an 8-byte boundary. For extension headers of variable size, there is a Header Extension Length field, and they use padding if needed to ensure that they are aligned on an 8-byte boundary. The numbers of all Linux IPv6 extension headers and their Linux Kernel symbol representation are displayed in Table 8-2, "IPv6 extension headers," in the "Quick Reference" section at the end of this chapter. + +A protocol handler is registered for each of the extension headers (except the Hop-by-Hop Options header) with the inet6_add_protocol() method. The reason for not registering a protocol handler for the Hop-by-Hop Options header is that there is a special method for parsing the Hop-by-Hop Options header, the ipv6_parse_hopopts() method. This method is invoked before calling the protocol handlers. (See the ipv6_rcv() method, net/ipv6/ip6_input.c). As mentioned before, the Hop-by-Hop Options header must be the first one, immediately following the IPv6 header. In this way, for example, the protocol handler for the Fragment extension header is registered: + +static const struct inet6_protocol frag_protocol = + +{ + +.handler = ipv6_frag_rcv, + +.flags = INET6_PROTO_NOPOLICY, + +}; + +int __init ipv6_frag_init(void) + +{ + +int ret; + +ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + +(net/ipv6/reassembly.c) + +Here is a description of all IPv6 Extension headers: + + * Hop-by-Hop Options header: The Hop-by-Hop Options header must be processed on each node. It is parsed by the ipv6_parse_hopopts() method (net/ipv6/exthdrs.c). + + * The Hop-by-Hop Options header must be immediately after the IPv6 header. It is used, for example, by the Multicast Listener Discovery protocol, as you will see in the "Multicast Listener Discovery (MLD)" section later in this chapter. The Hop-by-Hop Options header includes a variable-length option field. Its first byte is its type, which can be one of the following: + + * Router Alert (Linux Kernel symbol: IPV6_TLV_ROUTERALERT, value: 5). See RFC 6398, "IP Router Alert Considerations and Usage." + + * Jumbo (Linux Kernel symbol: IPV6_TLV_JUMBO, value: 194). The IPv6 packet payload normally can be up to 65,535 bytes long. With the jumbo option, it can be up to 2^32 bytes. See RFC 2675, "IPv6 Jumbograms." + + * Pad1 (Linux Kernel symbol: IPV6_TLV_PAD1, value: 0). The Pad1 option is used to insert one byte of padding. When more than one padding byte is needed, the PadN option (see next) should be used (and not multiple Pad1 options). See section 4.2 of RFC 2460. + + * PadN (Linux Kernel symbol: IPV6_TLV_PADN, value: 1). The PadN option is used to insert two or more octets of padding into the Options area of a header. + + * Routing Options header: This parallels the IPv4 Loose Source Record Route (IPOPT_LSRR), which is discussed in the "IP Options" section in Chapter 4. It provides the ability to specify one or more routers that should be visited along the packet's traversal route to its final destination. + + * Fragment Options header: As opposed to IPv4, fragmentation in IPv6 can occur only on the host that sends the packet, not on any of the intermediate nodes. Fragmentation is implemented by the ip6_fragment() method, which is invoked from the ip6_finish_output() method. In the ip6_fragment() method, there is a slow path and a fast path, much the same as in IPv4 fragmentation. The implementation of IPv6 fragmentation is in net/ipv6/ip6_output.c, and the implementation of IPv6 defragmentation is in net/ipv6/reassembly.c. + + * Authentication Header: The Authentication header (AH) provides data authentication, data integrity, and anti-replay protection. It is described in RFC 4302, "IP Authentication Header," which makes RFC 2402 obsolete. + + * Encapsulating Security Payload Options header: It is described in RFC 4303, "IP Encapsulating Security Payload (ESP)," which makes RFC 2406 obsolete. Note: The Encapsulating Security Payload (ESP) protocol is discussed in Chapter 10, which discusses the IPsec subsystem. + + * Destination Options header: The Destination Options header can appear twice in a packet; before a Routing Options header, and after it. When it is before the Routing Options header, it includes information that should be processed by the routers that are specified by the Router Options header. When it is after the Router Options header, it includes information that should be processed by the final destination. + +In the next section, you will see how the IPv6 protocol handler, which is the ipv6_rcv() method, is associated with IPv6 packets. + +## IPv6 Initialization + +The inet6_init() method performs various IPv6 initializations (like procfs initializations, registration of protocol handlers for TCPv6, UDPv6 and other protocols), initialization of IPv6 subsystems (like IPv6 neighbour discovery, IPv6 Multicast Routing, and IPv6 routing subsystem) and more. For more details, look in net/ipv6/af_inet6.c. The ipv6_rcv() method is registered as a protocol handler for IPv6 packets by defining a packet_type object for IPv6 and registering it with the dev_add_pack() method, quite similarly to what is done in IPv4: + +static struct packet_type ipv6_packet_type __read_mostly = { + +.type = cpu_to_be16(ETH_P_IPV6), + +.func = ipv6_rcv, + +}; + +static int __init ipv6_packet_init(void) + +{ + +dev_add_pack(&ipv6_packet_type); + +return 0; + +} + +(net/ipv6/af_inet6.c) + +As a result of the registration just shown, each Ethernet packet whose ethertype is ETH_P_IPV6 (0x86DD) will be handled by the ipv6_rcv() method. Next, I will discuss the IPv6 Autoconfiguration mechanism for setting IPv6 addresses. + +## Autoconfiguration + +Autoconfiguration is a mechanism that allows a host to obtain or create a unique address for each of its interfaces. The IPv6 autoconfiguration process is initiated at system startup; nodes (both hosts and routers) generate a link-local address for their interfaces. This address is regarded as "tentative" (the interface flag IFA_F_TENTATIVE is set); this means that it can communicate only with neighbour discovery messages. It should be verified that this address is not already in use by another node on the link. This is done with the DAD (Duplicate Address Detection) mechanism, which was described in the previous chapter which deals with the Linux Neighbouring Subsystem. If the node is not unique, the autoconfiguration process will stop and manual configuration will be needed. In cases where the address is unique, the autoconfiguration process will continue. The next phase of autoconfiguration of hosts involves sending one or more Router Solicitations to the all routers multicast group address (ff02::2). This is done by calling the ndisc_send_rs() method from the addrconf_dad_completed() method. Routers reply with a Router Advertisement message, which is sent to the all hosts address, ff02::1. Both the Router Solicitation and the Router Advertisement use the Neighbour Discovery Protocol via ICMPv6 messages. The router solicitation ICMPv6 type is NDISC_ROUTER_SOLICITATION (133), and the router advertisement ICMPv6 type is NDISC_ROUTER_ADVERTISEMENT (134). + +The radvd daemon is an example of an open source Router Advertisement daemon that is used for stateless autoconfiguration ( http://www.litech.org/radvd/ ). You can set a prefix in the radvd configuration file, which will be sent in Router Advertisement messages. The radvd daemon sends Router Advertisements periodically. Apart from that, it also listens to Router Solicitations (RS) requests and answers with Router Advertisement (RA) reply messages. These Router Advertisement (RA) messages include a prefix field, which plays an important role in the autoconfiguration process, as you will immediately see. The prefix must be 64 bits long. When a host receives the Router Advertisement (RA) message, it configures its IP address based on this prefix and its own MAC address. If the Privacy Extensions feature (CONFIG_IPV6_PRIVACY) was set, there is also an element of randomness added in the IPv6 address creation. The Privacy Extensions mechanism avoids getting details about the identity of a machine from its IPv6 address, which is generated normally using its MAC address and a prefix, by adding randomness as was mentioned earlier. For more details on Privacy Extensions, see RFC 4941, "Privacy Extensions for Stateless Address Autoconfiguration in IPv6." + +When a host receives a Router Advertisement message, it can automatically configure its address and some other parameters. It can also choose a default router based on these advertisements. It is also possible to set a preferred lifetime and a valid lifetime for the addresses that are configured automatically on the hosts. The preferred lifetime value specifies the length of time in seconds that the address, which was generated from the prefix via stateless address autoconfiguration, remains in a preferred state. When the preferred time is over, this address will stop communicating (will not answer ping6, etc.). The valid lifetime value specifies the length of time in seconds that the address is valid (i.e., that applications already using it can keep using it); when this time is over, the address is removed. The preferred lifetime and the valid lifetime are represented in the kernel by the prefered_lft and the valid_lft fields of the inet6_ifaddr object, respectively (include/net/if_inet6.h). + +Renumbering is the process of replacing an old prefix with a new prefix, and changing the IPv6 addresses of hosts according to a new prefix. Renumbering can also be done quite easily with radvd, by adding a new prefix to its configuration settings, setting a preferred lifetime and a valid lifetime, and restarting the radvd daemon. See also RFC 4192, "Procedures for Renumbering an IPv6 Network without a Flag Day," and RFCs 5887, 6866, and 6879. + +The Dynamic Host Configuration Protocol version 6 (DHCPv6) is an example of stateful address configuration; in the stateful autoconfiguration model, hosts obtain interface addresses and/or configuration information and parameters from a server. Servers maintain a database that keeps track of which addresses have been assigned to which hosts. I will not delve into the details of the DHCPv6 protocol in this book. The DHCPv6 protocol is specified by RFC 3315, "Dynamic Host Configuration Protocol for IPv6 (DHCPv6)." The IPv6 Stateless Autoconfiguration standard is described in RFC 4862, "IPv6 Stateless Address Autoconfiguration." + +You have learned in this section about the Autoconfiguration process, and you saw how easy it is to replace an old prefix with a new prefix by configuring and restarting radvd. The next section discusses how the ipv6_rcv() method, which is the IPv6 protocol handler, handles the reception of IPv6 packets in a somewhat similar way to what you saw in IPv4. + +## Receiving IPv6 Packets + +The main IPv6 receive method is the ipv6_rcv() method, which is the handler for all IPv6 packets (including multicasts; there are no broadcasts in IPv6 as mentioned before). There are many similarities between the Rx path in IPv4 and in IPv6. As in IPv4, we first make some sanity checks, like checking that the version of the IPv6 header is 6 and that the source address is not a multicast address. (According to section 2.7 of RFC 4291, this is forbidden.) If there is a Hop-by-Hop Options header, it must be the first one. If the value of the nexthdr of the IPV6 header is 0, this indicates a Hop-by-Hop Options header, and it is parsed by calling the ipv6_parse_hopopts() method. The real work is done by the ip6_rcv_finish() method, which is invoked by calling the NF_HOOK() macro. If there is a netfilter callback that is registered at this point (NF_INET_PRE_ROUTING), it will be invoked. I will discuss netfilter hooks in the next chapter. Let's take a look at the ipv6_rcv() method: + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + +struct net_device *orig_dev) + +{ + +const struct ipv6hdr *hdr; + +u32 pkt_len; + +struct inet6_dev *idev; + +Fetch the network namespace from the network device that is associated with the Socket Buffer (SKB): + +struct net *net = dev_net(skb->dev); + +. . . + +Fetch the IPv6 header from the SKB: + +hdr = ipv6_hdr(skb); + +Perform some sanity checks, and discard the SKB if necessary: + +if (hdr->version != 6) + +goto err; + +/* + +* RFC4291 2.5.3 + +* A packet received on an interface with a destination address + +* of loopback must be dropped. + +*/ + +if (!(dev->flags & IFF_LOOPBACK) && + +ipv6_addr_loopback(&hdr->daddr)) + +goto err; + +. . . + +/* + +* RFC4291 2.7 + +* Multicast addresses must not be used as source addresses in IPv6 + +* packets or appear in any Routing header. + +*/ + +if (ipv6_addr_is_multicast(&hdr->saddr)) + +goto err; + +. . . + +if (hdr->nexthdr == NEXTHDR_HOP) { + +if (ipv6_parse_hopopts(skb) < 0) { + +IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + +rcu_read_unlock(); + +return NET_RX_DROP; + +} + +} + +. . . + +return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, + +ip6_rcv_finish); + +err: + +IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + +drop: + +rcu_read_unlock(); + +kfree_skb(skb); + +return NET_RX_DROP; + +} + +(net/ipv6/ip6_input.c) + +The ip6_rcv_finish() method first performs a lookup in the routing subsystem by calling the ip6_route_input() method, in case there is no dst attached to the SKB. The ip6_route_input() method eventually invokes the fib6_rule_lookup(). + +int ip6_rcv_finish(struct sk_buff *skb) + +{ + +. . . + +if (!skb_dst(skb)) + +ip6_route_input(skb); + +Invoke the input callback of the dst attached to the SKB: + +return dst_input(skb); + +} + +(net/ipv6/ip6_input.c) + +Note + +There are two different implementations of the fib6_rule_lookup() method: one when Policy Routing (CONFIG_IPV6_MULTIPLE_TABLES) is set, in net/ipv6/fib6_rules.c, and one when Policy Routing is not set, in net/ipv6/ip6_fib.c. + +As you saw in Chapter 5, which dealt with advanced topics of the IPv4 Routing Subsystem, the lookup in the routing subsystem builds a dst object and sets its input and output callbacks; in IPv6, similar tasks are performed. After the ip6_rcv_finish() method performs the lookup in the routing subsystem, it calls the dst_input() method, which in fact invokes the input callback of the dst object that is associated with the packet. + +Figure 8-2 shows the receive path (Rx) of a packet that is received by the network driver. This packet can either be delivered to the local machine or be forwarded to another host. It is the result of the lookup in the routing tables that determines which of these two options will take place. + +Figure 8-2. + +Receiving IPv6 packets + +Note + +For simplicity, the diagram does not include the fragmentation/defragmentation/ parsing of extension headers /IPsec methods. + +The lookup in the IPv6 routing subsystem will set the input callback of the destination cache (dst) to be: + + * ip6_input() when the packet is destined to the local machine. + + * ip6_forward() when the packet is to be forwarded. + + * ip6_mc_input() when the packet is destined to a multicast address. + + * ip6_pkt_discard() when the packet is to be discarded. The ip6_pkt_discard() method drops the packet and replies to the sender with a destination unreachable (ICMPV6_DEST_UNREACH) ICMPv6 message. + +Incoming IPv6 packets can be locally delivered or forwarded; in the next section, you will learn about local delivery of IPv6 packets. + +### Local Delivery + +Let's look first at the local delivery case: the ip6_input() method is a very short method: + +int ip6_input(struct sk_buff *skb) + +{ + +return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + +ip6_input_finish); + +} + +(net/ipv6/ip6_input.c) + +If there is a netfilter hook registered in this point (NF_INET_LOCAL_IN) it will be invoked. Otherwise, we will proceed to the ip6_input_finish() method: + +static int ip6_input_finish(struct sk_buff *skb) + +{ + +struct net *net = dev_net(skb_dst(skb)->dev); + +const struct inet6_protocol *ipprot; + +The inet6_dev structure (include/net/if_inet6.h) is the IPv6 parallel of the IPv4 in_device structure. It contains IPv6-related configuration such as the network interface unicast address list (addr_list) and the network interface multicast address list (mc_list). This IPv6-related configuration can be set by the user with the ip command or with the ifconfig command. + +struct inet6_dev *idev; + +unsigned int nhoff; + +int nexthdr; + +bool raw; + +/* + +* Parse extension headers + +*/ + +rcu_read_lock(); + +resubmit: + +idev = ip6_dst_idev(skb_dst(skb)); + +if (!pskb_pull(skb, skb_transport_offset(skb))) + +goto discard; + +nhoff = IP6CB(skb)->nhoff; + +Fetch the next header number from the SKB: + +nexthdr = skb_network_header(skb)[nhoff]; + +First in case of a raw socket packet, we try to deliver it to a raw socket: + +raw = raw6_local_deliver(skb, nexthdr); + +Every extension header (except the Hop by Hop extension header) has a protocol handler which was registered by the inet6_add_protocol() method; this method in fact adds an entry to the global inet6_protos array (see net/ipv6/protocol.c). + +if ((ipprot = rcu_dereference(inet6_protos[nexthdr])) != NULL) { + +int ret; + +if (ipprot->flags & INET6_PROTO_FINAL) { + +const struct ipv6hdr *hdr; + +/* Free reference early: we don't need it any more, + +and it may hold ip_conntrack module loaded + +indefinitely. */ + +nf_reset(skb); + +skb_postpull_rcsum(skb, skb_network_header(skb), + +skb_network_header_len(skb)); + +hdr = ipv6_hdr(skb); + +RFC 3810, which is the MLDv2 specification, says: "Note that MLDv2 messages are not subject to source filtering and must always be processed by hosts and routers." We do not want to discard MLD multicast packets due to source filtering, since these MLD packets should be always processed according to the RFC. Therefore, before discarding the packet we make sure that if the destination address of the packet is a multicast address, the packet is not an MLD packet. This is done by calling the ipv6_is_mld() method before discarding it. If this method indicates that the packet is an MLD packet, it is not discarded. You can also see more about this in the "Multicast Listener Discovery (MLD)" section later in this chapter. + +if (ipv6_addr_is_multicast(&hdr->daddr) && + +!ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, + +&hdr->saddr) && + +!ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) + +goto discard; + +} + +When the INET6_PROTO_NOPOLICY flag is set, this indicates that there is no need to perform IPsec policy checks for this protocol: + +if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + +!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + +goto discard; + +ret = ipprot->handler(skb); + +if (ret > 0) + +goto resubmit; + +else if (ret == 0) + +IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + +} else { + +if (!raw) { + +if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + +IP6_INC_STATS_BH(net, idev, + +IPSTATS_MIB_INUNKNOWNPROTOS); + +icmpv6_send(skb, ICMPV6_PARAMPROB, + +ICMPV6_UNK_NEXTHDR, nhoff); + +} + +kfree_skb(skb); + +} else { + +Everything went fine, so increment the INDELIVERS SNMP MIB counter (/proc/net/snmp6/Ip6InDelivers) and free the packet with the consume_skb() method: + +IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + +consume_skb(skb); + +} + +} + +rcu_read_unlock(); + +return 0; + +discard: + +IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + +rcu_read_unlock(); + +kfree_skb(skb); + +return 0; + +} + +(net/ipv6/ip6_input.c) + +You have seen the implementation details of local delivery, which is performed by the ip6_input() and ip6_input_finish() methods. Now is the time to turn to the implementation details of forwarding in IPv6. Also here, there are many similarities between forwarding in IPv4 and forwarding in IPv6. + +### Forwarding + +Forwarding in IPv6 is very similar to forwarding in IPv4. There are some slight changes, though. For example, in IPv6, a checksum is not calculated when forwarding a packet. (There is no checksum field at all in an IPv6 header, as was mentioned before.) Let's take a look at the ip6_forward() method: + +int ip6_forward(struct sk_buff *skb) + +{ + +struct dst_entry *dst = skb_dst(skb); + +struct ipv6hdr *hdr = ipv6_hdr(skb); + +struct inet6_skb_parm *opt = IP6CB(skb); + +struct net *net = dev_net(dst->dev); + +u32 mtu; + +The IPv6 procfs forwarding entry (/proc/sys/net/ipv6/conf/all/forwarding) should be set: + +if (net->ipv6.devconf_all->forwarding == 0) + +goto error; + +When working with Large Receive Offload (LRO), the packet length will exceed the Maximum transmission unit (MTU). As in IPv4, when LRO is enabled, the SKB is freed and an error of –EINVAL is returned: + +if (skb_warn_if_lro(skb)) + +goto drop; + +if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + +IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + +goto drop; + +} + +Drop packets that are not destined to go to the local host. The pkt_type associated with an SKB is determined according to the destination MAC address in the Ethernet header of an incoming packet. This is done by the eth_type_trans() method, which is typically called in the network device driver when handling an incoming packet. See the eth_type_trans() method, net/ethernet/eth.c. + +if (skb->pkt_type != PACKET_HOST) + +goto drop; + +skb_forward_csum(skb); + +/* + +* We DO NOT make any processing on + +* RA packets, pushing them to user level AS IS + +* without any WARRANTY that application will be able + +* to interpret them. The reason is that we + +* cannot make anything clever here. + +* + +* We are not end-node, so that if packet contains + +* AH/ESP, we cannot make anything. + +* Defragmentation also would be mistake, RA packets + +* cannot be fragmented, because there is no warranty + +* that different fragments will go along one path. --ANK + +*/ + +if (opt->ra) { + +u8 *ptr = skb_network_header(skb) + opt->ra; + +We should try to deliver the packet to sockets that had the IPV6_ROUTER_ALERT socket option set by setsockopt(). This is done by calling the ip6_call_ra_chain() method; if the delivery in ip6_call_ra_chain() succeeded, the ip6_forward() method returns 0 and the packet is not forwarded. See the implementation of the ip6_call_ra_chain() method in net/ipv6/ip6_output.c. + +if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + +return 0; + +} + +/* + +* check and decrement ttl + +*/ + +if (hdr->hop_limit <= 1) { + +/* Force OUTPUT device used as source address */ + +skb->dev = dst->dev; + +Send back an ICMP error message when the Hop Limit is 1 (or less), much like what we have in IPv4 when forwarding a packet and the TTL reaches 0. In this case, the packet is discarded: + +icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + +IP6_INC_STATS_BH(net, + +ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + +kfree_skb(skb); + +return -ETIMEDOUT; + +} + +/* XXX: idev->cnf.proxy_ndp? */ + +if (net->ipv6.devconf_all->proxy_ndp && + +pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + +int proxied = ip6_forward_proxy_check(skb); + +if (proxied > 0) + +return ip6_input(skb); + +else if (proxied < 0) { + +IP6_INC_STATS(net, ip6_dst_idev(dst), + +IPSTATS_MIB_INDISCARDS); + +goto drop; + +} + +} + +if (!xfrm6_route_forward(skb)) { + +IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + +goto drop; + +} + +dst = skb_dst(skb); + +/* IPv6 specs say nothing about it, but it is clear that we cannot + +send redirects to source routed frames. + +We don't send redirects to frames decapsulated from IPsec. + +*/ + +if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { + +struct in6_addr *target = NULL; + +struct inet_peer *peer; + +struct rt6_info *rt; + +/* + +* incoming and outgoing devices are the same + +* send a redirect. + +*/ + +rt = (struct rt6_info *) dst; + +if (rt->rt6i_flags & RTF_GATEWAY) + +target = &rt->rt6i_gateway; + +else + +target = &hdr->daddr; + +peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); + +/* Limit redirects both by destination (here) + +and by source (inside ndisc_send_redirect) + +*/ + +if (inet_peer_xrlim_allow(peer, 1*HZ)) + +ndisc_send_redirect(skb, target); + +if (peer) + +inet_putpeer(peer); + +} else { + +int addrtype = ipv6_addr_type(&hdr->saddr); + +/* This check is security critical. */ + +if (addrtype == IPV6_ADDR_ANY || + +addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) + +goto error; + +if (addrtype & IPV6_ADDR_LINKLOCAL) { + +icmpv6_send(skb, ICMPV6_DEST_UNREACH, + +ICMPV6_NOT_NEIGHBOUR, 0); + +goto error; + +} + +} + +Note that the IPv6 IPV6_MIN_MTU is 1280 bytes, according to section 5, "Packet Size Issues," of the base IPv6 standard, RFC 2460. + +mtu = dst_mtu(dst); + +if (mtu < IPV6_MIN_MTU) + +mtu = IPV6_MIN_MTU; + +if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || + +(IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + +/* Again, force OUTPUT device used as source address */ + +skb->dev = dst->dev; + +Reply back to the sender with an ICMPv6 message of "Packet Too Big," and free the SKB; the ip6_forward() method returns –EMSGSIZ in this case: + +icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + +IP6_INC_STATS_BH(net, + +ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); + +IP6_INC_STATS_BH(net, + +ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + +kfree_skb(skb); + +return -EMSGSIZE; + +} + +if (skb_cow(skb, dst->dev->hard_header_len)) { + +IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + +goto drop; + +} + +hdr = ipv6_hdr(skb); + +The packet is to be forwarded, so decrement the hop_limit of the IPv6 header. + +/* Mangling hops number delayed to point after skb COW */ + +hdr->hop_limit--; + +IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + +IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + +return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + +ip6_forward_finish); + +error: + +IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + +drop: + +kfree_skb(skb); + +return -EINVAL; + +} + +(net/ipv6/ip6_output.c) + +The ip6_forward_finish() method is a one-line method, which simply invokes the destination cache (dst) output callback: + +static inline int ip6_forward_finish(struct sk_buff *skb) + +{ + +return dst_output(skb); + +} + +(net/ipv6/ip6_output.c) + +You have seen in this section how the reception of IPv6 packets is handled, either by local delivery or by forwarding. You have also seen some differences between receiving IPv6 packets and receiving IPv4 packets. In the next section, I will discuss the Rx path for multicast traffic. + +## Receiving IPv6 Multicast Packets + +The ipv6_rcv() method is the IPv6 handler for both unicast packets and multicast packets. As mentioned above, after some sanity checks, it invokes the ip6_rcv_finish() method, which performs a lookup in the routing subsystem by calling the ip6_route_input() method. In the ip6_route_input() method, the input callback is set to be the ip6_mc_input method in cases of receiving a multicast packet. Let's take a look at the ip6_mc_input() method: + +int ip6_mc_input(struct sk_buff *skb) + +{ + +const struct ipv6hdr *hdr; + +bool deliver; + +IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + +ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + +skb->len); + +hdr = ipv6_hdr(skb); + +The ipv6_chk_mcast_addr() method (net/ipv6/mcast.c) checks whether the multicast address list (mc_list) of the specified network device contains the specified multicast address (which is the destination address in the IPv6 header in this case, hdr->daddr). Note that because the third parameter is NULL, we do not check in this invocation whether there are any source filters for the source address; handling source filtering is discussed later in this chapter. + +deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + +If the local machine is a multicast router (that is, CONFIG_IPV6_MROUTE is set), we continue after some checks to the ip6_mr_input() method. The IPv6 multicast routing implementation is very similar to the IPv4 multicast routing implementation, which was discussed in Chapter 6, so I will not discuss it in this book. The IPv6 multicast routing implementation is in net/ipv6/ip6mr.c. Support for IPv6 Multicast Routing was added in kernel 2.6.26 (2008), based on a patch by Mickael Hoerdt. + +#ifdef CONFIG_IPV6_MROUTE + +. . . + +if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + +!(ipv6_addr_type(&hdr->daddr) & + +(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && + +likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { + +/* + +* Okay, we try to forward - split and duplicate + +* packets. + +*/ + +struct sk_buff *skb2; + +if (deliver) + +skb2 = skb_clone(skb, GFP_ATOMIC); + +else { + +skb2 = skb; + +skb = NULL; + +} + +if (skb2) { + +Continue to the IPv6 Multicast Routing code, via the ip6_mr_input() method (net/ipv6/ip6mr.c): + +ip6_mr_input(skb2); + +} + +} + +#endif + +if (likely(deliver)) + +ip6_input(skb); + +else { + +/* discard */ + +kfree_skb(skb); + +} + +return 0; + +} + +(net/ipv6/ip6_input.c) + +When the multicast packet is not destined to be forwarded by multicast routing (for example, when CONFIG_IPV6_MROUTE is not set), we will continue to the ip6_input() method, which is in fact a wrapper around the ip6_input_finish() method as you already saw. In the ip6_input_finish() method, we again call the ipv6_chk_mcast_addr() method, but this time the third parameter is not NULL, it is the source address from the IPv6 header. This time we do check in the ipv6_chk_mcast_addr() method whether source filtering is set, and we handle the packet accordingly. Source filtering is discussed in the "Multicast Source Filtering (MSF)" section later in this chapter. Next, I will describe the Multicast Listener Discovery protocol, which parallels the IPv4 IGMPv3 protocol. + +## Multicast Listener Discovery (MLD) + +The MLD protocol is used to exchange group information between multicast hosts and routers. The MLD protocol is an asymmetric protocol; it specifies different behavior to Multicast Routers and to Multicast Listeners. In IPv4, multicast group management is handled by the Internet Group Management Protocol (IGMP) protocol, as you saw in Chapter 6. In IPv6, multicast group management is handled by the MLDv2 protocol, which is specified in RFC 3810, from 2004. The MLDv2 protocol is derived from the IGMPv3 protocol, which is used by IPv4. However, as opposed to the IGMPv3 protocol, MLDv2 is part of the ICMPv6 protocol, while IGMPv3 is a standalone protocol that does not use any of the ICMPv4 services; this is the main reason why the IGMPv3 protocol is not used in IPv6. Note that you might encounter the term GMP (Group Management Protocol), which is used to refer to both IGMP and MLD. + +The former version of the Multicast Listener Discovery protocol is MLDv1, and it is specified in RFC 2710; it is derived from IGMPv2. MLDv1 is based on the Any-Source Multicast (ASM) model; this means that you do not specify interest in receiving multicast traffic from a single source address or from a set of addresses. MLDv2 extends MLDv1 by adding support for Source Specific Multicast (SSM); this means the ability of a node to specify interest in including or excluding listening to packets from specific unicast source addresses. This feature is referred to as source filtering. Later in this section, I will show a short, detailed userspace example of how to use source filtering. See more in RFC 4604, "Using Internet Group Management Protocol Version 3 (IGMPv3) and Multicast Listener Discovery Protocol Version 2 (MLDv2) for Source-Specific Multicast." + +The MLDv2 protocol is based on Multicast Listener Reports and Multicast Listener Queries. An MLDv2 Router (which is also sometimes termed "Querier") sends periodically Multicast Listener Queries in order to learn about the state of multicast groups of nodes. If there are several MLDv2 Routers on the same link, only one of them is selected to be the Querier, and all the other routers are set to be in a Non-Querier state. This is done by a Querier Election mechanism, as described in section 7.6.2 of RFC 3810. Nodes respond to these queries with Multicast Listener Reports, in which they provide information about multicast groups to which they belong. When a listener wants to stop listening on some multicast group, it informs the Querier about it, and the Querier must query for other listeners of that multicast group address before deleting it from its Multicast Address Listener state. An MLDv2 router can provide state information about listeners to multicast routing protocols. + +Now that you have learned generally what the MLD protocol is, I will turn your attention in the following section to how joining and leaving a multicast group is handled. + +### Joining and Leaving a Multicast Group + +There are two ways to join or leave a multicast group in IPv6. The first one is from within the kernel, by calling the ipv6_dev_mc_inc() method, which gets as a parameter a network device object and a multicast group address. For example, when registering a network device, the ipv6_add_dev() method is invoked; each device should join the interface-local all nodes multicast group (ff01::1) and the link-local all nodes multicast group (ff02::1). + +static struct inet6_dev *ipv6_add_dev(struct net_device *dev) { + +. . . + +/* Join interface-local all-node multicast group */ + +ipv6_dev_mc_inc(dev, & in6addr_interfacelocal_allnodes); + +/* Join all-node multicast group */ + +ipv6_dev_mc_inc(dev, & in6addr_linklocal_allnodes); + +. . . + +} + +(net/ipv6/addrconf.c) + +Routers are devices that have their procfs forwarding entry, /proc/sys/net/ipv6/conf/all/forwarding, set. Routers join three multicast address groups, in addition to the two multicast group that each host joins and that were mentioned earlier. These are the link-local all-routers multicast group (ff02::2), interface-local all routers multicast group (ff01::2), and site-local all routers multicast group (ff05::2). + +Note that setting the IPv6 procfs forwarding entry value is handled by the addrconf_fixup_forwarding() method, which eventually calls the dev_forward_change() method, which causes the specified network interface to join or leave these three multicast address groups according to the value of the procfs entry (which is represented by idev->cnf.forwarding, as you can see in the following code snippet): + +static void dev_forward_change(struct inet6_dev *idev) + +{ + +struct net_device *dev; + +struct inet6_ifaddr *ifa; + +. . . + +dev = idev->dev; + +. . . + +if (dev->flags & IFF_MULTICAST) { + +if (idev->cnf.forwarding) { + +ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + +ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters); + +ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters); + +} else { + +ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + +ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters); + +ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters); + +} + +} + +. . . + +} + +(net/ipv6/addrconf.c) + +To leave a multicast group from within the kernel, you should call the ipv6_dev_mc_dec() method. The second way of joining a multicast group is by opening an IPv6 socket in userspace, creating a multicast request (ipv6_mreq object) and setting the ipv6mr_multiaddr of the request to be the multicast group address to which this host wants to join, and setting the ipv6mr_interface to the ifindex of the network interface it wants to set. Then it should call setsockopt() with the IPV6_JOIN_GROUP socket option: + +int sockd; + +struct ipv6_mreq mcgroup; + +struct addrinfo *results; + +. . . + +/* read an IPv6 multicast group address to which we want to join */ + +/* into the address info object (results) */ + +. . . + +Set the network interface that we want to use (by its ifindex value): + +mcgroup.ipv6mr_interface=3; + +Set the multicast group address for the group that we want to join in the request (ipv6mr_multiaddr): + +memcpy( &(mcgroup.ipv6mr_multiaddr), + +&(((struct sockaddr_in6 *) results->ai_addr)->sin6_addr), + +sizeof(struct in6_addr)); + +sockd = socket(AF_INET6, SOCK_DGRAM,0); + +Call setsockopt() with IPV6_JOIN_GROUP to join the multicast group; this call is handled in the kernel by the ipv6_sock_mc_join() method (net/ipv6/mcast.c). + +status = setsockopt(sockd, IPPROTO_IPV6, IPV6_JOIN_GROUP, + +&mcgroup, sizeof(mcgroup)); + +. . . + +The IPV6_ADD_MEMBERSHIP socket option can be used instead of IPV6_JOIN_GROUP. (They are equivalent.) Note that we can set the same multicast group address on more than one network device by setting different values of network interfaces to mcgroup.ipv6mr_interface. The value of mcgroup.ipv6mr_interface is passed as the ifindex parameter to the ipv6_sock_mc_join() method. In such a case, the kernel builds and sends an MLDv2 Multicast Listener Report packet (ICMPV6_MLD2_REPORT), where the destination address is ff02::16 (the all MLDv2-capable routers Multicast Group Address). According to section 5.2.14 in RFC 3810, all MLDv2-capable multicast routers should listen to this multicast address. The number of Multicast Address Records in the MLDv2 header (shown in Figure 8-3) will be 1, because only one Multicast Address Record is used, containing the address of the multicast group that we want to join. The multicast group address that a host wants to join is part of the ICMPv6 header. The Hop-by-Hop Options header with Router Alert is set in this packet. MLD packets contain a Hop-by-Hop Options header, which in turn contains a Router Alert options header; the next header of the Hop-by-Hop extension header is IPPROTO_ICMPV6 (58), because following the Hop-by-Hop header is the ICMPv6 packet, which contains the MLDv2 message. + +Figure 8-3. + +MLDv2 Multicast Listener Report + +A host can leave a multicast group by calling setsockopt() with the IPV6_DROP_MEMBERSHIP socket option, which is handled in the kernel by calling the ipv6_sock_mc_drop() method or by closing the socket. Note that IPV6_LEAVE_GROUP is equivalent to IPV6_DROP_MEMBERSHIP. + +After talking about how joining and leaving a multicast group is handled, it is time to see what an MLDv2 Multicast Listener Report is. + +### MLDv2 Multicast Listener Report + +The MLDv2 Multicast Listener Report is represented in the kernel by the mld2_report structure: + +struct mld2_report { + +struct icmp6hdr mld2r_hdr; + +struct mld2_grec mld2r_grec[0]; + +}; + +(include/net/mld.h) + +The first member of the mld2_report structure is the mld2r_hdr, which is an ICMPv6 header; its icmp6_type should be set to ICMPV6_MLD2_REPORT (143). The second member of the mld2_report structure is the mld2r_grec[0], an instance of the mld2_grec structure, which represents the MLDv2 group record. (This is the Multicast Address Record in Figure 8-3.) Following is the definition of the mld2_grec structure: + +struct mld2_grec { + +__u8 grec_type; + +__u8 grec_auxwords; + +__be16 grec_nsrcs; + +struct in6_addr grec_mca; + +struct in6_addr grec_src[0]; + +}; + +(include/net/mld.h) + +The following is a description of the members of the mld2_grec structure: + + * grec_type: Specifies the type of the Multicast Address Record. See Table 8-3, "Multicast Address Record (record types)" in the "Quick Reference" section at the end of this chapter. + + * grec_auxwords: The length of the Auxiliary Data (aux data len in Figure 8-3). The Auxiliary Data field, if present, contains additional information that pertains to this Multicast Address Record. Usually it is 0. See also section 5.2.10 in RFC 3810. + + * grec_nsrcs: The number of source addresses. + + * grec_mca: The multicast address to which this Multicast Address Record pertains. + + * grec_src[0]: A unicast source address (or an array of unicast source addresses). These are addresses that we want to filter (block or allow). + +In the next section, I will discuss the Multicast Source Filtering (MSF) feature. You will find in it detailed examples of how a Multicast Address Record is used in source filtering. + +### Multicast Source Filtering (MSF) + +With Multicast Source Filtering, the kernel will drop the multicast traffic from sources other than the expected ones. This feature, which is also known as Source-Specific Multicast (SSM) was not part of MLDv1. It was introduced in MLDv2; see RFC 3810. It is the opposite of Any-Source Multicast (ASM), where a receiver expresses interest in a destination multicast address. To understand better what Multicast Source Filtering is all about, I will show here an example of a userspace application demonstrating how to join and leave a multicast group with source filtering. + +#### Joining and Leaving a Multicast Group with Source Filtering + +A host can join a multicast group with source filtering by opening an IPv6 socket in userspace, creating a multicast group source request (group_source_req object), and setting three parameters in the request: + + * gsr_group: The multicast group address that this host wants to join + + * gsr_source: The multicast group source address that it wants to allow + + * ipv6mr_interface: The ifindex of the network interface it wants to set + +Then it should call setsockopt() with the MCAST_JOIN_SOURCE_GROUP socket option. Following here is a code snippet of a userspace application demonstrating this (checking the success of the system calls was removed, for brevity): + +int sockd; + +struct group_source_req mreq; + +struct addrinfo *results1; + +struct addrinfo *results2; + +/* read an IPv6 multicast group address that we want to join into results1 */ + +/* read an IPv6 multicast group address which we want to allow into results2 */ + +memcpy(&(mreq.gsr_group), results1->ai_addr, sizeof(struct sockaddr_in6)); + +memcpy(&(mreq.gsr_source), results2->ai_addr, sizeof(struct sockaddr_in6)); + +mreq.gsr_interface = 3; + +sockd = socket(AF_INET6, SOCK_DGRAM, 0); + +setsockopt(sockd, IPPROTO_IPV6, MCAST_JOIN_SOURCE_GROUP, &mreq, sizeof(mreq)); + +This request is handled in the kernel first by the ipv6_sock_mc_join() method, and then by the ip6_mc_source() method. To leave the group, you should call setsockopt() with the MCAST_LEAVE_SOURCE_GROUP socket option or close the socket that you opened. + +You can set another address that you want to allow and again call setsockopt() with this socket with the MCAST_UNBLOCK_SOURCE socket option. This will add additional addresses to the source filter list. Each such call to setsockopt() will trigger sending an MLDv2 Multicast Listener Report message with one Multicast Address Record; the Record Type will be 5 ("Allow new sources"), and the number of sources will be 1 (the unicast address that you want to unblock). I will show now an example of using the MCAST_MSFILTER socket option for source filtering. + +#### Example: Using MCAST_MSFILTER for Source Filtering + +You can also block or permit multicast traffic from several multicast addresses in one setsockopt() call using MCAST_MSFILTER and a group_filter object. First, let's take a look at the definition of the group_filter structure definition in userspace, which is quite self-explanatory: + +struct group_filter + +{ + +/* Interface index. */ + +uint32_t gf_interface; + +/* Group address. */ + +struct sockaddr_storage gf_group; + +/* Filter mode. */ + +uint32_t gf_fmode; + +/* Number of source addresses. */ + +uint32_t gf_numsrc; + +/* Source addresses. */ + +struct sockaddr_storage gf_slist[1]; + +}; + +(include/netinet/in.h) + +The Filter mode (gf_fmode) can be MCAST_INCLUDE (when you want to allow multicast traffic from some unicast address) or MCAST_EXCLUDE (when you want to disallow multicast traffic from some unicast address). Following are two examples for this; the first will allow multicast traffic from three resources, and the second will disallow multicast traffic from two resources: + +struct ipv6_mreq mcgroup; + +struct group_filter filter; + +struct sockaddr_in6 *psin6; + +int sockd[2]; + +Set the multicast group address that we want to join, ffff::9. + +inet_pton(AF_INET6,"ffff::9", &mcgroup.ipv6mr_multiaddr); + +Set the network interface that we want to use by its ifindex (here, we use eth0, which has an ifindex value of 2): + +mcgroup.ipv6mr_interface=2; + +Set the filter parameters: use the same ifindex (2), use MCAST_INCLUDE to set the filter to allow traffic from the sources that are specified by the filter, and set gf_numsrc to 3, because we want to prepare a filter of 3 unicast addresses: + +filter.gf_interface = 2; + +We want to prepare two filters: the first one will allow traffic from a set of three multicast addresses, and the second one will permit traffic from a set of two multicast addresses. First set the filter mode to MCAST_INCLUDE, which means to allow traffic from this filter: + +filter.gf_fmode = MCAST_INCLUDE; + +Set the number of source addresses of the filter (gf_numsrc) to be 3: + +filter.gf_numsrc = 3; + +Set the group address of the filter (gf_group) to be the same one that we use for the mcgrouop earlier, ffff::9: + +psin6 = (struct sockaddr_in6 *)&filter.gf_group; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "ffff::9", &psin6->sin6_addr); + +The three unicast addresses that we want to allow are 2000::1, 2000::2, and 2000::3. + +Set filter.gf_slist[0], filter.gf_slist[1], and filter.gf_slist[2] accordingly: + +psin6 = (struct sockaddr_in6 *)&filter.gf_slist[0]; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "2000::1", &psin6->sin6_addr); + +psin6 = (struct sockaddr_in6 *)&filter.gf_slist[1]; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "2000::2", &psin6->sin6_addr); + +psin6 = (struct sockaddr_in6 *)&filter.gf_slist[2]; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "2000::3",&psin6->sin6_addr); + +Create a socket, and join a multicast group: + +sockd[0] = socket(AF_INET6, SOCK_DGRAM,0); + +status = setsockopt(sockd[0], IPPROTO_IPV6, IPV6_JOIN_GROUP, + +&mcgroup, sizeof(mcgroup)); + +Activate the filter we created: + +status=setsockopt(sockd[0], IPPROTO_IPV6, MCAST_MSFILTER, &filter, + +GROUP_FILTER_SIZE(filter.gf_numsrc)); + +This will trigger sending of an MLDv2 Multicast Listener Report (ICMPV6_MLD2_REPORT) to all MLDv2 routers (ff02::16) with a Multicast Address Record object (mld2_grec) embedded in it. (See the description of the mld2_report structure and Figure 8-3 earlier.) The values of the fields of mld2_grec will be as follows: + + * grec_type will be MLD2_CHANGE_TO_INCLUDE (3). + + * grec_auxwords will be 0. (We do not use Auxiliary Data.) + + * grec_nsrcs is 3 (because we want to use a filter with 3 source addresses and we set gf_numsrc to 3). + + * grec_mca will be ffff::9; this is the multicast group address that the Multicast Address Record pertains to. + +The following three unicast source addresses: + + * grec_src[0] is 2000::1 + + * grec_src[1] is 2000::2 + + * grec_src[2] is 2000::3 + +Now we want to create a filter of 2 unicast source addresses that we want to exclude. So first create a new userspace socket: + +sockd[1] = socket(AF_INET6, SOCK_DGRAM,0); + +Set the filter mode to EXCLUDE, and set the number of sources of the filter to be 2: + +filter.gf_fmode = MCAST_EXCLUDE; + +filter.gf_numsrc = 2; + +Set the two addresses we want to exclude, 2001::1 and 2001::2: + +psin6 = (struct sockaddr_in6 *)&filter.gf_slist[0]; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "2001::1", &psin6->sin6_addr); + +psin6 = (struct sockaddr_in6 *)&filter.gf_slist[1]; + +psin6->sin6_family = AF_INET6; + +inet_pton(PF_INET6, "2001::2", &psin6->sin6_addr); + +Create a socket, and join a multicast group: + +status = setsockopt(sockd[1], IPPROTO_IPV6, IPV6_JOIN_GROUP, + +&mcgroup, sizeof(mcgroup)); + +Activate the filter: + +status=setsockopt(sockd[1], IPPROTO_IPV6, MCAST_MSFILTER, &filter, + +GROUP_FILTER_SIZE(filter.gf_numsrc)); + +This again will trigger the sending of an MLDv2 Multicast Listener Report (ICMPV6_MLD2_REPORT) to all MLDv2 routers (ff02::16). This time the content of the Multicast Address Record object (mld2_grec) will be different: + + * grec_type will be MLD2_CHANGE_TO_EXCLUDE (4). + + * grec_auxwords will be 0. (We do not use Auxiliary Data.) + + * grec_nsrcs is 2 (because we want to use 2 source addresses and we set gf_numsrc to 2). + + * grec_mca will be ffff::9, as before; this is the multicast group address that the Multicast Address Record pertains to. + + * The following two unicast source addresses: + + * grec_src[0] is 2001::1 + + * grec_src[1] is 2002::2 + +Note + +We can display the source filtering mapping that we created by cat/proc/net/mcfilter6; this is handled in the kernel by the igmp6_mcf_seq_show() method. + +For example, the first three entries in this mapping will show that for the ffff::9 multicast address, we permit (INCLUDE) multicast traffic from 2000::1, 2000::2, and 2000::3. Note that for the first three entries the value in the INC (Include) column is 1. For the fourth and fifth entries, we disallow traffic from 2001::1 and 2001::2. Note that the value in the EX (Exclude) column is 1 for the fourth and fifth entries. + +cat /proc/net/mcfilter6 + +Idx Device Multicast Address Source Address INC EXC + +2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000001 1 0 + +2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000002 1 0 + +2 eth0 ffff0000000000000000000000000009 20000000000000000000000000000003 1 0 + +2 eth0 ffff0000000000000000000000000009 20010000000000000000000000000001 0 1 + +2 eth0 ffff0000000000000000000000000009 20010000000000000000000000000002 0 1 + +Note + +Creating filters by calling the setsockopt() method with MCAST_MSFILTER is handled in the kernel by the ip6_mc_msfilter() method, in net/ipv6/mcast.c. + +An MLD router (which is also sometimes known as the "Querier") joins the all MLDv2-capable routers Multicast Group (ff02::16) when it is started. It periodically sends Multicast Listener Query packets in order to know which hosts belong to a Multicast group, and to which Multicast group they belong. These are ICMPv6 packets whose type is ICMPV6_MGM_QUERY. The destination address of these query packets is the all-hosts multicast group (ff02::1). When a host receives an ICMPv6 Multicast Listener Query packet, the ICMPv6 Rx handler (the icmpv6_rcv() method) calls the igmp6_event_query() method to handle that query. Note that the igmp6_event_query() method handles both MLDv2 queries and MLDv1 queries (because both use ICMPV6_MGM_QUERY as the ICMPv6 type). The igmp6_event_query() method finds out whether the message is MLDv1 or MLDv2 by checking its length; in MLDv1 the length is 24 bytes, and in MLDv2 it is 28 bytes at least. Handling MLDv1 and MLDv2 messages is different; for MLDv2, we should support source filtering, as was mentioned before in this section, while this feature is not available in MLDv1. The host sends back a Multicast Listener Report by calling the igmp6_send() method. The Multicast Listener Report packet is an ICMPv6 packet. + +An example of an IPv6 MLD router is the mld6igmp daemon of the open source XORP project: http://www.xorp.org . The MLD router keeps information about the multicast address groups of network nodes (MLD listeners) and updates this information dynamically. This information can be provided to Multicast Routing daemons. Delving into the implementation of MLDv2 routing daemons like the mld6igmp daemon, or into the implementation of other Multicast Routing daemons, is beyond the scope of this book because it is implemented in userspace. + +According to RFC 3810, MLDv2 should be interoperable with nodes that implement MLDv1; an implementation of MLDv2 must support the following two MLDv1 message types: + + * MLDv1 Multicast Listener Report (ICMPV6_MGM_REPORT, decimal 131) + + * MLDv1 Multicast Listener Done (ICMPV6_MGM_REDUCTION, decimal 132) + +We can use the MLDv1 protocol for Multicast Listener messages instead of MLDv2; this can be done by using the following: + +echo 1 > /proc/sys/net/ipv6/conf/all/force_mld_version + +In such a case, when a host joins a multicast group, a Multicast Listener Report message will be sent by the igmp6_send() method. This message will use ICMPV6_MGM_REPORT (131) of MLDv1 as the ICMPv6 type, not ICMPV6_MLD2_REPORT(143) as in MLDv2. Note that in this case you cannot use source filtering request for this message, as MLDv1 does not support it. We will join the multicast group by calling the igmp6_join_group() method. When you leave the multicast group, a Multicast Listener Done message will be sent. In this message, the ICMPv6 type is ICMPV6_MGM_REDUCTION (132). + +In the next section, I will very briefly talk about the IPv6 Tx path, which is quite similar to the IPv4 Tx path, and which I do not cover in depth in this chapter. + +## Sending IPv6 Packets + +The IPv6 Tx path is very similar to the IPv4 Tx path; even the names of the methods are very similar. Also in IPv6, there are two main methods for sending IPv6 packets from Layer 4, the transport layer: the first is the ip6_xmit() method, which is used by the TCP, Stream Control Transmission Protocol (SCTP), and Datagram Congestion Control Protocol (DCCP) protocols. The second method is the ip6_append_data() method, which is used, for example, by UDP and Raw sockets. Packets that are created on the local host are sent out by the ip6_local_out() method. The ip6_output() method is set to be the output callback of the protocol-independent dst_entry; it first calls the NF_HOOK() macro for the NF_INET_POST_ROUTING hook, and then it calls the ip6_finish_output() method. If fragmentation is needed, the ip6_finish_output() method calls the ip6_fragment() method to handle it; otherwise, it calls the ip6_finish_output2() method, which eventually sends the packet. For implementation details, look in the IPv6 Tx path code; it is mostly in net/ipv6/ip6_output.c. + +In the next section, I will very briefly talk about IPv6 routing, which is, again, quite similar to the IPv4 routing, and which I do not cover in depth in this chapter. + +## IPv6 Routing + +The implementation of IPv6 routing is very similar to the IPv4 routing implementation that was discussed in Chapter 5, which dealt with the IPv4 routing subsystem. Like in the IPv4 routing subsystem, Policy routing is also supported in IPv6 (when CONFIG_IPV6_MULTIPLE_TABLES is set). A routing entry is represented in IPv6 by the rt6_info structure (include/net/ip6_fib.h). The rt6_info object parallels the IPv4 rtable structure, and the flowi6 structure (include/net/flow.h) parallels the IPv4 flowi4 structure. (In fact, they both have as their first member the same flowi_common object.) For implementation details, look in the IPv6 routing modules: net/ipv6/route.c, net/ipv6/ip6_fib.c, and the policy routing module, net/ipv6/fib6_rules.c. + +## Summary + +I dealt with the IPv6 subsystem and its implementation in this chapter. I discussed various IPv6 topics, like IPv6 addresses (including Special Addresses and Multicast Addresses), how the IPv6 header is built, what the IPv6 extension headers are, the autoconfiguration process, the Rx path in IPv6, and the MLD protocol. In the next chapter, we will continue our journey into the kernel networking internals and discuss the netfilter subsystem and its implementation. In the "Quick Reference" section that follows, we will cover the top methods related to the topics we discussed in this chapter, ordered by their context. + +## Quick Reference + +I conclude this chapter with a short list of important methods of the IPv6 subsystem. Some of them were mentioned in this chapter. Subsequently, there are three tables and two short sections about IPv6 Special Addresses and about the management of routing tables in IPv6. + +### Methods + +Let's start with the methods. + +#### bool ipv6_addr_any(const struct in6_addr *a); + +This method returns true if the specified address is the all-zeroes address ("unspecified address"). + +#### bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2); + +This method returns true if the two specified IPv6 addresses are equal. + +#### static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4); + +This method sets the IPv6 address according to the four 32-bit input parameters. + +#### bool ipv6_addr_is_multicast(const struct in6_addr *addr); + +This method returns true if the specified address is a multicast address. + +#### bool ipv6_ext_hdr(u8 nexthdr); + +This method returns true if the specified nexthdr is a well-known extension header. + +#### struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb); + +This method returns the IPv6 header (ipv6hdr) of the specified skb. + +#### struct inet6_dev *in6_dev_get(const struct net_device *dev); + +This method returns the inet6_dev object associated with the specified device. + +#### bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset); + +This method returns true if the specified nexthdr is ICMPv6 (IPPROTO_ICMPV6) and the type of the ICMPv6 header located at the specified offset is an MLD type. It should be one of the following: + + * ICMPV6_MGM_QUERY + + * ICMPV6_MGM_REPORT + + * ICMPV6_MGM_REDUCTION + + * ICMPV6_MLD2_REPORT + +#### bool raw6_local_deliver(struct sk_buff *, int); + +This method tries to deliver the packet to a raw socket. It returns true on success. + +#### int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); + +This method is the main Rx handler for IPv6 packets. + +#### bool ipv6_accept_ra(struct inet6_dev *idev); + +This method returns true if a host is configured to accept Router Advertisements, in these cases: + + * If forwarding is enabled, the special hybrid mode should be set, which means that /proc/sys/net/ipv6/conf//accept_ra is 2. + + * If forwarding is not enabled, /proc/sys/net/ipv6/conf//accept_ra should be 1. + +#### void ip6_route_input(struct sk_buff *skb); + +This method is the main IPv6 routing subsystem lookup method in the Rx path. It sets the dst entry of the specified skb according to the results of the lookup in the routing subsystem. + +#### int ip6_forward(struct sk_buff *skb); + +This method is the main forwarding method. + +#### struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6); + +This method is the main IPv6 routing subsystem lookup method in the Tx path. The return value is the destination cache entry (dst). + +Note + +Both the ip6_route_input() method and the ip6_route_output() method eventually perform the lookup by calling the fib6_lookup() method. + +#### void in6_dev_hold(struct inet6_dev *idev); and void __in6_dev_put(struct inet6_dev *idev); + +This method increments and decrements the reference counter of the specified idev object, respectively. + +#### int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); + +This method handles a setsockopt() call with MCAST_MSFILTER. + +#### int ip6_mc_input(struct sk_buff *skb); + +This method is the main Rx handler for multicast packets. + +#### int ip6_mr_input(struct sk_buff *skb); + +This method is the main Rx handler for multicast packets that are to be forwarded. + +#### int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); + +This method adds the specified device to a multicast group specified by addr, or creates such a group if not found. + +#### int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); + +This method removes the specified device from the specified address group. + +#### bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); + +This method checks if the specified network device belongs to the specified multicast address group. If the third parameter is not NULL, it will also check whether source filtering permits receiving multicast traffic from the specified address (src_addr) that is destined to the specified multicast address group. + +#### inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) + +This method computes link-local solicited-node multicast addresses. + +#### void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); + +This method joins to a solicited address multicast group. + +#### int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); + +This method handles socket join on a multicast group. + +#### int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); + +This method handles socket leave on a multicast group. + +#### int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol); + +This method registers an IPv6 protocol handler. It's used with L4 protocol registration (UDPv6, TCPv6, and more) and also with extension headers (like the Fragment Extension Header). + +#### int ipv6_parse_hopopts(struct sk_buff *skb); + +This method parses the Hop-by-Hop Options header, which must be the first extension header immediately after the IPv6 header. + +#### int ip6_local_out(struct sk_buff *skb); + +This method sends out packets that were generated on the local host. + +#### int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +This method handles IPv6 fragmentation. It is called from the ip6_finish_output() method. + +#### void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos); + +This method sends an ICMPv6 parameter problem (ICMPV6_PARAMPROB) error. It is called when there is some problem in parsing extension headers or in the defragmentation process. + +#### int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags); + +These methods are the generic IPv6 handlers for calling the setsockopt() and getsockopt() methods on IPv6 sockets, respectively (net/ipv6/ipv6_sockglue.c). + +#### int igmp6_event_query(struct sk_buff *skb); + +This method handles MLDv2 and MLDv1 queries. + +#### void ip6_route_input(struct sk_buff *skb); + +This method performs a routing lookup by building a flow6 object, based on the specified skb and invoking the ip6_route_input_lookup() method. + +### Macros + +And here are the macros. + +#### IPV6_ADDR_MC_SCOPE() + +This macro returns the scope of the specified IPv6 Multicast address, which is located in bits 11-14 of the multicast address. + +#### IPV6_ADDR_MC_FLAG_TRANSIENT() + +This macro returns 1 if the T bit of the flags of the specified multicast address is set. + +#### IPV6_ADDR_MC_FLAG_PREFIX() + +This macro returns 1 if the P bit of the flags of the specified multicast address is set. + +#### IPV6_ADDR_MC_FLAG_RENDEZVOUS() + +This macro returns 1 if the R bit of the flags of the specified multicast address is set. + +### Tables + +Here are the tables. + +Table 8-2 shows the IPv6 extension headers by their Linux symbol, value and description. You can find more details in the "extension headers" section of this chapter. + +Table 8-2. + +IPv6 extension headers + +Linux Symbol | Value | Description + +---|---|--- + +NEXTHDR_HOP | 0 | Hop-by-Hop Options header. + +NEXTHDR_TCP | 6 | TCP segment. + +NEXTHDR_UDP | 17 | UDP message. + +NEXTHDR_IPV6 | 41 | IPv6 in IPv6. + +NEXTHDR_ROUTING | 43 | Routing header. + +NEXTHDR_FRAGMENT | 44 | Fragmentation/reassembly header. + +NEXTHDR_GRE | 47 | GRE header. + +NEXTHDR_ESP | 50 | Encapsulating security payload. + +NEXTHDR_AUTH | 51 | Authentication header. + +NEXTHDR_ICMP | 58 | ICMP for IPv6. + +NEXTHDR_NONE | 59 | No next header. + +NEXTHDR_DEST | 60 | Destination options header. + +NEXTHDR_MOBILITY | 135 | Mobility header. + +Table 8-3 shows the Multicast Address Record types by their Linux symbol and value. For more details see the "MLDv2 Multicast Listener Report" section in this chapter. + +Table 8-3. + +Multicast Address Record (record types) + +Linux Symbol | Value + +---|--- + +MLD2_MODE_IS_INCLUDE | 1 + +MLD2_MODE_IS_EXCLUDE | 2 + +MLD2_CHANGE_TO_INCLUDE | 3 + +MLD2_CHANGE_TO_EXCLUDE | 4 + +MLD2_ALLOW_NEW_SOURCES | 5 + +MLD2_BLOCK_OLD_SOURCES | 6 + +(include/uapi/linux/icmpv6.h) + +Table 8-4 shows the codes of ICMPv6 "Parameter Problem" message by their Linux symbol and value. These codes gives more information about the type of problem which occurred. + +Table 8-4. + +ICMPv6 Parameter Problem codes + +Linux Symbol | Value + +---|--- + +ICMPV6_HDR_FIELD | 0 Erroneous header field encountered + +ICMPV6_UNK_NEXTHDR | 1 Unknown header field encountered + +ICMPV6_UNK_OPTION | 2 Unknown IPv6 option encountered + +### Special Addresses + +All of the following variables are instances of the in6_addr structure: + + * in6addr_any: Represents the unspecified device of all zeroes (::). + + * in6addr_loopback: Represents the loopback device (::1). + + * in6addr_linklocal_allnodes: Represents the link-local all nodes multicast address (ff02::1). + + * in6addr_linklocal_allrouters: Represents the link-local all routers multicast address (ff02::2). + + * in6addr_interfacelocal_allnodes: Represents the interface-local all nodes (ff01::1). + + * in6addr_interfacelocal_allrouters: Represents the interface-local all routers (ff01::2). + + * in6addr_sitelocal_allrouters: Represents the site-local all routers address (ff05::2). + +(include/linux/in6.h) + +### Routing Tables Management in IPv6 + +Like in IPv4, we can manage adding and deleting routing entries and displaying the routing tables with the ip route command of iproute2 and with the route command of net-tools: + + * Adding a route by ip -6 route add is handled by the inet6_rtm_newroute() method by invoking the ip6_route_add() method. + + * Deleting a route by ip -6 route del is handled by the inet6_rtm_delroute() method by invoking the ip6_route_del() method. + + * Displaying the routing table by ip -6 route show is handled by the inet6_dump_fib() method. + + * Adding a route by route -A inet6 add is implemented by sending SIOCADDRT IOCTL, which is handled by the ipv6_route_ioctl() method, by invoking the ip6_route_add() method. + + * Deleting a route by route -A inet6 del is implemented by sending SIOCDELRT IOCTL, which is handled by the ipv6_route_ioctl() method by invoking the ip6_route_del() method. + +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_9 + +© Rami Rosen 2014 + +# 9. Netfilter + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 8 discusses the IPv6 subsystem implementation. This chapter discusses the netfilter subsystem. The netfilter framework was started in 1998 by Rusty Russell, one of the most widely known Linux kernel developers, as an improvement of the older implementations of ipchains (Linux 2.2.x) and ipfwadm (Linux 2.0.x). The netfilter subsystem provides a framework that enables registering callbacks in various points (netfilter hooks) in the packet traversal in the network stack and performing various operations on packets, such as changing addresses or ports, dropping packets, logging, and more. These netfilter hooks provide the infrastructure to netfilter kernel modules that register callbacks in order to perform various tasks of the netfilter subsystem. + +Chapter 8 discusses the IPv6 subsystem implementation. This chapter discusses the netfilter subsystem. The netfilter framework was started in 1998 by Rusty Russell, one of the most widely known Linux kernel developers, as an improvement of the older implementations of ipchains (Linux 2.2.x) and ipfwadm (Linux 2.0.x). The netfilter subsystem provides a framework that enables registering callbacks in various points (netfilter hooks) in the packet traversal in the network stack and performing various operations on packets, such as changing addresses or ports, dropping packets, logging, and more. These netfilter hooks provide the infrastructure to netfilter kernel modules that register callbacks in order to perform various tasks of the netfilter subsystem. + +## Netfilter Frameworks + +The netfilter subsystem provides the following functionalities, discussed in this chapter: + + * Packet selection (iptables) + + * Packet filtering + + * Network Address Translation (NAT) + + * Packet mangling (modifying the contents of packet headers before or after routing) + + * Connection tracking + + * Gathering network statistics + +Here are some common frameworks that are based on the Linux kernel netfilter subsystem: + + * IPVS (IP Virtual Server):A transport layer load-balancing solution (net/netfilter/ipvs). There is support for IPv4 IPVS from very early kernels, and support for IPVS in IPv6 is included since kernel 2.6.28. The IPv6 kernel support for IPVS was developed by Julius Volz and Vince Busam from Google. For more details, see the IPVS official website, www.linuxvirtualserver.org . + + * IP sets:A framework which consists of a userspace tool called ipset and a kernel part (net/netfilter/ipset). An IP set is basically a set of IP addresses. The IP sets framework was developed by Jozsef Kadlecsik. For more details, see http://ipset.netfilter.org . + + * iptables:Probably the most popular Linux firewall, iptables is the front end of netfilter, and it provides a management layer for netfilter: for example, adding and deleting netfilter rules, displaying statistics, adding a table, zeroing the counters of a table, and more. + +There are different iptables implementations in the kernel, according to the protocol: + + * iptables for IPv4: (net/ipv4/netfilter/ip_tables.c) + + * ip6tables for IPv6: (net/ipv6/netfilter/ip6_tables.c) + + * arptables for ARP: (net/ipv4/netfilter/arp_tables.c) + + * ebtables for Ethernet: (net/bridge/netfilter/ebtables.c) + +In userspace, you have the iptables and the ip6tables command-line tools, which are used to set up, maintain, and inspect the IPv4 and IPv6 tables, respectively. See man 8 iptables and man 8 ip6tables. Both iptables and ip6tables use the setsockopt()/getsockopt() system calls to communicate with the kernel from userspace. I should mention here two interesting ongoing netfilter projects. The xtables2 project—being developed primarily by Jan Engelhardt, a work in progress as of this writing—uses a netlink-based interface to communicate with the kernel netfilter subsystem. See more details on the project website, http://xtables.de . The second project, the nftables project, is a new packet filtering engine that is a candidate to replace iptables. The nftables solution is based on using a virtual machine and a single unified implementation instead of the four iptables objects mentioned earlier (iptables, ip6tables, arptables, and ebtables). The nftables project was first presented in a netfilter workshop in 2008, by Patrick McHardy. The kernel infrastructure and userspace utility have been developed by Patrick McHardy and Pablo Neira Ayuso. For more details, see http://netfilter.org/projects/nftables , and "Nftables: a new packet filtering engine" at http://lwn.net/Articles/324989/ . + +There are a lot of netfilter modules that extend the core functionality of the core netfilter subsystem; apart from some examples, I do not describe these modules here in depth. There are a lot of information resources about these netfilter extensions from the administration perspective on the web and in various administration guides. See also the official netfilter project website: www.netfilter.org . + +## Netfilter Hooks + +There are five points in the network stack where you have netfilter hooks: you have encountered these points in previous chapters' discussions of the Rx and Tx paths in IPv4 and in IPv6. Note that the names of the hooks are common to IPv4 and IPv6: + + * NF_INET_PRE_ROUTING: This hook is in the ip_rcv() method in IPv4, and in the ipv6_rcv() method in IPv6. The ip_rcv() method is the protocol handler of IPv4, and the ipv6_rcv() method is the protocol handler of IPv6. It is the first hook point that all incoming packets reach, before performing a lookup in the routing subsystem. + + * NF_INET_LOCAL_IN: This hook is in the ip_local_deliver() method in IPv4, and in the ip6_input() method in IPv6. All incoming packets addressed to the local host reach this hook point after first passing via the NF_INET_PRE_ROUTING hook point and after performing a lookup in the routing subsystem. + + * NF_INET_FORWARD: This hook is in the ip_forward() method in IPv4, and in the ip6_forward() method in IPv6. All forwarded packets reach this hook point after first passing via the NF_INET_PRE_ROUTING hook point and after performing a lookup in the routing subsystem. + + * NF_INET_POST_ROUTING: This hook is in the ip_output() method in IPv4, and in the ip6_finish_output2() method in IPv6. Packets that are forwarded reach this hook point after passing the NF_INET_FORWARD hook point. Also packets that are created in the local machine and sent out arrive to NF_INET_POST_ROUTING after passing the NF_INET_LOCAL_OUT hook point. + + * NF_INET_LOCAL_OUT: This hook is in the __ip_local_out() method in IPv4, and in the __ip6_local_out() method in IPv6. All outgoing packets that were created on the local host reach this point before reaching the NF_INET_POST_ROUTING hook point. + +(include/uapi/linux/netfilter.h) + +The NF_HOOK macro, mentioned in previous chapters, is called in some distinct points along the packet traversal in the kernel network stack; it is defined in include/linux/netfilter.h: + +static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb, + +struct net_device *in, struct net_device *out, + +int (*okfn)(struct sk_buff *)) + +{ + +return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN); + +} + +The parameters of the NF_HOOK() are as follows: + + * pf: Protocol family. NFPROTO_IPV4 for IPv4 and NFPROTO_IPV6 for IPv6. + + * hook: One of the five netfilter hooks mentioned earlier (for example, NF_INET_PRE_ROUTING or NF_INET_LOCAL_OUT). + + * skb: The SKB object represents the packet that is being processed. + + * in: The input network device (net_device object). + + * out: The output network device (net_device object). There are cases when the output device is NULL, as it is yet unknown; for example, in the ip_rcv() method, net/ipv4/ip_input.c, which is called before a routing lookup is performed, and you don't know yet which is the output device; the NF_HOOK() macro is invoked in this method with a NULL output device. + + * okfn: A pointer to a continuation function which will be called when the hook will terminate. It gets one argument, the SKB. + +The return value from a netfilter hook must be one of the following values (which are also termed netfilter verdicts): + + * NF_DROP (0): Discard the packet silently. + + * NF_ACCEPT (1): The packet continues its traversal in the kernel network stack as usual. + + * NF_STOLEN (2): Do not continue traversal. The packet is processed by the hook method. + + * NF_QUEUE (3): Queue the packet for user space. + + * NF_REPEAT (4): The hook function should be called again. + +(include/uapi/linux/netfilter.h) + +Now that you know about the various netfilter hooks, the next section covers how netfilter hooks are registered. + +### Registration of Netfilter Hooks + +To register a hook callback at one of the five hook points mentioned earlier, you first define an nf_hook_ops object (or an array of nf_hook_ops objects) and then register it; the nf_hook_ops structure is defined in include/linux/netfilter.h: + +struct nf_hook_ops { + +struct list_head list; + +/* User fills in from here down. */ + +nf_hookfn *hook; + +struct module *owner; + +u_int8_t pf; + +unsigned int hooknum; + +/* Hooks are ordered in ascending priority. */ + +int priority; + +}; + +The following introduces some of the important members of the nf_hook_ops structure: + + * hook: The hook callback you want to register. Its prototype is: + +unsigned int nf_hookfn(unsigned int hooknum, + +struct sk_buff *skb, + +const struct net_device *in, + +const struct net_device *out, + +int (*okfn)(struct sk_buff *)); + + * pf: The protocol family (NFPROTO_IPV4 for IPv4 and NFPROTO_IPV6 for IPv6). + + * hooknum: One of the five netfilter hooks mentioned earlier. + + * priority: More than one hook callback can be registered on the same hook. Hook callbacks with lower priorities are called first. The nf_ip_hook_priorities enum defines possible values for IPv4 hook priorities (include/uapi/linux/netfilter_ipv4.h). See also Table 9-4 in the "Quick Reference" section at the end of this chapter. + +There are two methods to register netfilter hooks: + + * int nf_register_hook(struct nf_hook_ops *reg): Registers a single nf_hook_ops object. + + * int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n): Registers an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array. + +You will see two examples of registration of an array of nf_hook_ops objects in the next two sections. Figure 9-1 in the next section illustrates the use of priorities when registering more than one hook callback on the same hook point. + +## Connection Tracking + +It is not enough to filter traffic only according to the L4 and L3 headers in modern networks. You should also take into account cases when the traffic is based on sessions, such as an FTP session or a SIP session. By FTP session, I mean this sequence of events, for example: the client first creates a TCP control connection on TCP port 21, which is the default FTP port. Commands sent from the FTP client (such as listing the contents of a directory) to the server are sent on this control port. The FTP server opens a data socket on port 20, where the destination port on the client side is dynamically allocated. Traffic should be filtered according to other parameters, such as the state of a connection or timeout. This is one of the main reasons for using the Connection Tracking layer. + +Connection Tracking allows the kernel to keep track of sessions. The Connection Tracking layer's primary goal is to serve as the basis of NAT. The IPv4 NAT module (net/ipv4/netfilter/iptable_nat.c) cannot be built if CONFIG_NF_CONNTRACK_IPV4 is not set. Similarly, the IPv6 NAT module (net/ipv6/netfilter/ip6table_nat.c) cannot be built if the CONFIG_NF_CONNTRACK_IPV6 is not set. However, Connection Tracking does not depend on NAT; you can run the Connection Tracking module without activating any NAT rule. The IPv4 and IPv6 NAT modules are discussed later in this chapter. + +Note + +There are some userspace tools (conntrack-tools) for Connection Tracking administration mentioned in the "Quick Reference" section at the end of this chapter. These tools may help you to better understand the Connection Tracking layer. + +### Connection Tracking Initialization + +An array of nf_hook_ops objects, called ipv4_conntrack_ops, is defined as follows: + +static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { + +{ + +.hook = ipv4_conntrack_in, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_PRE_ROUTING, + +.priority = NF_IP_PRI_CONNTRACK, + +}, + +{ + +.hook = ipv4_conntrack_local, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_LOCAL_OUT, + +.priority = NF_IP_PRI_CONNTRACK, + +}, + +{ + +.hook = ipv4_helper, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_POST_ROUTING, + +.priority = NF_IP_PRI_CONNTRACK_HELPER, + +}, + +{ + +.hook = ipv4_confirm, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_POST_ROUTING, + +.priority = NF_IP_PRI_CONNTRACK_CONFIRM, + +}, + +{ + +.hook = ipv4_helper, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_LOCAL_IN, + +.priority = NF_IP_PRI_CONNTRACK_HELPER, + +}, + +{ + +.hook = ipv4_confirm, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_LOCAL_IN, + +.priority = NF_IP_PRI_CONNTRACK_CONFIRM, + +}, + +}; + +(net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c) + +The two most important Connection Tracking hooks you register are the NF_INET_PRE_ROUTING hook, handled by the ipv4_conntrack_in() method, and the NF_INET_LOCAL_OUT hook, handled by the ipv4_conntrack_local() method. These two hooks have a priority of NF_IP_PRI_CONNTRACK (-200). The other hooks in the ipv4_conntrack_ops array have an NF_IP_PRI_CONNTRACK_HELPER (300) priority and an NF_IP_PRI_CONNTRACK_CONFIRM (INT_MAX, which is 2^31-1) priority. In netfilter hooks, a callback with a lower-priority value is executed first. (The enum nf_ip_hook_priorities in include/uapi/linux/netfilter_ipv4.h represents the possible priority values for IPv4 hooks). Both the ipv4_conntrack_local() method and the ipv4_conntrack_in() method invoke the nf_conntrack_in() method, passing the corresponding hooknum as a parameter. The nf_conntrack_in() method belongs to the protocol-independent NAT core, and is used both in IPv4 Connection Tracking and in IPv6 Connection Tracking; its second parameter is the protocol family, specifying whether it is IPv4 (PF_INET) or IPv6 (PF_INET6). I start the discussion with the nf_conntrack_in() callback. The other hook callbacks, ipv4_confirm() and ipv4_help(), are discussed later in this section. + +Note + +When the kernel is built with Connection Tracking support (CONFIG_NF_CONNTRACK is set ), the Connection Tracking hook callbacks are called even if there are no iptables rules that are activated. Naturally, this has some performance cost. If the performance is very important, and you know beforehand that the device will not use the netfilter subsystem, consider building the kernel without Connection Tracking support or building Connection Tracking as a kernel module and not loading it. + +Registration of IPv4 Connection Tracking hooks is done by calling the nf_register_hooks() method in the nf_conntrack_l3proto_ipv4_init() method (net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c): + +in nf_conntrack_l3proto_ipv4_init(void) { + +. . . + +ret = nf_register_hooks(ipv4_conntrack_ops, + +ARRAY_SIZE(ipv4_conntrack_ops)) + +. . . + +} + +In Figure 9-1, you can see the Connection Tracking callbacks (ipv4_conntrack_in(), ipv4_conntrack_local(), ipv4_helper() and ipv4_confirm()), according to the hook points where they are registered. + +Figure 9-1. + +Connection Tracking hooks (IPv4) + +Note + +For the sake of simplicity, Figure 9-1 does not include more complex scenarios, such as when using IPsec or fragmentation or multicasting. It also omits the functions that are called for packets generated on the local host and sent out (like the ip_queue_xmit() method or the ip_build_and_send_pkt() method) for the sake of simplicity. + +The basic element of Connection Tracking is the nf_conntrack_tuple structure: + +struct nf_conntrack_tuple { + +struct nf_conntrack_man src; + +/* These are the parts of the tuple which are fixed. */ + +struct { + +union nf_inet_addr u3; + +union { + +/* Add other protocols here. */ + +__be16 all; + +struct { + +__be16 port; + +} tcp; + +struct { + +__be16 port; + +} udp; + +struct { + +u_int8_t type, code; + +} icmp; + +struct { + +__be16 port; + +} dccp; + +struct { + +__be16 port; + +} sctp; + +struct { + +__be16 key; + +} gre; + +} u; + +/* The protocol. */ + +u_int8_t protonum; + +/* The direction (for tuplehash) */ + +u_int8_t dir; + +} dst; + +}; + +(include/net/netfilter/nf_conntrack_tuple.h) + +The nf_conntrack_tuple structure represents a flow in one direction. The union inside the dst structure includes various protocol objects (like TCP, UDP, ICMP, and more). For each transport layer (L4) protocol, there is a Connection Tracking module, which implements the protocol-specific part. Thus, for example, you have net/netfilter/nf_conntrack_proto_tcp.c for the TCP protocol, net/netfilter/nf_conntrack_proto_udp.c for the UDP protocol, net/netfilter/nf_conntrack_ftp.c for the FTP protocol, and more; these modules support both IPv4 and IPv6. You will see examples of how protocol-specific implementations of Connection Tracking modules differ later in this section. + +### Connection Tracking Entries + +The nf_conn structure represents the Connection Tracking entry: + +struct nf_conn { + +/* Usage count in here is 1 for hash table/destruct timer, 1 per skb, + +plus 1 for any connection(s) we are `master' for */ + +struct nf_conntrack ct_general; + +spinlock_t lock; + +/* XXX should I move this to the tail ? - Y.K */ + +/* These are my tuples; original and reply */ + +struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; + +/* Have we seen traffic both ways yet? (bitset) */ + +unsigned long status; + +/* If we were expected by an expectation, this will be it */ + +struct nf_conn *master; + +/* Timer function; drops refcnt when it goes off. */ + +struct timer_list timeout; + +. . . + +/* Extensions */ + +struct nf_ct_ext *ext; + +#ifdef CONFIG_NET_NS + +struct net *ct_net; + +#endif + +/* Storage reserved for other modules, must be the last member */ + +union nf_conntrack_proto proto; + +}; + +(include/net/netfilter/nf_conntrack.h) + +The following is a description of some of the important members of the nf_conn structure : + + * ct_general: A reference count. + + * tuplehash: There are two tuplehash objects: tuplehash[0] is the original direction, and tuplehash[1] is the reply. They are usually referred to as tuplehash[IP_CT_DIR_ORIGINAL] and tuplehash[IP_CT_DIR_REPLY], respectively. + + * status: The status of the entry. When you start to track a connection entry, it is IP_CT_NEW; later on, when the connection is established, it becomes IP_CT_ESTABLISHED. See the ip_conntrack_info enum in include/uapi/linux/netfilter/nf_conntrack_common.h. + + * master: An expected connection. Set by the init_conntrack() method, when an expected packet arrives (this means that the nf_ct_find_expectation() method, which is invoked by the init_conntrack() method, finds an expectation). See also the "Connection Tracking Helpers and Expectations" section later in this chapter. + + * timeout: Timer of the connection entry. Each connection entry is expired after some time interval when there is no traffic. The time interval is determined according to the protocol. When allocating an nf_conn object with the __nf_conntrack_alloc() method, the timeout timer is set to be the death_by_timeout() method. + +Now that you know about the nf_conn struct and some of its members, let's take a look at the nf_conntrack_in() method: + +unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + +struct sk_buff *skb) + +{ + +struct nf_conn *ct, *tmpl = NULL; + +enum ip_conntrack_info ctinfo; + +struct nf_conntrack_l3proto *l3proto; + +struct nf_conntrack_l4proto *l4proto; + +unsigned int *timeouts; + +unsigned int dataoff; + +u_int8_t protonum; + +int set_reply = 0; + +int ret; + +if (skb->nfct) { + +/* Previously seen (loopback or untracked)? Ignore. */ + +tmpl = (struct nf_conn *)skb->nfct; + +if (!nf_ct_is_template(tmpl)) { + +NF_CT_STAT_INC_ATOMIC(net, ignore); + +return NF_ACCEPT; + +} + +skb->nfct = NULL; + +} + +First you try to find whether the network layer (L3) protocol can be tracked: + +l3proto = __nf_ct_l3proto_find(pf); + +Now you try to find if the transport layer (L4) protocol can be tracked. For IPv4, it is done by the ipv4_get_l4proto() method (net/ipv4/netfilter/nf_conntrack_l3proto_ipv4): + +ret = l3proto->get_l4proto(skb, skb_network_offset(skb), + +&dataoff, &protonum); + +if (ret <= 0) { + +. . . + +ret = -ret; + +goto out; + +} + +l4proto = __nf_ct_l4proto_find(pf, protonum); + +/* It may be an special packet, error, unclean... + +* inverse of the return code tells to the netfilter + +* core what to do with the packet. */ + +Now you check protocol-specific error conditions (see, for example, the udp_error() method in net/netfilter/nf_conntrack_proto_udp.c, which checks for malformed packets, packets with invalid checksum, and more, or the tcp_error() method, in net/netfilter/nf_conntrack_proto_tcp.c): + +if (l4proto->error != NULL) { + +ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, + +pf, hooknum); + +if (ret <= 0) { + +NF_CT_STAT_INC_ATOMIC(net, error); + +NF_CT_STAT_INC_ATOMIC(net, invalid); + +ret = -ret; + +goto out; + +} + +/* ICMP[v6] protocol trackers may assign one conntrack. */ + +if (skb->nfct) + +goto out; + +} + +The resolve_normal_ct() method, which is invoked hereafter immediately, performs the following: + + * Calculates the hash of the tuple by calling the hash_conntrack_raw() method. + + * Performs a lookup for a tuple match by calling the __nf_conntrack_find_get() method, passing the hash as a parameter. + + * If no match is found, it creates a new nf_conntrack_tuple_hash object by calling the init_conntrack() method. This nf_conntrack_tuple_hash object is added to the list of unconfirmed tuplehash objects. This list is embedded in the network namespace object; the net structure contains a netns_ct object, which consists of network namespace specific Connection Tracking information. One of its members is unconfirmed, which is a list of unconfirmed tuplehash objects (see include/net/netns/conntrack.h). Later on, in the __nf_conntrack_confirm() method, it will be removed from the unconfirmed list. I discuss the __nf_conntrack_confirm() method later in this section. + + * Each SKB has a member called nfctinfo, which represents the connection state (for example, it is IP_CT_NEW for new connections), and also a member called nfct (an instance of the nf_conntrack struct) which is in fact a reference counter. The resolve_normal_ct() method initializes both of them. + +ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, + +l3proto, l4proto, &set_reply, &ctinfo); + +if (!ct) { + +/* Not valid part of a connection */ + +NF_CT_STAT_INC_ATOMIC(net, invalid); + +ret = NF_ACCEPT; + +goto out; + +} + +if (IS_ERR(ct)) { + +/* Too stressed to deal. */ + +NF_CT_STAT_INC_ATOMIC(net, drop); + +ret = NF_DROP; + +goto out; + +} + +NF_CT_ASSERT(skb->nfct); + +You now call the nf_ct_timeout_lookup() method to decide what timeout policy you want to apply to this flow. For example, for UDP, the timeout is 30 seconds for unidirectional connections and 180 seconds for bidirectional connections; see the definition of the udp_timeouts array in net/netfilter/nf_conntrack_proto_udp.c. For TCP, which is a much more complex protocol, there are 11 entries in tcp_timeouts array (net/netfilter/nf_conntrack_proto_tcp.c): + +/* Decide what timeout policy we want to apply to this flow. */ + +timeouts = nf_ct_timeout_lookup(net, ct, l4proto); + +You now call the protocol-specific packet() method (for example, the udp_packet() for UDP or the tcp_packet() method for TCP). The udp_packet() method extends the timeout according to the status of the connection by calling the nf_ct_refresh_acct() method. For unreplied connections (where the IPS_SEEN_REPLY_BIT flag is not set), it will be set to 30 seconds, and for replied connections, it will be set to 180. Again, in the case of TCP, the tcp_packet() method is much more complex, due to the TCP advanced state machine. Moreover, the udp_packet() method always returns a verdict of NF_ACCEPT, whereas the tcp_packet() method may sometimes fail: + +ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); + +if (ret <= 0) { + +/* Invalid: inverse of the return code tells + +* the netfilter core what to do */ + +pr_debug("nf_conntrack_in: Can't track with proto module\n"); + +nf_conntrack_put(skb->nfct); + +skb->nfct = NULL; + +NF_CT_STAT_INC_ATOMIC(net, invalid); + +if (ret == -NF_DROP) + +NF_CT_STAT_INC_ATOMIC(net, drop); + +ret = -ret; + +goto out; + +} + +if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + +nf_conntrack_event_cache(IPCT_REPLY, ct); + +out: + +if (tmpl) { + +/* Special case: we have to repeat this hook, assign the + +* template again to this packet. We assume that this packet + +* has no conntrack assigned. This is used by nf_ct_tcp. */ + +if (ret == NF_REPEAT) + +skb->nfct = (struct nf_conntrack *)tmpl; + +else + +nf_ct_put(tmpl); + +} + +return ret; + +} + +The ipv4_confirm() method, which is called in the NF_INET_POST_ROUTING hook and in the NF_INET_LOCAL_IN hook, will normally call the __nf_conntrack_confirm() method, which will remove the tuple from the unconfirmed list. + +### Connection Tracking Helpers and Expectations + +Some protocols have different flows for data and for control—for example, FTP, the File Transfer Protocol, and SIP, the Session Initiation Protocol, which is a VoIP protocol. Usually in these protocols, the control channel negotiates some configuration setup with the other side and agrees with it on which parameters to use for the data flow. These protocols are more difficult to handle by the netfilter subsystem, because the netfilter subsystem needs to be aware that flows are related to each other. In order to support these types of protocols, the netfilter subsystem provides the Connection Tracking Helpers, which extend the Connection Tracking basic functionality. These modules create expectations (nf_conntrack_expect objects), and these expectations tell the kernel that it should expect some traffic on a specified connection and that two connections are related. Knowing that two connections are related lets you define rules on the master connection that pertain also to the related connections. You can use a simple iptables rule based on the Connection Tracking state to accept packets whose Connection Tracking state is RELATED: + +iptables -A INPUT -m conntrack --ctstate RELATED -j ACCEPT + +Note + +Connections can be related not only as a result of expectation. For example, an ICMPv4 error packet such as "ICMP fragmentation needed" will be related if netfilter finds a conntrack entry that matches the tuple in the ICMP-embedded L3/L4 header. See the icmp_error_message() method for more details, net/ipv4/netfilter/nf_conntrack_proto_icmp.c. + +The Connection Tracking Helpers are represented by the nf_conntrack_helper structure (include/net/netfilter/nf_conntrack_helper.h). They are registered and unregistered by the nf_conntrack_helper_register() method and the nf_conntrack_helper_unregister() method, respectively. Thus, for example, the nf_conntrack_helper_register() method is invoked by nf_conntrack_ftp_init() (net/netfilter/nf_conntrack_ftp.c) in order to register the FTP Connection Tracking Helpers. The Connection Tracking Helpers are kept in a hash table (nf_ct_helper_hash). The ipv4_helper() hook callback is registered in two hook points, NF_INET_POST_ROUTING and NF_INET_LOCAL_IN (see the definition of ipv4_conntrack_ops array in the "Connection Tracking Initialization" section earlier). Because of this, when the FTP packet reaches the NF_INET_POST_ROUTING callback, ip_output(), or the NF_INET_LOCAL_IN callback, ip_local_deliver(), the ipv4_helper() method is invoked, and this method eventually calls the callbacks of the registered Connection Tracking Helpers. In the case of FTP, the registered helper method is the help() method, net/netfilter/nf_conntrack_ftp.c. This method looks for FTP-specific patterns, like the "PORT" FTP command; see the invocation of the find_pattern() method in the help() method, in the following code snippet (net/netfilter/nf_conntrack_ftp.c). If there is a match, an nf_conntrack_expect object is created by calling the nf_ct_expect_init() method: + +static int help(struct sk_buff *skb, + +unsigned int protoff, + +struct nf_conn *ct, + +enum ip_conntrack_info ctinfo) + +{ + +struct nf_conntrack_expect *exp; + +. . . + +for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { + +found = find_pattern(fb_ptr, datalen, + +search[dir][i].pattern, + +search[dir][i].plen, + +search[dir][i].skip, + +search[dir][i].term, + +&matchoff, &matchlen, + +&cmd, + +search[dir][i].getnum); + +if (found) break; + +} + +if (found == -1) { + +/* We don't usually drop packets. After all, this is + +connection tracking, not packet filtering. + +However, it is necessary for accurate tracking in + +this case. */ + +nf_ct_helper_log(skb, ct, "partial matching of `%s'", + +search[dir][i].pattern); + +Note + +Normally, Connection Tracking does not drop packets. There are some cases when, due to some error or abnormal situation, packets are dropped. The following is an example of such a case: the invocation of find_pattern() earlier returned –1, which means that there is only a partial match; and the packet is dropped due to not finding a full pattern match. + +ret = NF_DROP; + +goto out; + +} else if (found == 0) { /* No match */ + +ret = NF_ACCEPT; + +goto out_update_nl; + +} + +pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + +matchlen, fb_ptr + matchoff, + +matchlen, ntohl(th->seq) + matchoff); + +exp = nf_ct_expect_alloc(ct); + +. . . + +nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num, + +&ct->tuplehash[!dir].tuple.src.u3, daddr, + +IPPROTO_TCP, NULL, &cmd.u.tcp.port); + +. . . + +} + +(net/netfilter/nf_conntrack_ftp.c) + +Later on, when a new connection is created by the init_conntrack() method, you check whether it has expectations, and if it does, you set the IPS_EXPECTED_BIT flag and set the master of the connection (ct->master) to refer to the connection that created the expectation: + +static struct nf_conntrack_tuple_hash * + +init_conntrack(struct net *net, struct nf_conn *tmpl, + +const struct nf_conntrack_tuple *tuple, + +struct nf_conntrack_l3proto *l3proto, + +struct nf_conntrack_l4proto *l4proto, + +struct sk_buff *skb, + +unsigned int dataoff, u32 hash) + +{ + +struct nf_conn *ct; + +struct nf_conn_help *help; + +struct nf_conntrack_tuple repl_tuple; + +struct nf_conntrack_ecache *ecache; + +struct nf_conntrack_expect *exp; + +u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + +struct nf_conn_timeout *timeout_ext; + +unsigned int *timeouts; + +. . . + +ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, + +hash); + +. . . + +exp = nf_ct_find_expectation(net, zone, tuple); + +if (exp) { + +pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + +ct, exp); + +/* Welcome, Mr. Bond. We've been expecting you... */ + +__set_bit(IPS_EXPECTED_BIT, &ct->status); + +ct->master = exp->master; + +if (exp->helper) { + +help = nf_ct_helper_ext_add(ct, exp->helper, + +GFP_ATOMIC); + +if (help) + +rcu_assign_pointer(help->helper, exp->helper); + +} + +. . . + +Note that helpers listen on a predefined port. For example, the FTP Connection Tracking Helper listens on port 21 (see FTP_PORT definition in include/linux/netfilter/nf_conntrack_ftp.h). You can set a different port (or ports) in one of two ways: the first way is by a module parameter—you can override the default port value by supplying a single port or a comma-separated list of ports to the modprobe command: + +modprobe nf_conntrack_ftp ports=2121 + +modprobe nf_conntrack_ftp ports=2022,2023,2024 + +The second way is by using the CT target: + +iptables -A PREROUTING -t raw -p tcp --dport 8888 -j CT --helper ftp + +Note that the CT target (net/netfilter/xt_CT.c) was added in kernel 2.6.34. + +Note + +Xtables target extensions are represented by the xt_target structure and are registered by the xt_register_target() method for a single target, or by the xt_register_targets() method for an array of targets. Xtables match extensions are represented by the xt_match structure and are registered by the xt_register_match() method, or by the xt_register_matches() for an array of matches. The match extensions inspect a packet according to some criterion defined by the match extension module; thus, for example, the xt_length match module (net/netfilter/xt_length.c) inspects packets according to their length (the tot_len of the SKB in case of IPv4 packet), and the xt_connlimit module (net/netfilter/xt_connlimit.c) limits the number of parallel TCP connections per IP address. + +This section detailed the Connection Tracking initialization. The next section deals with iptables, which is probably the most known part of the netfilter framework. + +### IPTables + +There are two parts to iptables. The kernel part—the core is in net/ipv4/netfilter/ip_tables.c for IPv4, and in net/ipv6/netfilter/ip6_tables.c for IPv6. And there is the userspace part, which provides a front end for accessing the kernel iptables layer (for example, adding and deleting rules with the iptables command). Each table is represented by the xt_table structure (defined in include/linux/netfilter/x_tables.h). Registration and unregistration of a table is done by the ipt_register_table() and the ipt_unregister_table() methods, respectively. These methods are implemented in net/ipv4/netfilter/ip_tables.c. In IPv6, you also use the xt_table structure for creating tables, but registration and unregistration of a table is done by the ip6t_register_table() method and the ip6t_unregister_table() method, respectively. + +The network namespace object contains IPv4- and IPv6-specific objects (netns_ipv4 and netns_ipv6, respectively). The netns_ipv4 and netns_ipv6 objects, in turn, contain pointers to xt_table objects. For IPv4, in struct netns_ipv4 you have, for example, iptable_filter, iptable_mangle, nat_table, and more (include/net/netns/ipv4.h). In struct netns_ipv6 you have, for example, ip6table_filter, ip6table_mangle, ip6table_nat, and more (include/net/netns/ipv6.h). For a full list of the IPv4 and of the IPv6 network namespace netfilter tables and the corresponding kernel modules, see Tables 9-2 and 9-3 in the "Quick Reference" section at the end of this chapter. + +To understand how iptables work, let's take a look at a real example with the filter table. For the sake of simplicity, let's assume that the filter table is the only one that is built, and also that the LOG target is supported; the only rule I am using is for logging, as you will shortly see. First, let's take a look at the definition of the filter table: + +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + +(1 << NF_INET_FORWARD) | \ + +(1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_filter = { + +.name = "filter", + +.valid_hooks = FILTER_VALID_HOOKS, + +.me = THIS_MODULE, + +.af = NFPROTO_IPV4, + +.priority = NF_IP_PRI_FILTER, + +}; + +(net/ipv4/netfilter/iptable_filter.c) + +Initialization of the table is done first by calling the xt_hook_link() method, which sets the iptable_filter_hook() method as the hook callback of the nf_hook_ops object of the packet_filter table: + +static struct nf_hook_ops *filter_ops __read_mostly; + +static int __init iptable_filter_init(void) + +{ + +. . . + +filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); + +. . . + +} + +Then you call the ipt_register_table() method (note that the IPv4 netns object, net->ipv4, keeps a pointer to the filter table, iptable_filter): + +static int __net_init iptable_filter_net_init(struct net *net) + +{ + +. . . + +net->ipv4.iptable_filter = + +ipt_register_table(net, &packet_filter, repl); + +. . . + +return PTR_RET(net->ipv4.iptable_filter); + +} + +(net/ipv4/netfilter/iptable_filter.c) + +Note that there are three hooks in the filter table: + + * NF_INET_LOCAL_IN + + * NF_INET_FORWARD + + * NF_INET_LOCAL_OUT + +For this example, you set the following rule, using the iptable command line: + +iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1 + +The meaning of this rule is that you will dump into the syslog incoming UDP packets with destination port 5001. The log-level modifier is the standard syslog level in the range 0 through 7; 0 is emergency and 7 is debug. Note that when running an iptables command, you should specify the table you want to use with the –t modifier; for example, iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE will add a rule to the NAT table. When not specifying a table name with the –t modifier, you use the filter table by default. So by running iptables -A INPUT -p udp --dport=5001 -j LOG --log-level 1, you add a rule to the filter table. + +Note + +You can set targets to iptables rules; usually these can be targets from the Linux netfilter subsystems (see the earlier example for using the LOG target). You can also write your own targets and extend the iptables userspace code to support them. See "Writing Netfilter modules," by Jan Engelhardt and Nicolas Bouliane: http://inai.de/documents/Netfilter_Modules.pdf . + +Note that CONFIG_NETFILTER_XT_TARGET_LOG must be set in order to use the LOG target in an iptables rule, as shown in the earlier example. You can refer to the code of net/netfilter/xt_LOG.c as an example of an iptables target module. + +When a UDP packet with destination port 5001 reaches the network driver and goes up to the network layer (L3), the first hook it encounters is the NF_INET_PRE_ROUTING hook; the filter table callback does not register a hook in NF_INET_PRE_ROUTING. It has only three hooks: NF_INET_LOCAL_IN, NF_INET_FORWARD, and NF_INET_LOCAL_OUT, as mentioned earlier. So you continue to the ip_rcv_finish() method and perform a lookup in the routing subsystem. Now there are two cases: the packet is intended to be delivered to the local host or intended to be forwarded (let's ignore cases when the packet is to be discarded). In Figure 9-2, you can see the packet traversal in both cases. + +Figure 9-2. + +Traffic for me and Forwarded Traffic with a Filter table rule + +### Delivery to the Local Host + +First you reach the ip_local_deliver() method; take a short look at this method: + +int ip_local_deliver(struct sk_buff *skb) + +{ + +. . . + +return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + +ip_local_deliver_finish); + +} + +As you can see, you have the NF_INET_LOCAL_IN hook in this method, and as mentioned earlier, NF_INET_LOCAL_IN is one of the filter table hooks; so the NF_HOOK() macro will invoke the iptable_filter_hook() method. Now take a look in the iptable_filter_hook() method: + +static unsigned int iptable_filter_hook(unsigned int hook, struct sk_buff *skb, + +const struct net_device *in, + +const struct net_device *out, + +int (*okfn)(struct sk_buff *)) + +{ + +const struct net *net; + +. . . + +net = dev_net((in != NULL) ? in : out); + +. . . + +return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); + +} + +(net/ipv4/netfilter/iptable_filter.c) + +The ipt_do_table() method, in fact, invokes the LOG target callback, ipt_log_packet(), which writes the packet headers into the syslog. If there were more rules, they would have been called at this point. Because there are no more rules, you continue to the ip_local_deliver_finish() method, and the packet continues its traversal to the transport layer (L4) to be handled by a corresponding socket. + +### Forwarding the Packet + +The second case is that after a lookup in the routing subsystem, you found that the packet is to be forwarded, so the ip_forward() method is called: + +int ip_forward(struct sk_buff *skb) + +{ + +. . . + +return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, + +rt->dst.dev, ip_forward_finish); + +. . . + +Because the filter table has a registered hook callback in NF_INET_FORWARD, as mentioned, you again invoke the iptable_filter_hook() method. And consequently, as before, you again call the ipt_do_table() method, which will in turn again call the ipt_log_packet() method. You will continue to the ip_forward_finish() method (note that ip_forward_finish is the last argument of the NF_HOOK macro above, which represents the continuation method). Then call the ip_output() method, and because the filter table has no NF_INET_POST_ROUTING hook, you continue to the ip_finish_output() method. + +Note + +You can filter packets according to their Connection Tracking state. The next rule will dump into syslog packets whose Connection Tracking state is ESTABLISHED: + +iptables -A INPUT -p tcp -m conntrack --ctstate ESTABLISHED -j LOG --log-level 1 + +### Network Address Translation (NAT) + +The Network Address Translation (NAT) module deals mostly with IP address translation, as the name implies, or port manipulation. One of the most common uses of NAT is to enable a group of hosts with a private IP address on a Local Area Network to access the Internet via some residential gateway. You can do that, for example, by setting a NAT rule. The NAT, which is installed on the gateway, can use such a rule and provide the hosts the ability to access the Web. The netfilter subsystem has NAT implementation for IPv4 and for IPv6. The IPv6 NAT implementation is mainly based on the IPv4 implementation and provides, from a user perspective, an interface similar to IPv4. IPv6 NAT support was merged in kernel 3.7. It provides some features like an easy solution to load balancing (by setting a DNAT on incoming traffic) and more. The IPv6 NAT module is in net/ipv6/netfilter/ip6table_nat.c. There are many types of NAT setups, and there is a lot of documentation on the Web about NAT administration. I talk about two common configurations: SNAT is source NAT, where the source IP address is changed, and DNAT is a destination NAT, where the destination IP address is changed. You can use the –j flag to select SNAT or DNAT. The implementation of both DNAT and SNAT is in net/netfilter/xt_nat.c. The next section discusses NAT initialization. + +#### NAT initialization + +The NAT table, like the filter table in the previous section, is also an xt_table object. It is registered on all hook points, except for the NF_INET_FORWARD hook: + +static const struct xt_table nf_nat_ipv4_table = { + +.name = "nat", + +.valid_hooks = (1 << NF_INET_PRE_ROUTING) | + +(1 << NF_INET_POST_ROUTING) | + +(1 << NF_INET_LOCAL_OUT) | + +(1 << NF_INET_LOCAL_IN), + +.me = THIS_MODULE, + +.af = NFPROTO_IPV4, + +}; + +(net/ipv4/netfilter/iptable_nat.c) + +Registration and unregistration of the NAT table is done by calling the ipt_register_table() and the ipt_unregister_table(), respectively (net/ipv4/netfilter/iptable_nat.c). The network namespace (struct net) includes an IPv4 specific object (netns_ipv4), which includes a pointer to the IPv4 NAT table (nat_table), as mentioned in the earlier "IP tables" section. This xt_table object, which is created by the ipt_register_table() method, is assigned to this nat_table pointer. You also define an array of nf_hook_ops objects and register it: + +static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { + +/* Before packet filtering, change destination */ + +{ + +.hook = nf_nat_ipv4_in, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_PRE_ROUTING, + +.priority = NF_IP_PRI_NAT_DST, + +}, + +/* After packet filtering, change source */ + +{ + +.hook = nf_nat_ipv4_out, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_POST_ROUTING, + +.priority = NF_IP_PRI_NAT_SRC, + +}, + +/* Before packet filtering, change destination */ + +{ + +.hook = nf_nat_ipv4_local_fn, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_LOCAL_OUT, + +.priority = NF_IP_PRI_NAT_DST, + +}, + +/* After packet filtering, change source */ + +{ + +.hook = nf_nat_ipv4_fn, + +.owner = THIS_MODULE, + +.pf = NFPROTO_IPV4, + +.hooknum = NF_INET_LOCAL_IN, + +.priority = NF_IP_PRI_NAT_SRC, + +}, + +}; + +Registration of the nf_nat_ipv4_ops array is done in the iptable_nat_init() method: + +static int __init iptable_nat_init(void) + +{ + +int err; + +. . . + +err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + +if (err < 0) + +goto err2; + +return 0; + +. . . + +} + +(net/ipv4/netfilter/iptable_nat.c) + +### NAT Hook Callbacks and Connection Tracking Hook Callbacks + +There are some hooks on which both NAT callbacks and Connection Tracking callbacks are registered. For example, on the NF_INET_PRE_ROUTING hook (the first hook an incoming packet arrives at), there are two registered callbacks: the Connection Tracking callback, ipv4_conntrack_in(), and the NAT callback, nf_nat_ipv4_in(). The priority of the Connection Tracking callback, ipv4_conntrack_in(), is NF_IP_PRI_CONNTRACK (-200), and the priority of the NAT callback, nf_nat_ipv4_in(), is NF_IP_PRI_NAT_DST (-100). Because callbacks of the same hook with lower priorities are invoked first, the Connection Tracking ipv4_conntrack_in() callback, which has a priority of –200, will be invoked before the NAT nf_nat_ipv4_in() callback, which has a priority of –100. See Figure 9-1 for the location of the ipv4_conntrack_in() method and Figure 9-4 for the location of the nf_nat_ipv4_in(); both are in the same place, in the NF_INET_PRE_ROUTING point. The reason behind this is that NAT performs a lookup in the Connection Tracking layer, and if it does not find an entry, NAT does not perform any address translation action: + +static unsigned int nf_nat_ipv4_fn(unsigned int hooknum, + +struct sk_buff *skb, + +const struct net_device *in, + +const struct net_device *out, + +int (*okfn)(struct sk_buff *)) + +{ + +struct nf_conn *ct; + +. . . + +/* Don't try to NAT if this packet is not conntracked */ + +if (nf_ct_is_untracked(ct)) + +return NF_ACCEPT; + +. . . + +} + +(net/ipv4/netfilter/iptable_nat.c) + +Note + +The nf_nat_ipv4_fn () method is called from the NAT PRE_ROUTING callback, nf_nat_ipv4_in(). + +On the NF_INET_POST_ROUTING hook, you have two registered Connection Tracking callbacks: the ipv4_helper() callback (with priority of NF_IP_PRI_CONNTRACK_HELPER, which is 300) and the ipv4_confirm() callback with priority of NF_IP_PRI_CONNTRACK_CONFIRM (INT_MAX, which is the highest integer value for a priority). You also have a registered NAT hook callback, nf_nat_ipv4_out(), with a priority of NF_IP_PRI_NAT_SRC, which is 100. As a result, when reaching the NF_INET_POST_ROUTING hook, first the NAT callback, nf_nat_ipv4_out(), will be called, and then the ipv4_helper() method will be called, and the ipv4_confirm() will be the last to be called. See Figure 9-4. + +Let's take a look in a simple DNAT rule and see the traversal of a forwarded packet and the order in which the Connection Tracking callbacks and the NAT callbacks are called (for the sake of simplicity, assume that the filter table is not built in this kernel image). In the setup shown in Figure 9-3, the middle host (the AMD server) runs this DNAT rule: + +iptables -t nat -A PREROUTING -j DNAT -p udp --dport 9999 --to-destination 192.168.1.8 + +The meaning of this DNAT rule is that incoming UDP packets that are sent on UDP destination port 9999 will change their destination IP address to 192.168.1.8. The right side machine (the Linux desktop) sends UDP packets to 192.168.1.9 with UDP destination port of 9999. In the AMD server, the destination IPv4 address is changed to 192.168.1.8 by the DNAT rule, and the packets are sent to the laptop on the left. + +Figure 9-3. + +A simple setup with a DNAT rule + +In Figure 9-4, you can see the traversal of a first UDP packet, which is sent according to the setup mentioned earlier. + +Figure 9-4. + +NAT and netfilter hooks + +The generic NAT module is net/netfilter/nf_nat_core.c. The basic elements of the NAT implementation are the nf_nat_l4proto structure (include/net/netfilter/nf_nat_l4proto.h) and the nf_nat_l3proto structure. In kernels prior to 3.7, you will encounter the nf_nat_protocol structure instead of these two structures, which replaced them as part of adding IPv6 NAT support. These two structures provide a protocol-independent NAT core support. + +Both of these structures contain a manip_pkt() function pointer that changes the packet headers. Let's look at an example of the manip_pkt() implementation for the TCP protocol, in net/netfilter/nf_nat_proto_tcp.c: + +static bool tcp_manip_pkt(struct sk_buff *skb, + +const struct nf_nat_l3proto *l3proto, + +unsigned int iphdroff, unsigned int hdroff, + +const struct nf_conntrack_tuple *tuple, + +enum nf_nat_manip_type maniptype) + +{ + +struct tcphdr *hdr; + +__be16 *portptr, newport, oldport; + +int hdrsize = 8; /* TCP connection tracking guarantees this much */ + +/* this could be an inner header returned in icmp packet; in such + +cases we cannot update the checksum field since it is outside of + +the 8 bytes of transport layer headers we are guaranteed */ + +if (skb->len >= hdroff + sizeof(struct tcphdr)) + +hdrsize = sizeof(struct tcphdr); + +if (!skb_make_writable(skb, hdroff + hdrsize)) + +return false; + +hdr = (struct tcphdr *)(skb->data + hdroff); + +Set newport according to maniptype: + + * If you need to change the source port, maniptype is NF_NAT_MANIP_SRC. So you extract the port from the tuple->src. + + * If you need to change the destination port, maniptype is NF_NAT_MANIP_DST. So you extract the port from the tuple->dst: + +if (maniptype == NF_NAT_MANIP_SRC) { + +/* Get rid of src port */ + +newport = tuple->src.u.tcp.port; + +portptr = &hdr->source; + +} else { + +/* Get rid of dst port */ + +newport = tuple->dst.u.tcp.port; + +portptr = &hdr->dest; + +} + +You are going to change the source port (when maniptype is NF_NAT_MANIP_SRC) or the destination port (when maniptype is NF_NAT_MANIP_DST) of the TCP header, so you need to recalculate the checksum. You must keep the old port for the checksum recalculation, which will be immediately done by calling the csum_update() method and the inet_proto_csum_replace2() method: + +oldport = *portptr; + +*portptr = newport; + +if (hdrsize < sizeof(*hdr)) + +return true; + +Recalculate the checksum: + +l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); + +inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + +return true; + +} + +### NAT Hook Callbacks + +The protocol-specific NAT module is net/ipv4/netfilter/iptable_nat.c for the IPv4 protocol, and net/ipv6/netfilter/ip6table_nat.c for the IPv6 protocol. These two NAT modules have four hooks callbacks each, shown in Table 9-1. + +Table 9-1. + +IPv4 and IPv6 NAT Callbacks + +Hook | Hook Callback (IPv4) | Hook Callback (IPv6) + +---|---|--- + +NF_INET_PRE_ROUTING | nf_nat_ipv4_in | nf_nat_ipv6_in + +NF_INET_POST_ROUTING | nf_nat_ipv4_out | nf_nat_ipv6_out + +NF_INET_LOCAL_OUT | nf_nat_ipv4_local_fn | nf_nat_ipv6_local_fn + +NF_INET_LOCAL_IN | nf_nat_ipv4_fn | nf_nat_ipv6_fn + +The nf_nat_ipv4_fn() is the most important of these methods (for IPv4). The other three methods, nf_nat_ipv4_in(), nf_nat_ipv4_out(), and nf_nat_ipv4_local_fn(), all invoke the nf_nat_ipv4_fn() method. Let's take a look at the nf_nat_ipv4_fn() method: + +static unsigned int nf_nat_ipv4_fn(unsigned int hooknum, + +struct sk_buff *skb, + +const struct net_device *in, + +const struct net_device *out, + +int (*okfn)(struct sk_buff *)) + +{ + +struct nf_conn *ct; + +enum ip_conntrack_info ctinfo; + +struct nf_conn_nat *nat; + +/* maniptype == SRC for postrouting. */ + +enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); + +/* We never see fragments: conntrack defrags on pre-routing + +* and local-out, and nf_nat_out protects post-routing. + +*/ + +NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb))); + +ct = nf_ct_get(skb, &ctinfo); + +/* Can't track? It's not due to stress, or conntrack would + +* have dropped it. Hence it's the user's responsibilty to + +* packet filter it out, or implement conntrack/NAT for that + +* protocol. 8) --RR + +*/ + +if (!ct) + +return NF_ACCEPT; + +/* Don't try to NAT if this packet is not conntracked */ + +if (nf_ct_is_untracked(ct)) + +return NF_ACCEPT; + +nat = nfct_nat(ct); + +if (!nat) { + +/* NAT module was loaded late. */ + +if (nf_ct_is_confirmed(ct)) + +return NF_ACCEPT; + +nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); + +if (nat == NULL) { + +pr_debug("failed to add NAT extension\n"); + +return NF_ACCEPT; + +} + +} + +switch (ctinfo) { + +case IP_CT_RELATED: + +case IP_CT_RELATED_REPLY: + +if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { + +if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + +hooknum)) + +return NF_DROP; + +else + +return NF_ACCEPT; + +} + +/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + +case IP_CT_NEW: + +/* Seen it before? This can happen for loopback, retrans, + +* or local packets. + +*/ + +if (!nf_nat_initialized(ct, maniptype)) { + +unsigned int ret; + +The nf_nat_rule_find() method calls the ipt_do_table() method, which iterates through all the matches of an entry in a specified table, and if there is a match, calls the target callback: + +ret = nf_nat_rule_find(skb, hooknum, in, out, ct); + +if (ret != NF_ACCEPT) + +return ret; + +} else { + +pr_debug("Already setup manip %s for ct %p\n", + +maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", + +ct); + +if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + +goto oif_changed; + +} + +break; + +default: + +/* ESTABLISHED */ + +NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || + +ctinfo == IP_CT_ESTABLISHED_REPLY); + +if (nf_nat_oif_changed(hooknum, ctinfo, nat, out)) + +goto oif_changed; + +} + +return nf_nat_packet(ct, ctinfo, hooknum, skb); + +oif_changed: + +nf_ct_kill_acct(ct, ctinfo, skb); + +return NF_DROP; + +} + +### Connection Tracking Extensions + +Connection Tracking (CT) Extensions were added in kernel 2.6.23. The main point of Connection Tracking Extensions is to allocate only what is required—for example, if the NAT module is not loaded, the extra memory needed for NAT in the Connection Tracking layer will not be allocated. Some extensions are enabled by sysctls or even depending on certain iptables rules (for example, -m connlabel). Each Connection Tracking Extension module should define an nf_ct_ext_type object and perform registration by the nf_ct_extend_register() method (unregistration is done by the nf_ct_extend_unregister() method). Each extension should define a method to attach its Connection Tracking Extension to a connection (nf_conn) object, which should be called from the init_conntrack() method. Thus, for example, you have the nf_ct_tstamp_ext_add() method for the timestamp CT Extension and nf_ct_labels_ext_add() for the labels CT Extension. The Connection Tracking Extensions infrastructure is implemented in net/netfilter/nf_conntrack_extend.c. These are the Connection Tracking Extensions modules as of this writing (all under net/netfilter): + + * nf_conntrack_timestamp.c + + * nf_conntrack_timeout.c + + * nf_conntrack_acct.c + + * nf_conntrack_ecache.c + + * nf_conntrack_labels.c + + * nf_conntrack_helper.c + +## Summary + +This chapter described the netfilter subsystem implementation. I covered the netfilter hooks and how they are registered. I also discussed important subjects such as the Connection Tracking mechanism, iptables, and NAT. Chapter 10 deals with the IPsec subsystem and its implementation. + +## Quick Reference + +This section covers the top methods that are related to the topics discussed in this chapter, ordered by their context, followed by three tables and a short section about tools and libraries. + +### Methods + +The following is a short list of important methods of the netfilter subsystem. Some of them were mentioned in this chapter. + +#### struct xt_table *ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl); + +This method registers a table in the netfilter subsystem. + +#### void ipt_unregister_table(struct net *net, struct xt_table *table); + +This method unregisters a table in the netfilter subsystem. + +#### int nf_register_hook(struct nf_hook_ops *reg); + +This method registers a single nf_hook_ops object. + +#### int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); + +This method registers an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array. + +#### void nf_unregister_hook(struct nf_hook_ops *reg); + +This method unregisters a single nf_hook_ops object. + +#### void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); + +This method unregisters an array of n nf_hook_ops objects; the second parameter is the number of the elements in the array. + +#### static inline void nf_conntrack_get(struct nf_conntrack *nfct); + +This method increments the reference count of the associated nf_conntrack object. + +#### static inline void nf_conntrack_put(struct nf_conntrack *nfct); + +This method decrements the reference count of the associated nf_conntrack object. If it reaches 0, the nf_conntrack_destroy() method is called. + +#### int nf_conntrack_helper_register(struct nf_conntrack_helper *me); + +This method registers an nf_conntrack_helper object. + +#### static inline struct nf_conn *resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, int *set_reply, enum ip_conntrack_info *ctinfo); + +This method tries to find an nf_conntrack_tuple_hash object according to the specified SKB by calling the __nf_conntrack_find_get() method, and if it does not find such an entry, it creates one by calling the init_conntrack() method. The resolve_normal_ct() method is called from the nf_conntrack_in() method (net/netfilter/nf_conntrack_core.c). + +#### struct nf_conntrack_tuple_hash *init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash); + +This method allocates a Connection Tracking nf_conntrack_tuple_hash object. Invoked from the resolve_normal_ct() method, it tries to find an expectation for this connection by calling the nf_ct_find_expectation() method. + +#### static struct nf_conn *__nf_conntrack_alloc(struct net *net, u16 zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash); + +This method allocates an nf_conn object. Sets the timeout timer of the nf_conn object to be the death_by_timeout() method. + +#### int xt_register_target(struct xt_target *target); + +This method registers an Xtable target extension. + +#### void xt_unregister_target(struct xt_target *target); + +This method unregisters an Xtable target extension. + +#### int xt_register_targets(struct xt_target *target, unsigned int n); + +This method registers an array of Xtable target extensions; n is the number of targets. + +#### void xt_unregister_targets(struct xt_target *target, unsigned int n); + +This method unregisters an array of Xtable target extensions; n is the number of targets. + +#### int xt_register_match(struct xt_match *target); + +This method registers an Xtable match extension. + +#### void xt_unregister_match(struct xt_match *target); + +This method unregisters an Xtable match extension. + +#### int xt_register_matches(struct xt_match *match, unsigned int n); + +This method registers an array of Xtable match extensions; n is the number of matches. + +#### void xt_unregister_matches(struct xt_match *match, unsigned int n); + +This method unregisters an array of Xtable match extensions; n is the number of matches. + +#### int nf_ct_extend_register(struct nf_ct_ext_type *type); + +This method registers a Connection Tracking Extension object. + +#### void nf_ct_extend_unregister(struct nf_ct_ext_type *type); + +This method unregisters a Connection Tracking Extension object. + +#### int __init iptable_nat_init(void); + +This method initializes the IPv4 NAT table. + +#### int __init nf_conntrack_ftp_init(void); + +This method initializes the Connection Tracking FTP Helper. Calls the nf_conntrack_helper_register() method to register the FTP helpers. + +### MACRO + +Let's look at the macro used in this chapter. + +#### NF_CT_DIRECTION(hash) + +This is a macro that gets an nf_conntrack_tuple_hash object as a parameter and returns the direction (IP_CT_DIR_ORIGINAL, which is 0, or IP_CT_DIR_REPLY, which is 1) of the destination (dst object) of the associated tuple (include/net/netfilter/nf_conntrack_tuple.h). + +### Tables + +And here are the tables, showing netfilter tables in IPv4 network namespace and in IPv6 network namespace and netfilter hook priorities. + +Table 9-2. + +IPv4 Network Namespace (netns_ipv4) Tables (xt_table Objects) + +Linux Symbol (netns_ipv4) | Linux Module + +---|--- + +iptable_filter | net/ipv4/netfilter/iptable_filter.c + +iptable_mangle | net/ipv4/netfilter/iptable_mangle.c + +iptable_raw | net/ipv4/netfilter/iptable_raw.c + +arptable_filter | net/ipv4/netfilter/arp_tables.c + +nat_table | net/ipv4/netfilter/iptable_nat.c + +iptable_security | net/ipv4/netfilter/iptable_security.c (Note: CONFIG_SECURITY should be set). + +Table 9-3. + +IPv6 Network Namespace (netns_ipv6) Tables (xt_table Objects) + +Linux Symbol (netns_ipv6) | Linux Module + +---|--- + +ip6table_filter | net/ipv6/netfilter/ip6table_filter.c + +ip6table_mangle | net/ipv6/netfilter/ip6table_mangle.c + +ip6table_raw | net/ipv6/netfilter/ip6table_raw.c + +ip6table_nat | net/ipv6/netfilter/ip6table_nat.c + +ip6table_security | net/ipv6/netfilter/ip6table_security.c (Note: CONFIG_SECURITY should be set). + +Table 9-4. + +Netfilter Hook Priorities + +Linux Symbol | value + +---|--- + +NF_IP_PRI_FIRST | INT_MIN + +NF_IP_PRI_CONNTRACK_DEFRAG | -400 + +NF_IP_PRI_RAW | -300 + +NF_IP_PRI_SELINUX_FIRST | -225 + +NF_IP_PRI_CONNTRACK | -200 + +NF_IP_PRI_MANGLE | -150 + +NF_IP_PRI_NAT_DST | -100 + +NF_IP_PRI_FILTER | 0 + +NF_IP_PRI_SECURITY | 50 + +NF_IP_PRI_NAT_SRC | 100 + +NF_IP_PRI_SELINUX_LAST | 225 + +NF_IP_PRI_CONNTRACK_HELPER | 300 + +NF_IP_PRI_CONNTRACK_CONFIRM | INT_MAX + +NF_IP_PRI_LAST | INT_MAX + +See the nf_ip_hook_priorities enum definition in include/uapi/linux/netfilter_ipv4.h. + +#### Tools and Libraries + +The conntrack-tools consist of a userspace daemon, conntrackd, and a command line tool, conntrack. It provides a tool with which system administrators can interact with the netfilter Connection Tracking layer. See: http://conntrack-tools.netfilter.org/ . + +Some libraries are developed by the netfilter project and allow you to perform various userspace tasks; these libraries are prefixed with "libnetfilter"; for example, libnetfilter_conntrack, libnetfilter_log, and libnetfilter_queue. For more details, see the netfilter official website, www.netfilter.org . +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_10 + +© Rami Rosen 2014 + +# 10. IPsec + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 9 deals with the netfilter subsystem and with its kernel implementation. This chapter discusses the Internet Protocol Security (IPsec) subsystem. IPsec is a group of protocols for securing IP traffic by authenticating and encrypting each IP packet in a communication session. Most security services are provided by two major IPsec protocols: the Authentication Header (AH) protocol and the Encapsulating Security Payload (ESP) protocol. Moreover, IPsec provides protection against trying to eavesdrop and send again packets (replay attacks). IPsec is mandatory according to IPv6 specification and optional in IPv4. Nevertheless, most modern operating systems, including Linux, have support for IPsec both in IPv4 and in IPv6. The first IPsec protocols were defined in 1995 (RFCs 1825–1829). In 1998, these RFCs were deprecated by RFCs 2401–2412. Then again in 2005, these RFCs were updated by RFCs 4301–4309. + +Chapter 9 deals with the netfilter subsystem and with its kernel implementation. This chapter discusses the Internet Protocol Security (IPsec) subsystem. IPsec is a group of protocols for securing IP traffic by authenticating and encrypting each IP packet in a communication session. Most security services are provided by two major IPsec protocols: the Authentication Header (AH) protocol and the Encapsulating Security Payload (ESP) protocol. Moreover, IPsec provides protection against trying to eavesdrop and send again packets (replay attacks). IPsec is mandatory according to IPv6 specification and optional in IPv4. Nevertheless, most modern operating systems, including Linux, have support for IPsec both in IPv4 and in IPv6. The first IPsec protocols were defined in 1995 (RFCs 1825–1829). In 1998, these RFCs were deprecated by RFCs 2401–2412. Then again in 2005, these RFCs were updated by RFCs 4301–4309. + +The IPsec subsystem is very complex—perhaps the most complex part of the Linux kernel network stack. Its importance is paramount when considering the growing security requirements of organizations and of private citizens. This chapter gives you a basis for delving into this complex subsystem. + +## General + +IPsec has become a standard for most of the IP Virtual Private Network (VPN) technologyin the world. That said, there are also VPNs based on different technologies, such as Secure Sockets Layer (SSL) and pptp (tunneling a PPP connection over the GRE protocol). Among IPsec's several modes of operation, the most important are transport mode and tunnel mode. In transport mode, only the payload of the IP packet is encrypted, whereas in tunnel mode, the entire IP packet is encrypted and inserted into a new IP packet with a new IP header. When using a VPN with IPsec, you usually work in tunnel mode, although there are cases in which you work in transport mode (L2TP/IPsec, for example). + +I start with a short discussion about the Internet Key Exchange (IKE) userspace daemon and cryptography in IPsec. These are topics that are mostly not a part of the kernel networking stack but that are related to IPsec operation and are needed to get a better understanding of the kernel IPsec subsystem. I follow that with a discussion of the XFRM framework, which is the configuration and monitoring interface between the IPsec userspace part and IPsec kernel components, and explain the traversal of IPsec packets in the Tx and Rx paths. I conclude the chapter with a short section about NAT traversal in IPsec, which is an important and interesting feature, and a "Quick Reference" section. The next section begins the discussion with the IKE protocol. + +## IKE (Internet Key Exchange) + +The most popular open source userspace Linux IPsec solutions are Openswan (and libreswan, which forked from Openswan), strongSwan, and racoon (of ipsec-tools). Racoon is part of the Kame project, which aimed to provide a free IPv6 and IPsec protocol stack implementation for variants of BSD. + +To establish an IPsec connection, you need to set up a Security Association (SA). You do that with the help of the already mentioned userspace projects. An SA is defined by two parameters: a source address and a 32-bit Security Parameter Index (SPI). Both sides (called initiator and responder in IPsec terminology) should agree on parameters such as a key (or more than one key), authentication, encryption, data integrity and key exchange algorithms, and other parameters such as key lifetime (IKEv1 only). This can be done in two different ways of key distribution: by manual key exchange, which is rarely used since it is less secure, or by the IKE protocol. Openswan and strongSwan implementations provide an IKE daemon (pluto in Openswan and charon in strongSwan) that uses UDP port 500 (both source and destination) to send and receive IKE messages. Both use the XFRM Netlink interface to communicate with the native IPsec stack of the Linux kernel. The strongSwan project is the only complete open source implementation of RFC 5996, "Internet Key Exchange Protocol Version 2 (IKEv2)," whereas the Openswan project only implements a small mandatory subset. + +You can use IKEv1 Aggressive Mode in Openswan and in strongSwan 5.x (for strongSwan, it should be explicitly configured, and the name of the charon daemon changes to be weakSwan in this case); but this option is regarded unsafe. IKEv1 is still used by Apple operating systems (iOS and Mac OS X) because of the built-in racoon legacy client. Though many implementations use IKEv1, there are many improvements and advantages when using IKEv2. I'll mention some of them very briefly: in IKEv1, more messages are needed to establish an SA than in IKEv2. IKEv1 is very complex, whereas IKEv2 is considerably simpler and more robust, mainly because each IKEv2 request message must be acknowledged by an IKEv2 response message. In IKEv1, there are no acknowledgements, but there is a backoff algorithm which, in case of packet loss, keeps trying forever. However, in IKEv1 there can be a race when the two sides perform retransmission, whereas in IKEv2 that can't happen because the responsibility for retransmission is on the initiator only. Among the other important IKEv2 features are that IKEv2 has integrated NAT traversal support, automatic narrowing of Traffic Selectors (left|rightsubnet on both sides don't have to match exactly, but one proposal can be a subset of the other proposal), an IKEv2 configuration payload allowing to assign virtual IPv4/IPv6 addresses and internal DNS information (replacement for IKEv1 Mode Config), and finally IKEv2 EAP authentication (replacement for the dangerous IKEv1 XAUTH protocol), which solves the problem of potentially weak PSKs by requesting a VPN server certificate and digital signature first, before the client uses a potentially weak EAP authentication algorithm (for example, EAP-MSCHAPv2). + +There are two phases in IKE: the first is called Main Mode. In this stage, each side verifies the identity of the other side, and a common session key is established using the Diffie-Hellman key exchange algorithm. This mutual authentication is based on RSA or ECDSA certificates or pre-shared secrets (pre-shared key, PSKs), which are password based and assumed to be weaker. Other parameters like the Encryption algorithm and the Authentication method to be used are also negotiated. If this phase completes successfully, the two peers are said to establish an ISAKMP SA (Internet Security Association Key Management Protocol Security Association). The second phase is called Quick Mode. In this phase, both sides agree on the cryptographic algorithms to use. The IKEv2 protocol does not differentiate between phase 1 and 2 but establishes the first CHILD_SA as part of the IKE_AUTH message exchange. THE CHILD_SA_CREATE message exchange is used only to establish additional CHILD_SAs or for the periodic rekeying of the IKE and IPsec SAs. This is why IKEv1 needs nine messages to establish a single IPsec SA, whereas IKEv2 does the same in just four messages. + +The next section briefly discusses cryptography in the context of IPsec (a fuller treatment of the subject would be beyond the scope of this book). + +## IPsec and Cryptography + +There are two widely used IPsec stacks for Linux: the native Netkey stack (developed by Alexey Kuznetsov and David S. Miller) introduced with the 2.6 kernel, and the KLIPS stack, originally written for 2.0 kernel (it predates netfilter!). Netkey uses the Linux kernel Crypto API, whereas KLIPS might support more crypto hardware through Open Cryptography Framework (OCF). OCF's advantage is that it enables using asynchronous calls to encrypt/decrypt data. In the Linux kernel, most of the Crypto API performs synchronous calls. I should mention the acrypto kernel code, which is the asynchronous crypto layer of the Linux kernel. There are asynchronous implementations for all algorithm types. A lot of hardware crypto accelerators use the asynchronous crypto interface for crypto request offloading. That is simply because they can't block until the crypto job is done. They have to use the asynchronous API. + +It is also possible to use software-implemented algorithms with the asynchronous API. For example, the cryptd crypto template can run arbitrary algorithms in asynchronous mode. And you can use the pcrypt crypto template when working in multicore environment. This template parallelizes the crypto layer by sending incoming crypto requests to a configurable set of CPUs. It also takes care of the order of the crypto requests, so it does not introduce packet reorder when used with IPsec. The use of pcrypt can speed up IPsec by magnitudes in some situations. The crypto layer has a user management API which is used by the crconf ( http://sourceforge.net/projects/crconf/ ) tool to configure the crypto layer, so asynchronous crypto algorithms can be configured whenever needed. With the Linux 2.6.25 kernel, released in 2008, the XFRM framework started to offer support for the very efficient AEAD (Authenticated Encryption with Associated Data) algorithms (for example, AES-GCM), especially when the Intel AES-NI instruction set is available and data integrity comes nearly for free. Delving deeply into the details of cryptography in IPsec is beyond the scope of this book. For further information, I suggest reading the relevant chapters in Network Security Essentials, Fifth Edition by William Stallings (Prentice Hall, 2013). + +The next section discusses the XFRM framework, which is the infrastructure of IPsec. + +## The XFRM Framework + +IPsec is implemented by the XFRM (pronounced "transform") framework, originated in the USAGI project, which aimed at providing a production quality IPv6 and IPsec protocol stack. The term transform refers to an incoming packet or an outgoing packet being transformed in the kernel stack according to some IPsec rule. The XFRM framework was introduced in kernel 2.5. The XFRM infrastructure is protocol-family independent, which means that there is a generic part common to both IPv4 and IPv6, located under net/xfrm. Both IPv4 and IPv6 have their own implementation of ESP, AH, and IPCOMP. For example, the IPv4 ESP module is net/ipv4/esp4.c, and the IPv6 ESP module is net/ipv6/esp6.c. Apart from it, IPv4 and IPv6 implement some protocol-specific modules for supporting the XFRM infrastructure, such as net/ipv4/xfrm4_policy.c or net/ipv6/xfrm6_policy.c. + +The XFRM framework supports network namespaces, which is a form of lightweight process virtualization that enables a single process or a group of processes to have their own network stack (I discuss network namespaces in Chapter 14). Each network namespace (instance of struct net) includes a member called xfrm, which is an instance of the netns_xfrm structure. This object includes many data structures and variables that you will encounter in this chapter, such as the hash tables of XFRM policies and the hash tables of XFRM states, sysctl parameters, XFRM state garbage collector, counters, and more: + +struct netns_xfrm { + +struct hlist_head *state_bydst; + +struct hlist_head *state_bysrc; + +struct hlist_head *state_byspi; + +. . . + +unsigned int state_num; + +. . . + +struct work_struct state_gc_work; + +. . . + +u32 sysctl_aevent_etime; + +u32 sysctl_aevent_rseqth; + +int sysctl_larval_drop; + +u32 sysctl_acq_expires; + +}; + +(include/net/netns/xfrm.h) + +### XFRM Initialization + +In IPv4, XFRM initialization is done by calling the xfrm_init() method and the xfrm4_init() method from the ip_rt_init() method in net/ipv4/route.c. In IPv6, the xfrm6_init() method is invoked from the ip6_route_init() method for performing XFRM initialization. Communication between the userspace and the kernel is done by creating a NETLINK_XFRM netlink socket and sending and receiving netlink messages. The netlink NETLINK_XFRM kernel socket is created in the following method: + +static int __net_init xfrm_user_net_init(struct net *net) + +{ + +struct sock *nlsk; + +struct netlink_kernel_cfg cfg = { + +.groups = XFRMNLGRP_MAX, + +.input = xfrm_netlink_rcv, + +}; + +nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg); + +. . . + +return 0; + +} + +Messages sent from userspace (like XFRM_MSG_NEWPOLICY for creating a new Security Policy or XFRM_MSG_NEWSA for creating a new Security Association) are handled by the xfrm_netlink_rcv() method (net/xfrm/xfrm_user.c), which in turn calls the xfrm_user_rcv_msg() method (I discuss netlink sockets in Chapter 2). + +The XFRM policy and the XFRM state are the fundamental data structures of the XFRM framework. I start by describing what XFRM policy is, and subsequently I describe what XFRM state is. + +### XFRM Policies + +A Security Policy is a rule that tells IPsec whether a certain flow should be processed or whether it can bypass IPsec processing. The xfrm_policy structure represents an IPsec policy. A policy includes a selector (an xfrm_selector object). A policy is applied when its selector matches a flow. The XFRM selector consists of fields like source and destination addresses, source and destination ports, protocol, and more, which can identify a flow: + +struct xfrm_selector { + +xfrm_address_t daddr; + +xfrm_address_t saddr; + +__be16 dport; + +__be16 dport_mask; + +__be16 sport; + +__be16 sport_mask; + +__u16 family; + +__u8 prefixlen_d; + +__u8 prefixlen_s; + +__u8 proto; + +int ifindex; + +__kernel_uid32_t user; + +}; + +(include/uapi/linux/xfrm.h) + +The xfrm_selector_match() method, which gets an XFRM selector, a flow, and a family (AF_INET for IPv4 or AF_INET6 for IPv6) as parameters, returns true when the specified flow matches the specified XFRM selector. Note that the xfrm_selector structure is also used in XFRM states, as you will see hereafter in this section. A Security Policy is represented by the xfrm_policy structure: + +struct xfrm_policy { + +. . . + +struct hlist_node bydst; + +struct hlist_node byidx; + +/* This lock only affects elements except for entry. */ + +rwlock_t lock; + +atomic_t refcnt; + +struct timer_list timer; + +struct flow_cache_object flo; + +atomic_t genid; + +u32 priority; + +u32 index; + +struct xfrm_mark mark; + +struct xfrm_selector selector; + +struct xfrm_lifetime_cfg lft; + +struct xfrm_lifetime_cur curlft; + +struct xfrm_policy_walk_entry walk; + +struct xfrm_policy_queue polq; + +u8 type; + +u8 action; + +u8 flags; + +u8 xfrm_nr; + +u16 family; + +struct xfrm_sec_ctx *security; + +struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; + +}; + +(include/net/xfrm.h) + +The following description covers the important members of the xfrm_policy structure: + + * refcnt: The XFRM policy reference counter; initialized to 1 in the xfrm_policy_alloc( ) method, incremented by the xfrm_pol_hold() method, and decremented by the xfrm_pol_put() method. + + * timer: Per-policy timer; the timer callback is set to be xfrm_policy_timer() in the xfrm_policy_alloc() method. The xfrm_policy_timer() method handles policy expiration: it is responsible for deleting a policy when it is expired by calling the xfrm_policy_delete() method, and sending an event (XFRM_MSG_POLEXPIRE) to all registered Key Managers by calling the km_policy_expired() method. + + * lft: The XFRM policy lifetime (xfrm_lifetime_cfg object). Every XFRM policy has a lifetime, which is a time interval (expressed as a time or byte count). + +You can set XFRM policy lifetime values with the ip command and the limit parameter—for example: + +ip xfrm policy add src 172.16.2.0/24 dst 172.16.1.0/24 limit byte-soft 6000 ... + +sets the soft_byte_limit of the XFRM policy lifetime (lft) to be 6000; see man 8 ip xfrm. + +You can display the lifetime (lft) of an XFRM policy by inspecting the lifetime configuration entry when running ip -stat xfrm policy show. + + * curlft: The XFRM policy current lifetime, which reflects the current status of the policy in context of lifetime. The curlft is an xfrm_lifetime_cur object. It consists of four members (all of them are fields of 64 bits, unsigned): + + * bytes: The number of bytes which were processed by the IPsec subsystem, incremented in the Tx path by the xfrm_output_one() method and in the Rx path by the xfrm_input() method. + + * packets: The number of packets that were processed by the IPsec subsystem, incremented in the Tx path by the xfrm_output_one() method, and in the Rx path by the xfrm_input() method. + + * add_time: The timestamp of adding the policy, initialized when adding a policy, in the xfrm_policy_insert() method and in the xfrm_sk_policy_insert() method. + + * use_time: The timestamp of last access to the policy. The use_time timestamp is updated, for example, in the xfrm_lookup() method or in the __xfrm_policy_check() method. Initialized to 0 when adding the XFRM policy, in the xfrm_policy_insert() method and in the xfrm_sk_policy_insert() method. + +Note + +You can display the current lifetime (curlft) object of an XFRM policy by inspecting the lifetime current entry when running ip -stat xfrm policy show. + + * polq: A queue to hold packets that are sent while there are still no XFRM states associated with the policy. As a default, such packets are discarded by calling the make_blackhole() method. When setting the xfrm_larval_drop sysctl entry to 0 (/proc/sys/net/core/xfrm_larval_drop), these packets are kept in a queue (polq.hold_queue) of SKBs; up to 100 packets (XFRM_MAX_QUEUE_LEN) can be kept in this queue. This is done by creating a dummy XFRM bundle, by the xfrm_create_dummy_bundle() method (see more in the "XFRM lookup" section later in this chapter). By default, the xfrm_larval_drop sysctl entry is set to 1 (see the __xfrm_sysctl_init() method in net/xfrm/xfrm_sysctl.c). + + * type: Usually the type is XFRM_POLICY_TYPE_MAIN (0). When the kernel has support for subpolicy (CONFIG_XFRM_SUB_POLICY is set), two policies can be applied to the same packet, and you can use the XFRM_POLICY_TYPE_SUB (1) type. Policy that lives a shorter time in kernel should be a subpolicy. This feature is usually needed only for developers/debugging and for mobile IPv6, because you might apply one policy for IPsec and one for mobile IPv6. The IPsec policy is usually the main policy with a longer lifetime than the mobile IPv6 (sub) policy. + + * action: Can have one of these two values: + + * XFRM_POLICY_ALLOW (0): Permit the traffic. + + * XFRM_POLICY_BLOCK(1): Disallow the traffic (for example, when using type=reject or type=drop in /etc/ipsec.conf). + + * xfrm_nr: Number of templates associated with the policy—can be up to six templates (XFRM_MAX_DEPTH). The xfrm_tmpl structure is an intermediate structure between the XFRM state and the XFRM policy. It is initialized in the copy_templates() method, net/xfrm/xfrm_user.c. + + * family: IPv4 or IPv6. + + * security: A security context (xfrm_sec_ctx object) that allows the XFRM subsystem to restrict the sockets that can send or receive packets via Security Associations (XFRM states). For more details, see http://lwn.net/Articles/156604/ . + + * xfrm_vec: An array of XFRM templates (xfrm_tmpl objects). + +The kernel stores the IPsec Security Policies in the Security Policy Database (SPD). Management of the SPD is done by sending messages from a userspace socket. For example: + + * Adding an XFRM policy (XFRM_MSG_NEWPOLICY) is handled by the xfrm_add_policy() method. + + * Deleting an XFRM policy (XFRM_MSG_DELPOLICY) is handled by the xfrm_get_policy() method. + + * Displaying the SPD (XFRM_MSG_GETPOLICY) is handled by the xfrm_dump_policy() method. + + * Flushing the SPD (XFRM_MSG_FLUSHPOLICY) is handled by the xfrm_flush_policy() method. + +The next section describes what XFRM state is. + +### XFRM States (Security Associations) + +The xfrm_state structure represents an IPsec Security Association (SA) (include/net/xfrm.h). It represents unidirectional traffic and includes information such as cryptographic keys, flags, request id, statistics, replay parameters, and more. You add XFRM states by sending a request (XFRM_MSG_NEWSA) from a userspace socket; it is handled in the kernel by the xfrm_state_add() method (net/xfrm/xfrm_user.c). Likewise, you delete a state by sending an XFRM_MSG_DELSA message, and it is handled in the kernel by the xfrm_del_sa() method: + +struct xfrm_state { + +. . . + +union { + +struct hlist_node gclist; + +struct hlist_node bydst; + +}; + +struct hlist_node bysrc; + +struct hlist_node byspi; + +atomic_t refcnt; + +spinlock_t lock; + +struct xfrm_id id; + +struct xfrm_selector sel; + +struct xfrm_mark mark; + +u32 tfcpad; + +u32 genid; + +/* Key manager bits */ + +struct xfrm_state_walk km; + +/* Parameters of this state. */ + +struct { + +u32 reqid; + +u8 mode; + +u8 replay_window; + +u8 aalgo, ealgo, calgo; + +u8 flags; + +u16 family; + +xfrm_address_t saddr; + +int header_len; + +int trailer_len; + +} props; + +struct xfrm_lifetime_cfg lft; + +/* Data for transformer */ + +struct xfrm_algo_auth *aalg; + +struct xfrm_algo *ealg; + +struct xfrm_algo *calg; + +struct xfrm_algo_aead *aead; + +/* Data for encapsulator */ + +struct xfrm_encap_tmpl *encap; + +/* Data for care-of address */ + +xfrm_address_t *coaddr; + +/* IPComp needs an IPIP tunnel for handling uncompressed packets */ + +struct xfrm_state *tunnel; + +/* If a tunnel, number of users + 1 */ + +atomic_t tunnel_users; + +/* State for replay detection */ + +struct xfrm_replay_state replay; + +struct xfrm_replay_state_esn *replay_esn; + +/* Replay detection state at the time we sent the last notification */ + +struct xfrm_replay_state preplay; + +struct xfrm_replay_state_esn *preplay_esn; + +/* The functions for replay detection. */ + +struct xfrm_replay *reply; + +/* internal flag that only holds state for delayed aevent at the + +* moment + +*/ + +u32 xflags; + +/* Replay detection notification settings */ + +u32 replay_maxage; + +u32 replay_maxdiff; + +/* Replay detection notification timer */ + +struct timer_list rtimer; + +/* Statistics */ + +struct xfrm_stats stats; + +struct xfrm_lifetime_cur curlft; + +struct tasklet_hrtimer mtimer; + +/* used to fix curlft->add_time when changing date */ + +long saved_tmo; + +/* Last used time */ + +unsigned long lastused; + +/* Reference to data common to all the instances of this + +* transformer. */ + +const struct xfrm_type *type; + +struct xfrm_mode *inner_mode; + +struct xfrm_mode *inner_mode_iaf; + +struct xfrm_mode *outer_mode; + +/* Security context */ + +struct xfrm_sec_ctx *security; + +/* Private data of this transformer, format is opaque, + +* interpreted by xfrm_type methods. */ + +void *data; + +}; + +(include/net/xfrm.h) + +The following description details some of the important members of the xfrm_state structure: + + * refcnt: A reference counter, incremented by the xfrm_state_hold() method and decremented by the __xfrm_state_put() method or by the xfrm_state_put() method (the latter also releases the XFRM state by calling the __xfrm_state_destroy() method when the reference counter reaches 0). + + * id: The id (xfrm_id object) consists of three fields, which uniquely define it: destination address, spi, and security protocol (AH, ESP, or IPCOMP). + + * props: The properties of the XFRM state. For example: + + * mode: Can be one of five modes (for example, XFRM_MODE_TRANSPORT for transport mode or XFRM_MODE_TUNNEL for tunnel mode; see include/uapi/linux/xfrm.h). + + * flag: For example, XFRM_STATE_ICMP. These flags are available in include/uapi/linux/xfrm.h. These flags can be set from userspace, for example, with the ip command and the flag option: ip xfrm add state flag icmp ... + + * family: IPv4 of IPv6. + + * saddr: The source address of the XFRM state. + + * lft: The XFRM state lifetime (xfrm_lifetime_cfg object). + + * stats: An xfrm_stats object, representing XFRM state statistics. You can display the XFRM state statistics by ip –stat xfrm show. + +The kernel stores the IPsec Security Associations in the Security Associations Database (SAD). The xfrm_state objects are stored in three hash tables in netns_xfrm (the XFRM namespace, discussed earlier): state_bydst, state_bysrc, state_byspi. The keys to these tables are computed by the xfrm_dst_hash(), xfrm_src_hash(), and xfrm_spi_hash() methods, respectively. When an xfrm_state object is added, it is inserted into these three hash tables. If the value of the spi is 0 (the value 0 is not normally to be used for spi—I will shortly mention when it is 0), the xfrm_state object is not added to the state_byspi hash table (see the __xfrm_state_insert() method in net/xfrm/xfrm_state.c). + +Note + +An spi with value of 0 is only used for acquire states. The kernel sends an acquire message to the key manager and adds a temporary acquire state with spi 0 if traffic matches a policy, but the state is not yet resolved. The kernel does not bother to send a further acquire as long as the acquire state exists; the lifetime can be configured at net->xfrm.sysctl_acq_expires. If the state gets resolved, this acquire state is replaced by the actual state. + +Lookup in the SAD can be done by the following: + + * xfrm_state_lookup() method: In the state_byspi hash table. + + * xfrm_state_lookup_byaddr() method: In the state_bysrc hash table. + + * xfrm_state_find() method: In the state_bydst hash table. + +The ESP protocol is the most commonly used IPsec protocol; it supports both encryption and authentication. The next section discusses the IPv4 ESP implementation. + +## ESP Implementation (IPv4) + +The ESP protocol is specified in RFC 4303; it supports both encryption and authentication. Though it also supports encryption-only and authentication-only modes, it is usually used with both encryption and authentication because it is safer. I should also mention here the new Authenticated Encryption (AEAD) methods like AES-GCM, which can do the encryption and data integrity computations in a single pass and can be highly parallelized on multiple cores, so that with the Intel AES-NI instruction set, an IPsec throughput of several Gbit/s can be achieved. The ESP protocol supports both tunnel mode and transport mode; the protocol identifier is 50 (IPPROTO_ESP). The ESP adds a new header and a trailer to each packet. According to the ESP format, illustrated in Figure 10-1, there are the following fields: + + * SPI: A 32-bit Security Parameter Index. Together with the source address, it identities an SA. + + * Sequence Number:32 bits, incremented by 1 for each transmitted packet in order to protect against replay attacks. + + * Payload Data:A variable size encrypted data block. + + * Padding: Padding for the encrypted data block in order to satisfy alignment requirements (0–255 bytes). + + * Pad Length: The size of padding in bytes (1 byte). + + * Next Header: The type of the next header (1 byte). + + * Authentication Data:The Integrity Check Value (ICV). + +Figure 10-1. + +ESP format + +The next section discusses IPv4 ESP initialization. + +### IPv4 ESP Initialization + +We first define an esp_type (xfrm_type object) and esp4_protocol (net_protocol object) and register them thus: + +static const struct xfrm_type esp_type = + +{ + +.description = "ESP4", + +.owner = THIS_MODULE, + +.proto = IPPROTO_ESP, + +.flags = XFRM_TYPE_REPLAY_PROT, + +.init_state = esp_init_state, + +.destructor = esp_destroy, + +.get_mtu = esp4_get_mtu, + +.input = esp_input, + +.output = esp_output + +}; + +static const struct net_protocol esp4_protocol = { + +.handler = xfrm4_rcv, + +.err_handler = esp4_err, + +.no_policy = 1, + +.netns_ok = 1, + +}; + +static int __init esp4_init(void) + +{ + +Each protocol family has an instance of an xfrm_state_afinfo object, which includes protocol-family specific state methods; thus there is xfrm4_state_afinfo for IPv4 (net/ipv4/xfrm4_state.c) and xfrm6_state_afinfo for IPv6. This object includes an array of xfrm_type objects called type_map. Registering XFRM type by calling the xfrm_register_type() method will set the specified xfrm_type as an element in this array: + +if (xfrm_register_type(&esp_type, AF_INET) < 0) { + +pr_info("%s: can't add xfrm type\n", __func__); + +return -EAGAIN; + +} + +Registering the IPv4 ESP protocol is done like registering any other IPv4 protocol, by calling the inet_add_protocol() method. Note that the protocol handler used by IPv4 ESP, namely the xfrm4_rcv() method, is also used by the IPv4 AH protocol (net/ipv4/ah4.c) and by the IPv4 IPCOMP (IP Payload Compression Protocol ) protocol (net/ipv4/ipcomp.c). + +if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + +pr_info("%s: can't add protocol\n", __func__); + +xfrm_unregister_type(&esp_type, AF_INET); + +return -EAGAIN; + +} + +return 0; + +} + +(net/ipv4/esp4.c) + +## Receiving an IPsec Packet (Transport Mode) + +Suppose you work in transport mode in IPv4, and you receive an ESP packet that is destined to the local host. ESP in transport mode does not encrypt the IP header, only the IP payload. Figure 10-2 shows the traversal of an incoming IPv4 ESP packet, and its stages are described in this section. We will pass all the usual stages of local delivery, starting with the ip_rcv() method, and we will reach the ip_local_deliver_finish() method. Because the value of the protocol field in the IPv4 header is ESP (50), we invoke its handler, which is the xfrm4_rcv() method, as you saw earlier. The xfrm4_rcv() method further calls the generic xfrm_input() method, which performs a lookup in the SAD by calling the xfrm_state_lookup() method. If the lookup fails, the packet is dropped. In case of a lookup hit, the input callback method of the corresponding IPsec protocol is invoked: + +int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) + +{ + +struct xfrm_state *x; + +do { + +. . . + +Perform a lookup in the state_byspi hash table: + +x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family); + +Drop the packet silently if the lookup failed: + +if (x == NULL) { + +XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + +xfrm_audit_state_notfound(skb, family, spi, seq); + +goto drop; + +} + +In this case, of IPv4 ESP incoming traffic, the XFRM type associated with the state (x->type) is the ESP XFRM Type (esp_type); its input callback was set to esp_input(), as mentioned earlier in the "IPv4 ESP initialization" section. + +By calling x->type->input(), in the following line the esp_input() method is invoked; this method returns the protocol number of the original packet, before it was encrypted by ESP: + +nexthdr = x->type->input(x, skb); + +. . . + +The original protocol number is kept in the control buffer (cb) of the SKB by using the XFRM_MODE_SKB_CB macro; it will be used later for modifying the IPv4 header of the packet, as you will see: + +XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; + +After the esp_input() method terminates, the xfrm4_transport_finish() method is invoked. This method modifies various fields of the IPv4 header. Take a look at the xfrm4_transport_finish() method: + +int xfrm4_transport_finish(struct sk_buff *skb, int async) + +{ + +struct iphdr *iph = ip_hdr(skb); + +The protocol of the IPv4 header (iph->protocol) is 50 (ESP) at this point; you should set it to be the protocol number of the original packet (before it was encrypted by ESP) so that it will be processed by L4 sockets. The protocol number of the original packet was kept in XFRM_MODE_SKB_CB(skb)->protocol, as you saw earlier in this section: + +iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + +. . . + +__skb_push(skb, skb->data - skb_network_header(skb)); + +iph->tot_len = htons(skb->len); + +Recalculate the checksum, since the IPv4 header was modified: + +ip_send_check(iph); + +Invoke any netfilter NF_INET_PRE_ROUTING hook callback and then call the xfrm4_rcv_encap_finish() method: + +NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + +xfrm4_rcv_encap_finish); + +return 0; + +} + +The xfrm4_rcv_encap_finish() method calls the ip_local_deliver() method. Now the value of the protocol member in the IPv4 header is the original transport protocol (UDPv4, TCPv4, and so on), so from now on you proceed in the usual packet traversal, and the packet is passed to the transport layer (L4). + +Figure 10-2. + +Receiving IPv4 ESP packet, local delivery, transport mode. Note: The figure describes an IPv4 ESP packet. For IPv4 AH packets, the ah_input() method is invoked instead of the esp_input() method; likewise, for IPv4 IPCOMP packets, the ipcomp_input() method is invoked instead of the esp_input() method + +## Sending an IPsec Packet (Transport Mode) + +Figure 10-3 shows the Tx path of an outgoing packet sent via IPv4 ESP in transport mode. The first step after performing a lookup in the routing subsystem (by calling the ip_route_output_flow() method), is to perform a lookup for an XFRM policy, which can be applied on this flow. You do that by calling the xfrm_lookup() method (I discuss the internals of this method later in this section). If there is a lookup hit, continue to the ip_local_out() method, and then, after calling several methods as you can see in Figure 10-3, you eventually reach the esp_output() method, which encrypts the packet and then sends it out by calling the ip_output() method. + +Figure 10-3. + +Transmitting IPv4 ESP packet, transport mode. For the sake of simplicity, the case of creating a dummy bundle (when there are no XFRM states) and some other details are omitted + +The following section talks about how a lookup is performed in XFRM. + +## XFRM Lookup + +The xfrm_lookup() method is called for each packet that is sent out of the system. You want this lookup to be as efficient as possible. To achieve this goal, bundles are used. Bundles let you cache important information such as the route, the policies, the number of policies, and more; these bundles, which are instances of the xfrm_dst structure, are stored by using the flow cache. When the first packet of some flow arrives, you create an entry in the generic flow cache and subsequently create a bundle (xfrm_dst object). The bundle creation is done after a lookup for this bundle fails, because it is the first packet of this flow. When subsequent packets of this flow arrive, you will get a hit when performing a flow cache lookup: + +struct xfrm_dst { + +union { + +struct dst_entry dst; + +struct rtable rt; + +struct rt6_info rt6; + +} u; + +struct dst_entry *route; + +struct flow_cache_object flo; + +struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + +int num_pols, num_xfrms; + +#ifdef CONFIG_XFRM_SUB_POLICY + +struct flowi *origin; + +struct xfrm_selector *partner; + +#endif + +u32 xfrm_genid; + +u32 policy_genid; + +u32 route_mtu_cached; + +u32 child_mtu_cached; + +u32 route_cookie; + +u32 path_cookie; + +}; + +(include/net/xfrm.h) + +The xfrm_lookup() method is a very complex method. I discuss its important parts but I don't delve into all its nuances. Figure 10-4 shows a block diagram of the internals of the xfrm_lookup() method. + +Figure 10-4. + +xfrm_lookup() internals + +Let's take a look at the xfrm_lookup() method: + +struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + +const struct flowi *fl, struct sock *sk, int flags) + +{ + +The xfrm_lookup() method handles only the Tx path; so you set the flow direction (dir) to be FLOW_DIR_OUT by: + +u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + +If a policy is associated with this socket, you perform a lookup by the xfrm_sk_policy_lookup() method, which checks whether the packet flow matches the policy selector. Note that if the packet is to be forwarded, the xfrm_lookup() method was invoked from the __xfrm_route_forward() method, and there is no socket associated with the packet, because it was not generated on the local host; in this case, the specified sk argument is NULL: + +if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { + +num_pols = 1; + +pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + +. . . + +} + +If there is no policy associated with this socket, you perform a lookup in the generic flow cache by calling the flow_cache_lookup() method, passing as an argument a function pointer to the xfrm_bundle_lookup method (the resolver callback). The key to the lookup is the flow object (the specified fl parameter). If you don't find an entry in the flow cache, allocate a new flow cache entry. If you find an entry with the same genid, call the xfrm_bundle_flo_get() method by invoking flo->ops->get(flo). Eventually, you call the xfrm_bundle_lookup() method by invoking the resolver callback, which gets the flow object as a parameter (oldflo). See the flow_cache_lookup() method implementation in net/core/flow.c: + +flo = flow_cache_lookup(net, fl, family, dir, xfrm_bundle_lookup, dst_orig); + +Fetch the bundle (xfrm_dst object) that contains the flow cache object as a member: + +xdst = container_of(flo, struct xfrm_dst, flo); + +Fetch cached data, like the number of policies, number of templates, the policies and the route: + +num_pols = xdst->num_pols; + +num_xfrms = xdst->num_xfrms; + +memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols); + +route = xdst->route; + +} + +dst = &xdst->u.dst; + +Next comes handling a dummy bundle. A dummy bundle is a bundle where the route member is NULL. It is created in the XFRM bundle lookup process (by the xfrm_bundle_lookup() method) when no XFRM states were found, by calling the xfrm_create_dummy_bundle() method. In such a case, either one of the two options are available, according to the value of sysctl_larval_drop (/proc/sys/net/core/xfrm_larval_drop): + + * If sysctl_larval_drop is set (which means its value is 1—it is so by default, as mentioned earlier in this chapter), the packet should be discarded. + + * If sysctl_larval_drop is not set (its value is 0), the packets are kept in a per-policy queue (polq.hold_queue), which can contain up to 100 (XFRM_MAX_QUEUE_LEN) SKBs; this is implemented by the xdst_queue_output() method. These packets are kept until the XFRM states are resolved or until some timeout elapses. Once the states are resolved, the packets are sent out of the queue. If the XFRM states are not resolved after some time interval (the timeout of the xfrm_policy_queue object), the queue is flushed by the xfrm_queue_purge() method: + +if (route == NULL && num_xfrms > 0) { + +/* The only case when xfrm_bundle_lookup() returns a + +* bundle with null route, is when the template could + +* not be resolved. It means policies are there, but + +* bundle could not be created, since we don't yet + +* have the xfrm_state's. We need to wait for KM to + +* negotiate new SA's or bail out with error.*/ + +if (net->xfrm.sysctl_larval_drop) { + +For IPv4, the make_blackhole() method calls the ipv4_blackhole_route() method. For IPv6, it calls the ip6_blackhole_route() method: + +return make_blackhole(net, family, dst_orig); + +} + +The next section covers one of the most important features of IPsec—NAT traversal—and explains what it is and why it is needed. + +## NAT Traversal in IPsec + +Why don't NAT devices allow IPsec traffic to pass? NAT changes the IP addresses and sometimes also the port numbers of the packet. As a result, it recalculates the checksum of the TCP or the UDP header. The transport layer checksum calculation takes into account the source and destination of the IP addresses. So even if only the IP addresses were changed, the TCP or UDP checksum should be recalculated. However, with ESP encryption in transport mode, the NAT device can't update the checksum because the TCP or UDP headers are encrypted with ESP. There are protocols where the checksum does not cover the IP header (like SCTP), so this problem does not occur there. To solve these problems, the NAT traversal standard for IPsec was developed (or, as officially termed in RFC 3948, "UDP Encapsulation of IPsec ESP Packets"). UDP Encapsulation can be applied to IPv4 packets as well as to IPv6 packets. NAT traversal solutions are not limited to IPsec traffic; these techniques are typically required for client-to-client networking applications, especially for peer-to-peer and Voice over Internet Protocol (VoIP) applications. + +There are some partial solutions for VoIP NAT-traversal, such as STUN, TURN, ICE, and more. I should mention here that strongSwan implements the IKEv2 Mediation Extension service ( http://tools.ietf.org/html/draft-brunner-ikev2-mediation-00 ), which allows two VPN endpoints located behind a NAT router each to establish a direct peer-to-peer IPsec tunnel using a mechanism similar to TURN and ICE. STUN, for example, is used in the VoIP open source Ekiga client (formerly gnomemeeting). The problem with these solutions is NAT devices they don't cope with. Devices called SBCs (session border controllers) provide a full solution for NAT traversal in VoIP. SBCs can be implemented in hardware (Juniper Networks, for example, provides a router-integrated SBC solution) or in software. These SBC solutions perform NAT traversal of the media traffic—which is sent by Real Time Protocol (RTP)—and sometimes also for the signaling traffic—which is sent by Session Initiation Protocol (SIP). NAT traversal is optional in IKEv2. Openswan, strongSwan, and racoon support NAT traversal, but Openswan and racoon support NAT-T only with IKEv1, whereas strongSwan supports NAT traversal in both IKEv1 and IKEv2. + +### NAT-T Mode of Operation + +How does NAT traversal work? First, keep in mind that NAT-T is a good solution only for ESP traffic and not for AH. Another restriction is that NAT-T can't be used with manual keying, but only with IKEv1 and IKEv2. This is because NAT-T is tied with exchanging IKEv1/IKEv2 messages. First, you must tell the userspace daemon (pluto) that you want to use the NAT traversal feature, because it is not activated by default. You do that in Openswan by adding nat_traversal=yes to the connection parameters in /etc/ipsec.conf. Clients not behind a NAT are not affected by the addition of this entry. In strongSwan, the IKEv2 charon daemon always supports NAT traversal, and this feature cannot be deactivated. In the first phase of IKE (Main Mode), you check whether both peers support NAT-T. In IKEv1, when a peer supports NAT-T, one of the ISAKAMP header members (vendor ID) tells whether it supports NAT-T. In IKEv2, NAT-T is part of the standard and does not have to be announced. If this condition is met, you check whether there is one or more NAT devices in the path between the two IPsec peers by sending NAT-D payload messages. If this condition is also met, NAT-T protects the original IPsec encoded packet by inserting in it a UDP header between the IP header and the ESP header. Both the source and destination ports in the UDP header are 4500. Besides, NAT-T sends keep-alive messages every 20 seconds so that the NAT retains its mapping. Keep alive messages are also sent on UDP port 4500 and are recognized by their content and value (which is one byte, 0xFF). When this packet reaches the IPsec peer, after going through the NAT, the kernel strips the UDP header and decrypts the ESP payload. See the xfrm4_udp_encap_rcv() method in net/ipv4/xfrm4_input.c. + +## Summary + +This chapter covered IPsec and the XFRM framework, which is the infrastructure of IPsec, and XFRM policies and states, which are the fundamental data structures of the XFRM framework. I also discussed IKE, the ESP4 implementation, the Rx/Tx path of ESP4 in transport mode, and NAT traversal in IPsec. Chapter 11 deals with the following transport Layer (L4) protocols: UDP, TCP, SCTP, and DCCP. The "Quick Reference" section that follows covers the top methods related to the topics discussed in this chapter, ordered by their context. + +## Quick Reference + +I conclude this chapter with a short list of important methods of IPsec. Some of them were mentioned in this chapter. Afterward, I include a table of XFRM SNMP MIB counters. + +### Methods + +Let's start with the methods. + +#### bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family); + +This method returns true when the specified flow matches the specified XFRM selector. Invokes the __xfrm4_selector_match() method for IPv4 or the __xfrm6_selector_match() method for IPv6. + +#### int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, int dir); + +This method returns 0 if the specified policy can be applied to the specified flow, otherwise it returns an –errno. + +#### struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); + +This method allocates and initializes an XFRM policy. It sets its reference counter to 1, initializes the read-write lock, assigns the policy namespace (xp_net) to be the specified network namespace, sets its timer callback to be xfrm_policy_timer(), and sets its state resolution packet queue timer (policy->polq.hold_timer) callback to be xfrm_policy_queue_process(). + +#### void xfrm_policy_destroy(struct xfrm_policy *policy); + +This method removes the timer of specified XFRM policy object and releases the specified XFRM policy memory. + +#### void xfrm_pol_hold(struct xfrm_policy *policy); + +This method increments by 1 the reference count of the specified XFRM policy. + +#### static inline void xfrm_pol_put(struct xfrm_policy *policy); + +This method decrements by 1 the reference count of the specified XFRM policy. If the reference count reaches 0, call the xfrm_policy_destroy() method. + +#### struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); + +This method returns the xfrm_state_afinfo object associated with the specified protocol family. + +#### struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, const struct flowi *fl, struct dst_entry *dst); + +This method creates an XFRM bundle. Called from the xfrm_resolve_and_create_bundle() method. + +#### int policy_to_flow_dir(int dir); + +This method returns the flow direction according to the specified policy direction. For example, return FLOW_DIR_IN when the specified direction is XFRM_POLICY_IN, and so on. + +#### static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct dst_entry *dst, const struct flowi *fl, int num_xfrms, u16 family); + +This method creates a dummy bundle. Called from the xfrm_bundle_lookup() method when policies were found but there are no matching states. + +#### struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family); + +This method allocates an XFRM bundle object. Called from the xfrm_bundle_create() method and from the xfrm_create_dummy_bundle() method. + +#### int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); + +This method adds an XFRM policy to the SPD. Invoked from the xfrm_add_policy() method (net/xfrm/xfrm_user.c), or from the pfkey_spdadd() method (net/key/af_key.c). + +#### int xfrm_policy_delete(struct xfrm_policy *pol, int dir); + +This method releases the resources of the specified XFRM policy object. The direction argument (dir) is needed to decrement by 1 the corresponding XFRM policy counter in the policy_count in the per namespace netns_xfrm object. + +#### int xfrm_state_add(struct xfrm_state *x); + +This method adds the specified XFRM state to the SAD. + +#### int xfrm_state_delete(struct xfrm_state *x); + +This method deletes the specified XFRM state from the SAD. + +#### void __xfrm_state_destroy(struct xfrm_state *x); + +This method releases the resources of an XFRM state by adding it to the XFRM states garbage list and activating the XFRM state garbage collector. + +#### int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *data); + +This method iterates over all XFRM states (net->xfrm.state_all) and invokes the specified func callback. + +#### struct xfrm_state *xfrm_state_alloc(struct net *net); + +This method allocates and initializes an XFRM state. + +#### void xfrm_queue_purge(struct sk_buff_head *list); + +This method flushes the state resolution per-policy queue (polq.hold_queue). + +#### int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); + +This method is the main Rx IPsec handler. + +#### static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig); + +This method is invoked from the xfrm_lookup() method when there are no resolved states and sysctl_larval_drop is set. For IPv4, the make_blackhole() method calls the ipv4_blackhole_route() method; for IPv6, it calls the ip6_blackhole_route() method. + +#### int xdst_queue_output(struct sk_buff *skb); + +This method handles adding packets to the per-policy state resolution packet queue (pq->hold_queue). This queue can contain up to 100 (XFRM_MAX_QUEUE_LEN) packets. + +#### struct net *xs_net(struct xfrm_state *x); + +This method returns the namespace object (xs_net) associated with the specified xfrm_state object. + +#### struct net *xp_net(const struct xfrm_policy *xp); + +This method returns the namespace object (xp_net) associated with the specified xfrm_policy object. + +#### int xfrm_policy_id2dir(u32 index); + +This method returns the direction of the policy according to the specified index. + +#### int esp_input(struct xfrm_state *x, struct sk_buff *skb); + +This method is the main IPv4 ESP protocol handler. + +#### struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb); + +This method returns the ESP header associated with the specified SKB. + +#### int verify_newpolicy_info(struct xfrm_userpolicy_info *p); + +This method verifies that the specified xfrm_userpolicy_info object contains valid values. (xfrm_userpolicy_info is the object which is passed from userspace). It returns 0 if it is a valid object, and -EINVAL or -EAFNOSUPPORT if not. + +### Table + +Table 10-1 lists XFRM SNMP MIB counters. + +Table 10-1. + +XFRM SNMP MIB counters + +Linux Symbol | SNMP (procfs) Symbol | Methods in Which the Counter Might Be Incremented + +---|---|--- + +LINUX_MIB_XFRMINERROR | XfrmInError | xfrm_input() + +LINUX_MIB_XFRMINBUFFERERROR | XfrmInBufferError | xfrm_input(),__xfrm_policy_check() + +LINUX_MIB_XFRMINHDRERROR | XfrmInHdrError | xfrm_input(),__xfrm_policy_check() + +LINUX_MIB_XFRMINNOSTATES | XfrmInNoStates | xfrm_input() + +LINUX_MIB_XFRMINSTATEPROTOERROR | XfrmInStateProtoError | xfrm_input() + +LINUX_MIB_XFRMINSTATEMODEERROR | XfrmInStateModeError | xfrm_input() + +LINUX_MIB_XFRMINSTATESEQERROR | XfrmInStateSeqError | xfrm_input() + +LINUX_MIB_XFRMINSTATEEXPIRED | XfrmInStateExpired | xfrm_input() + +LINUX_MIB_XFRMINSTATEMISMATCH | XfrmInStateMismatch | xfrm_input(), + +__xfrm_policy_check() + +LINUX_MIB_XFRMINSTATEINVALID | XfrmInStateInvalid | xfrm_input() + +LINUX_MIB_XFRMINTMPLMISMATCH | XfrmInTmplMismatch | __xfrm_policy_check() + +LINUX_MIB_XFRMINNOPOLS | XfrmInNoPols | __xfrm_policy_check() + +LINUX_MIB_XFRMINPOLBLOCK | XfrmInPolBlock | __xfrm_policy_check() + +LINUX_MIB_XFRMINPOLERROR | XfrmInPolError | __xfrm_policy_check() + +LINUX_MIB_XFRMOUTERROR | XfrmOutError | xfrm_output_one(),xfrm_output() + +LINUX_MIB_XFRMOUTBUNDLEGENERROR | XfrmOutBundleGenError | xfrm_resolve_and_create_bundle() + +LINUX_MIB_XFRMOUTBUNDLECHECKERROR | XfrmOutBundleCheckError | xfrm_resolve_and_create_bundle() + +LINUX_MIB_XFRMOUTNOSTATES | XfrmOutNoStates | xfrm_lookup() + +LINUX_MIB_XFRMOUTSTATEPROTOERROR | XfrmOutStateProtoError | xfrm_output_one() + +LINUX_MIB_XFRMOUTSTATEMODEERROR | XfrmOutStateModeError | xfrm_output_one() + +LINUX_MIB_XFRMOUTSTATESEQERROR | XfrmOutStateSeqError | xfrm_output_one() + +LINUX_MIB_XFRMOUTSTATEEXPIRED | XfrmOutStateExpired | xfrm_output_one() + +LINUX_MIB_XFRMOUTPOLBLOCK | XfrmOutPolBlock | xfrm_lookup() + +LINUX_MIB_XFRMOUTPOLDEAD | XfrmOutPolDead | n/a + +LINUX_MIB_XFRMOUTPOLERROR | XfrmOutPolError | xfrm_bundle_lookup(), + +xfrm_resolve_and_create_bundle() + +LINUX_MIB_XFRMFWDHDRERROR | XfrmFwdHdrError | __xfrm_route_forward() + +LINUX_MIB_XFRMOUTSTATEINVALID | XfrmOutStateInvalid | xfrm_output_one() + +Note + +The IPsec git tree: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git. + +The ipsec git tree is for fixes for the IPsec networking subsystem; the development in this tree is done against David Miller's net git tree. + +The ipsec-next git tree: git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git. + +The ipsec-next tree is for changes for IPsec with linux-next as target; the development in this tree is done against David Miller's net-next git tree. + +The IPsec subsystem maintainers are Steffen Klassert, Herbert Xu, and David S. Miller. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_11 + +© Rami Rosen 2014 + +# 11. Layer 4 Protocols + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 10 discussed the Linux IPsec subsystem and its implementation. In this chapter, I will discuss four transport layer (L4) protocols. I will start our discussion with the two most commonly used transport layer (L4) protocols, the User Datagram Protocol (UDP) and the Transmission Control Protocol (TCP), which have been used for many years. Subsequently, I will discuss the newer Stream Control Transmission Protocol (SCTP) and Datagram Congestion Control Protocol (DCCP) protocols, which combine features of TCP and UDP. I will start the chapter with describing the sockets API, which is the interface between the transport layer (L4) and the userspace. I will discuss how sockets are implemented in the kernel and how data flows from the userspace to the transport layer and from the transport layer to the userspace. I will also deal with passing packets from the network layer (L3) to the transport layer (L4) when working with these protocols. I will discuss here mainly the IPv4 implementation of these four protocols, though some of the code is common to IPv4 and IPv6. + +Chapter 10 discussed the Linux IPsec subsystem and its implementation. In this chapter, I will discuss four transport layer (L4) protocols. I will start our discussion with the two most commonly used transport layer (L4) protocols, the User Datagram Protocol (UDP) and the Transmission Control Protocol (TCP), which are used for many years. Subsequently, I will discuss the newer Stream Control Transmission Protocol (SCTP) and Datagram Congestion Control Protocol (DCCP) protocols, which combine features of TCP and UDP. I will start the chapter with describing the sockets API, which is the interface between the transport layer (L4) and the userspace. I will discuss how sockets are implemented in the kernel and how data flows from the userspace to the transport layer and from the transport layer to the userspace. I will also deal with passing packets from the network layer (L3) to the transport layer (L4) when working with these protocols. I will discuss here mainly the IPv4 implementation of these four protocols, though some of the code is common to IPv4 and IPv6. + +## Sockets + +Every operating system has to provide an entry point and an API to its networking subsystems. The Linux kernel networking subsystem provides an interface to the userspace by the standard POSIX socket API, which was specified by the IEEE (IEEE Std 1003.1g-2000, describing networking APIs, also known as POSIX.1g). This API is based on Berkeley sockets API (also known as BSD sockets), which originated from the 4.2BSD Unix operating system and is an industry standard in several operating systems. In Linux, everything above the transport layer belongs to the userspace. Conforming to the Unix paradigm that "everything is a file," sockets are associated with files, as you will see later in this chapter. Using the uniform sockets API makes porting applications easier. These are the available socket types: + + * Stream sockets (SOCK_STREAM): Provides a reliable, byte-stream communication channel. TCP sockets are an example of stream sockets. + + * Datagram sockets (SOCK_DGRAM): Provides for exchanging of messages (called datagrams). Datagram sockets provide an unreliable communication channel, because packets can be discarded, arrive out of order, or be duplicated. UDP sockets are an example of datagram sockets. + + * Raw sockets (SOCK_RAW): Uses direct access to the IP layer, and allows sending or receiving traffic without any protocol-specific, transport-layer formatting. + + * Reliably delivered message (SOCK_RDM): Used by the Transparent Inter-Process Communication (TIPC), which was originally developed at Ericsson from 1996–2005 and was used in cluster applications. See http://tipc.sourceforge.net . + + * Sequenced packet stream (SOCK_SEQPACKET): This socket type is similar to the SOCK_STREAM type and is also connection-oriented. The only difference between these types is that record boundaries are maintained using the SOCK_SEQPACKET type. Record boundaries are visible to the receiver via the MSG_EOR (End of record) flag. The Sequenced packet stream type is not discussed in this chapter. + + * DCCP sockets (SOCK_DCCP): The Datagram Congestion Control Protocol is a transport protocol that provides a congestion-controlled flow of unreliable datagrams. It combines features of both TCP and UDP. It is discussed in a later section of this chapter. + + * Data links sockets (SOCK_PACKET): The SOCK_PACKET is considered obsolete in the AF_INET family. See the __sock_create() method in net/socket.c. + +The following is a description of some methods that the sockets API provides (all the kernel methods that appear in the following list are implemented in net/socket.c): + + * socket(): Creates a new socket; will be discussed in the subsection "Creating Sockets." + + * bind(): Associates a socket with a local port and an IP address; implemented in the kernel by the sys_bind() method. + + * send(): Sends a message; implemented in the kernel by the sys_send() method. + + * recv(): Receives a message; implemented in the kernel by the sys_recv() method. + + * listen(): Allows a socket to receive connections from other sockets; implemented in the kernel by the sys_listen() method. Not relevant to datagram sockets. + + * accept(): Accepts a connection on a socket; implemented in the kernel by the sys_accept() method. Relevant only with connection-based socket types (SOCK_STREAM, SOCK_SEQPACKET). + + * connect(): Establishes a connection to a peer socket; implemented in the kernel by the sys_connect() method. Relevant to connection-based socket types (SOCK_STREAM or SOCK_SEQPACKET) as well as to connectionless socket types (SOCK_DGRAM). + +This book focuses on the kernel network implementation, so I will not delve into the details of the userspace socket API. If you want more information, I recommend the following books: + + * Unix Network Programming, Volume 1: The Sockets Networking API (3rd Edition) by W. Richard Stevens, Bill Fenner, and Andrew M. Rudoff (Addison-Wesley Professional, 2003). + + * The Linux Programming Interface by Michael Kerrisk (No Starch Press, 2010). + +Note + +All the socket API calls are handled by the socketcall() method, in net/socket.c. + +Now that you have learned about some socket types, you will learn what happens in the kernel when a socket is created. In the next section, I will introduce the two structures that implement sockets: struct socket and struct sock. I will also describe the difference between them and I will describe the msghdr struct and its members. + +## Creating Sockets + +There are two structures that represent a socket in the kernel: the first is struct socket, which provides an interface to the userspace and is created by the sys_socket() method. I will discuss the sys_socket() method later in this section. The second is struct sock, which provides an interface to the network layer (L3). Since the sock structure resides in the network layer, it is a protocol agnostic structure. I will discuss the sock structure also later in this section. The socket structure is short: + +struct socket { + +socket_state state; + +kmemcheck_bitfield_begin(type); + +short type; + +kmemcheck_bitfield_end(type); + +unsigned long flags; + +. . . + +struct file *file; + +struct sock *sk; + +const struct proto_ops *ops; + +}; + +(include/linux/net.h) + +The following is a description of the members of the socket structure: + + * state: A socket can be in one of several states, like SS_UNCONNECTED, SS_CONNECTED, and more. When an INET socket is created, its state is SS_UNCONNECTED; see the inet_create() method. After a stream socket connects successfully to another host, its state is SS_CONNECTED. See the socket_state enum in include/uapi/linux/net.h. + + * type: The type of the socket, like SOCK_STREAM or SOCK_RAW; see the enum sock_type in include/linux/net.h. + + * flags: The socket flags; for example, the SOCK_EXTERNALLY_ALLOCATED flag is set in the TUN device when allocating a socket, not by the socket() system call. See the tun_chr_open() method in drivers/net/tun.c. The socket flags are defined in include/linux/net.h. + + * file: The file associated with the socket. + + * sk: The sock object associated with the socket. The sock object represents the interface to the network layer (L3). When creating a socket, the associated sk object is created. For example, in IPv4, the inet_create() method, which is invoked when creating a socket, allocates a sock object, sk, and associates it with the specified socket object. + + * ops: This object (an instance of the proto_ops object) consists mostly of callbacks for this socket, like connect(), listen(), sendmsg(), recvmsg(), and more. These callbacks are the interface to the userspace. The sendmsg() callback implements several library-level routines, such as write(), send(), sendto(), and sendmsg(). Quite similarly, the recvmsg() callback implements several library-level routines, such as read(), recv(), recvfrom(), and recvmsg(). Each protocol defines a proto_ops object of its own according to the protocol requirements. Thus, for TCP, its proto_ops object includes a listen callback, inet_listen(), and an accept callback, inet_accept(). On the other hand, the UDP protocol, which does not work in the client-server model, defines the listen() callback to be the sock_no_listen() method, and it defines the accept() callback to be the sock_no_accept() method. The only thing that both these methods do is return an error of –EOPNOTSUPP. See Table 11-1 in the "Quick Reference" section at the end of this chapter for the definitions of the TCP and UDP proto_ops objects. The proto_ops structure is defined in include/linux/net.h. + +The sock structure is the network-layer representation of sockets; it is quite long, and following here are only some of its fields that are important for our discussion: + +struct sock { + +struct sk_buff_head sk_receive_queue; + +int sk_rcvbuf; + +unsigned long sk_flags; + +int sk_sndbuf; + +struct sk_buff_head sk_write_queue; + +. . . + +unsigned int sk_shutdown : 2, + +sk_no_check : 2, + +sk_protocol : 8, + +sk_type : 16; + +. . . + +void (*sk_data_ready)(struct sock *sk, int bytes); + +void (*sk_write_space)(struct sock *sk); + +}; + +(include/net/sock.h) + +The following is a description of the members of the sock structure: + + * sk_receive_queue: A queue for incoming packets. + + * sk_rcvbuf: The size of the receive buffer in bytes. + + * sk_flags: Various flags, like SOCK_DEAD or SOCK_DBG; see the sock_flags enum definition in include/net/sock.h. + + * sk_sndbuf: The size of the send buffer in bytes. + + * sk_write_queue: A queue for outgoing packets. + +Note + +You will see later, in the "TCP Socket Initialization" section, how the sk_rcvbuf and the sk_sndbuf are initialized, and how this can be changed by writing to procfs entries. + + * sk_no_check: Disable checksum flag. Can be set with the SO_NO_CHECK socket option. + + * sk_protocol: This is the protocol identifier, which is set according to the third parameter (protocol) of the socket() system call. + + * sk_type: The type of the socket, like SOCK_STREAM or SOCK_RAW; see the enum sock_type in include/linux/net.h. + + * sk_data_ready: A callback to notify the socket that new data has arrived. + + * sk_write_space: A callback to indicate that there is free memory available to proceed with data transmission. + +Creating sockets is done by calling the socket() system call from userspace: + +sockfd = socket(int socket_family, int socket_type, int protocol); + +The following is a description of the parameters of the socket() system call: + + * socket_family: Can be, for example, AF_INET for IPv4, AF_INET6 for IPv6, or AF_UNIX for UNIX domain sockets, and so on. (UNIX domain sockets is a form of Inter Process Communication (IPC), which allows communication between processes that are running on the same host.) + + * socket_type: Can be, for example, SOCK_STREAM for stream sockets, SOCK_DGRAM for datagram sockets, or SOCK_RAW for raw sockets, and so on. + + * protocol: Can be any of the following: + + * 0 or IPPROTO_TCP for TCP sockets. + + * 0 or IPPROTO_UDP for UDP sockets. + + * A valid IP protocol identifier (like IPPROTO_TCP or IPPROTO_ICMP) for raw sockets; see RFC 1700, "Assigned Numbers." + +The return value of the socket() system call (sockfd) is the file descriptor that should be passed as a parameter to subsequent calls with this socket. The socket() system call is handled in the kernel by the sys_socket() method. Let's take a look at the implementation of the socket() system call: + +SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) + +{ + +int retval; + +struct socket *sock; + +int flags; + +. . . + +retval = sock_create(family, type, protocol, &sock); + +if (retval < 0) + +goto out; + +. . . + +retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); + +if (retval < 0) + +goto out_release; + +out: + +. . . + +return retval; + +} + +(net/socket.c) + +The sock_create() method calls the address-family specific socket creation method, create(); in the case of IPv4, it is the inet_create() method. (See the inet_family_ops definition in net/ipv4/af_inet.c.) The inet_create() method creates the sock object (sk) that is associated with the socket; the sock object represents the network layer socket interface. The sock_map_fd() method returns an fd (file descriptor) that is associated with the socket; normally, the socket() system call returns this fd. + +Sending data from a userspace socket, or receiving data in a userspace socket from the transport layer, is handled in the kernel by the sendmsg() and recvmsg() methods, respectively, which get a msghdr object as a parameter. The msghdr object includes the data blocks to send or to fill, as well as some other parameters. + +struct msghdr { + +void *msg_name; /* Socket name */ + +int msg_namelen; /* Length of name */ + +struct iovec *msg_iov; /* Data blocks */ + +__kernel_size_t msg_iovlen; /* Number of blocks */ + +void *msg_control; /* Per protocol magic (eg BSD file descriptor passing) */ + +__kernel_size_t msg_controllen; /* Length of cmsg list */ + +unsigned int msg_flags; + +}; + +(include/linux/socket.h) + +The following is a description of some of the important members of the msghdr structure: + + * msg_name: The destination socket address. To get the destination socket, you usually cast the msg_name opaque pointer to a struct sockaddr_in pointer. See, for example, the udp_sendmsg() method. + + * msg_namelen: The length of the address. + + * iovec: A vector of data blocks. + + * msg_iovlen: The number of blocks in the iovec vector. + + * msg_control: Control information (also known as ancillary data). + + * msg_controllen: The length of the control information. + + * msg_flags: Flags of received messages, like MSG_MORE. (See, for example, the section "Sending Packets with UDP" later in this chapter.) + +Note that the maximum control buffer length that the kernel can process is limited per socket by the value in sysctl_optmem_max (/proc/sys/net/core/optmem_max). + +In this section, I described the kernel implementation of the socket and the msghdr struct, which is used when sending and receiving packets. In the next section, I will start my discussion about transport layer protocols (L4) by describing the UDP protocol, which is the simplest among the protocols to be discussed in this chapter. + +## UDP (User Datagram Protocol) + +The UDP protocol is described in RFC 768 from 1980. The UDP protocol is a thin layer around the IP layer, adding only port, length, and checksum information. It dates back as early as 1980 and provides unreliable, message-oriented transport without congestion control. Many protocols use UDP. I will mention, for example, the RTP protocol (Real-time Transport Protocol), which is used for delivery of audio and video over IP networks. Such a type of traffic can tolerate some packet loss. The RTP is commonly used in VoIP applications, usually in conjunction with SIP (Session Initiation Protocol) based clients.(It should be mentioned here that, in fact, the RTP protocol can also use TCP, as specified in RFC 4571, but this is not used much.) I should mention here UDP-Lite, which is an extension of the UDP protocol to support variable-length checksums (RFC 3828). Most of UDP-Lite is implemented in net/ipv4/udplite.c, but you will encounter it also in the main UDP module, net/ipv4/udp.c. The UDP header length is 8 bytes: + +struct udphdr { + +__be16 source; + +__be16 dest; + +__be16 len; + +__sum16 check; + +}; + +(include/uapi/linux/udp.h) + +The following is a description of the members of the UDP header: + + * source: The source port (16 bit), in the range 1-65535. + + * dest: The destination port (16 bit), in the range 1-65535. + + * len: The length in bytes (the payload length and the UDP header length). + + * checksum: The checksum of the packet. + +Figure 11-1 shows a UDP header. + +Figure 11-1. + +A UDP header (IPv4) + +In this section, you learned about the UDP header and its members. To understand how the userspace applications, which use the sockets API, communicate with the kernel (sending and receiving packets), you should know about how UDP initialization is done, which is described in the next section. + +### UDP Initialization + +We define the udp_protocol object (net_protocol object) and add it with the inet_add_protocol() method. This sets the udp_protocol object to be an element in the global protocols array (inet_protos). + +static const struct net_protocol udp_protocol = { + +.handler = udp_rcv, + +.err_handler = udp_err, + +.no_policy = 1, + +.netns_ok = 1, + +}; + +(net/ipv4/af_inet.c) + +static int __init inet_init(void) + +{ + +. . . + +if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + +pr_crit("%s: Cannot add UDP protocol\n", __func__); + +. . . + +} + +(net/ipv4/af_inet.c) + +We further define a udp_prot object and register it by calling the proto_register() method. This object contains mostly callbacks; these callbacks are invoked when opening a UDP socket in userspace and using the socket API. For example, calling the setsockopt() system call on a UDP socket will invoke the udp_setsockopt() callback. + +struct proto udp_prot = { + +.name = "UDP", + +.owner = THIS_MODULE, + +.close = udp_lib_close, + +.connect = ip4_datagram_connect, + +.disconnect = udp_disconnect, + +.ioctl = udp_ioctl, + +. . . + +.setsockopt = udp_setsockopt, + +.getsockopt = udp_getsockopt, + +.sendmsg = udp_sendmsg, + +.recvmsg = udp_recvmsg, + +.sendpage = udp_sendpage, + +. . . + +}; + +(net/ipv4/udp.c) + +int __init inet_init(void) + +{ + +int rc = -EINVAL; + +. . . + +rc = proto_register(&udp_prot, 1); + +. . . + +} + +(net/ipv4/af_inet.c) + +Note + +The UDP protocol, along with other core protocols, is initialized via the inet_init() method at boot-time. + +Now that you know about UDP initialization and its callback for sending packets, which is the udp_sendmsg() callback of the udp_prot object that was shown in this section, it is time to learn how packets are sent by UDP in IPV4. + +### Sending Packets with UDP + +Sending data from a UDP userspace socket can be done by several system calls: send(), sendto(), sendmsg(), and write(); eventually all of them are handled by the udp_sendmsg() method in the kernel. The userspace application builds a msghdr object that contains the data blocks and passes this msghdr object to the kernel. Let's take a look at this method: + +int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + +size_t len) + +{ + +In general, UDP packets are sent immediately. This behavior can be changed with the UDP_CORK socket option (introduced in kernel 2.5.44), which causes packet data passed to the udp_sendmsg() method to be accumulated until the final packet is released by unsetting the option. The same result can be achieved by setting the MSG_MORE flag: + +int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + +struct inet_sock *inet = inet_sk(sk); + +. . . + +First we make some sanity checks. The specified len, for example, cannot be greater than 65535 (remember that the len field in the UDP header is 16 bits): + +if (len > 0xFFFF) + +return -EMSGSIZE; + +We need to know the destination address and the destination port in order to build a flowi4 object, which is needed for sending the SKB with the udp_send_skb() method or with the ip_append_data() method. The destination port should not be 0. There are two cases here: the destination is specified in the msg_name of the msghdr, or the socket is connected and its state is TCP_ESTABLISHED. Note that UDP (in contrast to TCP) is almost a fully stateless protocol. The notion of TCP_ESTABLISHED in UDP mostly means that the socket has passed some sanity checks. + +if (msg->msg_name) { + +struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + +if (msg->msg_namelen < sizeof(*usin)) + +return -EINVAL; + +if (usin->sin_family != AF_INET) { + +if (usin->sin_family != AF_UNSPEC) + +return -EAFNOSUPPORT; + +} + +daddr = usin->sin_addr.s_addr; + +dport = usin->sin_port; + +Linux code honors the fact that zero UDP/TCP ports are reserved by the IANA. The reservation of port 0 in TCP and UDP dates back to RFC 1010, "Assigned Numbers" (1987), and it was still present in RFC 1700, which was obsoleted by the online database (see RFC 3232), where they are still present. See www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml . + +if (dport == 0) + +return -EINVAL; + +} else { + +if (sk->sk_state != TCP_ESTABLISHED) + +return -EDESTADDRREQ; + +daddr = inet->inet_daddr; + +dport = inet->inet_dport; + +/* Open fast path for connected socket. + +Route will not be used, if at least one option is set. + +*/ + +connected = 1; + +} + +. . . + +A userspace application can send control information (also known as ancillary data) by setting msg_control and msg_controllen in the msghdr object. Ancillary data is, in fact, a sequence of cmsghdr objects with appended data. (For more details, see man 3 cmsg.) You can send and receive ancillary data by calling the sendmsg() and recvmsg() methods, respectively. For example, you can create an IP_PKTINFO ancillary message to set a source route to an unconnected UDP socket. (See man 7 ip.) When msg_controllen is not 0, this is a control information message, which is handled by the ip_cmsg_send() method. The ip_cmsg_send() method builds an ipcm_cookie (IP Control Message Cookie) object by parsing the specified msghdr object. The ipcm_cookie structure includes information that is used further when processing the packet. For example, when using an IP_PKTINFO ancillary message, you can set the source address by setting an address field in the control messages, which eventually sets the addr in the ipcm_cookie object. The ipcm_cookie is a short structure: + +struct ipcm_cookie { + +__be32 addr; + +int oif; + +struct ip_options_rcu *opt; + +__u8 tx_flags; + +}; + +(include/net/ip.h) + +Let's continue our discussion of the udp_sendmsg() method: + +if (msg->msg_controllen) { + +err = ip_cmsg_send(sock_net(sk), msg, &ipc); + +if (err) + +return err; + +if (ipc.opt) + +free = 1; + +connected = 0; + +} + +. . . + +if (connected) + +rt = (struct rtable *)sk_dst_check(sk, 0); + +. . . + +If the routing entry is NULL, a routing lookup should be performed: + +if (rt == NULL) { + +struct net *net = sock_net(sk); + +fl4 = &fl4_stack; + +flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + +RT_SCOPE_UNIVERSE, sk->sk_protocol, + +inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + +faddr, saddr, dport, inet->inet_sport); + +security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + +rt = ip_route_output_flow(net, fl4, sk); + +if (IS_ERR(rt)) { + +err = PTR_ERR(rt); + +rt = NULL; + +if (err == -ENETUNREACH) + +IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + +goto out; + +} + +. . . + +In kernel 2.6.39, a lockless transmit fast path was added. This means that when the corking feature is not set, we do not hold the socket lock and we call the udp_send_skb() method, and when the corking feature is set, we hold the socket lock by calling the lock_sock() method and then send the packet: + +/* Lockless fast path for the non-corking case. */ + +if (!corkreq) { + +skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + +sizeof(struct udphdr), &ipc, &rt, + +msg->msg_flags); + +err = PTR_ERR(skb); + +if (!IS_ERR_OR_NULL(skb)) + +err = udp_send_skb(skb, fl4); + +goto out; + +} + +Now we handle the case when the corking feature is set: + +lock_sock(sk); + +do_append_data: + +up->len += ulen; + +The ip_append_data() method buffers the data for transmission but does not transmit it yet. Subsequently calling the udp_push_pending_frames() method will actually perform the transmission. Note that the udp_push_pending_frames() method also handles fragmentation by the specified getfrag callback: + +err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + +sizeof(struct udphdr), &ipc, &rt, + +corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + +If the method failed, we should flush all pending SKBs. This is achieved by calling the udp_flush_pending_frames() method, which will free all the SKBs in the write queue of the socket (sk_write_queue) by the ip_flush_pending_frames() method: + +if (err) + +udp_flush_pending_frames(sk); + +else if (!corkreq) + +err = udp_push_pending_frames(sk); + +else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + +up->pending = 0; + +release_sock(sk); + +You learned in this section about sending packets with UDP. Now, to complete our discussion about UDP in IPv4, it's time to learn about how packets from the network layer (L3) are received with UDP in IPv4. + +### Receiving Packets from the Network Layer (L3) with UDP + +The main handler for receiving UDP packets from the network layer (L3) is the udp_rcv() method. All it does is invoke the __udp4_lib_rcv() method (net/ipv4/udp.c): + +int udp_rcv(struct sk_buff *skb) + +{ + +return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); + +} + +Let's take a look at the __udp4_lib_rcv() method: + +int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + +int proto) + +{ + +struct sock *sk; + +struct udphdr *uh; + +unsigned short ulen; + +struct rtable *rt = skb_rtable(skb); + +__be32 saddr, daddr; + +struct net *net = dev_net(skb->dev); + +. . . + +We fetch the UDP header, header length, and source and destination addresses from the SKB: + +uh = udp_hdr(skb); + +ulen = ntohs(uh->len); + +saddr = ip_hdr(skb)->saddr; + +daddr = ip_hdr(skb)->daddr; + +We will skip some sanity checks that are being performed, like making sure that the UDP header length is not greater than the length of the packet and that the specified proto is the UDP protocol identifier (IPPROTO_UDP). If the packet is a broadcast or a multicast packet, it will be handled by the __udp4_lib_mcast_deliver() method: + +if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + +return __udp4_lib_mcast_deliver(net, skb, uh, + +saddr, daddr, udptable); + +Next we perform a lookup in the UDP sockets hash table: + +sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + +if (sk != NULL) { + +We arrive here because the lookup we performed found a matching socket. So process the SKB further by calling the udp_queue_rcv_skb() method, which invokes the generic sock_queue_rcv_skb() method, which in turn adds the specified SKB to the tail of sk->sk_receive_queue (by calling the __skb_queue_tail() method): + +int ret = udp_queue_rcv_skb(sk, skb); + +sock_put(sk); + +/* a return value > 0 means to resubmit the input, but + +* it wants the return to be -protocol, or 0 + +*/ + +if (ret > 0) + +return -ret; + +Everything is fine; return 0 to denote success: + +return 0; + +} + +. . . + +We arrived here because the lookup for a socket failed. This means that we should not handle the packet. This can occur, for example, when there is no listening UDP socket on the destination port. If the checksum is incorrect, we should drop the packet silently. If it is correct, we should send an ICMP reply back to the sender. This should be an ICMP message of "Destination Unreachable" with code of "Port Unreachable." Further on, we should free the packet and update an SNMP MIB counter: + +/* No socket. Drop packet silently, if checksum is wrong */ + +if (udp_lib_checksum_complete(skb)) + +goto csum_error; + +The next command increments the UDP_MIB_NOPORTS (NoPorts) MIB counter. Note that you can query various UDP MIB counters by cat /proc/net/snmp or by netstat –s. + +UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + +icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + +/* + +* Hmm. We got an UDP packet to a port to which we + +* don't wanna listen. Ignore it. + +*/ + +kfree_skb(skb); + +return 0; + +Figure 11-2 illustrates our discussion in this section about receiving UDP packets. + +Figure 11-2. + +Receiving UDP packets + +Our discussion about UDP is now finished. The next section describes the TCP protocol, which is the most complex among the protocols discussed in this chapter. + +## TCP (Transmission Control Protocol) + +The TCP protocol is described in RFC 793 from 1981. During the years since then, there have been many updates, variations, and additions to the base TCP protocol. Some additions were for specific types of networks (high-speed, satellite), whereas others were for performance improvements. + +The TCP protocol is the most commonly used transport protocol on the Internet today. Many well-known protocols are based upon TCP. The most well-known protocol is probably HTTP, and we should also mention here some other well-known protocols such as ftp, ssh, telnet, smtp, and ssl. The TCP protocol provides a reliable and connection-oriented transport, as opposed to UDP. Transmission is made reliable by using sequence numbers and acknowledgments. + +TCP is a very complex protocol; we will not discuss all the details, optimizations, and nuances of the TCP implementation in this chapter, as this requires a separate book in itself. TCP functionality consists of two ingredients: management of connections, and transmitting and receiving data. We will focus in this section on TCP initialization and TCP connection setup, which pertains to the first ingredient, connections management, and on receiving and sending packets, which pertains to the second ingredient. These are the important basics that enable further delving into the TCP protocol implementation. We should note that the TCP protocol self-regulates the byte-stream flow via congestion control. Many different congestion-control algorithms have been specified, and Linux provides a pluggable and configurable architecture to support a wide variety of algorithms. Delving into the details of the individual congestion-control algorithms is beyond the scope of this book. + +Every TCP packet starts with a TCP header. You must learn about the TCP header in order to understand the operation of TCP. The next section describes the IPv4 TCP header. + +### TCP Header + +The TCP header length is 20 bytes, but it is scalable up to 60 bytes when using TCP options: + +struct tcphdr { + +__be16 source; + +__be16 dest; + +__be32 seq; + +__be32 ack_seq; + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u16 res1:4, + +doff:4, + +fin:1, + +syn:1, + +rst:1, + +psh:1, + +ack:1, + +urg:1, + +ece:1, + +cwr:1; + +#elif defined(__BIG_ENDIAN_BITFIELD) + +__u16 doff:4, + +res1:4, + +cwr:1, + +ece:1, + +urg:1, + +ack:1, + +psh:1, + +rst:1, + +syn:1, + +fin:1; + +#else + +#error "Adjust your defines" + +#endif + +__be16 window; + +__sum16 check; + +__be16 urg_ptr; + +}; + +(include/uapi/linux/tcp.h) + +The following is a description of the members of the tcphdr structure: + + * source: The source port (16 bit), in the range 1-65535. + + * dest: The destination port (16 bit), in the range 1-65535. + + * seq: The Sequence number (32 bits). + + * ack_seq: Acknowledgment number (32 bits). If the ACK flag is set, the value of this field is the next sequence number that the receiver is expecting. + + * res1: Reserved for future use (4 bits). It should always be set to 0. + + * doff: Data offset (4 bits). The size of the TCP header in multiplies of 4 bytes; the minimum is 5 (20 bytes) and the maximum is 15 (60 bytes). + +The following are the TCP flags; each is 1 bit: + + * fin: No more data from sender (when one of the endpoints wants to close the connection). + + * syn: The SYN flag is initially sent when establishing the 3-way handshake between two endpoints. + + * rst: The Reset flag is used when a segment that is not intended for the current connection arrives. + + * psh: The data should be passed to userspace as soon as possible. + + * ack: Signifies that the acknowledgment number (ack_seq) value in the TCP header is meaningful. + + * urg: Signifies that the urgent pointer is meaningful. + + * ece: ECN - Echo flag. ECN stands for "Explicit Congestion Notification." ECN provides a mechanism that sends end-to-end notification about network congestion without dropping packets. It was added by RFC 3168, "The Addition of Explicit Congestion Notification (ECN) to IP," from 2001. + + * cwr: Congestion Window Reduced flag. + + * window: TCP receive window size in bytes (16 bit). + + * check: Checksum of the TCP header and TCP data. + + * urg_ptr: Has significance only when the urg flag is set. It represents an offset from the sequence number indicating the last urgent data byte (16 bit). + +Figure 11-3 shows a diagram of a TCP header. + +Figure 11-3. + +TCP header (IPv4) + +In this section, I described the IPv4 TCP header and its members. You saw that, as opposed to the UDP header, which has only 4 members, the TCP header has a lot more members, since TCP is a much more complex protocol. In the following section, I will describe how TCP initialization is done so that you will learn how and where the initialization of the callbacks for receiving and sending TCP packets takes place. + +### TCP Initialization + +We define the tcp_protocol object (net_protocol object) and add it with the inet_add_protocol() method: + +static const struct net_protocol tcp_protocol = { + +.early_demux = tcp_v4_early_demux, + +.handler = tcp_v4_rcv, + +.err_handler = tcp_v4_err, + +.no_policy = 1, + +.netns_ok = 1, + +}; + +(net/ipv4/af_inet.c) + +static int __init inet_init(void) + +{ + +. . . + +if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + +pr_crit("%s: Cannot add TCP protocol\n", __func__); + +. . . + +} + +(net/ipv4/af_inet.c) + +We further define a tcp_prot object and register it by calling the proto_register() method, like what we did with UDP: + +struct proto tcp_prot = { + +.name = "TCP", + +.owner = THIS_MODULE, + +.close = tcp_close, + +.connect = tcp_v4_connect, + +.disconnect = tcp_disconnect, + +.accept = inet_csk_accept, + +.ioctl = tcp_ioctl, + +.init = tcp_v4_init_sock, + +. . . + +}; + +(net/ipv4/tcp_ipv4.c) + +static int __init inet_init(void) + +{ + +int rc; + +. . . + +rc = proto_register(&tcp_prot, 1); + +. . . + +} + +(net/ipv4/af_inet.c) + +Note that in the tcp_prot definition, the init function pointer is defined to be the tcp_v4_init_sock() callback, which performs various initializations, like setting the timers by calling the tcp_init_xmit_timers() method, setting the socket state, and more. Conversely, in UDP, which is a much simpler protocol, the init function pointer was not defined at all because there are no special initializations to perform in UDP. We will discuss the tcp_v4_init_sock() callback later in this section. + +In the next section, I will describe briefly the timers used by the TCP protocol. + +### TCP Timers + +TCP timers are handled in net/ipv4/tcp_timer.c. There are four timers used by TCP: + + * Retransmit timer : Responsible for resending packets that were not acknowledged in a specified time interval. This can happen when a packet gets lost or corrupted. This timer is started after each segment is sent; if an ACK arrives before the timer expires, the timer is canceled. + + * Delayed ACK timer : Delays sending ACK packets. It is set when TCP receives data that must be acknowledged but does not need to be acknowledged immediately. + + * Keep Alive timer : Checks whether the connection is down. There are cases when sessions are idle for a long time and one side goes down. The Keep Alive timer detects such cases and calls the tcp_send_active_reset() method to reset the connection. + + * Zero window probe timer (also known as the persistent timer ): When the receive buffer is full, the receiver advertises a zero window and the sender stops sending. Now, when a receiver sends a segment with a new window size and this segment is lost, the sender will keep waiting forever. The solution is this: when the sender gets a zero window, it uses a persistent timer to probe the receiver for its window size; when getting a non-zero window size, the persistent timer is stopped. + +### TCP Socket Initialization + +To use a TCP socket, a userspace application should create a SOCK_STREAM socket and call the socket() system call. This is handled in the kernel by the tcp_v4_init_sock() callback, which invokes the tcp_init_sock() method to do the real work. Note that the tcp_init_sock() method performs address-family independent initializations, and it is invoked also from the tcp_v6_init_sock() method. The important tasks of the tcp_init_sock() method are the following: + + * Set the state of the socket to be TCP_CLOSE. + + * Initialize TCP timers by calling the tcp_init_xmit_timers() method. + + * Initialize the socket send buffer (sk_sndbuf) and receive buffer (sk_rcvbuf); sk_sndbuf is set to be to sysctl_tcp_wmem[1], which is by default 16384 bytes, and sk_rcvbuf is set to be sysctl_tcp_rmem[1], which is by default 87380 bytes. These default values are set in the tcp_init() method; the sysctl_tcp_wmem and sysctl_tcp_rmem arrays default values can be overridden by writing to /proc/sys/net/ipv4/tcp_wmem and to /proc/sys/net/ipv4/tcp_rmem, respectively. See the "TCP Variables" section in Documentation/networking/ip-sysctl.txt. + + * Initialize the out-of-order queue and the prequeue. + + * Initialize various parameters. For example, the TCP initial congestion window is initialized to 10 segments (TCP_INIT_CWND), according to RFC 6928, "Increasing TCP's Initial Window," from 2013. + +Now that you have learned how a TCP socket is initialized, I will discuss how to set up a TCP connection. + +### TCPConnection Setup + +TCP connection setup and teardown and TCP connection properties are described as transitions in a state machine. At each given moment, a TCP socket can be in one specified state; for example, the socket enters the TCP_LISTEN state when the listen() system call is invoked. The state of the sock object is represented by its sk_state member. For a list of all available states, refer to include/net/tcp_states.h. + +A three way handshake is used to set up a TCP connection between a TCP client and a TCP server: + + * First, the client sends a SYN request to the server. Its state changes to TCP_SYN_SENT. + + * The server socket, which is listening (its state is TCP_LISTEN), creates a request socket to represent the new connection in the TCP_SYN_RECV state and sends back a SYN ACK. + + * The client that receives the SYN ACK changes its state to TCP_ESTABLISHED and sends an ACK to the server. + + * The server receives the ACK and changes the request socket into a child socket in the TCP_ESTABLISHED state, as the connection is now established and data can be sent. + +Note + +to further look into the TCP state machine details, refer to the tcp_rcv_state_process() method (net/ipv4/tcp_input.c), which is the state machine engine, both for IPv4 and for IPv6. (It is called both from the tcp_v4_do_rcv() method and from the tcp_v6_do_rcv() method.) + +The next section describes how packets are received from the network layer (L3) with TCP in IPv4. + +### ReceivingPackets from the Network Layer (L3) with TCP + +The main handler for receiving TCP packets from the network layer (L3) is the tcp_v4_rcv() method (net/ipv4/tcp_ipv4.c). Let's take a look at this function: + +int tcp_v4_rcv(struct sk_buff *skb) + +{ + +struct sock *sk; + +. . . + +First we make some sanity checks (for example, checking to see if the packet type is not PACKET_HOST or if the packet size is shorter than the TCP header) and discard the packet if there are any problems; then some initializations are made and also a lookup for a corresponding socket is performed by calling the __inet_lookup_skb() method, which first performs a lookup in the established sockets hash table by calling the __inet_lookup_established() method. In the case of a lookup miss, it performs a lookup in the listening sockets hash table by calling the __inet_lookup_listener() method. If no socket is found, the packet is discarded at this stage. + +sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + +. . . + +if (!sk) + +goto no_tcp_socket; + +Now we check whether the socket is owned by some application. The sock_owned_by_user() macro returns 1 when there is currently an application that owns the socket, and it returns a value of 0 when there is no application that owns the socket: + +if (!sock_owned_by_user(sk)) { + +. . . + +{ + +We arrive here if no application owns the socket, so it can accept packets. First we try to put the packet in the prequeue by calling the tcp_prequeue() method, as packets in the prequeue are processed more efficiently. The tcp_prequeue() will return false if processing in the prequeue is not possible (for example, when the queue has no space); in such a case, we will call the tcp_v4_do_rcv() method, which we will discuss shortly: + +if (!tcp_prequeue(sk, skb)) + +ret = tcp_v4_do_rcv(sk, skb); + +} + +When an application owns the socket, it means that it is in a locked state, so it cannot accept packets. In such a case, we add the packet to the backlog by calling the sk_add_backlog() method: + +} else if (unlikely(sk_add_backlog(sk, skb, + +sk->sk_rcvbuf + sk->sk_sndbuf))) { + +bh_unlock_sock(sk); + +NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + +goto discard_and_relse; + +} + +} + +Let's take a look at the tcp_v4_do_rcv() method: + +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + +{ + +If the socket is in the TCP_ESTABLISHED state, we call the tcp_rcv_established() method: + +if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + +. . . + +if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { + +rsk = sk; + +goto reset; + +} + +return 0; + +If the socket is in the TCP_LISTEN state, we call the tcp_v4_hnd_req() method: + +if (sk->sk_state == TCP_LISTEN) { + +struct sock *nsk = tcp_v4_hnd_req(sk, skb); + +} + +If we are not in the TCP_LISTEN state, we invoke the tcp_rcv_state_process() method: + +if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { + +rsk = sk; + +goto reset; + +} + +return 0; + +reset: + +tcp_v4_send_reset(rsk, skb); + +} + +In this section, you learned about the reception of a TCP packet. In the next section, we conclude the TCP part of this chapter by describing how packets are sent with TCP in IPv4. + +### Sending Packets with TCP + +As with UDP, sending packets from TCP sockets that were created in userspace can be done by several system calls: send(), sendto(), sendmsg(), and write(). Eventually all of them are handled by the tcp_sendmsg() method (net/ipv4/tcp.c). This method copies the payload from the userspace to the kernel and sends it as TCP segments. It is much more complicated than the udp_sendmsg() method. + +int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + +size_t size) + +{ + +struct iovec *iov; + +struct tcp_sock *tp = tcp_sk(sk); + +struct sk_buff *skb; + +int iovlen, flags, err, copied = 0; + +int mss_now = 0, size_goal, copied_syn = 0, offset = 0; + +bool sg; + +long timeo; + +. . . + +I will not delve into all the details of copying the data from the userspace to the SKB in this method. Once the SKB is built, it is sent with the tcp_push_one() method that calls the tcp_write_xmit() method, which in turn invokes the tcp_transmit_skb() method: + +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + +gfp_t gfp_mask) + +{ + +The icsk_af_ops object (INET Connection Socket ops) is an address-family specific object. In the case of IPv4 TCP, it is set to be an inet_connection_sock_af_ops object named ipv4_specific in the tcp_v4_init_sock() method. The queue_xmit() callback is set to be the generic ip_queue_xmit() method. See net/ipv4/tcp_ipv4.c. + +. . . + +err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + +. . . + +} + +(net/ipv4/tcp_output.c) + +Now that you learned about TCP and UDP, you are ready to proceed to the next section which deals with the SCTP (Stream Control Transmission Protocol) protocol. The SCTP protocol combines features of both UDP and TCP, and it is newer than both of them. + +## SCTP (Stream Control Transmission Protocol) + +The SCTP protocol is specified in RFC 4960 from 2007. It was first specified in 2000. It is designed for Public Switched Telephone Network (PSTN) signaling over IP networks, but it can be used with other applications. The IETF SIGTRAN (Signaling Transport) working group originally developed the SCTP protocol and later handed the protocol over to the Transport Area working group (TSVWG) for the continued evolvement of SCTP as a general-purpose transport protocol. LTE (Long Term Evolution) uses SCTP; one of the main reasons for this is that the SCTP protocol is able to detect when a link goes down or when packets are dropped very quickly, whereas TCP does not have this feature. SCTP flow-control and congestion-control algorithms are very similar in TCP and SCTP. The SCTP protocol uses a variable for the advertised receiver window size (a_rwnd); this variable represents the current available space in the receiver buffer. The sender cannot send any new data if the receiver indicates that a_rwnd is 0 (no receive space available). The important features of SCTP are the following ones: + + * SCTP combines the features of TCP and UDP. It is a reliable transport protocol with congestion control like TCP; it is a message-oriented protocol like UDP, whereas TCP is stream-oriented. + + * The SCTP protocol provides improved security with its 4-way handshake (compared to the TCP 3-way handshake) to protect against SYN flooding attacks. I will discuss the 4-way handshake later in this chapter in the "Setting Up an SCTP Association" section. + + * SCTP supports multihoming—that is, multiple IP addresses on both endpoints. This provides a network-level, fault-tolerance capability. I will discuss SCTP chunks later in this section. + + * SCTP supports multistreaming, which means that it can send in parallel streams of data chunks. This can reduce the latency of streaming multimedia in some environments. I will discuss SCTP chunks later in this section. + + * SCTP uses a heartbeat mechanism to detect idle/unreachable peers in the case of multihoming. I will discuss the SCTP heartbeat mechanism later in this chapter. + +After this short description of the SCTP protocol, we will now discuss how SCTP initialization is done. The sctp_init() method allocates memory for various structures, initializes some sysctl variables, and registers the SCTP protocol in IPv4 and in IPv6: + +int sctp_init(void) + +{ + +int status = -EINVAL; + +. . . + +status = sctp_v4_add_protocol(); + +if (status) + +goto err_add_protocol; + +/* Register SCTP with inet6 layer. */ + +status = sctp_v6_add_protocol(); + +if (status) + +goto err_v6_add_protocol; + +. . . + +} + +(net/sctp/protocol.c) + +The registration of the SCTP protocol is done by defining an instance of net_protocol (named sctp_protocol for IPv4 and sctpv6_protocol for IPv6) and calling the inet_add_protocol() method, quite similarly to what you saw in other transport protocols, like the UDP protocol. We also call the register_inetaddr_notifier() to receive notifications about adding or deleting a network address. These events will be handled by the sctp_inetaddr_event() method, which will update the SCTP global address list (sctp_local_addr_list) accordingly. + +static const struct net_protocol sctp_protocol = { + +.handler = sctp_rcv, + +.err_handler = sctp_v4_err, + +.no_policy = 1, + +}; + +(net/sctp/protocol.c) + +static int sctp_v4_add_protocol(void) + +{ + +/* Register notifier for inet address additions/deletions. */ + +register_inetaddr_notifier(&sctp_inetaddr_notifier); + +/* Register SCTP with inet layer. */ + +if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) + +return -EAGAIN; + +return 0; + +} + +(net/sctp/protocol.c) + +Note + +The sctp_v6_add_protocol() method (net/sctp/ipv6.c) is very similar, so we will not show it here. + +Each SCTP packet starts with an SCTP header. I will now describe the structure of an SCTP header. I will start the discussion with SCTP chunks in the next section. + +### SCTP Packets and Chunks + +Each SCTP packet has an SCTP common header, which is followed by one or more chunks. Each chunk can contain either data or SCTP control information. Several chunks can be bundled into one SCTP packet (except for three chunks that are used when establishing and terminating a connection: INIT, INIT_ACK, and SHUTDOWN_COMPLETE). These chunks use the Type-Length-Value (TLV) format that you first encountered in Chapter 2. + +#### SCTP Common Header + +typedef struct sctphdr { + +__be16 source; + +__be16 dest; + +__be32 vtag; + +__le32 checksum; + +} __attribute__((packed)) sctp_sctphdr_t; + +(include/linux/sctp.h) + +Following is a description of the members of the sctphdr structure: + + * source: SCTP source port. + + * dest: SCTP destination port. + + * vtag: Verification Tag, which is a 32 bit random value. + + * checksum: Checksum of SCTP common header and all chunks. + +### SCTP Chunk Header + +The SCTP chunk header is represented by struct sctp_chunkhdr: + +typedef struct sctp_chunkhdr { + +__u8 type; + +__u8 flags; + +__be16 length; + +} __packed sctp_chunkhdr_t; + +(include/linux/sctp.h) + +The following is a description of the members of the sctp_chunkhdr structure: + + * type: The SCTP type. For example, the type of data chunks is SCTP_CID_DATA. See Table 11-2, Chunk types, in the "Quick Reference" section at the end of this chapter, and also see the chunk ID enum definition (sctp_cid_t) in include/linux/sctp.h. + + * flags: Usually, all 8 bits in it should be set to 0 by the sender and ignored by the receiver. There are cases when different values are used. For example, in ABORT chunk, we use the T bit (the LSB) thus: it is set to 0 if the sender filled in the Verification Tag, and it is set to 1 if the Verification Tag is reflected. + + * length: The length of the SCTP chunk. + +### SCTP Chunk + +The SCTP chunk is represented by struct sctp_chunk. Each chunk object contains the source and destination address for this chunk and a subheader (member of the subh union) according to its type. For example, for data packets we have the sctp_datahdr subheader, and for the INIT type we have the sctp_inithdr subtype: + +struct sctp_chunk { + +. . . + +atomic_t refcnt; + +union { + +__u8 *v; + +struct sctp_datahdr *data_hdr; + +struct sctp_inithdr *init_hdr; + +struct sctp_sackhdr *sack_hdr; + +struct sctp_heartbeathdr *hb_hdr; + +struct sctp_sender_hb_info *hbs_hdr; + +struct sctp_shutdownhdr *shutdown_hdr; + +struct sctp_signed_cookie *cookie_hdr; + +struct sctp_ecnehdr *ecne_hdr; + +struct sctp_cwrhdr *ecn_cwr_hdr; + +struct sctp_errhdr *err_hdr; + +struct sctp_addiphdr *addip_hdr; + +struct sctp_fwdtsn_hdr *fwdtsn_hdr; + +struct sctp_authhdr *auth_hdr; + +} subh; + +struct sctp_chunkhdr *chunk_hdr; + +struct sctphdr *sctp_hdr; + +struct sctp_association *asoc; + +/* What endpoint received this chunk? */ + +struct sctp_ep_common *rcvr; + +. . . + +/* What is the origin IP address for this chunk? */ + +union sctp_addr source; + +/* Destination address for this chunk. */ + +union sctp_addr dest; + +. . . + +/* For an inbound chunk, this tells us where it came from. + +* For an outbound chunk, it tells us where we'd like it to + +* go. It is NULL if we have no preference. + +*/ + +struct sctp_transport *transport; + +}; + +(include/net/sctp/structs.h) + +We will now describe an SCTP association (which is the counterpart of a TCP connection). + +### SCTP Associations + +In SCTP, we use the term association instead of a connection; a connection refers to communication between two IP addresses, whereas association refers to communication between two endpoints that might have multiple IP addresses. An SCTP association is represented by struct sctp_association: + +struct sctp_association { + +... + +sctp_assoc_t assoc_id; + +/* These are those association elements needed in the cookie. */ + +struct sctp_cookie c; + +/* This is all information about our peer. */ + +struct { + +struct list_head transport_addr_list; + +. . . + +__u16 transport_count; + +__u16 port; + +. . . + +struct sctp_transport *primary_path; + +struct sctp_transport *active_path; + +} peer; + +sctp_state_t state; + +. . . + +struct sctp_priv_assoc_stats stats; + +}; + +(include/net/sctp/structs.h). + +The following is a description of some of the important members of the sctp_association structure: + + * assoc_id: The association unique id. It's set by the sctp_assoc_set_id() method. + + * c: The state cookie (sctp_cookie object) that is attached to the association. + + * peer: An inner structure representing the peer endpoint of the association. Adding a peer is done by the sctp_assoc_add_peer() method; removing a peer is done by the sctp_assoc_rm_peer() method. Following is a description of some of the peer structure important members: + + * transport_addr_list: Represents one or more addresses of the peer. We can add addresses to this list or remove addresses from it by using the sctp_connectx() method when an association is established. + + * transport_count: The counter of the peer addresses in the peer address list (transport_addr_list). + + * primary_path: Represents the address to which the initial connection was made (INIT <\--> INIT_ACK exchange). The association will attempt to always use the primary path if it is active. + + * active_path: The address of the peer that is currently used when sending data. + + * state: The state that the association is in, like SCTP_STATE_CLOSED or SCTP_STATE_ESTABLISHED. Various SCTP states are discussed later in this section. + +Adding multiple local addresses to an SCTP association or removing multiple addresses from one can be done, for example, with the sctp_bindx() system call, in order to support the multihoming feature mentioned earlier. Every SCTP association includes a peer object, which represents the remote endpoint; the peer object includes a list of one or more addresses of the remote endpoint (transport_addr_list). We can add one or more addresses to this list by calling the sctp_connectx() system call when establishing an association. An SCTP association is created by the sctp_association_new() method and initialized by the sctp_association_init() method. At any given moment, an SCTP association can be in one of 8 states; thus, for example, when it is created, its state is SCTP_STATE_CLOSED. Later on, these states can change; see, for example, the "Setting Up an SCTP Association" section later in this chapter. These states are represented by the sctp_state_t enum (include/net/sctp/constants.h). + +To send data between two endpoints, an initialization process must be completed. In this process, an SCTP association between these two endpoints is set; a cookie mechanism is used to provide protection against synchronization attacks. This process is discussed in the following section. + +### Setting Up an SCTP Association + +The initialization process is a 4-way handshake that consists of the following steps: + + * One endpoint ("A") sends an INIT chunk to the endpoint it wants to communicate with ("Z"). This chunk will include a locally generated Tag in the Initiate Tag field of the INIT chunk, and it will also include a verification tag (vtag in the SCTP header) with a value of 0 (zero). + + * After sending the INIT chunk, the association enters the SCTP_STATE_COOKIE_WAIT state. + + * The other endpoint ("Z") sends to "A" an INIT-ACK chunk as a reply. This chunk will include a locally generated Tag in the Initiate Tag field of the INIT-ACK chunk and the remote Initiate Tag as the verification tag (vtag in the SCTP header). "Z" should also generate a state cookie and send it with the INIT-ACK reply. + + * When "A" receives the INIT-ACK chunk, it leaves the SCTP_STATE_COOKIE_WAIT state. "A" will use the remote Initiate Tag as the verification tag (vtag in the SCTP header) in all transmitted packets from now on. "A" will send the state cookie it received in a COOKIE ECHO chunk. "A" will enter the SCTP_STATE_COOKIE_ECHOED state. + + * When "Z" receives the COOKIE ECHO chunk, it will build a TCB (Transmission Control Block). The TCB is a data structure containing connection information on either side of an SCTP connection. "Z" will further change its state to SCTP_STATE_ESTABLISHED and reply with a COOKIE ACK chunk. This is where the association is finally established on "Z" and, at this point, this association will use the saved tags. + + * When "A" receives the COOKIE ACK, it will move from the SCTP_STATE_COOKIE_ECHOED state to the SCTP_STATE_ESTABLISHED state. + +Note + +An endpoint might respond to an INIT, INIT ACK, or COOKIE ECHO chunk with an ABORT chunk when some mandatory parameters are missing, or when receiving invalid parameter values. The cause of the ABORT chunk should be specified in the reply. + +Now that you have learned about SCTP associations and how they are created, you will see how SCTP packets are received with SCTP and how SCTP packets are sent. + +### Receiving Packets with SCTP + +The main handler for receiving SCTP packets is the sctp_rcv() method, which gets an SKB as a single parameter (net/sctp/input.c). First some sanity checks are made (size, checksum, and so on). If everything is fine, we proceed to check whether this packet is an "Out of the Blue" (OOTB) packet. A packet is an OOTB packet if it is correctly formed (that is, no checksum error), but the receiver is not able to identify the SCTP association to which this packet belongs. (See section 8.4 in RFC 4960.) The OOTB packets are handled by the sctp_rcv_ootb() method, which iterates over all the chunks of the packet and takes an action according to the chunk type, as specified in the RFC. Thus, for example, an ABORT chunk is discarded. If this packet is not an OOTB packet, it is put into an SCTP inqueue by calling the sctp_inq_push() method and proceeds on its journey with the sctp_assoc_bh_rcv() method or with the sctp_endpoint_bh_rcv() method. + +### Sending Packets with SCTP + +Writing to a userspace SCTP socket reaches the sctp_sendmsg() method (net/sctp/socket.c). The packet is passed to the lower layers by calling the sctp_primitive_SEND() method, which in turn calls the state machine callback, sctp_do_sm() (net/sctp/sm_sideeffect.c), with SCTP_ST_PRIMITIVE_SEND. The next stage is to call sctp_side_effects(), and eventually call the sctp_packet_transmit() method. + +### SCTP HEARTBEAT + +The HEARTBEAT mechanism tests the connectivity of a transport or path by exchanging HEARTBEAT and HEARTBEAT-ACK SCTP packets. It declares the transport IP address to be down once it reaches the threshold of a nonreturned heartbeat acknowledgment. A HEARTBEAT chunk is sent every 30 seconds by default to monitor the reachability of an idle destination transport address. This time interval is configurable by setting /proc/sys/net/sctp/hb_interval. The default is 30000 milliseconds (30 seconds). Sending heartbeat chunks is performed by the sctp_sf_sendbeat_8_3() method. The reason for the 8_3 in the method name is that it refers to section 8.3 (Path Heartbeat) in RFC 4960. When an endpoint receives a HEARTBEAT chunk, it replies with a HEARTBEAT-ECHO chunk if it is in the SCTP_STATE_COOKIE_ECHOED state or the SCTP_STATE_ESTABLISHED state. + +### SCTP Multistreaming + +Streams are unidirectional data flows within a single association. The number of Outbound Streams and the number of Inbound Streams are declared during the association setup (by the INIT chunk), and the streams are valid during the entire association lifetime. A userspace application can set the number of streams by creating an sctp_initmsg object and initializing its sinit_num_ostreams and sinit_max_instreams, and then calling the setsockopt() method with SCTP_INITMSG. Initialization of the number of streams can also be done with the sendmsg() system call. This, in turn, sets the corresponding fields in the initmsg object of the sctp_sock object. One of the biggest reasons streams were added was to remove the Head-of-Line blocking (HoL Blocking) condition. Head-of-line blocking is a performance-limiting phenomenon that occurs when a line of packets is held up by the first packet—for example, in multiple requests in HTTP pipelining. When working with SCTP Multistreaming, this problem does not exist because each stream is sequenced separately and guaranteed to be delivered in order. Thus, once one of the streams is blocked due to loss/congestion, the other streams might not be blocked and data will continue to be delivered. This is due to that one stream can be blocked while the other streams are not blocked, + +Note + +Regarding using sockets for SCTP, I should mention the lksctp-tools project ( http://lksctp.sourceforge.net/ ). This project provides a Linux userspace library for SCTP (libsctp), including C language header files (netinet/sctp.h), for accessing SCTP-specific application programming interfaces not provided by the standard sockets, and also some helper utilities around SCTP. I should also mention RFC 6458, "Sockets API Extensions for Stream Control Transmission Protocol (SCTP)," which describes a mapping of the Stream Control Transmission Protocol (SCTP) into the sockets API. + +### SCTP Multihoming + +SCTP multihoming refers to having multiple IP addresses on both endpoints. One of the really nice features of SCTP is that endpoints are multihomed by default if the local ip address was specified as a wildcard. Also, there has been a lot of confusion about the multihoming feature because people expect that simply by binding to multiple addresses, the associations will end up being multihomed. This is not true because we implement only destination multihoming. In other words, both connected endpoints have to be multihomed for it to have true failover capability. If the local association knows about only a single destination address, there will be only one path and thus no multihoming. + +With describing SCTP multihoming in this section, the SCTP part of this chapter has ended. In the next section, I will describe the DCCP protocol, which is the last transport protocol to be discussed in this chapter. + +## DCCP: The Datagram Congestion Control Protocol + +DCCP is an unreliable, congestion-controlled transport layer protocol and, as such, it borrows from both UDP and TCP while adding new features. Like UDP, it is message-oriented and unreliable. Like TCP, it is a connection-oriented protocol and it also uses a 3-way handshake to set up the connection. Development of DCCP was helped by ideas from academia, through participation of several research institutes, but it has not been tested so far in larger-scale Internet setups. The use of DCCP would make sense, for instance, in applications that require minor delays and where a small degree of data loss is permitted, like in telephony and in streaming media applications. + +Congestion control in DCCP differs from that in TCP in that the congestion-control algorithm (called CCID) can be negotiated between endpoints and congestion control can be applied on both the forward and reverse paths of a connection (called half-connections in DCCP). Two classes of pluggable congestion control have been specified so far. The first type is a rate-based, smooth "TCP-friendly" algorithm (CCID-3, RFC 4342 and 5348), for which there is an experimental small-packet variation called CCID-4 (RFC 5622, RFC 4828). The second type of congestion control, "TCP-like" (RFC 4341) applies a basic TCP congestion-control algorithm with selective acknowledgments (SACK, RFC 2018) to DCCP flows. At least one CCID needs to be implemented by endpoints in order to function. The first DCCP Linux implementation was released in Linux kernel 2.6.14 (2005). This chapter describes the implementation principles of the DCCPv4 (IPv4). Delving into the implementation details of individual DCCP congestion-control algorithms is beyond the scope of this book. + +Now that I've introduced the DCCP protocol in general, I will describe the DCCP header. + +### DCCP Header + +Every DCCP packet starts with a DCCP header. The minimum DCCP header length is 12 bytes. DCCP uses a variable-length header, which can range from 12 to 1020 bytes, depending on whether short sequence numbers are used and which TLV packet options are used. DCCP sequence numbers are incremented for each packet (not per each byte as in TCP) and can be shortened from 6 to 3 bytes. + +struct dccp_hdr { + +__be16 dccph_sport, + +dccph_dport; + +__u8 dccph_doff; + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u8 dccph_cscov:4, + +dccph_ccval:4; + +#elif defined(__BIG_ENDIAN_BITFIELD) + +__u8 dccph_ccval:4, + +dccph_cscov:4; + +#else + +#error "Adjust your defines" + +#endif + +__sum16 dccph_checksum; + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u8 dccph_x:1, + +dccph_type:4, + +dccph_reserved:3; + +#elif defined(__BIG_ENDIAN_BITFIELD) + +__u8 dccph_reserved:3, + +dccph_type:4, + +dccph_x:1; + +#else + +#error "Adjust your defines" + +#endif + +__u8 dccph_seq2; + +__be16 dccph_seq; + +}; + +(include/uapi/linux/dccp.h) + +The following is a description of the important members of the dccp_hdr structure: + + * dccph_sport: Source port (16 bit). + + * dccph_dport: Destination port (16 bit). + + * dccph_doff: Data offset (8 bits). The size of the DCCP header is in multiples of 4 bytes. + + * dccph_cscov: Determines which part of the packet is covered in the checksum. Using partial checksumming might improve performance when it is used with applications that can tolerate corruption of some low percentage. + + * dccph_ccval: CCID-specific information from sender to receiver (not always used). + + * dccph_x: Extended Sequence Numbers bit (1 bit). This flag is set when using 48-bit Extended Sequence and Acknowledgment Numbers. + + * dccph_type: The DCCP header type (4 bits). This can be, for example, DCCP_PKT_DATA for a data packet or DCCP_PKT_ACK for an ACK. See Table 11-3 , "DCCP packet types," in the "Quick Reference" section at the end of this chapter. + + * dccph_reserved: Reserved for future use (1 bit). + + * dccph_checksum: The checksum (16 bit). The Internet checksum of the DCCP header and data, computed similarly to UDP and TCP. If partial checksums are used, only the length specified by dccph_cscov of the application data is checksummed. + + * dccph_seq2: Sequence number. This is used when working with Extended Sequence Numbers (8 bit). + + * dccph_seq: Sequence number. It is incremented by 1 for each packet (16 bit). + +Note + +DCCP sequence numbers depend on dccph_x. (For details, refer to the dccp_hdr_seq() method, include/linux/dccp.h). + +Figure 11-4 shows a DCCP header. The dccph_x flag is set, so we use 48-bit Extended Sequence numbers. + +Figure 11-4. + +DCCP header (the Extended Sequence Numbers bit is set, dccph_x=1) + +Figure 11-5 shows a DCCP header. The dccph_x flag is not set, so we use 24-bit Sequence numbers. + +Figure 11-5. + +DCCP header (the Extended Sequence Numbers bit is not set, dccph_x=0) + +### DCCP Initialization + +DCCP initialization happens much like in TCP and UDP. Considering the DCCPv4 case (net/dccp/ipv4.c), first a proto object is defined (dccp_v4_prot) and its DCCP specific callbacks are set; we also define a net_protocol object (dccp_v4_protocol) and initialize it: + +static struct proto dccp_v4_prot = { + +.name = "DCCP", + +.owner = THIS_MODULE, + +.close = dccp_close, + +.connect = dccp_v4_connect, + +.disconnect = dccp_disconnect, + +.ioctl = dccp_ioctl, + +.init = dccp_v4_init_sock, + +. . . + +.sendmsg = dccp_sendmsg, + +.recvmsg = dccp_recvmsg, + +. . . + +} + +(net/dccp/ipv4.c) + +static const struct net_protocol dccp_v4_protocol = { + +.handler = dccp_v4_rcv, + +.err_handler = dccp_v4_err, + +.no_policy = 1, + +.netns_ok = 1, + +}; + +(net/dccp/ipv4.c) + +We register the dccp_v4_prot object and the dccp_v4_protocol object in the dccp_v4_init() method: + +static int __init dccp_v4_init(void) + +{ + +int err = proto_register(&dccp_v4_prot, 1); + +if (err != 0) + +goto out; + +err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); + +if (err != 0) + +goto out_proto_unregister; + +(net/dccp/ipv4.c) + +### DCCP Socket Initialization + +Socket creation in DCCP from userspace uses the socket() system call, where the domain argument (SOCK_DCCP) indicates that a DCCP socket is to be created. Within the kernel, this causes DCCP socket initialization via the dccp_v4_init_sock() callback, which relies on the dccp_init_sock() method to perform the actual work: + +static int dccp_v4_init_sock(struct sock *sk) + +{ + +static __u8 dccp_v4_ctl_sock_initialized; + +int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); + +if (err == 0) { + +if (unlikely(!dccp_v4_ctl_sock_initialized)) + +dccp_v4_ctl_sock_initialized = 1; + +inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; + +} + +return err; + +} + +(net/dccp/ipv4.c) + +The most important tasks of the dccp_init_sock() method are these: + + * Initialization of the DCCP socket fields with sane default values (for example, the socket state is set to be DCCP_CLOSED) + + * Initialization of the DCCP timers (via the dccp_init_xmit_timers() method) + + * Initialization of the feature-negotiation part via calling the dccp_feat_init() method. Feature negotiation is a distinguishing feature of DCCP by which endpoints can mutually agree on properties of each side of the connection. It extends TCP feature negotiation and is described further in RFC 4340, sec. 6. + +### Receiving Packets from the Network Layer (L3) with DCCP + +The main handler for receiving DCCP packets from the network layer (L3) is the dccp_v4_rcv () method: + +static int dccp_v4_rcv(struct sk_buff *skb) + +{ + +const struct dccp_hdr *dh; + +const struct iphdr *iph; + +struct sock *sk; + +int min_cov; + +First we discard invalid packets. For example, if the packet is not for this host (the packet type is not PACKET_HOST), or if the packet size is shorter than the DCCP header (which is 12 bytes): + +if (dccp_invalid_packet(skb)) + +goto discard_it; + +Then we perform a lookup according to the flow: + +sk = __inet_lookup_skb(&dccp_hashinfo, skb, + +dh->dccph_sport, dh->dccph_dport); + +If no socket was found, the packet is dropped: + +if (sk == NULL) { + +. . . + +goto no_dccp_socket; + +} + +We make some more checks relating to Minimum Checksum Coverage, and if everything is fine, we proceed to the generic sk_receive_skb() method to pass the packet to the transport layer (L4). Note that the dccp_v4_rcv() method is very similar in structure and function to the tcp_v4_rcv() method. This is because the original author of DCCP in Linux, Arnaldo Carvalho de Melo, has worked quite hard to make the similarities between TCP and DCCP obvious and clear in the code. + +. . . + +return sk_receive_skb(sk, skb, 1); + +} + +(net/dccp/ipv4.c) + +### Sending Packets with DCCP + +Sending data from a DCCP userspace socket is eventually handled by the dccp_sendmsg() method in the kernel (net/dccp/proto.c). This parallels the TCP case, where the tcp_sendmsg() kernel method handles sending data from a TCP userspace socket. Let's take a look at the dccp_sendmsg() method: + +int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + +size_t len) + +{ + +const struct dccp_sock *dp = dccp_sk(sk); + +const int flags = msg->msg_flags; + +const int noblock = flags & MSG_DONTWAIT; + +struct sk_buff *skb; + +int rc, size; + +long timeo; + +Allocate an SKB: + +skb = sock_alloc_send_skb(sk, size, noblock, &rc); + +lock_sock(sk); + +if (skb == NULL) + +goto out_release; + +skb_reserve(skb, sk->sk_prot->max_header); + +Copy the data blocks from the msghdr object to the SKB: + +rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + +if (rc != 0) + +goto out_discard; + +if (!timer_pending(&dp->dccps_xmit_timer)) + +dccp_write_xmit(sk); + +Depending upon the type of congestion control (window-based or rate-based) chosen for the connection, the dccp_write_xmit() method will cause a packet to be sent later (via dccps_xmit_timer() expiry) or passed on for immediate sending by the dccp_xmit_packet() method. This, in turn, relies on the dccp_transmit_skb() method to initialize the outgoing DCCP header and pass it to the L3-specific queue_xmit sending callback (using the ip_queue_xmit() method for IPv4, and the inet6_csk_xmit() method for IPv6). I will conclude our discussion about DCCP with a short section about DCCP and NAT. + +### DCCP and NAT + +Some NAT devices do not let DCCP through (usually because their firmware is typically small, and hence does not support "exotic" IP protocols such as DCCP). RFC 5597 (September 2009) has suggested behavioral requirements for NATs to support NAT-ed DCCP communications. However, it is not clear to what extent the recommendations are put into consumer devices. One of the motivations for DCCP-UDP was the absence of NAT devices that would let DCCP through (RFC 6773, sec. 1). There is a detail that might be interesting in the comparison with TCP. The latter, by default, supports simultaneous open (RFC 793, section 3.4), whereas the initial specification of DCCP in RFC 4340, section 4.6 disallowed the use of simultaneous-open. To support NAPT traversal, RFC 5596 updated RFC 4340 in September 2009 with a "near simultaneous open" technique, which added one packet type (DCCP-LISTEN, RFC 5596, section 2.2.1) to the list and changed the state machine to support two more states (2.2.2) to support near-simultaneous open. The motivation was a NAT "hole punching" technique, which would require, however, that NATs with DCCP existed (same problem as above). As a result of this chicken-and-egg problem, DCCP has not seen much exposure over the Internet. Perhaps the UDP encapsulation will change that. But then it would no longer really be considered as a transport layer protocol. + +## Summary + +This chapter discussed four transport protocols: UDP and TCP, which are the most commonly used, and SCTP and DCCP, which are newer protocols. You learned the basic differences between these protocols. You learned that TCP is a much more complex protocol than UDP, as its uses a state machine and several timers and requires acknowledgments. You learned about the header of each of these protocols and about sending and receiving packets with these protocols. I discussed some unique features of the SCTP protocol, like multihoming and multistreaming. + +The next chapter will deal with the Wireless subsystem and its implementation in Linux. In the "Quick Reference" section that follows, I will cover the top methods related to the topics discussed in this chapter, ordered by their context, and also I will present the two tables that were mentioned in this chapter. + +## Quick Reference + +I will conclude this chapter with a short list of important methods of sockets and transport-layer protocols that we discussed in this chapter. Some of them were mentioned in this chapter. Afterward, there is one macro and three tables. + +### Methods + +Here are the methods. + +#### int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc); + +This method builds an ipcm_cookie object by parsing the specified msghdr object. + +#### void sock_put(struct sock *sk); + +This method decrements the reference count of the specified sock object. + +#### void sock_hold(struct sock *sk); + +This method increments the reference count of the specified sock object. + +#### int sock_create(int family, int type, int protocol, struct socket **res); + +This method performs some sanity checks, and if everything is fine, it allocates a socket by calling the sock_alloc() method, and then calling net_families[family]->create. (In the case of IPv4, it is the inet_create() method.) + +#### int sock_map_fd(struct socket *sock, int flags); + +This method allocates a file descriptor and fills in the file entry. + +#### bool sock_flag(const struct sock *sk, enum sock_flags flag); + +This method returns true if the specified flag is set in the specified sock object. + +#### int tcp_v4_rcv(struct sk_buff *skb); + +This method is the main handler to process incoming TCP packets arriving from the network layer (L3). + +#### void tcp_init_sock(struct sock *sk); + +This method performs address-family independent socket initializations. + +#### struct tcphdr *tcp_hdr(const struct sk_buff *skb); + +This method returns the TCP header associated with the specified skb. + +#### int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); + +This method handles sending TCP packets that are sent from userspace. + +#### struct tcp_sock *tcp_sk(const struct sock *sk); + +This method returns the tcp_sock object associated with the specified sock object (sk). + +#### int udp_rcv(struct sk_buff *skb); + +This method is the main handler to process incoming UDP packets arriving from the network layer (L3). + +#### struct udphdr *udp_hdr(const struct sk_buff *skb); + +This method returns the UDP header associated with the specified skb. + +#### int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); + +This method handles UDP packets that are sent from the userspace. + +#### struct sctphdr *sctp_hdr(const struct sk_buff *skb); + +This method returns the SCTP header associated with the specified skb. + +#### struct sctp_sock *sctp_sk(const struct sock *sk); + +This method returns the SCTP socket (sctp_sock object) associated with the specified sock object. + +#### int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t msg_len); + +This method handles SCTP packets that are sent from userspace. + +#### struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, sctp_scope_t scope, gfp_t gfp); + +This method allocates and initializes a new SCTP association. + +#### void sctp_association_free(struct sctp_association *asoc); + +This method frees the resources of an SCTP association. + +#### void sctp_chunk_hold(struct sctp_chunk *ch); + +This method increments the reference count of the specified SCTP chunk. + +#### void sctp_chunk_put(struct sctp_chunk *ch); + +This method decrements the reference count of the specified SCTP chunk. If the reference count reaches 0, it frees it by calling the sctp_chunk_destroy() method. + +#### int sctp_rcv(struct sk_buff *skb); + +This method is the main input handler for input SCTP packets. + +#### static int dccp_v4_rcv(struct sk_buff *skb); + +This method is the main Rx handler for processing incoming DCCP packets that arrive from the network layer (L3). + +#### int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); + +This method handles DCCP packets that are sent from the userspace. + +### Macros + +And here is the macro. + +#### sctp_chunk_is_data( ) + +This macro returns 1 if the specified chunk is a data chunk; otherwise, it returns 0. + +### Tables + +Take a look at the tables used in this chapter. + +Table 11-1. + +TCP and UDP prot_ops objects + +prot_ops callback | TCP | UDP + +---|---|--- + +release | inet_release | inet_release + +bind | inet_bind | inet_bind + +connect | inet_stream_connect | inet_dgram_connect + +socketpair | sock_no_socketpair | sock_no_socketpair + +accept | inet_accept | sock_no_accept + +getname | inet_getname | inet_getname + +poll | tcp_poll | udp_poll + +ioctl | inet_ioctl | inet_ioctl + +listen | inet_listen | sock_no_listen + +shutdown | inet_shutdown | inet_shutdown + +setsockopt | sock_common_setsockopt | sock_common_setsockopt + +getsockopt | sock_common_getsockopt | sock_common_getsockopt + +sendmsg | inet_sendmsg | inet_sendmsg + +recvmsg | inet_recvmsg | inet_recvmsg + +mmap | sock_no_mmap | sock_no_mmap + +sendpage | inet_sendpage | inet_sendpage + +splice_read | tcp_splice_read | - + +compat_setsockopt | compat_sock_common_setsockopt | compat_sock_common_setsockopt + +compat_getsockopt | compat_sock_common_getsockopt | compat_sock_common_getsockopt + +compat_ioctl | inet_compat_ioctl | inet_compat_ioctl + +Note + +See the inet_stream_ops and the inet_dgram_ops definitions in net/ipv4/af_inet.c. + +Table 11-2. + +Chunk types + +Chunk Type | Linux Symbol | Value + +---|---|--- + +Payload Data | SCTP_CID_DATA | 0 + +Initiation | SCTP_CID_INIT | 1 + +Initiation Acknowledgment | SCTP_CID_INIT_ACK | 2 + +Selective Acknowledgment | SCTP_CID_SACK | 3 + +Heartbeat Request | SCTP_CID_HEARTBEAT | 4 + +Heartbeat Acknowledgment | SCTP_CID_HEARTBEAT_ACK | 5 + +Abort | SCTP_CID_ABORT | 6 + +Shutdown | SCTP_CID_SHUTDOWN | 7 + +Shutdown Acknowledgment | SCTP_CID_SHUTDOWN_ACK | 8 + +Operation Error | SCTP_CID_ERROR | 9 + +State Cookie | SCTP_CID_COOKIE_ECHO | 10 + +Cookie Acknowledgment | SCTP_CID_COOKIE_ACK | 11 + +Explicit Congestion Notification Echo (ECNE) | SCTP_CID_ECN_ECNE | 12 + +Congestion Window Reduced (CWR) | SCTP_CID_ECN_CWR | 13 + +Shutdown Complete | SCTP_CID_SHUTDOWN_COMPLETE | 14 + +SCTP Authentication Chunk (RFC 4895) | SCTP_CID_AUTH | 0x0F + +Transmission Sequence Numbers | SCTP_CID_FWD_TSN | 0xC0 + +Address Configuration Change Chunk | SCTP_CID_ASCONF | 0xC1 + +Address Configuration Acknowledgment Chunk | SCTP_CID_ASCONF_ACK | 0x80 + +Table 11-3. + +DCCP packet types + +Linux Symbol | Description + +---|--- + +DCCP_PKT_REQUEST | Sent by the client to initiate a connection (the first part of the three-way initiation handshake). + +DCCP_PKT_RESPONSE | Sent by the server in response to a DCCP-Request (the second part of the three-way initiation handshake). + +DCCP_PKT_DATA | Used to transmit application data. + +DCCP_PKT_ACK | Used to transmit pure acknowledgments. + +DCCP_PKT_DATAACK | Used to transmit application data with piggybacked acknowledgment information. + +DCCP_PKT_CLOSEREQ | Sent by the server to request that the client close the connection. + +DCCP_PKT_CLOSE | Used by the client or the server to close the connection; elicits a DCCP-Reset packet in response. + +DCCP_PKT_RESET | Used to terminate the connection, either normally or abnormally. + +DCCP_PKT_SYNC | Used to resynchronize sequence numbers after large bursts of packet loss. + +DCCP_PKT_SYNCACK | Acknowledge a DCCP_PKT_SYNC. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_12 + +© Rami Rosen 2014 + +# 12. Wireless in Linux + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 11 deals with Layer 4 protocols, which enable us to communicate with userspace. This chapter deals with the wireless stack in the Linux kernel. I describe the Linux wireless stack (mac80211 subsystem) and discuss some implementation details of important mechanisms in it, such as packet aggregation and block acknowledgement, used in IEEE 802.11n, and power save mode. Becoming familiar with the 802.11 MAC header is essential in order to understand the wireless subsystem implementation. The 802.11 MAC header, its members, and their usage are described in depth in this chapter. I also discuss some common wireless topologies, like infrastructure BSS, independent BSS, and Mesh networking.Mac80211 Subsystem + +Chapter 11 deals with Layer 4 protocols, which enable us to communicate with userspace. This chapter deals with the wireless stack in the Linux kernel. I describe the Linux wireless stack (mac80211 subsystem) and discuss some implementation details of important mechanisms in it, such as packet aggregation and block acknowledgement, used in IEEE 802.11n, and power save mode. Becoming familiar with the 802.11 MAC header is essential in order to understand the wireless subsystem implementation. The 802.11 MAC header, its members, and their usage are described in depth in this chapter. I also discuss some common wireless topologies, like infrastructure BSS, independent BSS, and Mesh networking. + +## Mac80211 Subsystem + +At the end of the 1990s, there were discussions in IEEE regarding a protocol for wireless local area networks (WLANS). The original version of the IEEE 802.11 spec for WLANS was released in 1997 and revised in 1999. In the following years, some extensions were added, formally termed 802.11 amendments. These extensions can be divided into PHY (Physical) layer extensions, MAC (Medium Access Control) layer extensions, Regulatory extensions, and others. PHY layer extensions are, for example, 802.11b from 1999, 802.11a (also from 1999), and 802.11g from 2003. MAC layer extensions are, for example, 802.11e for QoS and 802.11s for Mesh networking. The "Mesh Networking" section of this chapter deals with the Linux kernel implementation of the IEEE802.11s amendment. The IEEE802.11 spec was revised, and in 2007 a second version of 1,232 pages was released. In 2012, a spec of 2,793 pages was released, available from http://standards.ieee.org/findstds/standard/802.11-2012.html . I refer to this spec as IEEE 802.11-2012 in this chapter. Following is a partial list of important 802.11 amendments: + + * IEEE 802.11d: International (country-to-country) roaming extensions (2001). + + * IEEE 802.11e: Enhancements: QoS, including packet bursting (2005). + + * IEEE 802.11h: Spectrum Managed 802.11a for European compatibility (2004). + + * IEEE 802.11i: Enhanced security (2004). + + * IEEE 802.11j: Extensions for Japan (2004). + + * IEEE 802.11k: Radio resource measurement enhancements (2008). + + * IEEE 802.11n: Higher throughput improvements using MIMO (multiple input, multiple output antennas) (2009). + + * IEEE 802.11p:WAVE: Wireless Access for the Vehicular Environment (such as ambulances and passenger cars). It has some peculiarities such as not using the BSS concept and narrower (5/10 MHz) channels. Note that IEEE 802.11p isn't supported in Linux as of this writing. + + * IEEE 802.11v: Wireless network management. + + * IEEE 802.11w: Protected Management Frames. + + * IEEE 802.11y: 3650–3700 MHz operation in the U.S. (2008) + + * IEEE 802.11z: Extensions to Direct Link Setup (DLS) (Aug 2007–Dec 2011). + +It was only in about 2001, about four years after the IEEE 802.11 first spec was approved, that laptops became very popular; many of these laptops were sold with wireless network interfaces. Today every laptop includes WiFi as standard equipment. It was important to the Linux community at that time to provide Linux drivers to these wireless network interfaces and to provide a Linux network wireless stack, in order to stay competitive with other OSes (such as Windows, Mac OS, and others). Less effort has been done regarding architecture and design. "They just want their hardware to work," as Jeff Garzik, the Linux Kernel Wireless maintainer at that time, put it. When the first wireless drivers for Linux were developed, there was no general wireless API. As a result, there were many cases of duplication of code between drivers, when developers implemented their drivers from scratch. Some drivers were based on FullMAC, which means that most of the management layer (MLME) is managed in hardware. In the years since, a new 802.11 wireless stack called mac80211 was developed. It was integrated into the Linux kernel in July 2007, for the 2.6.22 Linux kernel. The mac80211 stack is based on the d80211 stack, which is an open source, GPL-licensed stack by a company named Devicescape. + +I cannot delve into the details of the PHY layer, because that subject is very wide and deserves a book of its own. However, I must note that there are many differences between 802.11 and 802.3 wired Ethernet. Here are two major differences: + + * Ethernet works with CSMA/CD, whereas 802.11 works with CSMA/CA. CSMA/CA stands for carrier sense multiple access/collision avoidance, and CSMA/CD stands for carrier sense multiple access/collision detection. The difference, as you might guess, is the collision detection. With Ethernet, a station starts to transmit when the medium is idle; if a collision is detected during transmission, it stops, and a random backoff period starts. Wireless stations cannot detect collisions while transmitting, whereas wired stations can. With CSMA/CA, the wireless station waits for a free medium and only then transmits the frame. In case of a collision, the station will not notice it, but because no acknowledgment frame should be sent for this packet, it is retransmitted after a timeout has elapsed if an acknowledgment is not received. + + * Wireless traffic is sensitive to interferences. As a result, the 802.11 spec requires that every frame, except for broadcast and multicast, be acknowledged when it is received. Packets that are not acknowledged in time should be retransmitted. Note that since IEEE 802.11e, there is a mode which does not require acknowledgement—the QoSNoAck mode—but it's rarely used in practice. + +## The 802.11 MAC Header + +Each MAC frame consists of a MAC header, a frame body of variable length, and an FCS (Frame Check Sequence) of 32 bit CRC. Figure 12-1 shows the 802.11 header. + +Figure 12-1. + +IEEE 802.11 header. Note that all members are not always used, as this section will shortly explain + +The 802.11 header is represented in mac80211 by the ieee80211_hdr structure: + +struct ieee80211_hdr { + +__le16 frame_control; + +__le16 duration_id; + +u8 addr1[6]; + +u8 addr2[6]; + +u8 addr3[6]; + +__le16 seq_ctrl; + +u8 addr4[6]; + +} __packed; + +(include/linux/ieee80211.h) + +In contrast to an Ethernet header (struct ethhdr), which contains only three fields (source MAC address, destination MAC address, and Ethertype), the 802.11 header contains up to six addresses and some other fields. For a typical data frame, though, only three addresses are used (for example, Access Point or AP/client communication). With an ACK frame, only the receiver address is used. Note that Figure 12-1 shows only four addresses, but when working with Mesh networking, a Mesh extension header with two additional addresses is used. + +I now turn to a description of the 802.11 header fields, starting with the first field in the 802.11 header, called the frame control. This is an important field, and in many cases its contents determine the meaning of other fields of the 802.11 MAC header (especially addresses). + +### The Frame Control + +The frame control length is 16 bits. Figure 12-2 shows its fields and the size of each field. + +Figure 12-2. + +Frame control fields + +The following is a description of the frame control members: + + * Protocol version: The version of the MAC 802.11 we use. Currently there is only one version of MAC, so this field is always 0. + + * Type: There are three types of packets in 802.11—management, control, and data: + + * Management packets (IEEE80211_FTYPE_MGMT) are for management actions like association, authentication, scanning, and more. + + * Control packets (IEEE80211_FTYPE_CTL) usually have some relevance to data packets; for example, a PS-Poll packet is for retrieving packets from an AP buffer. Another example: a station that wants to transmit first sends a control packet named RTS (request to send); if the medium is free, the destination station will send back a control packet named CTS (clear to send). + + * Data packets (IEEE80211_FTYPE_DATA) are the raw data packets. Null packets are a special case of raw packets, carrying no data and used mostly for power management control purposes. I discuss null packets in the "Power Save Mode" section later in this chapter. + + * Subtype: For all the aforementioned three types of packets (management, control, and data), there is a sub-type field which identifies the character of the packet used. For example: + + * A value of 0100 for the sub-type field in a management frame denotes that the packet is a Probe Request (IEEE80211_STYPE_PROBE_REQ) management packet, which is used in a scan operation. + + * A value of 1011 for the sub-type field in a control packet denotes that this is a request to send (IEEE80211_STYPE_RTS) control packet. A value of 0100 for the sub-type field of a data packet denotes that this is a null data (IEEE80211_STYPE_NULLFUNC) packet, which is used for power management control. + + * A value of 1000 (IEEE80211_STYPE_QOS_DATA) for the sub-type of a data packet means that this is a QoS data packet; this sub-type was added by the IEEE802.11e amendment, which dealt with QoS enhancements. + + * ToDS: When this bit is set, it means the packet is for the distribution system. + + * FromDS: When this bit is set, it means the packet is from the distribution system. + + * More Frag: When you use fragmentation, this bit is set to 1. + + * Retry: When a packet is retransmitted, this bit is set to 1. A typical case of retransmission is when a packet that was sent did not receive an acknowledgment in time. The acknowledgments are usually sent by the firmware of the wireless driver. + + * Pwr Mgmt: When the power management bit is set, it means that the station will enter power save mode. I discuss power save mode in the "Power Save Mode" section later in this chapter. + + * More Data: When an AP sends packets that it buffered for a sleeping station, it sets the More Data bit to 1 when the buffer is not empty. Thus the station knows that there are more packets it should retrieve. When the buffer has been emptied, this bit is set to 0. + + * Protected Frame: This bit is set to 1 when the frame body is encrypted; only data frames and authentication frames can be encrypted. + + * Order: With the MAC service called strict ordering, the order of frames is important. When this service is in use, the order bit is set to 1. It is rarely used. + +Note + +The action frame (IEEE80211_STYPE_ACTION) was introduced with the 802.11h amendment, which dealt with spectrum and transmit power management. However, because of a lack of space for management packets sub-types, action frames are used also in various newer amendments to the standard—for example, HT action frames in 802.11n. + +## The Other 802.11 MAC Header Members + +The following describes the other members of the mac802.11 header, after the frame control: + + * Duration/ID: The duration holds values for the Network Allocation Vector (NAV) in microseconds, and it consists of 15 bits of the Duration/ID field. The sixteenth field is 0. When working in power save mode, it is the AID (association id) of a station for PS-Poll frames (see 8.2.4.2 (a) in IEEE 802.11-2012). The Network Allocation Vector (NAV) is a virtual carrier sensing mechanism. I do not delve into NAV internals because that is beyond the scope of this chapter. + + * Sequence Control: This is a 2-byte field specifying the sequence control. In 802.11, it is possible that a packet will be received more than once, most commonly when an acknowledgment is not received for some reason. The sequence control field consists of a fragment number (4 bits) and a sequence number (12 bits). The sequence number is generated by the transmitting station, in the ieee80211_tx_h_sequence() method. In the case of a duplicate frame in a retransmission, it is dropped, and a counter of the dropped duplicate frames (dot11FrameDuplicateCount) is incremented by 1; this is done in the ieee80211_rx_h_check() method. The Sequence Control field is not present in control packets. + + * Address1 – Address4: There are four addresses, but you don't always use all of them. Address 1 is the Receive Address (RA), and is used in all packets. Address 2 is the Transmit Address (TA), and it exists in all packets except ACK and CTS packets. Address 3 is used only for management and data packets. Address 4 is used when ToDS and FromDS bits of the frame control are set; this happens when operating in a Wireless Distribution System. + + * QoS Control: The QoS control field was added by the 802.11e amendment and is only present in QoS data packets. Because it is not part of the original 802.11 spec, it is not part of the original mac80211 implementation, so it is not a member of the IEEE802.11 header (ieee80211_hdr struct). In fact, it was added at the end of the IEEE802.11 header and can be accessed by the ieee80211_get_qos_ctl() method. The QoS control field includes the tid (Traffic Identification), the ACK Policy, and a field called A-MSDU present, which tells whether an A-MSDU is present. I discuss A-MSDU later in this chapter, in the "High Throughput (ieee802.11n)" section. + + * HT Control Field: HT (high throughput) control field was added by the 802.11n amendment (see 7.1.3.5(a) of the 802.11n-2009 spec). + +This section covered the 802.11 MAC header, with a description of its members and their use. Becoming familiar with the 802.11 MAC header is essential for understanding the mac802.11 stack. + +## Network Topologies + +There are two popular network topologies in 802.11 wireless networks. The first topology I discuss is Infrastructure BSS mode, which is the most popular. You encounter Infrastructure BSS wireless networks in home wireless networks and offices. Later I discuss the IBSS (Ad Hoc) mode. Note that IBSS is not Infrastructure BSS; IBSS is Independent BSS, which is an ad hoc network, discussed later in this section. + +### Infrastructure BSS + +When working in Infrastructure BSS mode, there is a central device, called an Access Point (AP), and some client stations. Together they form a BSS (Basic Service Set). These client stations must first perform association and authentication against the AP to be able to transmit packets via the AP. On many occasions, client stations perform scanning prior to authentication and association, in order to get details about the AP. Association is exclusive: a client can be associated with only one AP in a given moment. When a client associates with an AP successfully, it gets an AID (association id), which is a unique number (to this BSS) in the range 1–2007. An AP is in fact a wireless network device with some hardware additions (like Ethernet ports, LEDs, a button to reset to manufacturer defaults, and more). A management daemon runs on the AP device. An example of such software is the hostapd daemon. This software handles some of the management tasks of the MLME layer, such as authentication and association requests. It achieves this by registering itself to receive the relevant management frames via nl80211. The hostapd project is an open source project which enables several wireless network devices to operate as an AP. + +Clients can communicate with other clients (or to stations in a different network which is bridged to the AP) by sending packets to the AP, which are relayed by the AP to their final destination. To cover a large area, you can deploy multiple APs and connect them by wire. This type of deployment is called Extended Service Set (ESS). Within ESS deployment, there are two or more BSSs. Multicasts and broadcasts sent in one BSS, which may arrive on a nearby BSS, are rejected in the nearby BSS stations (the bssid in the 802.11 header does not match). Within such a deployment, each AP usually uses a different channel to minimize interference. + +### IBSS, or Ad Hoc Mode + +IBSS network is often formed without preplanning, for only as long as the WLAN is needed. An IBSS network is also called ad hoc network. Creating an IBSS is a simple procedure. You can set an IBSS by running from a command line this iw command (note that the 2412 parameter is for using channel 1): + +iw wlan0 ibss join AdHocNetworkName 2412 + +Or when using the iwconfig tool, with these two commands: + +iwconfig wlan0 mode ad-hoc + +iwconfig wlan0 essid AdHocNetworkrName + +This triggers IBSS creation by calling the ieee80211_sta_create_ibss() method (net/mac80211/ibss.c). Then the ssid (AdHocNetworkName in this case) has to be distributed manually (or otherwise) to everyone who wants to connect to the ad hoc network. When working with IBSS, you do not have an AP. The bssid of the IBSS is a random 48-bit address (based on calling the get_random_bytes() method). Power management in Ad Hoc mode is a bit more complex than power management in Infrastructure BSS; it uses Announcement Traffic Indication Map (ATIM) messages. ATIM is not supported by mac802.11 and is not discussed in this chapter. + +The next section describes power save mode, which is one of the most important mechanisms of the mac80211 network stack. + +## Power Save Mode + +Apart from relaying packets, there is another important function for the AP: buffering packets for client stations that enter power save mode. Clients are usually battery-powered devices. From time to time, the wireless network interface enters power save mode. + +### Entering Power Save Mode + +When a client station enters power save mode, it informs the AP about it by sending usually a null data packet. In fact, technically speaking, it does not have to be a null data packet; it is enough that it is a packet with PM=1 (PM is the Power Management flag in the frame control). An AP that gets such a null packet starts keeping unicast packets which are destined to that station in a special buffer called ps_tx_buf; there is such a buffer for every station. This buffer is in fact a linked list of packets, and it can hold up to 128 packets (STA_MAX_TX_BUFFER) for each station. If the buffer is filled, it will start discarding the packets that were received first (FIFO). Apart from this, there is a single buffer called bc_buf, for multicast and broadcast packets (in the 802.11 stack, multicast packets should be received and processed by all the stations in the same BSS). The bc_buf buffer can also hold up to 128 packets (AP_MAX_BC_BUFFER). When a wireless network interface is in power save mode, it cannot receive or send packets. + +### Exiting Power Save Mode + +From time to time, an associated station is awakened by itself (by some timer); it then checks for special management packets, called beacons, which the AP sends periodically. Typically, an AP sends 10 beacons in a second; on most APs, this is a configurable parameter. These beacons contain data in information elements, which constitute the data in the management packet. The station that awoke checks a specific information element called TIM (Traffic Indication Map), by calling the ieee80211_check_tim() method (include/linux/ieee80211.h). The TIM is an array of 2008 entries. Because the TIM size is 251 bytes (2008 bits), you are allowed to send a partial virtual bitmap, which is smaller in size. If the entry in the TIM for that station is set, it means that the AP saved unicast packets for this station, so that station should empty the buffer of packets that the AP kept for it. The station starts sending null packets (or, more rarely, special control packets, called PS-Poll packets) to retrieve these buffered packets from the AP. Usually after the buffer has been emptied, the station goes to sleep (however, this is not mandatory according to the spec). + +### Handling the Multicast/Broadcast Buffer + +The AP buffers multicast and broadcast packets whenever at least one station is in sleeping mode. The AID for multicast/broadcast stations is 0; so, in such a case, you set TIM[0] to true. The Delivery Team (DTIM), which is a special type of TIM, is sent not in every beacon, but once for a predefined number of beacon intervals (the DTIM period). After a DTIM is sent, the AP sends its buffered broadcast and multicast packets. You retrieve packets from the multicast/broadcast buffer (bc_buf) by calling the ieee80211_get_buffered_bc() method. In Figure 12-3 you can see an AP that contains a linked list of stations (sta_info objects), each of them with a unicast buffer (ps_tx_buf) of its own, and a single bc_buf buffer, for storing multicast and broadcast packets. + +Figure 12-3. + +Buffering packets in an AP + +The AP is implemented as an ieee80211_if_ap object in mac80211. Each such ieee80211_if_ap object has a member called ps (an instance of ps_data), where power save data is stored. One of the members of the ps_data structure is the broadcast/multicast buffer, bc_buf. + +In Figure 12-4 you can see a flow of PS-Poll packets that a client sends in order to retrieve packets from the AP unicast buffer, ps_tx_buf. Note that the AP sends all the packets with the IEEE80211_FCTL_MOREDATA flag, except for the last one. Thus, the client knows that it should keep on sending PS-Poll packets until the buffer is emptied. For the sake of simplicity, the ACK traffic in this diagram is not included, but it should be mentioned here that the packets should be acknowledged. + +Figure 12-4. + +Sending PSPOLL packets from a client to retrieve packets from the ps_tx_buf buffer within an AP + +Note + +Power management and power save mode are two different topics. Power management deals with handling machines that perform suspend (whether it is suspend to RAM or suspend to disk, aka hibernate, or in some cases, both suspend to RAM and suspend to disk, aka hybrid suspend), and is handled in net/mac80211/pm.c. In the drivers, power management is handled by the resume/suspend methods. Power save mode, on the other hand, deals with handling stations that enter sleep mode and wake up; it has nothing to do with suspend and hibernation. + +This section described power save mode and the buffering mechanism. The next section discusses the management layer and the different tasks it handles. + +## The Management Layer (MLME) + +There are three components in the 802.11 management architecture: + + * The Physical Layer Management Entity (PLME). + + * The System Management Entity (SME). + + * The MAC Layer Management Entity (MLME). + +### Scanning + +There are two types of scanning: passive scanning and active scanning. Passive scanning means to listen passively for beacons, without transmitting any packets for scanning. When performing passive scanning (the flags of the scan channel contain IEEE80211_CHAN_PASSIVE_SCAN), the station moves from channel to channel, trying to receive beacons. Passive scanning is needed in some higher 802.11a frequency bands, because you're not allowed to transmit anything at all until you've heard an AP beacon. With active scanning, each station sends a Probe Request packet; this is a management packet, with sub-type Probe Request (IEEE80211_STYPE_PROBE_REQ). Also with active scanning, the station moves from channel to channel, sending a Probe Request management packet on each channel (by calling the ieee80211_send_probe_req() method). This is done by calling the ieee80211_request_scan() method. Changing channels is done via a call to the ieee80211_hw_config() method, passing IEEE80211_CONF_CHANGE_CHANNEL as a parameter. Note that there is a one-to-one correspondence between a channel in which a station operates and the frequency in which it operates; the ieee80211_channel_to_frequency() method (net/wireless/util.c) returns the frequency in which a station operates, given its channel. + +### Authentication + +Authentication is done by calling the ieee80211_send_auth() method (net/mac80211/util.c). It sends a management frame with authentication sub-type (IEEE80211_STYPE_AUTH). There are many authentications types; the original IEEE802.11 spec talked about only two forms: open-system authentication and shared key authentication. The only mandatory authentication method required by the IEEE802.11 spec is the open-system authentication (WLAN_AUTH_OPEN). This is a very simple authentication algorithm—in fact, it is a null authentication algorithm. Any client that requests authentication with this algorithm will become authenticated. An example of another option for an authentication algorithm is the shared key authentication (WLAN_AUTH_SHARED_KEY). In shared key authentication, the station should authenticate using a Wired Equivalent Privacy (WEP) key. + +### Association + +In order to associate, a station sends a management frame with association sub-type (IEEE80211_STYPE_ASSOC_REQ). Association is done by calling the ieee80211_send_assoc() method (net/mac80211/mlme.c). + +### Reassociation + +When a station moves between APs within an ESS, it is said to be roaming. The roaming station sends a reassociation request to a new AP by sending a management frame with reassociation sub-type (IEEE80211_STYPE_REASSOC_REQ). Reassociation is done by calling the ieee80211_send_assoc() method; there are many similarities between association and reassociation, so this method handles both. In addition, with reassociation, the AP returns an AID (association id) to the client in case of success. + +This section talked about the management layer (MLME) and some of the operations it supports, like scanning, authentication, association, and more. In the next section I describe some mac80211 implementation details that are important in order to understand the wireless stack. + +## Mac80211 Implementation + +Mac80211 has an API for interfacing with the low level device drivers. The implementation of mac80211 is complex and full of many small details. I cannot give an exhaustive description of the mac80211 API and implementation; I do discuss some important points that can give a good starting point to those who want to delve into the code. A fundamental structure of mac80211 API is the ieee80211_hw struct (include/net/mac80211.h); it represents hardware information. The priv (pointer to a private area) pointer of ieee80211_hw is of an opaque type (void *). Most wireless device drivers define a private structure for this private area, like lbtf_private (Marvell wireless driver) or iwl_priv (iwlwifi from Intel). Memory allocation and initialziation for the ieee80211_hw struct is done by the ieee80211_alloc_hw() method. Here are some methods related to the ieee80211_hw struct: + + * int ieee80211_register_hw(struct ieee80211_hw *hw): Called by wireless drivers for registering the specified ieee80211_hw object. + + * void ieee80211_unregister_hw(struct ieee80211_hw *hw): Unregisters the specified 802.11 hardware device. + + * struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops): Allocates an ieee80211_hw object and initializes it. + + * ieee80211_rx_irqsafe(): This method is for receiving a packet. It is implemented in net/mac80211/rx.c and called from low level wireless drivers. + +The ieee80211_ops object, which is passed to the ieee80211_alloc_hw() method as you saw earlier, consists of pointers to callbacks to the driver. Not all of these callbacks must be implemented by the drivers. The following is a short description of these methods: + + * tx(): The transmit handler called for each transmitted packet. It usually returns NETDEV_TX_OK (except for under certain limited conditions). + + * start(): Activates the hardware device and is called before the first hardware device is enabled. It turns on frame reception. + + * stop(): Turns off frame reception and usually turns off the hardware. + + * add_interface(): Called when a network device attached to the hardware is enabled. + + * remove_interface(): Informs a driver that the interface is going down. + + * config(): Handles configuration requests, such as hardware channel configuration. + + * configure_filter(): Configures the device's Rx filter. + +Figure 12-5 shows a block diagram of the architecture of the Linux wireless subsystem. You can see that the interface between wireless device drivers layer and the mac80211 layer is the ieee80211_ops object and its callbacks. + +Figure 12-5. + +Linux wireless architecture + +Another important structure is the sta_info struct (net/mac80211/sta_info.h), which represents a station. Among the members of this structure are various statistics counters, various flags, debugfs entries, the ps_tx_buf array for buffering unicast packets, and more. Stations are organized in a hash table (sta_hash) and a list (sta_list). The important methods related to sta_info are as follows: + + * int sta_info_insert(struct sta_info *sta): Adds a station. + + * int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr): Removes a station (by calling the __sta_info_destroy() method). + + * struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr): Fetches a station; the address of the station (it's bssid) is passed as a parameter. + +### Rx Path + +The ieee80211_rx() function (net/mac80211/rx.c) is the main receive handler. The status of the received packet (ieee80211_rx_status) is passed by the wireless driver to mac80211, embedded in the SKB control buffer (cb). The IEEE80211_SKB_RXCB() macro is used to fetch this status. The flag field of the Rx status specifies, for example, whether the FCS check failed on the packet (RX_FLAG_FAILED_FCS_CRC). The various values possible for the flag field are presented in Table 12-1 in the "Quick Reference" section of this chapter. In the ieee80211_rx() method, the ieee80211_rx_monitor() is invoked to remove the FCS (checksum) and remove a radiotap header (struct ieee80211_radiotap_header) which might have been added if the wireless interface is in monitor mode. (You use a network interface in monitor mode in case of sniffing, for example. Not all the wireless network interfaces support monitor mode, see the section "Wireless Modes" later in this chapter.) + +If you work with HT (802.11n), you perform AMPDU reordering if needed by invoking the ieee80211_rx_reorder_ampdu() method. Then you call the __ieee80211_rx_handle_packet() method, which eventually calls the ieee80211_invoke_rx_handlers() method. Then you call, one by one, various receive handlers (using a macro named CALL_RXH). The order of calling these handlers is important. Each handler checks whether it should handle the packet or not. If it decides it should not handle the packet, then you return RX_CONTINUE and proceed to the next handler. If it decides it should handle the packet, then you return RX_QUEUED. + +There are certain cases when a handler decides to drop a packet; in these cases, it returns RX_DROP_MONITOR or RX_DROP_UNUSABLE. For example, if you get a PS-Poll packet, and the type of the receiver shows that it is not an AP, you return RX_DROP_UNUSABLE. Another example: for a management frame, if the length of the SKB is less than the minimum (24), the packet is discarded and RX_DROP_MONITOR is returned. Or if the packet is not a management packet, then also the packet is discarded and RX_DROP_MONITOR is returned. Here is the code snippet from the ieee80211_rx_h_mgmt_check() method that implements this: + +ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) + +{ + +struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + +struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); + +. . . + +if (rx->skb->len < 24) + +return RX_DROP_MONITOR; + +if (!ieee80211_is_mgmt(mgmt->frame_control)) + +return RX_DROP_MONITOR; + +. . . + +} + +(net/mac80211/rx.c) + +### Tx Path + +The ieee80211_tx() method is the main handler for transmission (net/mac80211/tx.c). First it invokes the __ieee80211_tx_prepare() method, which performs some checks and sets certain flags. Then it calls the invoke_tx_handlers() method, which calls, one by one, various transmit handlers (using a macro named CALL_TXH). If a transmit handler finds that it should do nothing with the packet, it returns TX_CONTINUE and you proceed to the next handler. If it decides it should handle a certain packet, it returns TX_QUEUED, and if it decides it should drop the packet, it returns TX_DROP. The invoke_tx_handlers() method returns 0 upon success. Let's take a short look in the implementation of the ieee80211_tx() method: + +static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, + +struct sk_buff *skb, bool txpending, + +enum ieee80211_band band) + +{ + +struct ieee80211_local *local = sdata->local; + +struct ieee80211_tx_data tx; + +ieee80211_tx_result res_prepare; + +struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + +bool result = true; + +int led_len; + +Perform a sanity check, drop the SKB if its length is less than 10: + +if (unlikely(skb->len < 10)) { + +dev_kfree_skb(skb); + +return true; + +} + +/* initialises tx */ + +led_len = skb->len; + +res_prepare = ieee80211_tx_prepare(sdata, &tx, skb); + +if (unlikely(res_prepare == TX_DROP)) { + +ieee80211_free_txskb(&local->hw, skb); + +return true; + +} else if (unlikely(res_prepare == TX_QUEUED)) { + +return true; + +} + +Invoke the Tx handlers; if everything is fine, continue with invoking the __ieee80211_tx() method: + +. . . + +if (!invoke_tx_handlers(&tx)) + +result = __ieee80211_tx(local, &tx.skbs, led_len, + +tx.sta, txpending); + +return result; + +} + +(net/mac80211/tx.c) + +### Fragmentation + +Fragmentation in 802.11 is done only for unicast packets. Each station is assigned a fragmentation threshold size (in bytes). Packets that are bigger than this threshold should be fragmented. You can lower the number of collisions by reducing the fragmentation threshold size, making the packets smaller. You can inspect the fragmentation threshold of a station by running iwconfig or by inspecting the corresponding debugfs entry (see the "Mac80211 debugfs" section later in this chapter). You can set the fragmentation threshold with the iwconfig command; thus, for example, you can set the fragmentation threshold to 512 bytes by: + +iwconfig wlan0 frag 512 + +Each fragment is acknowledged. The more fragment field in the fragment header is set to 1 if there are more fragments. Each fragment has a fragment number (a subfield in the sequence control field of the frame control). Reassembling of the fragments on the receiver is done according to the fragments numbers. Fragmentation in the transmitter side is done by the ieee80211_tx_h_fragment() method (net/mac80211/tx.c). Reassembly on the receiver side is done by the ieee80211_rx_h_defragment() method (net/mac80211/rx.c). Fragmentation is incompatible with aggregation (used for higher throughput), and given the high rates and thus short (in time) packets it is very rarely used nowadays. + +### Mac80211 debugfs + +debugfsis a technique that enables exporting debugging information to userspace. It creates entries under the sysfs filesystem. debugfs is a virtual filesystem devoted to debugging information. For mac80211, handling mac80211 debugfs is mostly in net/mac80211/debugfs.c. After mounting debugfs, various mac802.11 statistics and information entries can be inspected. Mounting debugfs is performed like this: + +mount -t debugfs none_debugs /sys/kernel/debug + +Note + +CONFIG_DEBUG_FS must be set when building the kernel to be able to mount and work with debugfs. + +For example, let's say your phy is phy0; the following is a discussion about some of the entries under /sys/kernel/debug/ieee80211/phy0: + + * total_ps_buffered: This is the total number of packets (unicast and multicasts/broadcasts) which the AP buffered for the station. The total_ps_buffered counter is incremented by ieee80211_tx_h_unicast_ps_buf() for unicasts, and by ieee80211_tx_h_multicast_ps_buf() for multicasts or broadcasts. + + * Under /sys/kernel/debug/ieee80211/phy0/statistics, you have various statistical information—for example: + + * frame_duplicate_count denotes the number of duplicate frames. This debugfs entry represents the duplicate frames counter, dot11FrameDuplicateCount, which is incremented by the ieee80211_rx_h_check() method. + + * transmitted_frame_count denotes the number of transmitted packets. This debugfs entry represents dot11TransmittedFrameCount; it is incremented by the ieee80211_tx_status() method. + + * retry_count denotes number of retransmissions. This debugfs entry represents dot11RetryCount; it is incremented also by the ieee80211_tx_status() method. + + * fragmentation_threshold: The size of the fragmentation threshold, in bytes. See the "Fragmentation" section earlier. + + * Under /sys/kernel/debug/ieee80211/phy0/netdev:wlan0, you have some entries that give information about the interface; for example, if the interface is in station mode, you will have aid for the association id of the station, assoc_tries for the number of times the stations tried to perform association, bssid is for the bssid of the station, and so on. + + * Every station uses a rate control algorithm. Its name is exported by the following debugfs entry: /sys/kernel/debug/ieee80211/phy1/rc/name. + +### Wireless Modes + +You can set a wireless network interface to operate in several modes, depending on its intended use and the topology of the network in which it is deployed. In some cases, you can set the mode with the iwconfig command, and in some cases you must use a tool like hostapd for this. Note that not all devices support all modes. See www.linuxwireless.org/en/users/Drivers for a list of Linux drivers that support different modes. Alternatively, you can also check to which values the interface_modes field of the wiphy member (in the ieee80211_hw object) is initialized in the driver code. The interface_modes are initialized to one or more modes of the nl80211_iftype enum, like NL80211_IFTYPE_STATION or NL80211_IFTYPE_ADHOC (see: include/uapi/linux/nl80211.h). The following is a detailed description of these wireless modes: + + * AP mode: In this mode, the device acts as an AP (NL80211_IFTYPE_AP). The AP maintains and manages a list of associated stations. The network (BSS) name is the MAC address of the AP (bssid). There is also a human-readable name for the BSS, called the SSID. + + * Station infrastructure mode: A managed station in an infrastructure mode (NL80211_IFTYPE_STATION). + + * Monitor mode:All incoming packets are handed unfiltered in monitor mode (NL80211_IFTYPE_MONITOR). This is useful for sniffing. It is usually possible to transmit packets in monitor mode. This is termed packet injection; these packets are marked with a special flag (IEEE80211_TX_CTL_INJECTED). + + * Ad Hoc (IBSS) mode:A station in an ad hoc (IBSS) network (NL80211_IFTYPE_ADHOC). With Ad Hoc mode, there is no AP device in the network. + + * Wireless Distribution System (WDS) mode:A station in a WDS network (NL80211_IFTYPE_WDS). + + * Mesh mode:A station in a Mesh network (NL80211_IFTYPE_MESH_POINT), discussed in the "Mesh Networking (802.11s)" section later in this chapter. + +The next section discusses the ieee802.11n technology, which provides higher performance, and how it is implemented in the Linux wireless stack. You will learn also about block acknowledgment and packet aggregation in 802.11n and how these techniques are used to improve performance. + +## High Throughput (ieee802.11n) + +A little after 802.11g was approved, a new task group was created in IEEE, called High Throughput Task Group (TGn). IEEE 802.11n became a final spec at the end of 2009. The IEEE 802.11n protocol allows coexistence with legacy devices. There were some vendors who already sold 802.11n pre-standard devices based on the 802.11n draft before the official approval. Broadcom set a precedent for releasing wireless interfaces based on a draft. In 2003, it released a chipset of a wireless device based on a draft of 802.11g. Following this precedent, as early as 2005 some vendors released products based on the 802.11n draft. For example, Intel Santa Rose processor has Intel Next-Gen Wireless-N (Intel WiFI Link 5000 series), supports 802.11n. Other Intel wireless network interfaces, like 4965AGN, also supported 802.11n. Other vendors, including Atheros and Ralink, also released 802.11n draft-based wireless devices. The WiFi alliance started certification of 802.11n draft devices in June 2007. A long list of vendors released products which comply with Wi-Fi CERTIFIED 802.11n draft 2.0. + +802.11n can operate on the 2.4 GHz and/or 5 GHz bands, whereas 802.11g and 802.11b operate only in the 2.4 GHz radio frequency band, and 802.11a operates only in the 5 GHz radio frequency band. The 802.11n MIMO (Multiple Input, Multiple Output) technology increases the range and reliability of traffic over the wireless coverage area. MIMO technology uses multiple transmitter and receiver antennas on both APs and clients, to allow for simultaneous data streams. The result is increased range and increased throughput. With 802.11n you can achieve a theoretical PHY rate of up to 600 Mbps (actual throughput will be much lower due to medium access rules, and so on). + +802.11n added many improvements for the 802.11 MAC layer. The most well known is packet aggregation, which concatenates multiple packets of application data into a single transmission frame. A block acknowledgment (BA) mechanism was added (discussed in the next section). BA permits multiple packets to be acknowledged by a single packet instead of sending an ACK for each received packet. The wait time between two consecutive packets is cut. This enables sending multiple data packets with a fixed overhead cost of a single packet. The BA protocol was introduced in the 802.11e amendment from 2005. + +### Packet Aggregation + +There are two types of packet aggregation: + + * AMSDU: Aggregated Mac Service Data Unit + + * AMPDU: Aggregated Mac Protocol Data Unit + +Note that the AMSDU is only supported on Rx, and not on Tx, and is wholly independent from the Block Ack mechanism described in this section; so the discussion in this section only pertains to AMPDU. + +There are two sides to a Block Ack session: originator and recipient. Each block session has a different Traffic Identifier (TID). The originator starts the block acknowledgement session by calling the ieee80211_start_tx_ba_session() method. This is done typically from a rate control algorithm method in the driver. For example, with the ath9k wireless driver, the ath_tx_status() function (drivers/net/wireless/ath/ath9k/rc.c), which is a rate control callback, invokes the ieee80211_start_tx_ba_session() method. The ieee80211_start_tx_ba_session() method sets the state to HT_ADDBA_REQUESTED_MSK and sends an ADDBA request packet, by invoking the ieee80211_send_addba_request() method. The call to ieee80211_send_addba_request() passes parameters for the session, such as the wanted reorder buffer size and the TID of the session. + +The reorder buffer size is limited to 64K (see the definition of ieee80211_max_ampdu_length_exp in include/linux/ieee80211.h). These parameters are part of the capability member (capab) in the struct addba_req. The response to the ADDBA request should be received within 1 Hz, which is one second in x86_64 machines (ADDBA_RESP_INTERVAL). If you do not get a response in time, the sta_addba_resp_timer_expired() method will stop the BA session by calling the ___ieee80211_stop_tx_ba_session() method. When the other side (the recipient) receives the ADDBA request, it first sends an ACK (every packet in ieee802.11 should be acknowledged, as mentioned before). Then it processes the ADDBA request by calling the ieee80211_process_addba_request() method; if everything is okay, it sets the aggregation state of this machine to operational (HT_AGG_STATE_OPERATIONAL) and sends an ADDBA response by calling the ieee80211_send_addba_resp() method. It also stops the response timer (the timer which has as its callback the sta_addba_resp_timer_expired() method) by calling del_timer_sync()on this timer. After a session is started, a data block containing multiple MPDU packets is sent. Consequently, the originator sends a Block Ack Request (BAR) packet by calling the ieee80211_send_bar() method. + +#### Block Ack Request (BAR) + +The BAR is a control packet with Block Ack Request sub-type (IEEE80211_STYPE_BACK_REQ). The BAR packet includes the SSN (start sequence number), which is the sequence number of the oldest MSDU in the block that should be acknowledged. The recipient receives the BAR and reorders the ampdu buffer accordingly, if needed. Figure 12-6 shows a BAR request. + +Figure 12-6. + +BAR request + +When sending a BAR, the type subfield in the frame control is control (IEEE80211_FTYPE_CTL), and the subtype subfield is Block Ack request (IEEE80211_STYPE_BACK_REQ). The BAR is represented by the ieee80211_bar struct: + +struct ieee80211_bar { + +__le16 frame_control; + +__le16 duration; + +__u8 ra[6]; + +__u8 ta[6]; + +__le16 control; + +__le16 start_seq_num; + +} __packed; + +(include/linux/ieee80211.h) + +The RA is the recipient address, and the TA is the transmitter (originator) address. The control field of the BAR request includes the TID. + +#### Block Ack + +There are two types of Block Ack: Immediate Block Ack and Delayed Block Ack. Figure 12-7 shows Immediate Block Ack. + +Figure 12-7. + +Immediate Block Ack + +The difference between Immediate Block Ack and Delayed Block Ack is that with Delayed Block Ack, the BAR request itself is answered first with an ACK, and then after some delay, with a BA (Block Ack). When using Delayed Block Ack, there is more time to process the BAR, and this is sometime needed when working with software based processing. Using Immediate Block Ack is better in terms of performance. The BA itself is also acknowledged. When the originator has no more data to send, it can terminate the Block Ack session by calling the ieee80211_send_delba() method; this function sends a DELBA request packet to the other side. The DELBA request is handled by the ieee80211_process_delba() method. The DELBA message, which causes a Block Ack session tear down, can be sent either from the originator or recipient of the Block Ack session. The AMPDU maximum length is 65535 octets. Note that packet aggregation is only implemented for APs and managed stations; packet aggregation for IBSS is not supported by the spec. + +## Mesh Networking (802.11s) + +The IEEE 802.11s protocol started as a Study Group of IEEE in September 2003, and became a Task Group named TGs in 2004. In 2006, 2 proposals, out of 15 (the "SEEMesh" and "Wi-Mesh" proposals) were merged into one, which resulted in draft D0.01. 802.11s was ratified in July 2011 and is now part of IEEE 802.11-2012. Mesh networks allow the creation of an 802.11 Basic Service Set over fully and partially connected Mesh topology. This can be seen as an improvement over 802.11 ad hoc network, which requires a fully-connected Mesh topology. Figures 12-8 and 12-9 illustrate the difference between the two types of Mesh topologies. + +Figure 12-8. + +Full Mesh + +In a partially-connected Mesh, nodes are connected to only some of the other nodes, but not to all of them. This topology is much more common in wireless Mesh networks. Figure 12-9 shows an example of a partial mesh. + +Figure 12-9. + +Partial Mesh + +Wireless mesh networks forward data packets over multiple wireless hops. Each mesh node acts as a relay point/router for the other mesh nodes. In kernel 2.6.26 (2008), support for the draft of wireless mesh networking (802.11s) was added to the network wireless stack, thanks to the open80211s project. The open80211s project goal was to create the first open implementation of 802.11s. The project got some sponsorship from the OLPC project and from some commercial companies. Luis Carlos Cobo and Javier Cardona and other developers from Cozybit developed the Linux mac80211 Mesh code. + +Now that you have learned a bit about Mesh networking and Mesh network topologies, you are ready for the next section, which covers the HWMP routing protocol for Mesh networks. + +### HWMP Protocol + +The 802.11s protocol defines a default routing protocol called HWMP (Hybrid Wireless Mesh Protocol). The HWMP protocol works with Layer 2 and deals with MAC addresses, as opposed to the IPV4 routing protocol, for example, which works with Layer 3 and deals with IP addresses. HWMP routing is based on two types of routing (hence it is called hybrid). The first is on-demand routing, and the second is proactive routing. The main difference between the two mechanisms has to do with the time in which path establishment is initiated (path is the name used for route in Layer 2). In on-demand routing, a path to a destination is established by the protocol only after the protocol stack has received frames for such a destination. This minimizes the amount of management traffic required to maintain the Mesh network at the expense of introducing additional latency in data traffic. Proactive routing can be used if a Mesh node is known to be the recipient of a lot of mesh traffic. In that case, the node will periodically announce itself over the Mesh network and trigger path establishments to itself from all the Mesh nodes in the network. Both on-demand and proactive routing are implemented in the Linux kernel. There are four types of routing messages: + + * PREQ (Path Request): This type of message is sent as a broadcast when you look for some destination that you still do not have a route to. This PREQ message is propagated in the Mesh network until it gets to its destination. A lookup is performed on each station until the final destination is reached (by calling the mesh_path_lookup() method). If the lookup fails, the PREQ is forwarded (as a broadcast) to the other stations. The PREQ message is sent in a management packet; its sub-type is action (IEEE80211_STYPE_ACTION). It is handled by the hwmp_preq_frame_process() method. + + * PREP (Path Reply): This type is a unicast packet that is sent as a reply to a PREQ message. This packet is sent in the reverse path. The PREP message is also sent in a management packet and its subtype is also the action sub-type (IEEE80211_STYPE_ACTION). It is handled by the hwmp_prep_frame_process() method. Both the PREQ and the PREP messages are sent by the mesh_path_sel_frame_tx() method. + + * PERR (Path Error): If there is some failure on the way, a PERR is sent. A PERR message is handled by the mesh_path_error_tx() method. + + * RANN (Root Announcement): The Root Mesh point periodically broadcasts this frame. Mesh points that receive it send a unicast RREQ to the root via the MP from which it received the RANN. In response, the Root Mesh will send a PREP response to each PREQ. + +Note + +The route takes into consideration a radio-aware metric (airtime metric). The airtime metric is calculated by the airtime_link_metric_get() method (based on rate and other hardware parameters). Mesh points continuously monitor their links and update metric values with neighbours. + +The station that sent the PREQ may try to send packets to the final destination while still not knowing the route to that destination; these packets are kept in a buffer of SKBs named frame_queue, which is a member of the mesh_path object (net/mac80211/mesh.h). In such a case, when a PREP finally arrives, the pending packets of this buffer are sent to the final destination (by calling the mesh_path_tx_pending() method). The maximum number of frames buffered per destination for unresolved destinations is 10 (MESH_FRAME_QUEUE_LEN). The advantages of Mesh networking are as follows: + + * Rapid deployment + + * Minimal configuration, inexpensive + + * Easy to deploy in hard-to-wire environments + + * Connectivity while nodes are in motion + + * Higher reliability: no single point of failure and the ability to heal itself + +The disadvantages are as follows: + + * Many broadcasts limit network performance. + + * Not all wireless drivers support Mesh mode at the moment. + +### Setting Up a Mesh Network + +There are two sets of userspace tools for managing wireless devices and networks in Linux: one is the older Wireless Tools for Linux, an open source project based on IOCTLs. Examples of command line utilities of the wireless tools are iwconfig, iwlist, ifrename, and more. The newer tool is iw, based on generic netlink sockets (described in Chapter 2). However, there are some tasks that only the newer tool, iw, can perform. You can set a wireless device to work in Mesh mode only with the iw command. + +Example: setting a wireless network interface (wlan0) to work in Mesh mode can be done like this: + +iw wlan0 set type mesh + +Note + +Setting a wireless network interface (wlan0) to work in mesh mode can be done also like this:iw wlan0 set type mp + +mp stands for Mesh Point. See "Adding interfaces with iw" in http://wireless.kernel.org/en/users/Documentation/iw + +Joining the mesh is done by: iw wlan0 mesh join "my-mesh-ID" + +You can display statistics about a station by the following: + + * iw wlan0 station dump + + * iw wlan0 mpath dump + +I should mention here also the authsae and the wpa_supplicant tools, which can be used to create secure Mesh networks and do not depend upon iw. + +## Linux Wireless Development Process + +Most development is done using the git distributed version control system, as with many other Linux subsystems. There are three main git trees; the bleeding edge is the wireless-testing tree. There are also the regular wireless tree and the wireless-next tree. The following are the links to the git repositories for the development trees: + + * wireless-testing development tree: + +git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git + + * wireless development tree: + +git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git + + * wireless-next development tree: + +git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git + +Patches are sent and discussed in the wireless mailing list: linux-wireless@vger.kernel.org. From time to time a pull request is sent to the kernel networking mailing list, netdev, mentioned in Chapter 1. + +As mentioned in the "Mac80211 subsystem" section, which dealt with the mac80211 subsystem, some wireless network interface vendors maintain their own development trees for their Linux drivers on their own sites. In some cases, the code they are using does not use the mac80211 API; for example, some Ralink and Realtek wireless device drivers. Since January 2006, the maintainer of the Linux wireless subsystem is John W. Linville, who replaced Jeff Garzik. The maintainer of mac80211 is Johannes Berg, from October 2007. There were some annual Linux wireless summits; the first took place in 2006 in Beaverton (OR). A very detailed wiki page is here: http://wireless.kernel.org/ . This web site includes a lot of important documentation. For example, a table specifies the modes each wireless network interface supports. There is a lot of information in this wiki page regarding many wireless device drivers, hardware, and various tools (such as CRDA, the central regulatory domain agent, hostapd, iw, and more). + +## Summary + +A lot of development has been done in Linux wireless stack in recent years. The most significant change is the integration of the mac80211 stack and porting wireless drivers to use the mac80211 API, making the code much more organized. The situation is much better than before; many more wireless devices are supported in Linux. Mesh networking got a boost recently thanks to the open802.11s project. It was integrated in the Linux 2.6.26 kernel. The future will probably see more drivers that support the new standard, IEEE802.11ac, a 5 GHz-only technology that can reach maximum throughputs well above a gigabit per second, and more drivers that support P2P. + +Chapter 13 discusses InfiniBand and RDMA in the Linux kernel. The "Quick Reference" section covers the top methods that are related to the topics discussed in this chapter, ordered by their context. + +## Quick Reference + +I conclude this chapter with a short list of important methods of the Linux wireless subsystem, some of which are mentioned in this chapter. Table 12-1 shows the various possible values for the flag member of the ieee80211_rx_status object. + +### Methods + +This section discusses the methods. + +#### void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); + +This method sends a block acknowledgment request. + +#### int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, u16 timeout); + +This method starts a Block Ack session by calling the wireless driver ampdu_action() callback, passing IEEE80211_AMPDU_TX_START. As a result, the driver will later call the ieee80211_start_tx_ba_cb() callback or the ieee80211_start_tx_ba_cb_irqsafe() callback, which will start the aggregation session. + +#### int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid); + +This method stops a Block Ack session by calling the wireless driver ampdu_action() function, passing IEEE80211_AMPDU_TX_STOP. The driver must later call the ieee80211_stop_tx_ba_cb() callback or the ieee80211_stop_tx_ba_cb_irqsafe() callback. + +#### static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u8 dialog_token, u16 start_seq_num, u16 agg_size, u16 timeout); + +This method sends an ADDBA message. An ADDBA message is a management action message. + +#### void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); + +This method handles an ADDBA message. + +#### static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout); + +This method sends an ADDBA response. An ADDBA response is a management packet, with subtype of action (IEEE80211_STYPE_ACTION). + +#### static ieee80211_rx_result debug_noinline ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx); + +This method handles AMSDU aggregation (Rx path). + +#### void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); + +This method handles a DELBA message. + +#### void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, u16 initiator, u16 reason_code); + +This method sends a DELBA message. + +#### void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb); + +This method receives a packet. The ieee80211_rx_irqsafe() method can be called in hardware interrupt context. + +#### static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, struct sk_buff_head *frames); + +This method handles the A-MPDU reorder buffer. + +#### static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_rx *tid_agg_rx, struct sk_buff_head *frames); + +This method handles the A-MPDU reorder buffer. + +#### static ieee80211_rx_result debug_noinline ieee80211_rx_h_check(struct ieee80211_rx_data *rx); + +This method drops duplicate frames of a retransmission and increment dot11FrameDuplicateCount and the station num_duplicates counter. + +#### void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, int powersave); + +This method sends a special NULL data frame. + +#### void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); + +This method sends a PS-Poll control packet to an AP. + +#### static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata); + +This method performs association or reassociation by sending a management packet with association sub-type of IEEE80211_STYPE_ASSOC_REQ or IEEE80211_STYPE_REASSOC_REQ, respectively. The ieee80211_send_assoc() method is invoked from the ieee80211_do_assoc() method. + +#### void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *da, const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx, u32 tx_flags); + +This method performs authentication by sending a management packet with authentication sub-type (IEEE80211_STYPE_AUTH). + +#### static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, u8 tim_len, u16 aid); + +This method checks whether the tim[aid] is set; the aid is passed as a parameter, and it represents the association id of the station. + +#### int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req); + +This method starts active scanning. + +#### void mesh_path_tx_pending(struct mesh_path *mpath); + +This method send packets from the frame_queue. + +#### struct mesh_path *mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst); + +This method performs a lookup in a Mesh path table (routing table) of a Mesh point. The second parameter to the mesh_path_lookup() method is the hardware address of the destination. It returns NULL if there is no entry in the table, otherwise it returns a pointer to the mesh path structure which was found. + +#### static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata); + +This method creates an IBSS. + +#### int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); + +This method is called for various configurations by the driver; in most cases, it delegates the call to the driver config() method, if implemented. The second parameter specifies which action to take (for instance, IEEE80211_CONF_CHANGE_CHANNEL to change channel, or IEEE80211_CONF_CHANGE_PS to change the power save mode of the driver). + +#### struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops); + +This method allocates a new 802.11 hardware device. + +#### int ieee80211_register_hw(struct ieee80211_hw *hw); + +This method registers a 802.11 hardware device. + +#### void ieee80211_unregister_hw(struct ieee80211_hw *hw); + +This method unregisters a 802.11 hardware device and frees its allocated resources. + +#### int sta_info_insert(struct sta_info *sta); + +This method adds a station to the hash table of stations and to the list of stations. + +#### int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr); + +This method removes a station and frees its resources. + +#### struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr); + +This method returns a pointer to a station by performing a lookup in the hash table of stations. + +#### void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, bool directed, u32 tx_flags, struct ieee80211_channel *channel, bool scan); + +This method sends a probe request management packet. + +#### static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); + +This method transmits an SKB. + +#### int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band); + +This method returns the frequency in which a station operates, given its channel. There is a one-to-one correspondence between a channel and a frequency. + +#### static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, const u8 *orig_addr, __le32 orig_sn, u8 target_flags, const u8 *target, __le32 target_sn, const u8 *da, u8 hop_count, u8 ttl, __le32 lifetime, __le32 metric, __le32 preq_id, struct ieee80211_sub_if_data *sdata); + +This method sends a PREQ or PREP management packet. + +#### static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, const u8 *preq_elem, u32 metric); + +This method handles a PREQ message. + +#### struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb); + +This method returns the ieee80211_rx_status object associated with the control buffer (cb), which is associated with the specified SKB. + +#### static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool txpending, enum ieee80211_band band); + +This method is the main handler for transmission. + +### Table + +Table 12-1 shows the bits of the flag member (a 32-bit field) of the ieee80211_rx_status structure and the corresponding Linux symbol. + +Table 12-1. + +Rx Flags: Various Possible Values for the Flag Field of the ieee80211_rx_status Object + +Linux Symbol | Bit | Description + +---|---|--- + +RX_FLAG_MMIC_ERROR | 0 | Michael MIC error was reported on this frame. + +RX_FLAG_DECRYPTED | 1 | This frame was decrypted in hardware. + +RX_FLAG_MMIC_STRIPPED | 3 | The Michael MIC is stripped off this frame, verification has been done by the hardware. + +RX_FLAG_IV_STRIPPED | 4 | The IV/ICV are stripped from this frame. + +RX_FLAG_FAILED_FCS_CRC | 5 | The FCS check failed on the frame. + +RX_FLAG_FAILED_PLCP_CRC | 6 | The PCLP check failed on the frame. + +RX_FLAG_MACTIME_START | 7 | The timestamp passed in the RX status is valid and contains the time the first symbol of the MPDU was received. + +RX_FLAG_SHORTPRE | 8 | Short preamble was used for this frame. + +RX_FLAG_HT | 9 | HT MCS was used and rate_idx is MCS index + +RX_FLAG_40MHZ | 10 | HT40 (40 MHz) was used. + +RX_FLAG_SHORT_GI | 11 | Short guard interval was used. + +RX_FLAG_NO_SIGNAL_VAL | 12 | The signal strength value is not present. + +RX_FLAG_HT_GF | 13 | This frame was received in a HT-greenfield transmission + +RX_FLAG_AMPDU_DETAILS | 14 | A-MPDU details are known, in particular the reference + +number must be populated and be a distinct number for + +each A-MPDU. + +RX_FLAG_AMPDU_REPORT_ZEROLEN | 15 | Driver reports 0-length subframes. + +RX_FLAG_AMPDU_IS_ZEROLEN | 16 | This is a zero-length subframe, for monitoring purposes + +only. + +RX_FLAG_AMPDU_LAST_KNOWN | 17 | Last subframe is known, should be set on all subframes of a single A-MPDU. + +RX_FLAG_AMPDU_IS_LAST | 18 | This subframe is the last subframe of the A-MPDU. + +RX_FLAG_AMPDU_DELIM_CRC_ERROR | 19 | A delimiter CRC error has been detected on this subframe. + +RX_FLAG_AMPDU_DELIM_CRC_KNOWN | 20 | The delimiter CRC field is known (the CRC + +is stored in the ampdu_delimiter_crc field of the + +ieee80211_rx_status) + +RX_FLAG_MACTIME_END | 21 | The timestamp passed in the RX status is valid and + +contains the time the last symbol of the MPDU (including + +FCS) was received. + +RX_FLAG_VHT | 22 | VHT MCS was used and rate_index is MCS index + +RX_FLAG_80MHZ | 23 | 80 MHz was used + +RX_FLAG_80P80MHZ | 24 | 80+80 MHz was used + +RX_FLAG_160MHZ | 25 | 160 MHz was used +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_13 + +© Rami Rosen 2014 + +# 13. InfiniBand + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 12 dealt with the wireless subsystem and its implementation in Linux. In this chapter, I will discuss the InfiniBand subsystem and its implementation in Linux. Though the InfiniBand technology might be perceived as a very complex technology for those who are unfamiliar with it, the concepts behind it are surprisingly straightforward, as you will see in this chapter. I will start our discussion with Remote Direct Memory Access (RDMA), and discuss its main data structures and its API. I will give some examples illustrating how to work with RDMA, and conclude this chapter with a short discussion about using RDMA API from the kernel level and userspace. + +> This chapter was written by Dotan Barak, an InfiniBand Expert. Dotan is a Senior Software Manager at Mellanox Technologies working on RDMA Technologies. Dotan has been working at Mellanox for more than 10 years in various roles, both as a developer and a manager. Additionally, Dotan maintains a blog about the RDMA technology: http://www.rdmamojo.com . + +Chapter 12 dealt with the wireless subsystem and its implementation in Linux. In this chapter, I will discuss the InfiniBand subsystem and its implementation in Linux. Though the InfiniBand technology might be perceived as a very complex technology for those who are unfamiliar with it, the concepts behind it are surprisingly straightforward, as you will see in this chapter. I will start our discussion with Remote Direct Memory Access (RDMA), and discuss its main data structures and its API. I will give some examples illustrating how to work with RDMA, and conclude this chapter with a short discussion about using RDMA API from the kernel level and userspace. + +## RDMA and InfiniBand—General + +Remote Direct Memory Access (RDMA) is the ability for one machine to access—that is, to read or write to—memory on a remote machine. There are several main network protocols that support RDMA: InfiniBand, RDMA over Converged Ethernet (RoCE) and internet Wide Area RDMA Protocol (iWARP), and all of them share the same API. InfiniBand is a completely new networking protocol, and its specifications can be found in the document "InfiniBand Architecture specifications," which is maintained by the InfiniBand Trade Association (IBTA). RoCE allows you to have RDMA over an Ethernet network, and its specification can be found as an Annex to the InfiniBand specifications. iWARP is a protocol that allows using RDMA over TCP/IP, and its specifications can be found in the document, "An RDMA Protocol Specification," which is being maintained by the RDMA Consortium. Verbs is the description of the API to use RDMA from a client code. The RDMA API implementation was introduced to the Linux kernel in version 2.6.11. At the beginning, it supported only InfiniBand, and after several kernel versions, iWARP and RoCE support were added to it as well. When describing the API, I mention only one of them, but the following text refers to all. All of the definitions to this API can be found in include/rdma/ib_verbs.h. Here are some notes about the API and the implementation of the RDMA stack: + + * Some of the functions are inline functions, and some of them aren't. Future implementation might change this behavior. + + * Most of the APIs have the prefix "ib"; however, this API supports InfiniBand, iWARP and RoCE. + + * The header ib_verbs.h contains functions and structures to be used by: + + * The RDMA stack itself + + * Low-level drivers for RDMA devices + + * Kernel modules that use the stack as consumers + +I will concentrate on functions and structures that are relevant only for kernel modules that use the stack as consumers (the third case). The following section discusses the RDMA stack organization in the kernel tree. + +### The RDMA Stack Organization + +Almost all of the kernel RDMA stack code is under drivers/infiniband in the kernel tree. The following are some of its important modules (this is not an exhaustive list, as I do not cover the entire RDMA stack in this chapter): + + * CM: Communication manager (drivers/infiniband/core/cm.c) + + * IPoIB: IP over InfiniBand (drivers/infiniband/ulp/ipoib/) + + * iSER: iSCSI extension for RDMA (drivers/infiniband/ulp/iser/) + + * RDS: Reliable Datagram Socket (net/rds/) + + * SRP: SCSI RDMA protocol (drivers/infiniband/ulp/srp/) + + * Hardware low-level drivers of different vendors (drivers/infiniband/hw) + + * verbs: Kernel verbs (drivers/infiniband/core/verbs.c) + + * uverbs: User verbs (drivers/infiniband/core/uverbs_*.c) + + * MAD: Management datagram (drivers/infiniband/core/mad.c) + +Figure 13-1 shows the Linux InfiniBand stack architecture. + +Figure 13-1. + +Linux Infiniband stack architecture + +In this section, I covered the RDMA stack organization and the kernel modules that are part of it in the Linux kernel. + +### RDMA Technology Advantages + +Here I will cover the advantages of the RDMA technology and explain the features that make it popular in many markets: + + * Zero copy: The ability to directly write data to and read data from remote memory allows you to access remote buffers directly without the need to copy it between different software layers. + + * Kernel bypass: Sending and receiving data from the same context of the code (that is, userspace or kernel level) saves the context switches time. + + * CPU offload: The ability to send or receive data using dedicated hardware without any CPU intervention allows for decreasing the usage of the CPU on the remote side, because it doesn't perform any active operations. + + * Low latency: RDMA technologies allow you to reach a very low latency for short messages. (In current hardware and on current servers, the latency for sending up to tens of bytes can be a couple of hundred nanoseconds.) + + * High Bandwidth: In an Ethernet device, the maximum bandwidth is limited by the technology (that is, 10 or 40 Gbits/sec). In InfiniBand, the same protocol and equipment can be used from 2.5 Gbits/sec up to 120 Gbits/sec. (In current hardware and on current servers, the BW can be upto 56 Gbits/sec.) + +### InfiniBand Hardware Components + +Like in any other interconnect technologies, in InfiniBand several hardware components are described in the spec, some of them are endpoints to the packets (generating packets and the target of the packet), and some of them forward packets in the same subnet or between different subnets. Here I will cover the most common ones: + + * Host Channel Adapter (HCA ): The network adapter that can be placed at a host or at any other system (for example, storage device). This component initiates or is the target of packets. + + * Switch : A component that knows how to receive a packet from one port and send it to another port. If needed, it can duplicate multicast messages. (Broadcast isn't supported in InfiniBand.) Unlike other technologies, every switch is a very simple device with forwarding tables that are configured by the Subnet Manager (SM), which is an entity that configures and manages the subnet (later on in this section, I will discuss its role in more detail). The switch doesn't learn anything by itself or parse and anlyze packets; it forwards packets only within the same subnet. + + * Router : A component that connects several different InfiniBand subnets. + +A subnet is a set of HCAs, switches, and router ports that are connected together. In this section, I described the various hardware components in InfiniBand, and now I will discuss the addressing of the devices, system, and ports in InfiniBand. + +### Addressing in InfiniBand + +Here are some rules about InfiniBand addressing and an example: + + * In InfiniBand, the unique identifier of components is the Globally Unique Identifier (GUID), which is a 64-bit value that is unique in the world. + + * Every node in the subnet has a Node GUID. This is the identifier of the node and a constant attribute of it. + + * Every port in the subnet, including in HCAs and in switches, has a port GUID. This is the identifier of the port and a constant attribute of it. + + * In systems that are made from several components, there can be a system GUID. All of the components in that system have the same system GUID. + +Here is an example that demonstrates all the aforementioned GUIDs: a big switch system that is combined from several switch chips. Every switch chip has a unique Node GUID. Every port in every switch has a unique port GUID. All of the chips in that system have the same system GUID. + + * Global IDentifier (GID) is used to identify an end port or a multicast group. Every port has at least one valid GID at the GID table in index 0. It is based on the port GUID plus the subnet identifier that this port is part of. + + * Local IDentifier (LID) is a 16-bit value that is assigned to every subnet port by the Subnet Manager. A switch is an exception, and the switch management port has the LID assignment, and not all of its ports. Every port can be assigned only one LID, or a contiguous range of LIDs, in order to have several paths to this port. Each LID is unique at a specific point of time in the same subnet, and it is used by the switches when forwarding the packets to know which egress port to use. The unicast LID's range is 0x001 to 0xbfff. The multicast LIDs range is 0xc000 to 0xfffe. + +### InfiniBand Features + +Here we will cover some of the InfiniBand protocol features: + + * InfiniBand allows you to configure partitions of ports of HCAs, switches, and routers and allows you to provide virtual isolation over the same physical subnet. Every Partition Key (P_Key) is a 16-bit value that is combined from the following: 15 lsbs are the key value, and the msb is the membership level; 0 is limited membership; and 1 is full membership. Every port has a P_Key table that is being configured by the SM, and every Queue Pair (QP), the actual object in InfiniBand that sends and receives data, is associated with one P_Key index in this table. One QP can send or receive packets from a remote QP only if, in the P_Keys that each of them is associated with, the following is true: + + * The key value is equal. + + * At least one of them has full membership. + + * Queue Key (Q_Key): An Unreliable Datagram (UD) QP will get unicast or multicast messages from a remote UD QP only if the Q_Key of the message is equal to the Q_Key value of this UD QP. + + * Virtual Lanes (VL): This is a mechanism for creating multiple virtual links over a single physical link. Every virtual lane represents an autonomic set of buffers for send and receive packets in each port. The number of supported VLs is an attribute of a port. + + * Service Level (SL): InfiniBand supports up to 16 service levels. The protocol doesn't specify the policy of each level. In InfiniBand, the QoS is implemented using the SL-to-VL mapping and the resources for each VL. + + * Failover: Connected QPs are QPs that can send packets to or receive packets from only one remote QP. InfiniBand allows defining a primary path and an alternate path for connected QPs. If there is a problem with the primary path, instead of reporting an error, the alternate path will be used automatically. + +In the next section, we will look at what packets in InfiniBand look like. This is very useful when you debug problems in InfiniBand. + +### InfiniBand Packets + +Every packet in InfiniBand is a combination of several headers and, in many cases, a payload, which is the data of the messages that the clients want to send. Messages that contain only an ACK or messages with zero bytes (for example, if only immediate data is being sent) won't contain a payload. Those headers describe from where the packet was sent, what the target of the packet is, the used operation, the information needed to separate the packets into messages, and enough information to detect packet loss errors. + +Figure 13-2 presents the InfiniBand packet headers. + +Figure 13-2. + +InfiniBand packet headers + +Here are the headers in InfiniBand: + + * Local Routing Header (LRH):8 bytes. Always present. It identifies the local source and destination ports of the packet. It also specifies the requested QoS attributes (SL and VL) of the message. + + * Global Routing Header (GRH): 40 bytes. Optional. Present for multicast packets or packets that travel in multiple subnets. It describes the source and destination ports using GIDs. Its format is identical to the IPv6 header. + + * Base Transport Header (BTH): 12 bytes. Always present. It specifies the source and destination QPs, the operation, packet sequence number, and partition. + + * Extended Transport Header (ETH): from 4 to 28 bytes. Optional. Extra family of headers that might be present, depending on the class of the service and the operation used. + + * Payload: Optional. The data that the client wants to send. + + * Immediate data:4 bytes. Optional. Out-of-band, 32-bit value that can be added to Send and RDMA Write operations. + + * Invariant CRC (ICRC):4 bytes. Always present. It covers all fields that should not be changed as the packet travels in the subnet. + + * Variant CRC (VCRC):2 bytes. Always present. It covers all of the fields of the packet. + +### Management Entities + +The SM is the entity in the subnet that is responsible for analyzing the subnet and configuring it. These are some of its missions: + + * Discover the physical topology of the subnet. + + * Assign the LIDs and other attributes—such as active MTU, active speeds, and more—to each port in the subnet. + + * Configure the forwarding table in the subnet switches. + + * Detect any changes in the topology (for example, if new nodes were added or removed from the subnet). + + * Handle various errors in the subnet. + +Subnet Manager is usually a software entity that can be running in a switch (which is called a managed switch) or in any node in the subnet. + +Several SMs can be running in a subnet, but only one of them will be active and the rest of them will be in standby mode. There is an internal protocol that performs master election and decides which SM will be active. If the active SM is going down, one of the standby SMs will become the active SM. Every port in the subnet has a Subnet Management Agent (SMA), which is an agent that knows how to receive management messages sent by the SM, handle them, and return a response. Subnet Administrator (SA) is a service that is part of the SM. These are some of its missions: + + * Provide information about the subnet—for example, information about how to get from one port to another (that is, a path query). + + * Allow you to register to get notifications about events. + + * Provide services for management of the subnet, such as joining or leaving a multicast. Those services might cause the SM to (re)configure the subnet. + +Communication Manager (CM) is an entity that is capable of running on each port, if the port supports it, to establish, maintain, and tear down QP connections. + +## RDMA Resources + +In the RDMA API, a lot of resources need to be created and handled before any data can be sent or received. All of the resources are in the scope of a specific RDMA device, those resources cannot be shared or used across more than one local device, even if there are multiple devices in the same machine. Figure 13-3 presents the RDMA resource creation hierarchy. + +Figure 13-3. + +RDMA resource creation hierarchy + +### RDMA Device + +The client needs to register with the RDMA stack in order to be notified about any RDMA device that is being added to the system or removed from it. After the initial registration, the client is notified for all existing RDMA devices. A callback will be invoked for every RDMA device, and the client can start working with these devices in the following ways: + + * Query the device for various attributes + + * Modify the device attributes + + * Create, work with and destroy resources + +The ib_register_client() method registers a kernel client that wants to use the RDMA stack. The specified callbacks will be invoked for every new InfiniBand device that currently exists in the system and that will be added to or removed from (using hot-plug functionality) the system. The ib_unregister_client() method unregisters a kernel client that wants to stop using the RDMA stack. Usually, it is called when the driver is being unloaded. Here is an sample code that shows how to register the RDMA stack in a kernel client: + +static void my_add_one(struct ib_device *device) + +{ + +... + +} + +static void my_remove_one(struct ib_device *device) + +{ + +... + +} + +static struct ib_client my_client = { + +.name = "my RDMA module", + +.add = my_add_one, + +.remove = my_remove_one + +}; + +static int __init my_init_module(void) + +{ + +int ret; + +ret = ib_register_client(&my_client); + +if (ret) { + +printk(KERN_ERR "Failed to register IB client\n"); + +return ret; + +} + +return 0; + +} + +static void __exit my_cleanup_module(void) + +{ + +ib_unregister_client(&my_client); + +} + +module_init(my_init_module); + +module_exit(my_cleanup_module); + +Following here is a description of several more methods for handling an InfiniBand device. + + * The ib_set_client_data() method sets a client context to be associated with an InfiniBand device. + + * The ib_get_client_data() method returns the client context that was associated with an InfiniBand device using the ib_set_client_data() method. + + * The ib_register_event_handler() method registers a callback to be called for every asynchronous event that will occur to the InfiniBand device. The callback structure must be initialized with the INIT_IB_EVENT_HANDLER macro. + + * The ib_unregister_event_handler() method unregisters the event handler. + + * The ib_query_device() method queries the InfiniBand device for its attributes. Those attributes are constant and won't be changed in subsequent calls of this method. + + * The ib_query_port() method queries the InfiniBand device port for its attributes. Some of those attributes are constant, and some of them might be changed in subsequent calls of this method—for example, the port LID, state, and some other attributes. + + * The rdma_port_get_link_layer() method returns the link layer of the device port. + + * The ib_query_gid() method queries the InfiniBand device port's GID table in a specific index. The ib_find_gid() method returns the index of a specific GID value in a port's GID table. + + * The ib_query_pkey() method queries the InfiniBand device port's P_Key table in a specific index. The ib_find_pkey() method returns the index of a specific P_Key value in a port's P_Key table. + +### Protection Domain (PD) + +A PD allows associating itself with several other RDMA resources—such as SRQ, QP, AH, or MR—in order to provide a means of protection among them. RDMA resources that are associated with PDx cannot work with RDMA resources that were associated with PDy. Trying to mix those resources will end with an error. Typically, every module will have one PD. However, if a specific module wants to increase its security, it will use one PD for each remote QP or service that it uses. Allocation and deallocation of a PD is done like this: + + * The ib_alloc_pd() method allocates a PD. It takes as an argument the pointer of the device object that was returned when the driver callback was called after its registration. + + * The ib_dealloc_pd() method deallocates a PD. It is usually called when the driver is being unloaded or when the resources that are associated with the PD are being destroyed. + +### Address Handle (AH) + +An AH is used in the Send Request of a UD QP to describe the path of the message from the local port to the remote port. The same AH can be used for several QPs if all of them send messages to the same remote port using the same attributes. Following is a description of four methods related to the AH: + + * The ib_create_ah() method creates an AH. It takes as an argument a PD and attributes for the AH. The AH attributes of the AH can be filled directly or by calling the ib_init_ah_from_wc() method, which gets as a parameter a received Work Completion (ib_wc object) that includes the attributes of a successfully completed incoming message, and the port it was received from. Instead of calling the ib_init_ah_from_wc() method and then the ib_create_ah() method, one can call the ib_create_ah_from_wc() method. + + * The ib_modify_ah() method modifies the attributes of an existing AH. + + * The ib_query_ah() method queries for the attributes of an existing AH. + + * The ib_destroy_ah() method destroys an AH. It is called when there isn't a need to send any further messages to the node that the AH describes the path to. + +### Memory Region (MR) + +Every memory buffer that is accessed by the RDMA device needs to be registered. During the registration process, the following tasks are performed on the memory buffer: + + * Separate the contiguous memory buffer to memory pages. + + * The mapping of the virtual-to-physical translation will be done. + + * The memory pages permission is checked to ensure that the requested permissions for the MR is supported by them. + + * The memory pages are pinned, to prevent them from being swapped out. This keeps the virtual-to-physical mapping unchanged. + +After a successful memory registration is completed, it has two keys: + + * Local key (lkey): A key for accessing this memory by local Work Requests. + + * Remote key (rkey): A key for accessing this memory by a remote machine using RDMA operations. + +Those keys will be used in Work Requests when referring to those memory buffers. The same memory buffers can be registered several times, even with different permissions. The following is a description of some methods related to the MR: + + * The ib_get_dma_mr() method returns a Memory Region for system memory that is usable for DMA. It takes a PD and the requested access permission for the MR as arguments. + + * The ib_dma_map_single() method maps a kernel virtual address, that was allocated by the kmalloc() method family, to a DMA address. This DMA address will be used to access local and remote memory. The ib_dma_mapping_error() method should be used to check whether the mapping was successful. + + * The ib_dma_unmap_single() method unmaps a DMA mapping that was done using ib_dma_map_single(). It should be called when this memory isn't needed anymore. + +Note + +There are some more flavors of ib_dma_map_single() that allow the mapping of pages, mapping ­according to DMA attributes, mapping using a scatter/gather list, or mapping using a scatter/gather list with DMA attributes: ib_dma_map_page(), ib_dma_map_single_attrs(), ib_dma_map_sg(), and ib_dma_map_sg_attrs(). All of them have corresponding unmap functions. + +Before accessing a DMA mapped memory, the following methods should be called: + + * ib_dma_sync_single_for_cpu() if the DMA region is going to be accessed by the CPU, or ib_dma_sync_single_for_device() if the DMA region is going to be accessed by the InfiniBand device. + + * The ib_dma_alloc_coherent() method allocates a memory block that can be accessed by the CPU and maps it for DMA. + + * The ib_dma_free_coherent() method frees a memory block that was allocated using ib_dma_alloc_coherent(). + + * The ib_reg_phys_mr() method takes a set of physical pages, registers them, and prepares a virtual address that can be accessed by an RDMA device. If you want to change it after it was created, you should call the ib_rereg_phys_mr() method. + + * The ib_query_mr() method retrieves the attributes of a specific MR. Note that most low-level drivers do not implement this method. + + * The ib_dereg_mr() method deregisters an MR. + +### Fast Memory Region (FMR) Pool + +Registration of a Memory Region is a "heavy" procedure that might take some time to complete, and the context that performs it even might sleep if required resources aren't available when it is called. This behavior might be problematic when performed in certain contexts—for example, in the interrupt handler. Working with an FMR pool allows you to work with FMRs with registrations that are "lightweight" and can be registered in any context. The API of the FMR pool can be found in include/rdma/ib_fmr_pool.h. + +### Memory Window (MW) + +Enabling a remote access to a memory can be done in two ways: + + * Register a memory buffer with remote permissions enabled. + + * Register a Memory Region and then bind a Memory Window to it. + +Both of those ways will create a remote key (rkey) that can be used to access this memory with the specified permissions. However, if you wish to invalidate the rkey to prevent remote access to this memory, performing Memory Region deregistration might be a heavy procedure. Working with Memory Window on this Memory Region and binding or unbinding it when needed might provide a "lightweight" procedure for enabling and disabling remote access to memory. Following is a description of three methods related to the MW: + + * The ib_alloc_mw() method allocates a Memory Window. It takes a PD and the MW type as arguments. + + * The ib_bind_mw() method binds a Memory Window to a specified Memory Region with a specific address, size, and remote permissions by posting a special Work Request to a QP. It is called when you want to allow temporary remote access to its memory. A Work Completion in the Send Queue of the QP will be generated to describe the status of this operation. If ib_bind_mw() was called to a Memory Windows that is already bounded, to the same Memory Region or a different one, the previous binding will be invalidated. + + * The ib_dealloc_mw() method deallocates the specified MW object. + +### Completion Queue (CQ) + +Every posted Work Request, to either Send or Receive Queue, is considered outstanding until there is a corresponding Work Completion for it or for any Work Request that was posted after it. While the Work Request is outstanding, the content of the memory buffers that it points to is undetermined: + + * If the RDMA device reads this memory and sends its content over the wire, the client cannot know if this buffer can be (re)used or released. If this is a reliable QP, a successful Work Completion means that the message was received by the remote side. If this is an unreliable QP, a successful Work Completion means that the message was sent. + + * If the RDMA device writes a message to this memory, the client cannot know if this buffer contains the incoming message. + +A Work Completion specifies that the corresponding Work Request was completed and provides some information about it: its status, the used opcode, its size, and so on. A CQ is an object that contains the Work Completions. The client needs to poll the CQ in order to read the Work Completions that it has. The CQ works on a first-in, first-out (FIFO) basis: the order of Work Completions that will be de-queued from it by the client will be according to the order that they were enqueued to the CQ by the RDMA device. The client can read the Work Completions in polling mode or request to get a notification when a new Work Completion is added to the CQ. A CQ cannot hold more Work Completions than its size. If more Work Completions than its capacity are added to it, a Work Completion with an error will be added, a CQ error asynchronous event will be generated, and all the Work Queues associated with it will get an error. Here are some methods related to the CQ: + + * The ib_create_cq() method creates a CQ. It takes the following as its arguments: the pointer of the device object that was returned when the driver callback was called after its registration and the attributes for the CQ, including its size and the callbacks that will be called when there is an asynchronous event on this CQ or a Work Completion is added to it. + + * The ib_resize_cq() method changes the size of a CQ. The new number of entries cannot be less than the number of the Work Completions that currently populate the CQ. + + * The ib_modify_cq() method changes the moderation parameter for a CQ. A Completion event will be generated if at least a specific number of Work Completions enter the CQ or a timeout will expire. Using it might help reduce the number of interrupts that happen in an RDMA device. + + * The ib_peek_cq() method returns the number of available Work Completions in a CQ. + + * The ib_req_notify_cq() method requests that a Completion event notification be generated when the next Work Completion, or Work Completion that includes a solicited event indication, is added to the CQ. If no Work Completion is added to the CQ after the ib_req_notify_cq() method was called, no Completion event notification will occur. + + * The ib_req_ncomp_notif() method requests that a Completion event notification be created when a specific number of Work Completions exists in the CQ. Unlike the ib_req_notify_cq() method, when calling the ib_req_ncomp_notif() method, a Completion event notification will be generated even if the CQ currently holds this number of Work Completions. + + * The ib_poll_cq() method polls for Work Completions from a CQ. It reads the Work Completions from the CQ in the order they were added to it and removes them from it. + +Here is an example of a code that empties a CQ—that is, reads all the Work Completions from a CQ, and checks their status: + +struct ib_wc wc; + +int num_comp = 0; + +while (ib_poll_cq(cq, 1, &wc) > 0) { + +if (wc.status != IB_WC_SUCCESS) { + +printk(KERN_ERR "The Work Completion[%d] has a bad status %d\n", + +num_comp, wc.status); + +return -EINVAL; + +} + +num_comp ++; + +} + +### eXtended Reliable Connected (XRC) Domain + +An XRC Domain is an object that is used to limit the XRC SRQs an incoming message can target. That XRC domain can be associated with several other RDMA resources that work with XRC, such as SRQ and QP. + +### Shared Receive Queue (SRQ) + +An SRQ is a way for the RDMA architecture to be more scalable on the receive side. Instead of having a separate Receive Queue for every Queue Pair, there is a shared Receive Queue that all of the QPs are connected to. When they need to consume a Receive Request, they fetch it from the SRQ. Figure 13-4 presents QPs that are associated with an SRQ. + +Figure 13-4. + +QPs that are associated with an SRQ + +Here's what you do if you have N QPs, and each of them might receive a burst of M messages at a random time: + + * Without using an SRQ, you post N*M Receive Requests. + + * With SRQs, you post K*M (where K << N) Receive Requests. + +Unlike a QP, which doesn't have any mechanism to determine the number of outstanding Work Requests in it, with an SRQ you can set a watermark limit. When the number of Receive Requests drops below this limit, an SRQ limit asynchronous event will be created for this SRQ. The downside of using an SRQ is that you cannot predict which QP will consume every posted Receive Request from the SRQ, so the message size that each posted Receive Request will be able to hold must be the maximum incoming message size that any of the QPs might get. This limitation can be handled by creating several SRQs, one for each different maximum message size, and associating them with the relevant QPs according to their expected message sizes. + +Here is a description of some methods related to the SRQ and an example: + + * The ib_create_srq() method creates an SRQ. It takes a PD and attributes for the SRQ. + + * The ib_modify_srq() method modifies the attributes of the SRQ. It is used to set a new watermark value for the SRQ's limit event or to resize the SRQ for devices that support it. + +Here is an example for setting the value of the watermark to get an asynchronous event when the number of RRs in the SRQ drops below 5: + +struct ib_srq_attr srq_attr; + +int ret; + +memset(&srq_attr, 0, sizeof(srq_attr)); + +srq_attr.srq_limit = 5; + +ret = ib_modify_srq(srq, &srq_attr, IB_SRQ_LIMIT); + +if (ret) { + +printk(KERN_ERR "Failed to set the SRQ's limit value\n"); + +return ret; + +} + +Following here is a description of several more methods for handling an SRQ. + + * The ib_query_srq() method queries for the current SRQ attributes. This method is usually used to check the content of the SRQ's limit value. The value 0 in the srq_limit member in the ib_srq_attr object means that there isn't any SRQ limit watermark set. + + * The ib_destroy_srq() method destroys an SRQ. + + * The ib_post_srq_recv() method takes a linked list of Receive Requests as an argument and adds them to a specified Shared Receive Queue for future processing. + +Here is an example for posting a single Receive Request to an SRQ. It saves an incoming message in a memory buffer, using its registered DMA address in a single gather entry: + +struct ib_recv_wr wr, *bad_wr; + +struct ib_sge sg; + +int ret; + +memset(&sg, 0, sizeof(sg)); + +sg.addr = dma_addr; + +sg.length = len; + +sg.lkey = mr->lkey; + +memset(&wr, 0, sizeof(wr)); + +wr.next = NULL; + +wr.wr_id = (uintptr_t)dma_addr; + +wr.sg_list = &sg; + +wr.num_sge = 1; + +ret = ib_post_srq_recv(srq, &wr, &bad_wr); + +if (ret) { + +printk(KERN_ERR "Failed to post Receive Request to an SRQ\n"); + +return ret; + +} + +### Queue Pair (QP) + +Queue Pair is the actual object used to send and receive data in InfiniBand. It has two separate Work Queues: Send and Receive Queues. Every Work Queue has a specific number of Work Requests (WR) that can be posted to it, a number of scatter/gather elements that are supported for each WR, and a CQ to which the Work Requests whose processing has ended will add Work Completion. Those Work Queues can be created with similar or different attributes—for example, the number of WRs that can be posted to each Work Queue. The order in each Work Queue is guaranteed—that is, the processing of a Work Request in the Send Queue will start according to the order of the Send Requests submission. And the same behavior applies to the Receive Queue. However, there isn't any relation between them—that is, an outstanding Send Request can be processed even if it was posted after posting a Receive Request to the Receive Queue. Figure 13-5 presents a QP. + +Figure 13-5. + +QP (Queue Pair) + +Upon creation, every QP has a unique number across the RDMA device at a specific point in time. + +#### QP Transport Types + +There are several QP transport types supported in InfiniBand: + + * Reliable Connected (RC): One RC QP is connected to a single remote RC QP, and reliability is guaranteed—that is, the arrival of all packets according to their order with the same content that they were sent with is guaranteed. Every message is fragmented to packets with the size of the path MTU at the sender side and defragmented at the receiver side. This QP supports Send, RDMA Write, RDMA Read, and Atomic operations. + + * Unreliable Connected (UC): One UC QP is connected to a single remote UC QP, and reliability isn't guaranteed. Also, if a packet in a message is lost, the whole message is lost. Every message is fragmented to packets with the size of the path MTU at the sender side and defragmented at the receiver side. This QP supports Send and RDMA Write operations. + + * Unreliable Datagram (UD): One UD QP can send a unicast message to any UD QP in the subnet. Multicast messages are supported. Reliability isn't guaranteed. Every message is limited to one packet message, with its size limited to the path MTU size. This QP supports only Send operations. + + * eXtended Reliable Connected (XRC): Several QPs from the same node can send messages to a remote SRQ in a specific node. This is useful for decreasing the number of QPs between two nodes from the order of the number of CPU cores—that is, QP in a process per core, to one QP. This QP supports all operations that are supported by RC QP. This type is relevant only for userspace applications. + + * Raw packet: Allows the client to build a complete packet, including the L2 headers, and send it as is. At the receiver side, no header will be stripped by the RDMA device. + + * Raw IPv6/Raw Ethertype: QPs that allow sending raw packets that aren't interpreted by the IB device. Currently, both of these types aren't supported by any RDMA device. + +There are special QP transport types that are used for subnet management and special services: + + * SMI/QP0: QP used for subnet managements packets. + + * GSI/QP1: QP used for general services packets. + +The ib_create_qp() method creates a QP. It takes a PD and the requested attributes that this QP will be created with as arguments. Here is an example for creating an RC QP using a PD that was created, with two different CQs: one for the Send Queue and one for the Receive Queue. + +struct ib_qp_init_attr init_attr; + +struct ib_qp *qp; + +memset(&init_attr, 0, sizeof(init_attr)); + +init_attr.event_handler = my_qp_event; + +init_attr.cap.max_send_wr = 2; + +init_attr.cap.max_recv_wr = 2; + +init_attr.cap.max_recv_sge = 1; + +init_attr.cap.max_send_sge = 1; + +init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + +init_attr.qp_type = IB_QPT_RC; + +init_attr.send_cq = send_cq; + +init_attr.recv_cq = recv_cq; + +qp = ib_create_qp(pd, &init_attr); + +if (IS_ERR(qp)) { + +printk(KERN_ERR "Failed to create a QP\n"); + +return PTR_ERR(qp); + +} + +#### QP State Machine + +A QP has a state machine that defines what the QP is capable of doing at each state: + + * Reset state: Each QP is generated at this state. At this state, no Send Requests or Receive Requests can be posted to it. All incoming messages are silently dropped. + + * Initialized state: At this state, no Send Requests can be posted to it. However, Receive Requests can be posted, but they won't be processed. All incoming messages are silently dropped. It is a good practice to post a Receive Request to a QP at this state before moving it to RTR (Ready To Receive). Doing this prevents a case where remote QP sends messages need to consume a Receive Request but such were not posted yet. + + * Ready To Receive (RTR) state: At this state, no Send Requests can be posted to it, but Receive Requests can be posted and processed. All incoming messages will be handled. The first incoming message that is received at this state will generate the communication-established asynchronous event. A QP that only receives messages can stay at this state. + + * Ready To Send (RTS) state: At this state, both Send Requests and Receive Requests can be posted and processed. All incoming messages will be handled. This is the common state for QPs. + + * Send Queue Drained (SQD) state:At this state, the QP completes the processing of all the Send Requests that their processing has started. Only when there aren't any messages that can be sent, you can change some of the QP attributes. This state is separated into two internal states: + + * Draining: Messages are still being sent. + + * Drained: The sending of the messages was completed. + + * Send Queue Error (SQE) state: The RDMA device automatically moves a QP to this state when there is an error in the Send Queue for unreliable transport types. The Send Request that caused the error will be completed with the error reason, and all of the consecutive Send Requests will be flushed. The Receive Queue will still work—that is, Receive Requests can be posted, and incoming messages will be handled. The client can recover from this state and modify the QP state back to RTS. + + * Error state: At this state, all of the outstanding Work Requests will be flushed. The RDMA device can move the QP to this state if this is a reliable transport type and there was an error with a Send Request, or if there was an error in the Receive Queue regardless of which transport type was used. All incoming messages are silently dropped. + +A QP can be transitioned by ib_modify_qp() from any state to the Reset state and to the Error state. Moving the QP to the Error state will flush all of the outstanding Work Requests. Moving the QP to the Reset state will clear all previously configured attributes and remove all of the outstanding Work Request and Work Completions that were ended on this QP in the Completion Queues that this QP is working with. Figure 13-6 presents a QP state machine diagram. + +Figure 13-6. + +QP state machine + +The ib_modify_qp() method modifies the attributes of a QP. It takes as an argument the QP to modify and the attributes of the QP that will be modified. The state machine of the QP can be changed according to the diagram shown in Figure 13-6. Every QP transport type requires different attributes to be set in each QP state transition. + +Here is an example for modifying a newly created RC QP to the RTS state, in which it can send and receive packets. The local attributes are the outgoing port, the used SL, and the starting Packet Serial Number for the Send Queue. The remote attributes needed are the Receive PSN, the QP number, and the LID of the port that it uses. + +struct ib_qp_attr attr = { + +.qp_state = IB_QPS_INIT, + +.pkey_index = 0, + +.port_num = port, + +.qp_access_flags = 0 + +}; + +ret = ib_modify_qp(qp, &attr, + +IB_QP_STATE | + +IB_QP_PKEY_INDEX | + +IB_QP_PORT | + +IB_QP_ACCESS_FLAGS); + +if (ret) { + +printk(KERN_ERR "Failed to modify QP to INIT state\n"); + +return ret; + +} + +attr.qp_state = IB_QPS_RTR; + +attr.path_mtu = mtu; + +attr.dest_qp_num = remote->qpn; + +attr.rq_psn = remote->psn; + +attr.max_dest_rd_atomic = 1; + +attr.min_rnr_timer = 12; + +attr.ah_attr.is_global = 0; + +attr.ah_attr.dlid = remote->lid; + +attr.ah_attr.sl = sl; + +attr.ah_attr.src_path_bits = 0, + +attr.ah_attr.port_num = port + +ret = ib_modify_qp(ctx->qp, &attr, + +IB_QP_STATE | + +IB_QP_AV | + +IB_QP_PATH_MTU | + +IB_QP_DEST_QPN | + +IB_QP_RQ_PSN | + +IB_QP_MAX_DEST_RD_ATOMIC | + +IB_QP_MIN_RNR_TIMER); + +if (ret) { + +printk(KERN_ERR "Failed to modify QP to RTR state\n"); + +return ret; + +} + +attr.qp_state = IB_QPS_RTS; + +attr.timeout = 14; + +attr.retry_cnt = 7; + +attr.rnr_retry = 6; + +attr.sq_psn = my_psn; + +attr.max_rd_atomic = 1; + +ret = ib_modify_qp(ctx->qp, &attr, + +IB_QP_STATE | + +IB_QP_TIMEOUT | + +IB_QP_RETRY_CNT | + +IB_QP_RNR_RETRY | + +IB_QP_SQ_PSN | + +IB_QP_MAX_QP_RD_ATOMIC); + +if (ret) { + +printk(KERN_ERR "Failed to modify QP to RTS state\n"); + +return ret; + +} + +Following here is a description of several more methods for handling a QP: + + * The ib_query_qp() method queries for the current QP attributes. Some of the attributes are constant (the values that the client specifies), and some of them can be changed (for example, the state). + + * The ib_destroy_qp() method destroys a QP. It is called when the QP isn't needed anymore. + +### Work Request Processing + +Every posted Work Request, to either the Send or Receive Queue, is considered outstanding until there is a Work Completion, which was polled from the CQ which is associated with this Work Queue for this Work Request or for Work Requests in the same Work Queue that were posted after it. Every outstanding Work Request in the Receive Queue will end with a Work Completion. A Work Request processing flow in a Work Queue is according to the diagram shown in Figure 13-7. + +Figure 13-7. + +Work Request processing flow + +In the Send Queue, you can choose (when creating a QP) whether you want every Send Request to end with a Work Completion or whether you want to select the Send Requests that will end with Work Completions—that is, selective signaling. You might encounter an error for an unsignaled Send Request; nevertheless, a Work Completion with bad status will be generated for it. + +When a Work Request is outstanding one cannot (re)use or free the resources that were specified in it when posting this Work Request. For example: + + * When posting a Send Request for a UD QP, the AH cannot be freed. + + * When posting a Receive Request, the memory buffers that were referred to in a scatter/gather (s/g) list cannot be read, because it is unknown if the RDMA device already wrote the data in them. + +"Fencing" is the ability to prevent the processing of a specific Send Request until the processing of the previous RDMA Read and Atomic operations ends. Adding the Fence indication to a Send Request can be useful, for example, when using RDMA Read from a remote address and sending the data, or part of it, in the same Send Queue. Without fencing, the send operation might start before the data is retrieved and available in local memory. When posting a Send Request to a UC or RC QP, the path to the target is known, because it was provided when moving the QP to the RTR state. However, when posting a Send Request to a UD QP, you need to add an AH to describe the path to the target(s) of this message. If there is an error related to the Send Queue, and if this is an Unreliable transport type, the Send Queue will move to the Error state (that is, the SQE state) but the Receive Queue will still be fully functional. The client can recover from this state and change the QP state back to RTS. If there is an error related to the Receive Queue, the QP will be moved to the Error state because this is an unrecoverable error. When a Work Queue is moved to the Error state, the Work Request that caused the error is ended with a status that indicates the nature of the error and the rest of the Work Requests in this Queue are flushed with error. + +### Supported Operations in the RDMA Architecture + +There are several operation types supported in InfiniBand: + + * Send: Send a message over the wire. The remote side needs to have a Receive Request available, and the message will be written in its buffers. + + * Send with Immediate: Send a message over the wire with an extra 32 bits of out-of-band data. The remote side needs to have a Receive Request available, and the message will be written in its buffers. This immediate data will be available in the Work Completion of the receiver. + + * RDMA Write: Send a message over the wire to a remote address. + + * RDMA Write with Immediate: Send a message over the wire, and write it to a remote address. The remote side needs to have a Receive Request available. This immediate data will be available in the Work Completion of the receiver. This operation can be seen as RDMA Write + Send with immediate with a zero-byte message. + + * RDMA Read: Read a remote address, and fill the local buffer with its content. + + * Compare and Swap: Compare the content of a remote address with valueX; if they are equal, replace its content with the valueY. All of this is performed in an atomic way. The original remote memory content is sent and saved locally. + + * Fetch and Add: Add a value to the content of a remote address in an atomic way. The original remote memory content is sent and saved locally. + + * Masked Compare and Swap: Compare the part of the content using maskX of a remote address with valueX; if they are equal, replace part of its content using the bits in maskY with valueY. All of this is performed in an atomic way. The original remote memory content is sent and saved locally. + + * Masked Fetch and Add: Add a value to the content of a remote address in an atomic way, and change only the bits that are specified in the mask. The original remote memory content is sent and saved locally. + + * Bind Memory Window: Binds a Memory Windows to a specific Memory Region. + + * Fast registration: Registers a Fast Memory Region using a Work Request. + + * Local invalidate: Invalidates a Fast Memory Region using a Work Request. If someone uses its old lkey/rkey, it will be considered an error. It can be combined with send/RDMA read; in such a case, first the send/read will be performed, and only then this Fast Memory Region will be invalidated. + +The Receive Request specifies where the incoming message will be saved for operations that consume a Receive Request. The total size of the memory buffers specified in the scatter list must be equal to or greater than the size of the incoming message. + +For UD QP, because the origin of the message is unknown in advance (same subnet or another subnet, unicast or multicast message), an extra 40 bytes, which is the GRH header size, must be added to the Receive Request buffers. The first 40 bytes will be filled with the GRH of the message, if such is available. This GRH information describes how to send a message back to the sender. The message itself will start at offset 40 in the memory buffers that were described in the scatter list. + +The ib_post_recv() method takes a linked list of Receive Requests and adds them to the Receive Queue of a specific QP for future processing. Here is an example for posting a single Receive Request for a QP. It saves an incoming message in a memory buffer using its registered DMA address in a single gather entry. qp is a pointer to a QP that was created using ib_create_qp(). The memory buffer is a block that was allocated using kmalloc() and mapped for DMA using ib_dma_map_single(). The used lkey is from the MR that was registered using ib_get_dma_mr(). + +struct ib_recv_wr wr, *bad_wr; + +struct ib_sge sg; + +int ret; + +memset(&sg, 0, sizeof(sg)); + +sg.addr = dma_addr; + +sg.length = len; + +sg.lkey = mr->lkey; + +memset(&wr, 0, sizeof(wr)); + +wr.next = NULL; + +wr.wr_id = (uintptr_t)dma_addr; + +wr.sg_list = &sg; + +wr.num_sge = 1; + +ret = ib_post_recv(qp, &wr, &bad_wr); + +if (ret) { + +printk(KERN_ERR "Failed to post Receive Request to a QP\n"); + +return ret; + +} + +The ib_post_send() method takes as an argument a linked list of Send Requests and adds them to the Send Queue of a specific QP for future processing. Here is an example for posting a single Send Request of a Send operation for a QP. It sends the content of a memory buffer using its registered DMA address in a single gather entry. + +struct ib_sge sg; + +struct ib_send_wr wr, *bad_wr; + +int ret; + +memset(&sg, 0, sizeof(sg)); + +sg.addr = dma_addr; + +sg.length = len; + +sg.lkey = mr->lkey; + +memset(&wr, 0, sizeof(wr)); + +wr.next = NULL; + +wr.wr_id = (uintptr_t)dma_addr; + +wr.sg_list = &sg; + +wr.num_sge = 1; + +wr.opcode = IB_WR_SEND; + +wr.send_flags = IB_SEND_SIGNALED; + +ret = ib_post_send(qp, &wr, &bad_wr); + +if (ret) { + +printk(KERN_ERR "Failed to post Send Request to a QP\n"); + +return ret; + +} + +#### Work Completion Status + +Every Work Completion can be ended successfully or with an error. If it ends successfully, the operation was finished and the data was sent according to the transport type reliability level. If this Work Completion contains an error, the content of the memory buffers is unknown. There can be many reasons that the Work Request status indicates that there is an error: protection violation, bad address, and so on. The violation errors won't perform any retransmission. However, there are two special retry flows that are worth mentioning. Both of them are done automatically by the RDMA device, which retransmit packets, until the problem is solved or it exceeds the number of retransmissions. If the issue was solved, the client code won't be aware that this even happened, besides a temporary performance hiccup. This is relevant only for Reliable transport types. + +##### Retry Flow + +If the receiver side didn't return any ACK or NACK to the sender side within the expected timeout, the sender might send the message again, according to the timeout and the retry count attributes that were configured in the QP attributes. There might be several reasons for having such a problem: + + * The attributes of the remote QP or the path to it aren't correct. + + * The remote QP state didn't get to (at least) the RTR state. + + * The remote QP state moved to the Error state. + + * The message itself was dropped on the way from the sender to the receiver (for example, a CRC error). + + * The ACK or NACK of messages was dropped on the way from the receiver to the sender (for example, a CRC error). + +Figure 13-8 presents the retry flow becasue of a packet loss that overcame a packet drop. + +Figure 13-8. + +A retry flow (on reliable transport types) + +If eventually the ACK/NACK is received by the sender QP successfully, it will continue to send the rest of the messages. If any message in the future has this problem too, the retry flow will be done again for this message as well, without any history that this was done before. If even after retrying several times the receiver side still doesn't respond, there will be a Work Completion with Retry Error on the sender side. + +##### Receiver Not Ready (RNR) Flow + +If the receiver side got a message that needs to consume a Receive Request from the Receiver Queue, but there isn't any outstanding Receive Request, the receiver will send back to the sender an RNR NACK. After a while, according to the time that was specified in the RNR NACK, the sender will try to send the message again. + +If eventually the receiver side posts a Receiver Request in time, and the incoming message consumes it, an ACK will be sent to the sender side to indicate that the message was saved successfully. If any message in the future has this problem too, the RNR retry flow will be done again for this message as well, without any history that this was done before. If even after retrying several times the receiver side still didn't post a Receiver Request and an RNR NACK was sent to the sender for each sent message, a Work Completion with RNR Retry Error will be generated on the sender side. Figure 13-9 presents the RNR retry flow of retry that overcome a missing Receive Request in he receiver side. + +Figure 13-9. + +RNR retry flow (on reliable transport types) + +In this section, I covered the Work Request status and some of the bad flows that can happen to a message. In the next section, I will discuss the multicast groups. + +### Multicast Groups + +Multicast groups are a means to send a message from one UD QP to many UD QPs. Every UD QP that wants to get this message needs to be attached to the multicast group. When a device gets a multicast packet, it duplicates it to all of the QPs that are attached to that group. Following is a description of two methods related to multicast groups: + + * The ib_attach_mcast() method attaches a UD QP to a multicast group within an InfiniBand device. It accepts the QP to be attached and the multicast group attributes. + + * The ib_detach_mcast() method detaches a UD QP from a multicast group. + +### Difference Between the Userspace and the Kernel-Level RDMA API + +The userspace and the kernel level of the RDMA stack API are quite similar, because they cover the same technology and need to be able to provide the same functionality. When the userspace is calling a method of the control path from the RDMA API, it performs a context switch to the kernel level to protect privileged resources and to synchronize objects that need to be synchronized (for example, the same QP number cannot be assigned to more than one QP at the same time). + +However, there are some differences between the userspace and the kernel-level RDMA API and functionality: + + * The prefix of all the APIs in the kernel level is "ib_", while in the userspace the prefix is "ibv_". + + * There are enumerations and macros that exist only in the RDMA API in the kernel level. + + * There are QP types that are available only in the kernel (for example, the SMI and GSI QPs). + + * There are privileged operations that can be performed only in the kernel level—for example, registration of a physical memory, registration of an MR using a WR, and FMRs. + + * Some functionality isn't available in the RDMA API in the userspace—for example, Request for N notification. + + * The kernel API is asynchronous. There are callbacks that are called when there is an asynchronous event or Completion event. In the userspace, everything is synchronous and the user needs to explicitly check if there is an asynchronous event or Completion event in its running context (that is, thread). + + * XRC isn't relevant for kernel-level clients. + + * There are new features that were introduced to the kernel level, but they are not available (yet) in the userspace. + +The userspace API is supplied by the userspace library "libibverbs." And although some of the RDMA functionality in the user level is less than the kernel-level one, it is enough to enjoy the benefits of the InfiniBand technology. + +## Summary + +You have learned in this chapter about the advantages of the InfiniBand technology. I reviewed the RDMA stack organization. I discussed the resource-creation hierarchy and all of the important objects and their API, which is needed in order to write client code that uses InfiniBand. You also saw some examples that use this API. The next chapter will deal with advanced topics like network namespaces and the Bluetooth subsystem. + +## Quick Reference + +I will conclude this chapter with a short list of important methods of the RDMA API. Some of them were mentioned in this chapter. + +### Methods + +Here are the methods. + +#### int ib_register_client(struct ib_client *client); + +Register a kernel client that wants to use the RDMA stack. + +#### void ib_unregister_client(struct ib_client *client); + +Unregister a kernel client that wants to stop using the RDMA stack. + +#### void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); + +Set a client context to be associated with an InfiniBand device. + +#### void *ib_get_client_data(struct ib_device *device, struct ib_client *client); + +Read the client context that was associated with an InfiniBand device. + +#### int ib_register_event_handler(struct ib_event_handler *event_handler); + +Register a callback to be called for every asynchronous event that occurs to the InfiniBand device. + +#### int ib_unregister_event_handler(struct ib_event_handler *event_handler); + +Unregister a callback to be called for every asynchronous event that occurs to the InfiniBand device. + +#### int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr); + +Query an InfiniBand device for its attributes. + +#### int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); + +Query an InfiniBand device port for its attributes. + +#### enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); + +Query for the link layer of the InfiniBand device's port. + +#### int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); + +Query for the GID in a specific index in the InfiniBand device's port GID table. + +#### int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); + +Query for the P_Key-specific index in the InfiniBand device's port P_Key table. + +#### int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index); + +Find the index of a specific GID value in the InfiniBand device's port GID table. + +#### int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); + +Find the index of a specific P_Key value in the InfiniBand device's port P_Key table. + +#### struct ib_pd *ib_alloc_pd(struct ib_device *device); + +Allocate a PD to be used later to create other InfiniBand resources. + +#### int ib_dealloc_pd(struct ib_pd *pd); + +Deallocate a PD. + +#### struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +Create an AH that will be used when posting a Send Request in a UD QP. + +#### int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, struct ib_grh *grh, struct ib_ah_attr *ah_attr); + +Initializes an AH attribute from a Work Completion of a received message and a GRH buffer. Those AH attributes can be used when calling the ib_create_ah() method. + +#### struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, struct ib_grh *grh, u8 port_num); + +Create an AH from a Work Completion of a received message and a GRH buffer. + +#### int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +Modify the attributes of an existing AH. + +#### int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +Query the attributes of an existing AH. + +#### int ib_destroy_ah(struct ib_ah *ah); + +Destroy an AH. + +#### struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + +Return an MR system memory that is usable for DMA. + +#### static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr); + +Check if the DMA memory points to an invalid address—that is, check whether the DMA mapping operation failed. + +#### static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction); + +Map a kernel virtual address to a DMA address. + +#### static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); + +Unmap a DMA mapping of a virtual address. + +#### static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs) + +Map a kernel virtual memory to a DMA address according to DMA attributes. + +#### static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs); + +Unmap a DMA mapping of a virtual address that was mapped according to DMA attributes. + +#### static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); + +Maps a physical page to a DMA address. + +#### static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); + +Unmap a DMA mapping of a physical page. + +#### static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); + +Map a scatter/gather list to a DMA address. + +#### static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); + +Unmap a DMA mapping of a scatter/gather list. + +#### static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); + +Map a scatter/gather list to a DMA address according to DMA attributes. + +#### static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); + +Unmap a DMA mapping of a scatter/gather list according to DMA attributes. + +#### static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg); + +Return the address attribute of a scatter/gather entry. + +#### static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg); + +Return the length attribute of a scatter/gather entry. + +#### static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir); + +Transfer a DMA region ownership to the CPU. It should be called before the CPU accesses a DMA mapped region whose ownership was previously transferred to the device. + +#### static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir); + +Transfer a DMA region ownership to the device. It should be called before the device accesses a DMA mapped region whose ownership was previously transferred to the CPU. + +#### static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); + +Allocate a memory block that can be accessed by the CPU, and map it for DMA. + +#### static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); + +Free a memory block that was allocated using ib_dma_alloc_coherent(). + +#### struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); + +Take a physical page list, and prepare it for being accessed by the InfiniBand device. + +#### int ib_rereg_phys_mr(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); + +Change the attributes of an MR. + +#### int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +Query for the attributes of an MR. + +#### int ib_dereg_mr(struct ib_mr *mr); + +Deregister an MR. + +#### struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + +Allocate an MW. This MW will be used to allow remote access to an MR. + +#### static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); + +Bind an MW to an MR to allow a remote access to local memory with specific permissions. + +#### int ib_dealloc_mw(struct ib_mw *mw); + +Deallocates an MW. + +#### struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, int cqe, int comp_vector); + +Create a CQ. This CQ will be used to indicate the status of ended Work Requests for Send or Receive Queues. + +#### int ib_resize_cq(struct ib_cq *cq, int cqe); + +Change the number of entries in a CQ. + +#### int ib_modify_cq(structib_cq *cq, u16 cq_count, u16 cq_period); + +Modify the moderation attributes of a CQ. This method is used to decrease the number of interrupts of an InfiniBand device. + +#### int ib_peek_cq(structib_cq *cq, intwc_cnt); + +Return the number of available Work Completions in a CQ. + +#### static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); + +Request a Completion notification event to be generated when the next Work Completion is added to the CQ. + +#### static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt); + +Request a Completion notification event to be generated when there is a specific number of Work Completions in a CQ. + +#### static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); + +Read and remove one or more Work Completions from a CQ. They are read in the order that they were added to the CQ. + +#### struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); + +Create an SRQ that will be used as a shared Receive Queue for several QPs. + +#### int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); + +Modify the attributes of an SRQ. + +#### int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); + +Query for the attributes of an SRQ. The SRQ limit value might be changed in subsequent calls to this method. + +#### int ib_destroy_srq(struct ib_srq *srq); + +Destroy an SRQ. + +#### struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); + +Create a QP. Every new QP is assigned with a QP number that isn't in use by other QPs at the same time. + +#### int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); + +Modify the attributes of a QP, which includes Send and Receive Queue attributes and the QP state. + +#### int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + +Query for the attributes of a QP. Some of the attributes might be changed in subsequent calls to this method. + +#### int ib_destroy_qp(struct ib_qp *qp); + +Destroy a QP. + +#### static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); + +Adds a linked list of Receive Requests to an SRQ. + +#### static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); + +Adds a linked list of Receive Requests to the Receive Queue of a QP. + +#### static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr); + +Adds a linked list of Send Requests to the Send Queue of a QP. + +#### int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +Attaches a UD QP to a multicast group. + +#### int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +Detaches a UD QP from a multicast group. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_14 + +© Rami Rosen 2014 + +# 14. Advanced Topics + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +Chapter 13 dealt with the InfiniBand subsystem and its implementation in Linux. This chapter deals with several advanced topics and some topics that didn't fit logically into other chapters. The chapter starts with a discussion about network namespaces, a type of lightweight process virtualization mechanism that was added to Linux in recent years. I will discuss the namespaces implementation in general and network namespaces in particular. You will learn that only two new system calls are needed in order to implement namespaces. You will also see several examples of how simple it is to create and manage network namespaces with the ip command of iproute2, and how simple it is to move one network device from one network namespace to another and to attach a specified process to a specified network namespace. The cgroups subsystem also provides resource management solution, which is different from namespaces. I will describe the cgroups subsystem and its two network modules, net_prio and cls_cgroup, and give two examples of using these cgroup network modules. + +Chapter 13 dealt with the InfiniBand subsystem and its implementation in Linux. This chapter deals with several advanced topics and some topics that didn't fit logically into other chapters. The chapter starts with a discussion about network namespaces, a type of lightweight process virtualization mechanism that was added to Linux in recent years. I will discuss the namespaces implementation in general and network namespaces in particular. You will learn that only two new system calls are needed in order to implement namespaces. You will also see several examples of how simple it is to create and manage network namespaces with the ip command of iproute2, and how simple it is to move one network device from one network namespace to another and to attach a specified process to a specified network namespace. The cgroups subsystem also provides resource management solution, which is different from namespaces. I will describe the cgroups subsystem and its two network modules, net_prio and cls_cgroup, and give two examples of using these cgroup network modules. + +Later on in this chapter, you will learn about Busy Poll Sockets and how to tune them. The Busy Poll Sockets feature provides an interesting performance optimization technique for sockets that need low latency and are willing to pay a cost of higher CPU utilization. The Busy Poll Sockets feature is available from kernel 3.11. I will also cover the Bluetooth subsystem, the IEEE 802.15.4 subsystem and the Near Field Communication (NFC) subsystem; these three subsystems typically work in short range networks, and the development of new features for these subsystem is progressing at a rapid pace. I will also discuss Notification Chains, which is an important mechanism that you may encounter while developing or debugging kernel networking code and the PCI subsystem, as many network devices are PCI devices. I will not delve deep into the PCI subsystem details, as this book is not about device drivers. I will conclude the chapter with three short sections, one about the teaming network driver (which is the new kernel link aggregation solution), one about the Point-to-Point over Ethernet (PPPoE) Protocol, and finally one about Android. + +## Network Namespaces + +This section covers Linux namespaces, what they are for and how they are implemented. It includes an in-depth discussion of network namespaces, giving some examples that will demonstrate their usage. Linux namespaces are essentially a virtualization solution. Operating system virtualization was implemented in mainframes many years before solutions like Xen or KVM hit the market. Also with Linux namespaces, which are a form of process virtualization, the idea is not new at all. It was tried in the Plan 9 operating system (see this article from 1992: "The Use of Name Spaces in Plan 9", www.cs.bell-labs.com/sys/doc/names.html ). + +Namespaces is a form of lightweight process virtualization, and it provides resource isolation. As opposed to virtualization solutions like KVM or Xen, with namespaces you do not create additional instances of the operating system on the same host, but use only a single operating system instance. I should mention in this context that the Solaris operating system has a virtualization solution named Solaris Zones, which also uses a single operating system instance, but the scheme of resource partitioning is somewhat different than that of Linux namespaces (for example, in Solaris Zones there is a global zone which is the primary zone, and which has more capabilities). In the FreeBSD operating system there is a mechanism called jails, which also provides resource partitioning without running more than one instance of the kernel. + +The main idea of Linux namespaces is to partition resources among groups of processes to enable a process (or several processes) to have a different view of the system than processes in other groups of processes. This feature is used, for example, to provide resource isolation in the Linux containers project ( http://lxc.sourceforge.net/ ). The Linux containers project also uses another resource management mechanism that is provided by the cgroups subsystem, which will be described later in this chapter. With containers, you can run different Linux distributions on the same host using one instance of the operating systems. Namespaces are also needed for the checkpoint/restore feature, which is used in high performance computing (HPC). For example, it is used in CRIU ( http://criu.org/Main_Page ), a software tool of OpenVZ ( http://openvz.org/Main_Page ), which implements checkpoint/restore functionality for Linux processes mostly in userspace, though there are very few places when CRIU kernel patches were merged. I should mention that there were some projects to implement checkpoint/restore in the kernel, but these projects were not accepted in mainline because they were too complex. For example, take the CKPT project: https://ckpt.wiki.kernel.org/index.php/Main_Page . The checkpoint/restore feature (sometimes referred to as checkpoint/restart) enables stopping and saving several processes on a filesystem, and at a later time restores those processes (possibly on a different host) from the filesystem and resumes its execution from where it was stopped. Without namespaces, checkpoint/restore has very limited use cases, in particular live migration is only possible with them. Another use case for network namespaces is when you need to set up an environment that needs to simulate different network stacks for testing, debugging, etc. For readers who want to learn more about checkpoint/restart, I suggest reading the article "Virtual Servers and Checkpoint/Restart in Mainstream Linux," by Sukadev Bhattiprolu, Eric W. Biederman, Serge Hallyn, and Daniel Lezcano. + +Mount namespaces were the first type of Linux namespaces to be merged in 2002, for kernel 2.4.19. User namespaces were the last to be implemented, in kernel 3.8, for almost all filesystems types. It could be that additional namespaces will be developed, as is discussed later in this section. For creating a namespace you should have the CAP_SYS_ADMIN capability for all namespaces, except for the user namespace. Trying to create a namespace without the CAP_SYS_ADMIN capability for all namespaces, except for the user namespace, will result with an –EPRM error ("Operation not permitted"). Many developers took part in the development of namespaces, among them are Eric W. Biederman, Pavel Emelyanov, Al Viro, Cyrill Gorcunov, Andrew Vagin, and more. + +After getting some background about process virtualization and Linux namespaces, and how they are used, you are now ready to dive in into the gory implementation details. + +### Namespaces Implementation + +As of this writing, six namespaces are implemented in the Linux kernel. Here is a description of the main additions and changes that were needed in order to implement namespaces in the Linux kernel and to support namespaces in userspace packages: + + * A structure called nsproxy (namespace proxy) was added. This structure contains pointers to five namespaces out of the six namespaces that are implemented. There is no pointer to the user namespace in the nsproxy structure; however, all the other five namespace objects contain a pointer to the user namespace object that owns them, and in each of these five namespaces, the user namespace pointer is called user_ns. The user namespace is a special case; it is a member of the credentials structure (cred), called user_ns. The cred structure represents the security context of a process. Each process descriptor (task_struct) contains two cred objects, for effective and objective process descriptor credentials. I will not delve into all the details and nuances of user namespaces implementation, since this is not in the scope of this book. An nsproxy object is created by the create_nsproxy() method and it is released by the free_nsproxy() method. A pointer to nsproxy object, which is also called nsproxy, was added to the process descriptor (a process descriptor is represented by the task_struct structure, include/linux/sched.h.) Let's take a look at the nsproxy structure, as it's quite short and should be quite self-explanatory: + +struct nsproxy { + +atomic_t count; + +struct uts_namespace *uts_ns; + +struct ipc_namespace *ipc_ns; + +struct mnt_namespace *mnt_ns; + +struct pid_namespace *pid_ns; + +struct net *net_ns; + +}; + +(include/linux/nsproxy.h) + + * You can see in the nsproxy structure five pointers of namespaces (there is no user namespace pointer). Using the nsproxy object in the process descriptor (task_struct object) instead of five namespace objects is an optimization. When performing fork(), a new child is likely to live in the same set of namespaces as its parent. So instead of five reference counter increments (one per each namespace), only one reference counter increment would happen (of the nsproxy object). The nsproxy count member is a reference counter, which is initialized to 1 when the nsproxy object is created by the create_nsproxy() method, and which is decremented by the put_nsproxy() method and incremented by the get_nsproxy() method. Note that the pid_ns member of the nsproxy object was renamed to pid_ns_for_children in kernel 3.11. + + * A new system call, unshare(), was added. This system call gets a single parameter that is a bitmask of CLONE* flags. When the flags argument consists of one or more namespace CLONE_NEW* flags, the unshare() system call performs the following steps: + + * First, it creates a new namespace (or several namespaces) according to the specified flag. This is done by calling the unshare_nsproxy_namespaces() method, which in turn creates a new nsproxy object and one or more namespaces by calling the create_new_namespaces() method. The type of the new namespace (or namespaces) is determined according to the specified CLONE_NEW* flag. The create_new_namespaces() method returns a new nsproxy object that contains the new created namespace (or namespaces). + + * Then it attaches the calling process to that newly created nsproxy object by calling the switch_task_namespaces() method. + + * When CLONE_NEWPID is the flag of the unshare() system call, it works differently than with the other flags; it's an implicit argument to fork(); only the child task will happen in a new PID namespace, not the one calling the unshare() system call. Other CLONE_NEW* flags immediately put the calling process into a new namespace. + + * The six CLONE_NEW* flags, which were added to support the creation of namespaces, are described later in this section. The implementation of the unshare() system call is in kernel/fork.c. + + * A new system call, setns(), was added. It attaches the calling thread to an existing namespace. Its prototype is int setns(int fd, int nstype); the parameters are: + + * fd: A file descriptor which refers to a namespace. These are obtained by opening links from the /proc//ns/ directory. + + * nstype: An optional parameter. When it is one of the new CLONE_NEW* namespaces flags, the specified file descriptor must refer to a namespace which matches the type of the specified CLONE_NEW* flag. When the nstype is not set (its value is 0) the fd argument can refer to a namespace of any type. If the nstype does not correspond to the namespace type associated with the specified fd, a value of –EINVAL is returned. + +You can find the implementation of the setns() system call in kernel/nsproxy.c. + + * The following six new clone flags were added in order to support namespaces: + + * CLONE_NEWNS (for mount namespaces) + + * CLONE_NEWUTS (for UTS namespaces) + + * CLONE_NEWIPC (for IPC namespaces) + + * CLONE_NEWPID (for PID namespaces) + + * CLONE_NEWNET (for network namespaces) + + * CLONE_NEWUSER (for user namespaces) + + * The clone() system call is used traditionally to create a new process. It was adjusted to support these new flags so that it will create a new process attached to a new namespace (or namespaces). Note that you will encounter usage of the CLONE_NEWNET flag, for creating a new network namespace, in some of the examples later in this chapter. + + * Each subsystem, from the six for which there is a namespace support, had implemented a unique namespace of its own. For example, the mount namespace is represented by a structure called mnt_namespace, and the network namespace is represented by a structure called net, which is discussed later in this section. I will mention the other namespaces later in this chapter. + + * For namespaces creation, a method named create_new_namespaces() was added (kernel/nsproxy.c). This method gets as a first parameter a CLONE_NEW* flag or a bitmap of CLONE_NEW* flags. It first creates an nsproxy object by calling the create_nsproxy() method, and then it associates a namespace according to the specified flag; since the flag can be a bitmask of flags, the create_new_namespaces() method can associate more than one namespace. Let's take a look at the create_new_namespaces() method: + +static struct nsproxy *create_new_namespaces(unsigned long flags, + +struct task_struct *tsk, struct user_namespace *user_ns, + +struct fs_struct *new_fs) + +{ + +struct nsproxy *new_nsp; + +int err; + +Allocate an nsproxy object and initialize its reference counter to 1: + +new_nsp = create_nsproxy(); + +if (!new_nsp) + +return ERR_PTR(-ENOMEM); + +. . . + +After creating successfully an nsproxy object, we should create namespaces according to the specified flags, or associate an existing namespace to the new nsproxy object we created. We start by calling copy_mnt_ns(), for the mount namespaces, and then we call copy_utsname(), for the UTS namespace. I will describe here shortly the copy_utsname() method, because the UTS namespace is discussed in the "UTS Namespaces Implementation" section later in this chapter. If the CLONE_NEWUTS is not set in the specified flags of the copy_utsname() method, the copy_utsname() method does not create a new UTS namespace; it returns the UTS namespace that was passed by tsk->nsproxy->uts_ns as the last parameter to the copy_utsname() method. In case the CLONE_NEWUTS is set, the copy_utsname() method clones the specified UTS namespace by calling the clone_uts_ns() method. The clone_uts_ns() method, in turn, allocates a new UTS namespace object, copies the new_utsname object of the specified UTS namespace (tsk->nsproxy->uts_ns) into the new_utsname object of the newly created UTS namespace object, and returns the newly created UTS namespace. You will learn more about the new_utsname structure in the "UTS Namespaces Implementation" section later in this chapter: + +new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); + +if (IS_ERR(new_nsp->uts_ns)) { + +err = PTR_ERR(new_nsp->uts_ns); + +goto out_uts; + +} + +. . . + +After handling the UTS namespace, we continue with calling the copy_ipcs() method to handle the IPC namespace, copy_pid_ns() to handle the PID namespace, and copy_net_ns() to handle the network namespace. Note that there is no call to the copy_user_ns() method, as the nsproxy does not contain a pointer to user namespace, as was mentioned earlier. I will describe here shortly the copy_net_ns() method. If the CLONE_NEWNET is not set in the specified flags of the create_new_namespaces() method, the copy_net_ns() method returns the network namespace that was passed as the third parameter to the copy_net_ns() method, tsk->nsproxy->net_ns, much like the copy_utsname() did, as you saw earlier in this section. If the CLONE_NEWNET is set, the copy_net_ns() method allocates a new network namespace by calling the net_alloc() method, initializes it by calling the setup_net() method, and adds it to the global list of all network namespaces, net_namespace_list: + +new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); + +if (IS_ERR(new_nsp->net_ns)) { + +err = PTR_ERR(new_nsp->net_ns); + +goto out_net; + +} + +return new_nsp; + +} + + * Note that the setns() system call, which does not create a new namespace but only attaches the calling thread to a specified namespace, also calls create_new_namespaces(), but it passes 0 as a first parameter; this implies that only an nsproxy is created by calling the create_nsproxy() method, but no new namespace is created, but the calling thread is associated with an existing network namespace which is identified by the specified fd argument of the setns() system call. Later in the setns() system call implementation, the switch_task_namespaces() method is invoked, and it assigns the new nsproxy which was just created to the calling thread (see kernel/nsproxy.c). + + * A method named exit_task_namespaces() was added in kernel/nsproxy.c. It is called when a process is terminated, by the do_exit() method (kernel/exit.c). The exit_task_namespaces() method gets the process descriptor (task_struct object) as a single parameter. In fact the only thing it does is call the switch_task_namespaces() method, passing the specified process descriptor and a NULL nsproxy object as arguments. The switch_task_namespaces() method, in turn, nullifies the nsproxy object of the process descriptor of the process which is being terminated. If there are no other processes that use that nsproxy, it is freed. + + * A method named get_net_ns_by_fd() was added. This method gets a file descriptor as its single parameter, and returns the network namespace associated with the inode that corresponds to the specified file descriptor. For readers who are not familiar with filesystems and with inode semantics, I suggest reading the "Inode Objects" section of Chapter 12, "The Virtual Filesystem," in Understanding the Linux Kernel by Daniel P. Bovet and Marco Cesati (O'Reilly, 2005). + + * A method named get_net_ns_by_pid() was added. This method gets a PID number as a single argument, and it returns the network namespace object to which this process is attached. + + * Six entries were added under /proc//ns, one for each namespace. These files, when opened, should be fed into the setns() system call. You can use ls –al or readlink to display the unique proc inode number which is associated with a namespace. This unique proc inode is created by the proc_alloc_inum() method when the namespace is created, and is freed by the proc_free_inum() method when the namespace is released. See, for example, in the create_pid_namespace() method in kernel/pid_namespace.c. In the following example, the number in square brackets on the right is the unique proc inode number of each namespace: + +ls -al /proc/1/ns/ + +total 0 + +dr-x--x--x 2 root root 0 Nov 3 13:32 . + +dr-xr-xr-x 8 root root 0 Nov 3 12:17 .. + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 ipc -> ipc:[4026531839] + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 mnt -> mnt:[4026531840] + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 net -> net:[4026531956] + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 pid -> pid:[4026531836] + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 user -> user:[4026531837] + +lrwxrwxrwx 1 root root 0 Nov 3 13:32 uts -> uts:[4026531838] + + * A namespace can stay alive if either one of the following conditions is met: + + * The namespace file under /proc//ns/ descriptor is held. + + * bind mounting the namespace proc file somewhere else, for example, for PID namespace, by: mount --bind /proc/self/ns/pid /some/filesystem/path + + * For each of the six namespaces, a proc namespace operations object (an instance of proc_ns_operations structure) is defined. This object consists of callbacks, such as inum, to return the unique proc inode number associated with the namespace or install, for namespace installation (in the install callback, namespace specific actions are performed, such as attaching the specific namespace object to the nsproxy object, and more; the install callback is invoked by the setns system call). The proc_ns_operations structure in defined in include/linux/proc_fs.h. Following is the list of the six proc_ns_operations objects: + + * utsns_operations for UTS namespace (kernel/utsname.c) + + * ipcns_operations for IPC namespace (ipc/namespace.c) + + * mntns_operations for mount namespaces (fs/namespace.c) + + * pidns_operations for PID namespaces (kernel/pid_namespace.c) + + * userns_operations for user namespace (kernel/user_namespace.c) + + * netns_operations for network namespace (net/core/net_namespace.c) + + * For each namespace, except the mount namespace, there is an initial namespace: + + * init_uts_ns: For UTS namespace (init/version.c). + + * init_ipc_ns: For IPC namespace (ipc/msgutil.c). + + * init_pid_ns: For PID namespace (kernel/pid.c). + + * init_net: For network namespace (net/core/net_namespace.c). + + * init_user_ns: For user namespace (kernel/user.c). + + * An initial, default nsproxy object is defined: it is called init_nsproxy and it contains pointers to five initial namespaces; they are all initialized to be the corresponding specific initial namespace except for the mount namespace, which is initialized to be NULL: + +struct nsproxy init_nsproxy = { + +.count = ATOMIC_INIT(1), + +.uts_ns = &init_uts_ns, + +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) + +.ipc_ns = &init_ipc_ns, + +#endif + +.mnt_ns = NULL, + +.pid_ns = &init_pid_ns, + +#ifdef CONFIG_NET + +.net_ns = &init_net, + +#endif + +}; + +(kernel/nsproxy.c) + + * A method named task_nsproxy() was added; it gets as a single parameter a process descriptor (task_struct object), and it returns the nsproxy associated with the specified task_struct object. See include/linux/nsproxy.h. + +These are the six namespaces available in the Linux kernel as of this writing: + + * Mount namespaces: The mount namespaces allows a process to see its own view of the filesystem and of its mount points. Mounting a filesystem in one mount namespace does not propagate to the other mount namespaces. Mount namespaces are created by setting the CLONE_NEWNS flag when calling the clone() or unshare() system calls. In order to implement mount namespaces, a structure called mnt_namespace was added (fs/mount.h), and nsproxy holds a pointer to an mnt_namespace object called mnt_ns. Mount namespaces are available from kernel 2.4.19. Mount namespaces are implemented primarily in fs/namespace.c. When creating a new mount namespace, the following rules apply: + + * All previous mounts will be visible in the new mount namespace. + + * Mounts/unmounts in the new mount namespace are invisible to the rest of the system. + + * Mounts/unmounts in the global mount namespace are visible in the new mount namespace. + + * Mount namespaces use a VFS enhancement called shared subtrees, which was introduced in the Linux 2.6.15 kernel; the shared subtrees feature introduced new flags: MS_PRIVATE, MS_SHARED, MS_SLAVE and MS_UNBINDABLE . (See http://lwn.net/Articles/159077/ and Documentation/filesystems/sharedsubtree.txt.) I will not discuss the internals of mount namespaces implementation. For readers who want to learn more about mount namespaces usage, I suggest reading the following article: "Applying Mount Namespaces," by Serge E. Hallyn and Ram Pai ( http://www.ibm.com/developerworks/linux/library/l-mount-namespaces/index.html ). + + * PID namespaces: The PID namespaces provides the ability for different processes in different PID namespaces to have the same PID. This feature is a building block for Linux containers. It is important for checkpoint/restore of a process, because a process checkpointed on one host can be restored on a different host even if there is a process with the same PID on that host. When creating the first process in a new PID namespace, its PID is 1. The behavior of this process is somewhat like the behavior of the init process. This means that when a process dies, all its orphaned children will now have the process with PID 1 as their parent (child reaping). Sending SIGKILL signal to a process with PID 1 does not kill the process, regardless of in which namespace the SIGKILL signal was sent, in the initial PID namespace or in any other PID namespace. But killing init of one PID namespace from another (parent one) will work. In this case, all of the tasks living in the former namespace will be killed and the PID namespace will be stopped. PID namespaces are created by setting the CLONE_NEWPID flag when calling the clone() or unshare() system calls. In order to implement PID namespaces, a structure called pid_namespace was added (include/linux/pid_namespace.h), and nsproxy holds a pointer to a pid_namespace object called pid_ns. In order to have PID namespaces support, CONFIG_PID_NS should be set. PID namespaces are available from kernel 2.6.24. PID namespaces are implemented primarily in kernel/pid_namespace.c. + + * Network namespaces: The network namespace allows creating what appears to be multiple instances of the kernel network stack. Network namespaces are created by setting the CLONE_NEWNET flag when calling the clone() or unshare() system calls. In order to implement network namespaces, a structure called net was added (include/net/net_namespace.h), and nsproxy holds a pointer to a net object called net_ns. In order to have network namespaces support, CONFIG_NET_NS should be set. I will discuss network namespaces later in this section. Network namespaces are available from kernel 2.6.29. Network namespaces are implemented primarily in net/core/net_namespace.c. + + * IPC namespaces: The IPC namespace allows a process to have its own System V IPC resources and POSIX message queues resources. IPC namespaces are created by setting the CLONE_NEWIPC flag when calling the clone() or unshare() system calls. In order to implement IPC namespaces, a structure called ipc_namespace was added (include/linux/ipc_namespace.h), and nsproxy holds a pointer to an ipc_namespace object called ipc_ns. In order to have IPC namespaces support, CONFIG_IPC_NS should be set. Support for System V IPC resources is available in IPC namespaces from kernel 2.6.19. Support for POSIX message queues resources in IPC namespaces was added later, in kernel 2.6.30. IPC namespaces are implemented primarily in ipc/namespace.c. + + * UTS namespaces: The UTS namespace provides the ability for different UTS namespaces to have different host name or domain name (or other information returned by the uname() system call). UTS namespaces are created by setting the CLONE_NEWUTS flag when calling the clone() or unshare() system calls. UTS namespace implementation is the simplest among the six namespaces that were implemented. In order to implement the UTS namespace, a structure called uts_namespace was added (include/linux/utsname.h), and nsproxy holds a pointer to a uts_namespace object called uts_ns. In order to have UTS namespaces support, CONFIG_UTS_NS should be set. UTS namespaces are available from kernel 2.6.19. UTS namespaces are implemented primarily in kernel/utsname.c. + + * User namespaces: The user namespace allows mapping of user and group IDs. This mapping is done by writing to two procfs entries that were added for supporting user namespaces: /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. A process attached to a user namespace can have a different set of capabilities then the host. User namespaces are created by setting the CLONE_NEWUSER flag when calling the clone() or unshare() system calls. In order to implement user namespaces, a structure called user_namespace was added (include/linux/user_namespace.h). The user_namespace object contains a pointer to the user namespace object that created it (parent). As opposed to the other five namespaces, nsproxy does not hold a pointer to a user_namespace object. I will not delve into more implementation details of user namespaces, as it is probably the most complex namespace and as it is beyond the scope of the book. In order to have user namespaces support, CONFIG_USER_NS should be set. User namespaces are available from kernel 3.8 for almost all filesystem types. User namespaces are implemented primarily in kernel/user_namespace.c. + +Support to namespaces was added in four userspace packages: + + * In util-linux: + + * The unshare utility can create any of the six namespaces, available since version 2.17. + + * The nsenter utility (which is in fact a light wrapper around the setns system call), available since version 2.23. + + * In iproute2, management of network namespaces is done with the ip netns command, and you will see several examples for this later in this chapter. Moreover, you can move a network interface to a different network namespace with the ip link command as you will see in the "Moving a Network Interface to a different Network Namespace" section later in this chapter. + + * In ethtool, support was added to enable to find out whether the NETIF_F_NETNS_LOCAL feature is set for a specified network interface. When the NETIF_F_NETNS_LOCAL feature is set, this indicates that the network interface is local to that network namespace, and you cannot move it to a different network namespace. The NETIF_F_NETNS_LOCAL feature will be discussed later in this section. + + * In the wireless iw package, an option was added to enable moving a wireless interface to a different namespace. + +Note + +In a presentation in Ottawa Linux Symposium (OLS) in 2006, "Multiple Instances of the Global Linux Namespaces," Eric W. Biederman (one of the main developers of Linux namespaces) mentioned ten namespaces; the other four namespaces that he mentioned in this presentation and that are not implemented yet are: device namespace, security namespace, security keys namespace, and time namespace. (See https://www.kernel.org/doc/ols/2006/ols2006v1-pages-101-112.pdf .) For more information about namespaces, I suggest reading a series of six articles about it by Michael Kerrisk ( https://lwn.net/Articles/531114/ ). Mobile OS virtualization projects triggered a development effort to support device namespaces; for more information about device namespaces, which are not yet part of the kernel, see "Device Namespaces" By Jake Edge ( http://lwn.net/Articles/564854/ ) and also ( http://lwn.net/Articles/564977/ ). There was also some work for implementing a new syslog namespace (see the article "Stepping Closer to Practical Containers: "syslog" namespaces", http://lwn.net/Articles/527342/ ). + +The following three system calls can be used with namespaces: + + * clone(): Creates a new process attached to a new namespace (or namespaces). The type of the namespace is specified by a CLONE_NEW* flag which is passed as a parameter. Note that you can also use a bitmask of these CLONE_NEW* flags. The implementation of the clone() system call is in kernel/fork.c. + + * unshare(): Discussed earlier in this section. + + * setns(): Discussed earlier in this section. + +Note + +Namespaces do not have names inside the kernel that userspace processes can use to talk with them. If namespaces would have names, this would require keeping them globally, in yet another special namespace. This would complicate the implementation and can raise problems in checkpoint/restore for example. Instead, userspace processes should open namespace files under /proc//ns/ and their file descriptors can be used to talk to a specific namespace, in order to keep that namespace alive. Namespaces are identified by a unique proc inode number generated when they are created and freed when they are released. Each of the six namespace structures contains an integer member called proc_inum, which is the namespace unique proc inode number and is assigned by calling the proc_alloc_inum() method. Each of the six namespaces has also a proc_ns_operations object, which includes namespace-specific callbacks; one of these callbacks, called inum, returns the proc_inum of the associated namespace (for the definition of proc_ns_operations structure, refer to include/linux/proc_fs.h). + +Before discussing network namespaces, let's describe how the simplest namespace, the UTS namespace, is implemented. This is a good starting point to understand the other, more complex namespaces. + +### UTS Namespaces Implementation + +In order to implement UTS namespaces, a struct called uts_namespace was added: + +struct uts_namespace { + +struct kref kref; + +struct new_utsname name; + +struct user_namespace *user_ns; + +unsigned int proc_inum; + +}; + +(include/linux/utsname.h) + +Here is a short description of the members of the uts_namespace structure: + + * kref: A reference counter. It is a generic kernel reference counter, incremented by the kref_get() method and decremented by the kref_put() method. Besides the UTS namespace, also the PID namespace has a kref object as a reference counter; all the other four namespaces use an atomic counter for reference counting. For more info about the kref API look in Documentation/kref.txt. + + * name: A new_utsname object, contains fields like domainname and nodename (will be discussed shortly). + + * user_ns: The user namespace associated with the UTS namespace. + + * proc_inum: The unique proc inode number of the UTS namespace. + +The nsproxy structure contains a pointer to the uts_namespace: + +struct nsproxy { + +. . . + +struct uts_namespace *uts_ns; + +. . . + +}; + +(include/linux/nsproxy.h) + +As you saw earlier, the uts_namespace object contains an instance of the new_utsname structure. Let's take a look at the new_utsname structure, which is the essence of the UTS namespace: + +struct new_utsname { + +char sysname[__NEW_UTS_LEN + 1]; + +char nodename[__NEW_UTS_LEN + 1]; + +char release[__NEW_UTS_LEN + 1]; + +char version[__NEW_UTS_LEN + 1]; + +char machine[__NEW_UTS_LEN + 1]; + +char domainname[__NEW_UTS_LEN + 1]; + +}; + +(include/uapi/linux/utsname.h) + +The nodename member of the new_utsname is the host name, and domainname is the domain name. A method named utsname() was added; this method simply returns the new_utsname object which is associated with the process that currently runs (current): + +static inline struct new_utsname *utsname(void) + +{ + +return ¤t->nsproxy->uts_ns->name; + +} + +(include/linux/utsname.h) + +Now, the new gethostname() system call implementation is the following: + +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) + +{ + +int i, errno; + +struct new_utsname *u; + +if (len < 0) + +return -EINVAL; + +down_read(&uts_sem); + +Invoke the utsname() method, which accesses the new_utsname object of the UTS namespace associated with the current process: + +u = utsname(); + +i = 1 + strlen(u->nodename); + +if (i > len) + +i = len; + +errno = 0; + +Copy to userspace the nodename of the new_utsname object that the utsname() method returned: + +if (copy_to_user(name, u->nodename, i)) + +errno = -EFAULT; + +up_read(&uts_sem); + +return errno; + +} + +(kernel/sys.c) + +You can find a similar approach in the sethostbyname() and in the uname() system calls, which are also defined in kernel/sys.c. I should note that UTS namespaces implementation also handles UTS procfs entries. There are only two UTS procfs entries, /proc/sys/kernel/domainname and /proc/sys/kernel/hostname, which are writable (this means that you can change them from userspace). There are other UTS procfs entries which are not writable, like /proc/sys/kernel/ostype and /proc/sys/kernel/osrelease. If you will look at the table of the UTS procfs entries, uts_kern_table (kernel/utsname_sysctl.c), you will see that some entries, like ostype and osrelease, have mode of "0444", which means they are not writable, and only two of them, hostname and domainname, have mode of "0644", which means they are writable. Reading and writing the UTS procfs entries is handled by the proc_do_uts_string() method. Readers who want to learn more about how UTS procfs entries are handled should look into the proc_do_uts_string() method and into the get_uts() method; both are in kernel/utsname_sysctl.c. + +Now that you learned about how the simplest namespace, the UTS namespace, is implemented, it is time to learn about network namespaces and their implementation. + +### Network Namespaces Implementation + +A network namespace is logically another copy of the network stack, with its own network devices, routing tables, neighbouring tables, netfilter tables, network sockets, network procfs entries, network sysfs entries, and other network resources. A practical feature of network namespaces is that network applications running in a given namespace (let's say ns1) will first look for configuration files under /etc/netns/ns1, and only afterward under /etc. So, for example, if you created a namespace called ns1 and you have created /etc/netns/ns1/hosts, every userspace application that tries to access the hosts file will first access /etc/netns/ns1/hosts and only then (if the entry being looked for does not exist) will it read /etc/hosts. This feature is implemented using bind mounts and is available only for network namespaces created with the ip netns add command. + +#### The Network Namespace Object (struct net) + +Let's turn now to the definition of the net structure, which is the fundamental data structure that represents a network namespace: + +struct net { + +. . . + +struct user_namespace *user_ns; /* Owning user namespace */ + +unsigned int proc_inum; + +struct proc_dir_entry *proc_net; + +struct proc_dir_entry *proc_net_stat; + +. . . + +struct list_head dev_base_head; + +struct hlist_head *dev_name_head; + +struct hlist_head *dev_index_head; + +. . . + +int ifindex; + +. . . + +struct net_device *loopback_dev; /* The loopback */ + +. . . + +atomic_t count; /* To decided when the network + +* namespace should be shut down. + +*/ + +struct netns_ipv4 ipv4; + +#if IS_ENABLED(CONFIG_IPV6) + +struct netns_ipv6 ipv6; + +#endif + +#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) + +struct netns_sctp sctp; + +#endif + +. . . + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + +struct netns_ct ct; + +#endif + +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + +struct netns_nf_frag nf_frag; + +#endif + +. . . + +struct net_generic __rcu *gen; + +#ifdef CONFIG_XFRM + +struct netns_xfrm xfrm; + +#endif + +. . . + +}; + +(include/net/net_namespace.h) + +Here is a short description of several members of the net structure: + + * user_ns represents the user namespace that created the network namespace; it owns the network namespace and all its resources. It is assigned in the setup_net() method. For the initial network namespace object (init_net), the user namespace that created it is the initial user namespace, init_user_ns. + + * proc_inum is the unique proc inode number associated to the network namespace. This unique proc inode is created by the proc_alloc_inum() method, which also assigns proc_inum to be the proc inode number. The proc_alloc_inum() method is invoked by the network namespace initialization method, net_ns_net_init(), and it is freed by calling the proc_free_inum() method in the network namespace cleanup method, net_ns_net_exit(). + + * proc_net represents the network namespace procfs entry (/proc/net) as each network namespace maintains its own procfs entry. + + * proc_net_stat represents the network namespace procfs statistics entry (/proc/net/stat) as each network namespace maintains its own procfs statistics entry. + + * dev_base_head points to a linked list of all network devices. + + * dev_name_head points to a hashtable of network devices, where the key is the network device name. + + * dev_index_head points to a hashtable of network devices, where the key is the network device index. + + * ifindex is the last device index assigned inside a network namespace. Indices are virtualized in network namespaces; this means that loopback devices would always have index of 1 in all network namespaces, and other network devices may have coinciding indices when living in different network namespaces. + + * loopback_dev is the loopback device. Every new network namespace is created with only one network device, the loopback device. The loopback_dev object of a network namespace is assigned in the loopback_net_init() method, drivers/net/loopback.c. You cannot move the loopback device from one network namespace to another. + + * count is the network namespace reference counter. It is initialized to 1 when the network namespace is created by the by the setup_net() method. It is incremented by the get_net() method and decremented by the put_net() method. If the count reference counter reaches 0 in the put_net() method, the __put_net() method is called. The __put_net() method, in turn, adds the network namespace to a global list of network namespaces to be removed, cleanup_list, and later removes it. + + * ipv4 (an instance of the netns_ipv4 structure) for the IPv4 subsystem. The netns_ipv4 structure contains IPv4 specific fields which are different for different namespaces. For example, in chapter 6 you saw that the multicast routing table of a specified network namespace called net is stored in net->ipv4.mrt. I will discuss the netns_ipv4 later in this section. + + * ipv6 (an instance of the netns_ipv6 structure) for the IPv6 subsystem. + + * sctp (an instance of the netns_sctp structure) for SCTP sockets. + + * ct (an instance of the netns_ct structure, which is discussed in chapter 9 ) for the netfilter connection tracking subsystem. + + * gen (an instance of the net_generic structure, defined in include/net/netns/generic.h) is a set of generic pointers on structures describing a network namespace context of optional subsystems. For example, the sit module (Simple Internet Transition, an IPv6 tunnel, implemented in net/ipv6/sit.c) puts its private data on struct net using this engine. This was introduced in order not to flood the struct net with pointers for every single network subsystem that is willing to have per network namespace context. + + * xfrm (an instance of the netns_xfrm structure, which is mentioned several times in chapter 10 ) for the IPsec subsystem. + +Let's take a look at the IPv4 specific namespace, the netns_ipv4 structure: + +struct netns_ipv4 { + +. . . + +#ifdef CONFIG_IP_MULTIPLE_TABLES + +struct fib_rules_ops *rules_ops; + +bool fib_has_custom_rules; + +struct fib_table *fib_local; + +struct fib_table *fib_main; + +struct fib_table *fib_default; + +#endif + +. . . + +struct hlist_head *fib_table_hash; + +struct sock *fibnl; + +struct sock **icmp_sk; + +. . . + +#ifdef CONFIG_NETFILTER + +struct xt_table *iptable_filter; + +struct xt_table *iptable_mangle; + +struct xt_table *iptable_raw; + +struct xt_table *arptable_filter; + +#ifdef CONFIG_SECURITY + +struct xt_table *iptable_security; + +#endif + +struct xt_table *nat_table; + +#endif + +int sysctl_icmp_echo_ignore_all; + +int sysctl_icmp_echo_ignore_broadcasts; + +int sysctl_icmp_ignore_bogus_error_responses; + +int sysctl_icmp_ratelimit; + +int sysctl_icmp_ratemask; + +int sysctl_icmp_errors_use_inbound_ifaddr; + +int sysctl_tcp_ecn; + +kgid_t sysctl_ping_group_range[2]; + +long sysctl_tcp_mem[3]; + +atomic_t dev_addr_genid; + +#ifdef CONFIG_IP_MROUTE + +#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES + +struct mr_table *mrt; + +#else + +struct list_head mr_tables; + +struct fib_rules_ops *mr_rules_ops; + +#endif + +#endif + +}; + +(net/netns/ipv4.h) + +You can see in the netns_ipv4 structure many IPv4-specific tables and variables, like the routing tables, the netfilter tables, the multicast routing tables, and more. + +#### Network Namespaces Implementation: Other Data Structures + +In order to support network namespaces, a member called nd_net, which is a pointer to a network namespace, was added to the network device object (struct net_device). Setting the network namespace for a network device is done by calling the dev_net_set() method, and getting the network namespace associated to a network device is done by calling the dev_net() method. Note that a network device can belong to only a single network namespace at a given moment. The nd_net is set typically when a network device is registered or when a network device is moved to a different network namespace. For example, when registering a VLAN device, both these methods just mentioned are used: + +static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) + +{ + +struct net_device *new_dev; + +The network namespace to be assigned to the new VLAN device is the network namespace associated with the real device, which is passed as a parameter to the register_vlan_device() method; we get this namespace by calling dev_net(real_dev): + +struct net *net = dev_net(real_dev); + +. . . + +new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, vlan_setup); + +if (new_dev == NULL) + +return -ENOBUFS; + +Switch the network namespace by calling the dev_net_set() method: + +dev_net_set(new_dev, net); + +. . . + +} + +A member called sk_net, a pointer to a network namespace, was added to struct sock, which represents a socket. Setting the network namespace for a sock object is done by calling the sock_net_set() method, and getting the network namespace associated to a sock object is done by calling the sock_net() method. Like in the case of the nd_net object, also a sock object can belong to only a single network namespace at a given moment. + +When the system boots, a default network namespace, init_net, is created. After the boot, all physical network devices and all sockets belong to that initial namespace, as well as the network loopback device. + +Some network devices and some network subsystems should have network namespaces specific data. In order to enable this, a structure named pernet_operations was added; this structure includes an init and exit callbacks: + +struct pernet_operations { + +. . . + +int (*init)(struct net *net); + +void (*exit)(struct net *net); + +. . . + +int *id; + +size_t size; + +}; + +(include/net/net_namespace.h) + +Network devices that need network namespaces specific data should define a pernet_operations object, and define its init() and exit() callbacks for device specific initialization and cleanup, respectively, and call the register_pernet_device() method in their module initialization and the unregister_pernet_device() method when the module is removed, passing the pernet_operations object as a single parameter in both cases. For example, the PPPoE module exports information about PPPoE session by a procfs entry, /proc/net/pppoe. The information exported by this procfs entry depends on the network namespace to which this PPPoE device belongs (since different PPPoE devices can belong to different network namespaces). So the PPPoE module defines a pernet_operations object called pppoe_net_ops: + +static struct pernet_operations pppoe_net_ops = { + +.init = pppoe_init_net, + +.exit = pppoe_exit_net, + +.id = &pppoe_net_id, + +.size = sizeof(struct pppoe_net), + +} + +(net/ppp/pppoe.c) + +In the init callback, pppoe_init_net(), it only creates the PPPoE procfs entry, /proc/net/pppoe, by calling the proc_create() method: + +static __net_init int pppoe_init_net(struct net *net) + +{ + +struct pppoe_net *pn = pppoe_pernet(net); + +struct proc_dir_entry *pde; + +rwlock_init(&pn->hash_lock); + +pde = proc_create("pppoe", S_IRUGO, net->proc_net, &pppoe_seq_fops); + +#ifdef CONFIG_PROC_FS + +if (!pde) + +return -ENOMEM; + +#endif + +return 0; + +} + +(net/ppp/pppoe.c) + +And in the exit callback, pppoe_exit_net(), it only removes the PPPoE procfs entry, /proc/net/pppoe, by calling the remove_proc_entry() method: + +static __net_exit void pppoe_exit_net(struct net *net) + +{ + +remove_proc_entry("pppoe", net->proc_net); + +} + +(net/ppp/pppoe.c) + +Network subsystems that need network-namespace-specific data should call register_pernet_subsys() when the subsystem is initialized and unregister_pernet_subsys() when the subsystem is removed. You can look for examples in net/ipv4/route.c, and there are many other examples of reviewing these methods. The network namespace module itself also defines a net_ns_ops object and registers it in the boot phase: + +static struct pernet_operations __net_initdata net_ns_ops = { + +.init = net_ns_net_init, + +.exit = net_ns_net_exit, + +}; + +static int __init net_ns_init(void) + +{ + +. . . + +register_pernet_subsys(&net_ns_ops); + +. . . + +} + +(net/core/net_namespace.c) + +Each time a new network namespace is created, the init callback (net_ns_net_init) is called, and each time a network namespace is removed, the exit callback (net_ns_net_exit) is called. The only thing that the net_ns_net_init() does is to allocate a unique proc inode for the newly created namespace by calling the proc_alloc_inum() method; the newly created unique proc inode number is assigned to net->proc_inum: + +static __net_init int net_ns_net_init(struct net *net) + +{ + +return proc_alloc_inum(&net->proc_inum); + +} + +And the only thing that the net_ns_net_exit() method does is to remove that unique proc inode by calling the proc_free_inum() method: + +static __net_exit void net_ns_net_exit(struct net *net) + +{ + +proc_free_inum(net->proc_inum); + +} + +When you create a new network namespace, it has only the network loopback device. The most common ways to create a network namespace are: + + * By a userspace application which will create a network namespace with the clone() system call or with the unshare() system call, setting the CLONE_NEWNET flag in both cases. + + * Using ip netns command of iproute2 (you will shortly see an example). + + * Using the unshare utility of util-linux, with the \--net flag. + +### Network Namespaces Management + +Next you will see some examples of using the ip netns command of the iproute2 package to perform actions such as creating a network namespace, deleting a network namespace, showing all the network namespaces, and more. + + * Creating a network namespace named ns1 is done by: + + * ip netns add ns1 + + * Running this command triggers first the creation of a file called /var/run/netns/ns1, and then the creation of the network namespace by the unshare() system call, passing it a CLONE_NEWNET flag. Then /var/run/netns/ns1 is attached to the network namespace (/proc/self/ns/net) by a bind mount (calling the mount() system call with MS_BIND). Note that network namespaces can be nested, which means that from within ns1 you can also create a new network namespace, and so on. + + * Deleting a network namespace named ns1 is done by: + + * ip netns del ns1 + + * Note that this will not delete a network namespace if there is one or more processes attached to it. In case there are no such processes, the /var/run/netns/ns1 file is deleted. Note also that when deleting a namespace, all its network devices are moved to the initial, default network namespace, init_net, except for network namespace local devices, which are network devices whose NETIF_F_NETNS_LOCAL feature is set; such network devices are deleted. See more in the "Moving a Network Interface to a Network Namespace" section later in this chapter and in Appendix A. + + * Showing all the network namespaces in the system that were added by ip netns add is done by: + + * ip netns list + + * In fact, running ip netns list simply shows the names of files under /var/run/netns. Note that network namespaces not added by ip netns add will not be displayed by ip netns list, because creating such network namespaces did not trigger creation of any file under /var/run/netns. So, for example, a network namespace created by unshare --net bash will not appear when running ip netns list. + + * Monitoring creation and removal of a network namespace is done by: + + * ip netns monitor + + * After running ip netns monitor, when you add a new namespace by ip netns add ns2 you will see on screen the following message: "add ns2", and after you delete that namespace by ip netns delete ns2 you will see on screen the following message: "delete ns2". Note that adding and removing network namespaces not by running ip netns add and ip netns delete, respectively, does not trigger displaying any messages on screen by ip netns monitor. The ip netns monitor command is implemented by setting an inotify watch on /var/run/netns. Note that in case you will run ip netns monitor before adding at least one network namespace with ip netns add you will get the following error: inotify_add_watch failed: No such file or directory. The reason is that trying to set a watch on /var/run/netns, which does not exist yet, fails. See man inotify_init() and man inotify_add_watch(). + + * Start a shell in a specified namespace (ns1 in this example) is done by: + + * ip netns exec ns1 bash + + * Note that with ip netns exec you can run any command in a specified network namespace. For example, the following command will display all network interfaces in the network namespace called ns1: + + * ip netns exec ns1 ifconfig -a + +In recent versions of iproute2 (since version 3.8), you have these two additional helpful commands: + + * Show the network namespace associated with the specified pid: + + * ip netns identify #pid + + * This is implemented by reading /proc//ns/net and iterating over the files under /var/run/netns to find a match (using the stat() system call). + + * Show the PID of a process (or list of processes) attached to a network namespace called ns1 by: + + * ip netns pids ns1 + + * This is implemented by reading /var/run/netns/ns1, and then iterating over /proc/ entries to find a matching /proc/pid/ns/net entry (using the stat() system call). + +Note + +For more information about the various ip netns command options see man ip netns. + +#### Moving a Network Interface to a Different Network Namespace + +Moving a network interface to a network namespace named ns1 can be done with the ip command. For example, by: ip link set eth0 netns ns1. As part of implementing network namespaces, a new feature named NETIF_F_NETNS_LOCAL was added to the features of the net_device object (The net_device structure represents a network interface. For more information about the net_device structure and its features see Appendix A). You can find out whether the NETIF_F_NETNS_LOCAL feature is set for a specified network device by looking at the netns-local flag in the output of ethtool -k eth0 or in the output of ethtool --show-features eth0 (both commands are equivalent.) Note that you cannot set the NETIF_F_NETNS_LOCAL feature with ethtool. This feature, when set, denotes that the network device is a network namespace local device. For example, the loopback, the bridge, the VXLAN and the PPP devices are network namespace local devices. Trying to move a network device whose NETIF_F_NETNS_LOCAL feature is set to a different namespace will fail with an error of –EINVAL, as you will shortly see in the following code snippet. The dev_change_net_namespace() method is invoked when trying to move a network interface to a different network namespace, for example by: ip link set eth0 netns ns1. Let's take a look at the dev_change_net_namespace() method: + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) + +{ + +int err; + +ASSERT_RTNL(); + +/* Don't allow namespace local devices to be moved. */ + +err = -EINVAL; + +Return –EINVAL in case that the device is a local device (The NETIF_F_NETNS_LOCAL flag in the features of net_device object is set) + +if (dev->features & NETIF_F_NETNS_LOCAL) + +goto out; + +. . . + +Actually switch the network namespace by setting nd_net of the net_device object to the new specified namespace: + +dev_net_set(dev, net) + +. . . + +out: + +return err; + +} + +(net/core/dev.c) + +Note + +You can move a network interface to a network namespace named ns1 also by specifying a PID of a process that is attached to that namespace, without specifying the namespace name explicitly. For example, if you know that a process whose PID is is attached to ns1, running ip link set eth1 netns will move eth1 to the ns1 namespace. Implementation details: getting the network namespace object when specifying one of the PIDs of its attached processes is implemented by the get_net_ns_by_pid() method, whereas getting the network namespace object when specifying the network namespace name is implemented by the get_net_ns_by_fd() method; both methods are in net/core/net_namespace.c. In order to move a wireless network interface to a different network namespace you should use the iw command. For example, if you want to move wlan0 to a network namespace and you know that a process whose PID is is attached to that namespace, you can run iw phy phy0 set netns to move it to that network namespace. For the implementation details, refer to the nl80211_wiphy_netns() method in net/wireless/nl80211.c. + +#### Communicating Between Two Network Namespaces + +I will end the network namespaces section with a short example of how two network namespaces can communicate with each other. It can be done either by using Unix sockets or by using the Virtual Ethernet (VETH) network driver to create a pair of virtual network devices and moving one of them to another network namespace. For example, here are the first two namespaces, ns1 and ns2: + +ip netns add ns1 + +ip netns add ns2 + +Start a shell in ns1: + +ip netns exec ns1 bash + +Create a virtual Ethernet device (its type is veth): + +ip link add name if_one type veth peer name if_one_peer + +Move if_one_peer to ns2: + +ip link set dev if_one_peer netns ns2 + +You can now set addresses on if_one and on if_one_peer as usual, with the ifconfig command or with the ip command, and send packets from one network namespace to the other. + +Note + +Network namespaces are not mandatory for a kernel image. By default, network namespaces are enabled (CONFIG_NET_NS is set) in most distributions. However, you can build and boot a kernel where network namespaces are disabled. + +I have discussed in this section what namespaces are, and in particular what are network namespaces. I mentioned some of the major changes that were required in order to implement namespaces in general, like adding 6 new CLONE_NEW* flags, adding two new systems calls, adding an nsproxy object to the process descriptor, and more. I also described the implementation of UTS namespaces, which are the most simple among all namespaces, and the implementation of network namespaces. Several examples were given showing how simple it is to manipulate network namespaces with the ip netns command of the iproute2 package. Next I will describe the cgroups subsystem, which provides another solution of resource management, and two network modules that belong to it. + +## Cgroups + +The cgroups subsystem is a project started by Paul Menage, Rohit Seth, and other Google developers in 2006. It was initially called "process containers," but later it was renamed to "Control Groups." It provides resource management and resource accounting for groups of processes. It has been part of the mainline kernel since kernel 2.6.24, and it's used in several projects: for example by systemd (a service manager which replaced SysV init scripts; used, for example, by Fedora and by openSUSE), by the Linux Containers project, which was mentioned earlier in this chapter, by Google containers ( https://github.com/google/lmctfy/ ), by libvirt ( http://libvirt.org/cgroups.html ) and more. Cgroups kernel implementation is mostly in non-critical paths in terms of performance. The cgroups subsystem implements a new Virtual File System (VFS) type named "cgroups". All cgroups actions are done by filesystem actions, like creating cgroups directories in a cgroup filesystem, writing or reading to entries in these directories, mounting cgroup filesystems, etc. There is a library called libcgroup (a.k.a. libcg), which provides a set of userspace utilities for cgroups management: for example, cgcreate to create a new cgroup, cgdelete to delete a cgroup, cgexec to run a task in a specified control group, and more. In fact this is done by calling the cgroup filesystem operations from the libcg library. The libcg library is likely to see reduced usage in the future because it doesn't provide any coordination among multiple parties trying to use the cgroup controllers. It could be that in the future all the cgroup file operations will be performed by a library or by a daemon and not directly. The cgroups subsystem, as currently implemented, needs some form of coordination, because there is only a single controller for each resource type. When multiple actors modify it, this necessarily leads to conflicts. The cgroups controllers can be used by many projects like libvirt, systemd, lxc and more, simultaneously. When working only via cgroups filesystem operations, and when all the projects try to impose their own policy through cgroups at too low a level, without knowing about each other, they may accidently walk over each other. When each will talk to a daemon, for example, such a clash will be avoided. For more information about libcg see http://libcg.sourceforge.net/ . + +As opposed to namespaces, no new system calls were added for implementing the cgroup subsystem. As in namespaces, several cgroups can be nested. There were code additions in the boot phase, mainly for the initialization of the cgroups subsystem, and in various subsystems, like the memory subsystem or security subsystem. Following here is a short, partial list of tasks that you can perform with cgroups: + + * Assign a set of CPUs to a set of processes, with the cpusets cgroup controller. You can also control the NUMA node memory is allocated from with the cpusets cgroup controller. + + * Manipulate the out of memory (oom) killer operation or create a process with a limited amount of memory with the memory cgroup controller (memcg). You will see an example later in this chapter. + + * Assign permissions to devices under /dev, with the devices cgroup. You will see later an example of using the devices cgroup in the "Cgroup Devices – A Simple Example" section. + + * Assign priority to traffic (see the section "The net_prio Module" later in this chapter). + + * Freeze processes with the freezer cgroup. + + * Report CPU resource usage of tasks of a cgroup with the cpuacct cgroup. Note that there is also the cpu controller, which can provision CPU cycles either by priority or by absolute bandwidth and provides the same or a superset of statistics. + + * Tag network traffic with a class identifier (classid); see the section "The cls_cgroup Classifier" later in this chapter. + +Next I will describe very briefly some changes that were done for supporting cgroups. + +### Cgroups Implementation + +The cgroup subsystem is very complex. Here are several implementation details about the cgroup subsystem that should give you a good starting point to delve into its internals: + + * A new structure called cgroup_subsys was added (include/linux/cgroup.h). It represents a cgroup subsystem (also known as a cgroup controller). The following cgroup subsystems are implemented: + + * mem_cgroup_subsys: mm/memcontrol.c + + * blkio_subsys: block/blk-cgroup.c + + * cpuset_subsys: kernel/cpuset.c + + * devices_subsys: security/device_cgroup.c + + * freezer_subsys: kernel/cgroup_freezer.c + + * net_cls_subsys: net/sched/cls_cgroup.c + + * net_prio_subsys: net/core/netprio_cgroup.c + + * perf_subsys: kernel/events/core.c + + * cpu_cgroup_subsys: kernel/sched/core.c + + * cpuacct_subsys: kernel/sched/core.c + + * hugetlb_subsys: mm/hugetlb_cgroup.c + + * A new structure called cgroup was added; it represents a control group (linux/cgroup.h) + + * A new virtual file system was added; this was done by defining the cgroup_fs_type object and a cgroup_ops object (instance of super_operations): + +static struct file_system_type cgroup_fs_type = { + +.name = "cgroup", + +.mount = cgroup_mount, + +.kill_sb = cgroup_kill_sb, + +}; + +static const struct super_operations cgroup_ops = { + +.statfs = simple_statfs, + +.drop_inode = generic_delete_inode, + +.show_options = cgroup_show_options, + +.remount_fs = cgroup_remount, + +}; + +(kernel/cgroup.c) + +And registering it is done like any other filesystem with the register_filesystem() method in the cgroup_init() method; see kernel/cgroup.c. + + * The following sysfs entry, /sys/fs/cgroup, is created by default when the cgroup subsystem is initialized; this is done by calling kobject_create_and_add("cgroup", fs_kobj) in the cgroup_init() method. Note that cgroup controllers can be mounted also on other directories. + + * There is a global array of cgroup_subsys objects named subsys, defined in kernel/cgroup.c (note that from kernel 3.11, the array name was changed from subsys to cgroup_subsys). There are CGROUP_SUBSYS_COUNT elements in this array. A procfs entry called /proc/cgroups is exported by the cgroup subsystem. You can display the elements of the global subsys array in two ways: + + * By running cat /proc/cgroups. + + * By the lssubsys utility of libcgroup-tools. + + * Creating a new cgroup entails generating these four control files always under that cgroup VFS: + + * notify_on_release: Its initial value is inherited from its parent. It's represents a boolean variable, and its usage is related to the release_agenttopmost-only control file, which will be explained shortly. + + * cgroup.event_control: This file enables getting notification from a cgroup, using the eventfd() system call. See man 2 eventfd, and fs/eventfd.c. + + * tasks: A list of the PIDs which are attached to this group. Attaching a process to a cgroup is done by writing the value of its PID to the tasks control file and is handled by the cgroup_attach_task() method, kernel/cgroup.c. Displaying the cgroups to which a process is attached is done by cat /proc//cgroup. This is handled in the kernel by the proc_cgroup_show() method, in kernel/cgroup.c. + + * cgroup.procs: A list of the thread group ids which are attached to this cgroup. The tasks entry allows attaching threads of the same process to different cgroup controllers, whereas cgroup.procs has a process-level granularity (all threads of a single process are moved together and belong to the same cgroup). + + * In addition to these four control files, a control file named release_agentis created for the topmost cgroup root object only. The value of this file is a path of an executable that will be executed when the last process of a cgroup is terminated; the notify_on_release mentioned earlier should be set so that the release_agent feature will be enabled. The release_agent can be assigned as a cgroup mount option; this is the case, for example, in systemd in Fedora. The release_agent mechanism is based on a user-mode helper: the call_usermodehelper() method is invoked and a new userspace process is created each time that the release_agent is activated, which is costly in terms of performance. See: "The past, present, and future of control groups", lwn.net/Articles/574317/. For the release_agent implementation details see the cgroup_release_agent() method in kernel/cgroup.c. + + * Apart from these four default control files and the release_agent topmost-only control file, each subsystem can create its own specific control files. This is done by defining an array of cftype (Control File type) objects and assigning this array to the base_cftypes member of the cgroup_subsys object. For example, for the memory cgroup controller, we have this definition for the usage_in_bytes control file: + +static struct cftype mem_cgroup_files[] = { + +{ + +.name = "usage_in_bytes", + +.private = MEMFILE_PRIVATE(_MEM, RES_USAGE), + +.read = mem_cgroup_read, + +.register_event = mem_cgroup_usage_register_event, + +.unregister_event = mem_cgroup_usage_unregister_event, + +}, + +. . . + +struct cgroup_subsys mem_cgroup_subsys = { + +.name = "memory", + +. . . + +.base_cftypes = mem_cgroup_files, + +}; + +(mm/memcontrol.c) + + * A member called cgroups, which is a pointer to a css_set object, was added to the process descriptor, task_struct. The css_set object contains an array of pointers to cgroup_subsys_state objects (one such pointer for each cgroup subsystem). The process descriptor itself (task_struct ) does not contain a direct pointer to a cgroup subsystem it is associated to, but this could be determined from this array of cgroup_subsys_state pointers. + +Two cgroups networking modules were added. They will be discussed later in this section: + + * net_prio (net/core/netprio_cgroup.c). + + * cls_cgroup (net/sched/cls_cgroup.c). + +Note + +The cgroup subsystem is still in its early days and likely to see a fair amount of development in its features and interface. + +Next you will see a short example that illustrates how the devices cgroup controller can be used to change the write permission of a device file. + +### Cgroup Devices Controller: A Simple Example + +Let's look at a simple example of using the devices cgroup. Running the following command will create a devices cgroup: + +mkdir /sys/fs/cgroup/devices/0 + +Three control files will be created under /sys/fs/cgroup/devices/0: + + * devices.deny: Devices for which access is denied. + + * devices.allow: Devices for which access is allowed. + + * devices.list: Available devices. + +Each such control file consists of four fields: + + * type: possible values are: 'a' is all, 'c' is char device and 'b' is block device. + + * The device major number. + + * The device minor number. + + * Access permission: 'r' is permission to read, 'w' is permission to write, and 'm' is permission to perform mknod. + +By default, when creating a new devices cgroup, it has all the permissions: + +cat /sys/fs/cgroup/devices/0/devices.list + +a *:* rwm + +The following command adds the current shell to the devices cgroup that you created earlier: + +echo $$ > /sys/fs/cgroup/devices/0/tasks + +The following command will deny access from all devices: + +echo a > /sys/fs/cgroup/devices/0/devices.deny + +echo "test" > /dev/null + +-bash: /dev/null: Operation not permitted + +The following command will return the access permission for all devices: + +echo a > /sys/fs/cgroup/devices/0/devices.allow + +Running the following command, which previously failed, will succeed now: + +echo "test" > /dev/null + +### Cgroup Memory Controller: A Simple Example + +You can disable the out of memory (OOM) killer thus, for example: + +mkdir /sys/fs/cgroup/memory/0 + +echo $$ > /sys/fs/cgroup/memory/0/tasks + +echo 1 > /sys/fs/cgroup/memory/0/memory.oom_control + +Now if you will run some memory-hogging userspace program, the OOM killer will not be invoked. Enabling the OOM killer can be done by: + +echo 0 > /sys/fs/cgroup/memory/0/memory.oom_control + +You can use the eventfd() system call the get notifications in a userspace application about a change in the status of a cgroup. See man 2 eventfd. + +Note + +You can limit the memory a process in a cgroup can have up to 20M, for example, by: + +echo 20M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +### The net_prio Module + +The network priority control group (net_prio) provides an interface for setting the priority of network traffic that is generated by various userspace applications. Usually this can be done by setting the SO_PRIORITY socket option, which sets the priority of the SKB, but it is not always wanted to use this socket option. To support the net_prio module, an object called priomap, an instance of netprio_map structure, was added to the net_device object. Let's take a look at the netprio_map structure: + +struct netprio_map { + +struct rcu_head rcu; + +u32 priomap_len; + +u32 priomap[]; + +}; + +(include/net/netprio_cgroup.h) + +The priomap array is using the net_prio sysfs entries, as you will see shortly. The net_prio module exports two entries to cgroup sysfs: net_prio.ifpriomap and net_prio.prioidx. The net_prio.ifpriomap is used to set the priomap object of a specified network device, as you will see in the example immediately following. In the Tx path, the dev_queue_xmit() method invokes the skb_update_prio() method to set skb->priority according to the priomap which is associated with the outgoing network device (skb->dev). The net_prio.prioidx is a read-only entry, which shows the id of the cgroup. The net_prio module is a good example of how simple it is to develop a cgroup kernel module in less than 400 lines of code. The net_prio module was developed by Neil Horman and is available from kernel 3.3. For more information see Documentation/cgroups/net_prio.txt. The following is an example of how to use the network priority cgroup module (note that you must load the netprio_cgroup.ko kernel module in case CONFIG_NETPRIO_CGROUP is set as a module and not as a built-in): + +mkdir /sys/fs/cgroup/net_prio + +mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +mkdir /sys/fs/cgroup/net_prio/0 + +echo "eth1 4" > /sys/fs/cgroup/net_prio/0/net_prio.ifpriomap + +This sequence of commands would set any traffic originating from processes belonging to the netprio "0" group and outgoing on interface eth1 to have the priority of four. The last command triggers writing an entry to a field in the net_device object called priomap. + +Note + +In order to work with net_prio, CONFIG_NETPRIO_CGROUP should be set. + +### The cls_cgroup Classifier + +The cls_cgroup classifier provides an interface to tag network packets with a class identifier (classid). You can use it in conjunction with the tc tool to assign different priorities to packets from different cgroups, as the example that you will soon see demonstrates. The cls_cgroup module exports one entry to cgroup sysfs, net_cls.classid. The control group classifier (cls_cgroup) was merged in kernel 2.6.29 and was developed by Thomas Graf. Like the net_prio module which was discussed in the previous section, also this cgroup kernel module is less than 400 lines of code, which proves again that adding a cgroup controller by a kernel module is not a heavy task. Here is an example of using the control group classifier (note that you must load the cls_cgroup.ko kernel module in case that CONFIG_NETPRIO_CGROUP is set as a module and not as a built-in): + +mkdir /sys/fs/cgroup/net_cls + +mount -t cgroup -onet_cls none /sys/fs/cgroup/net_cls + +mkdir /sys/fs/cgroup/net_cls/0 + +echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + +The last command assigns classid 10:1 to group 0. The iproute2 package contains a utility named tc for managing traffic control settings. You can use the tc tool with this class id, for example: + +tc qdisc add dev eth0 root handle 10: htb + +tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + +tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +For more information see Documentation/cgroups/net_cls.txt (only from kernel 3.10.) + +Note + +In order to work with cls_cgroup, CONFIG_NET_CLS_CGROUP should be set. + +I will conclude the discussion about the cgroup subsystem with a short section about mounting cgroups. + +### Mounting cgroup Subsystems + +Mounting a cgroup subsystem can be done also in other mount points than /sys/fs/cgroup, which is created by default. For example, you can mount the memory controller on /mycgroup/mymemtest by the following sequence: + +mkdir –p /mycgroup/mymemtest + +mount -t cgroup -o memory mymemtest /mycgroup/mymemtest + +Here are some of the mount options when mounting cgroup subsystems: + + * all: Mount all cgroup controllers. + + * none: Do not mount any controller. + + * release_agent: A path to an executable which will be executed when the last process of a cgroup is terminated. Systemd uses the release_agent cgroup mount option. + + * noprefix: Avoid prefix in control files. Each cgroup controller has its own prefix for its own control files; for example, the cpuset controller entry mem_exclusive appears as cpuset.mem_exclusive. The noprefix mount option avoids adding the controller prefix. For example, + +mkdir /cgroup + +mount -t tmpfs xxx /cgroup/ + +mount -t cgroup -o noprefix,cpuset xxx /cgroup/ + +ls /cgroup/ + +cgroup.clone_children mem_hardwall mems + +cgroup.event_control memory_migrate notify_on_release + +cgroup.procs memory_pressure release_agent + +cpu_exclusive memory_pressure_enabled sched_load_balance + +cpus memory_spread_page sched_relax_domain_level + +mem_exclusive memory_spread_slab tasks + +Note + +Readers who want to delve into how parsing of the cgroups mount options is implemented should look into the parse_cgroupfs_options() method, kernel/cgroup.c. + +For more information about cgroups, see the following resources: + + * Documentation/cgroups + + * cgroups mailing list: cgroups@vger.kernel.org + + * cgroups mailing list archives: http://news.gmane.org/gmane.linux.kernel.cgroups + + * git repository: git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git + +Note + +Linux namespaces and cgroups are orthogonal and are not related technically. You can build a kernel with namespaces support and without cgroups support, and vice versa. In the past there were experiments with a cgroups namespace subsystem, called "ns", but the code was eventually removed. + +You have seen what cgroups are and you learned about its two network modules, net_prio and cls_cgroup. You also saw short examples demonstrating how the devices, memory, and the networking cgroups controllers can be used. The Busy Poll Sockets feature, which was added in kernel 3.11 and above, provides lower latency for sockets. Let's take a look at how it is implemented and how it is configured and used. + +## Busy Poll Sockets + +The traditional way the networking stack operates when the socket queue runs dry, is that it will sleep waiting for the driver to put more data on the socket queue, or returns if it is a non-blocking operation. This causes additional latency due to interrupts and context switches. For sockets applications that need the lowest possible latency and are willing to pay a cost of higher CPU utilization, Linux has added a capability for Busy Polling on Sockets from kernel 3.11 and above (in the beginning this technique was called Low Latency Sockets Poll, but it was changed to Busy Poll Sockets according to Linus suggestion). Busy Polling takes a more aggressive approach toward moving data to the application. When the application asks for more data and there is none in the socket queue, the networking stack actively calls into the device driver. The driver checks for newly arrived data and pushes it through the network layer (L3) to the socket. The driver may find data for other sockets and will push that data as well. When the poll call returns to the networking stack, the socket code checks whether new data is pending on the socket receive queue. + +In order that a network driver will support busy polling, it should supply its busy polling method and add it as the ndo_busy_poll callback of the net_device_ops object. This driver ndo_busy_poll callback should move the packets into the network stack; see for example, the ixgbe_low_latency_recv() method, drivers/net/ethernet/intel/ixgbe/ixgbe_main.c . This ndo_busy_poll callback should return the number of packets that were moved to the stack or 0 if there were no such packets, and LL_FLUSH_FAILED or LL_FLUSH_BUSY in case of some problem. An unmodified driver that does not fill in the ndo_busy_poll callback will continue to work as usual and will not be busy polled. + +An important component to providing low latency is busy polling. Sometimes when the driver polling routine returns with no data, more data is arriving and just misses being returned to the networking stack. This is where busy polling comes in to play. The networking stack polls the driver for a configurable period of time so new packets can be picked up as soon as they arrive. + +The active and busy polling of the device driver can provide reduced latency very close to that of the hardware. Busy polling can be used for large numbers of sockets at the same time but will not yield the best results, since busy polling on some sockets will slow down other sockets when using the same CPU core. Figure 14-1 contrasts the traditional receive flow with that of a socket that has been enabled for Busy Polling. + +Figure 14-1. + +Traditional receive flow versus Busy Poll Sockets receive flow + +1. Application checks for receive. 1. Application checks for receive + +2. No immediate receive – thus block. 2. Check device driver for pending packet (poll starts). + +3. Packet Received. 3. Meanwhile, packet received to NIC. + +4. Driver passes packet to the protocol layer. 4. Driver processes pending packet + +5. Protocol/socket wakes application. 5. Driver passes to the protocol layer + +\- Bypass context switch and interrupt. + +6. Application receives data through sockets. 6. Application receives data through sockets. + +Repeat. Repeat. + +### Enabling Globally + +Busy Polling on Sockets can be turned on globally for all sockets via procfs parameters or it can be turned on for individual sockets by setting the SO_BUSY_POLL socket option. For global enabling, there are two parameters: net.core.busy_poll and net.core.busy_read, which are exported to procfs by /proc/sys/net/core/busy_poll and /proc/sys/net/core/busy_read, respectively. Both are zero by default, which means that Busy Polling is off. Setting these values will enable Busy Polling globally. A value of 50 will usually yield good results, but some experimentation might help find a better value for some applications. + + * busy_read controls the time limit when busy polling on blocking read operations. For a non-blocking read, if busy polling is enabled for the socket, the stack code polls just once before returning control to the user. + + * busy_poll controls how long select and poll will busy poll waiting for new events on any of the sockets that are enabled for Busy Polling. Only sockets with the busy read socket operation enabled are busy polled. + +For more information, see: Documentation/sysctl/net.txt. + +### Enabling Per Socket + +A better way to enable Busy Polling is to modify the application to use the SO_BUSY_POLL socket option, which sets the sk_ll_usec of the socket object (an instance of the sock structure). By using this socket option, an application can specify which sockets are Busy Polled so CPU utilization is increased only for those sockets. Sockets from other applications and services will continue to use the traditional receive path. The recommended starting value for SO_BUSY_POLL is 50. The sysctl.net.busy_read value must be set to 0 and the sysctl.net.busy_poll value should be set as described in Documentation/sysctl/net.txt. + +### Tuning and Configuration + +Here are several ways in which you can tune and configure Busy Poll sockets: + + * The interrupt coalescing (ethtool -C setting for rx-usecs) on the network device should be on the order of 100 to lower the interrupt rate. This limits the number of context switches caused by interrupts. + + * Disabling GRO and LRO by using ethtool -K on the network device may avoid out of order packets on the receive queue. This should only be an issue when mixed bulk and low latency traffic arrive on the same queue. Generally, keeping GRO and LRO enabled usually gives best results. + + * Application threads and the network device IRQs should be bound to separate CPU cores. Both sets of cores should be on the same CPU NUMA node as the network device. When the application and the IRQ run on the same core, there is a small penalty. If interrupt coalescing is set to a low value this penalty can be very large. + + * For lowest latency, it may help to turn off the I/O Memory Management Unit (IOMMU) support. This may already be disabled by default on some systems. + +### Performance + +Many applications that use Busy Polling Sockets should show reduced latency and jitter as well as improved transactions per second. However, overloading the system with too many sockets that are busy polling can hurt performance as CPU contention increases. The parameters net.core.busy_poll, net.core.busy_read and the SO_BUSY_POLL socket option are all tunable. Experimenting with these values may give better results for various applications. + +I will now start a discussion of three wireless subsystems, which typically serve short range and low power devices: the Bluetooth subsystem, IEEE 802.15.4 and NFC. There is a growing interest in these three subsystems as new exciting features are added quite steadily. I will start the discussion with the Bluetooth subsystem. + +## The Linux Bluetooth Subsystem + +The Bluetooth protocol is one of the major transport protocols mainly for small and embedded devices. Bluetooth network interfaces are included nowadays in almost every new laptop or tablet and in every mobile phone, and in many electronic gadgets. The Bluetooth protocol was created by the mobile vendor Ericsson in 1994. In the beginning, it was intended to be a cable-replacement for point-to-point connections. Later, it evolved to enable wireless Personal Area Networks (PANs). Bluetooth operates in the 2.4 GHz Industrial, Scientific and Medical (ISM) radio-frequency band, which is license-free for low-power transmissions. The Bluetooth specifications are formalized by the Bluetooth Special Interest Group (SIG), which was founded in 1998; see https://www.bluetooth.org . The SIG is responsible for development of Bluetooth specification and for the qualification process that helps to ensure interoperability between Bluetooth devices from different vendors. The Bluetooth core specification is freely available. There were several specifications for Bluetooth over the years, I will mention the most recent: + + * Bluetooth v2.0 + Enhanced Data Rate (EDR) from 2004. + + * Bluetooth v2.1 + EDR 2007; included improvement of the pairing process by secure simple pairing (SSP). + + * Bluetooth v3.0 + HS (High Speed) from 2009; the main new feature is AMP (Alternate MAC/PHY), the addition of 802.11 as a high-speed transport. + + * Bluetooth v4.0 + BLE (Bluetooth Low Energy, which was formerly known as WiBree) from 2010. + +There is a variety of uses for the Bluetooth protocol, like file transfer, audio streaming, health-care devices, networking, and more. Bluetooth is designed for short distance data exchange, in a range that typically extends up to 10 meters. There are three classes of Bluetooth devices, with the following ranges: + + * Class 1 – about 100 m. + + * Class 2 – about 10 m. + + * Class 3 – about 1 m. + +The Linux Bluetooth protocol stack is called BlueZ. Originally it was a project started by Qualcomm. It was officially integrated in kernel 2.4.6 (2001). Figure 14-2 shows the Bluetooth stack. + +Figure 14-2. + +Bluetooth stack. Note: In the layer above L2CAP there can be other Bluetooth protocols that are not discussed in this chapter, like AVDTP (Audio/Video Distribution Transport Protocol), HFP (Hands-Free Profile), Audio/video control transport protocol (AVCTP), and more + + * The lower three layers (The RADIO layer, Link controller and Link Management Protocol) are implemented in hardware or firmware. + + * The Host Controller Interface (HCI) specifies how the host interacts and communicates with a local Bluetooth device (the controller). I will discuss it in the "HCI Layer" section, later in this chapter. + + * The L2CAP (Logical link control and adaptation protocol) provides the ability to transmit and to receive packets from other Bluetooth devices. An application can use the L2CAP protocol as a message-based, unreliable data-delivery transport protocol similarly to the UDP protocol. Access to the L2CAP protocol from userspace is done by BSD sockets API, which was discussed in Chapter 11. Note that in L2CAP, packets are always delivered in the order they were sent, as opposed to UDP. In Figure 14-2, I showed three protocols that are located on top of L2CAP (there are other protocols on top of L2CAP that are not discussed in this chapter, as mentioned earlier). + + * BNEP: Bluetooth Network Encapsulation Protocol. I will present an example of using the BNEP protocol later in this chapter. + + * RFCOMM: The Radio Frequency Communications (RFCOMM) protocol is a reliable streams-based protocol. RFCOMM allows operation over only 30 ports. RFCOMM is used for emulating communication over a serial port and for sending unframed data. + + * SDP: Service Discovery Protocol. Enables an application to register a description and a port number in an SDP server it runs. Clients can perform a lookup in the SDP server providing the description. + + * The SCO (Synchronous Connection-Oriented) Layer: for sending audio; I do not delve into its details in this chapter as it falls outside the scope of this book. + + * Bluetooth profiles are definitions of possible applications and specify general behaviors that Bluetooth- enabled devices use to communicate with other Bluetooth devices. There are many Bluetooth profiles, and I will mention some of the most commonly used ones: + + * File Transfer Profile (FTP): Manipulates and transfers objects (files and folders) in an object store (file system) of another system. + + * Health Device Profile (HDP): Handles medical data. + + * Human Interface Device Profile (HID): A wrapper of USB HID (Human Interface Device) that provides support for devices like mice and keyboards. + + * Object Push Profile (OPP) – Push objects profile. + + * Personal Area Networking Profile (PAN): Provides networking over a Bluetooth link; you will see an example of it in the BNEP section later in this chapter. + + * Headset Profile (HSP): Provides support for Bluetooth headsets, which are used with mobile phones. + +The seven layers in this diagram are roughly parallel to the seven layers of the OS model. The Radio (RF) layer is parallel to the Physical layer, the Link Controller is parallel to the Data Link Layer, the Link Management Protocol is parallel to the Network Protocol, and so on. The Linux Bluetooth subsystem consists of several ingredients: + + * Bluetooth Core + + * HCI device and connection manager, scheduler; files: net/bluetooth/hci*.c, net/bluetooth/mgmt.c. + + * Bluetooth Address Family sockets; file: net/bluetooth/af_bluetooth.c. + + * SCO audio links; file: net/bluetooth/sco.c. + + * L2CAP (Logical Link Control and Adaptation Protocol); files: net/bluetooth/l2cap*.c. + + * SMP (Security Manager Protocol) on LE (Low Energy) links; file: net/bluetooth/smp.c + + * AMP manager - Alternate MAC/PHY management; file: net/bluetooth/a2mp.c. + + * HCI Device drivers (Interface to the hardware); files: drivers/bluetooth/*. Includes vendor specific drivers as well as generic drivers, like the Bluetooth USB generic driver, btusb. + + * RFCOMM Module (RFCOMM Protocol); files: net/bluetooth/rfcomm/*. + + * BNEP Module (Bluetooth Network Encapsulation Protocol); files: net/bluetooth/bnep/*. + + * CMTP Module (CAPI Message Transport Protocol), used by the ISDN protocol. CMTP is in fact obsolete; files: net/bluetooth/cmtp/*. + + * HIDP Module (Human Interface Device Protocol); files: net/bluetooth/hidp/*. + +I discussed briefly the Bluetooth protocol, the architecture of the Bluetooth stack and the Linux Bluetooth subsystem tree, and Bluetooth profiles. In the next section I will describe the HCI layer, which is the first layer above the LMP (see Figure 14-2 earlier in this section). + +### HCI Layer + +I will start the discussion of the HCI layer with describing the HCI device, which represents a Bluetooth controller. Later in this section I will describe the interface between the HCI layer and the layer below it, the Link Controller layer, and the interface between the HCI and the layers above it, L2CAP and SCO. + +#### HCI Device + +A Bluetooth device is represented by struct hci_dev. This structure is quite big (over 100 members), and will partially be shown here: + +struct hci_dev { + +char name[8]; + +unsigned long flags; + +__u8 bus; + +bdaddr_t bdaddr; + +__u8 dev_type; + +. . . + +struct work_struct rx_work; + +struct work_struct cmd_work; + +. . . + +struct sk_buff_head rx_q; + +struct sk_buff_head raw_q; + +struct sk_buff_head cmd_q; + +. . . + +int (*open)(struct hci_dev *hdev); + +int (*close)(struct hci_dev *hdev); + +int (*flush)(struct hci_dev *hdev); + +int (*send)(struct sk_buff *skb); + +void (*notify)(struct hci_dev *hdev, unsigned int evt); + +int (*ioctl)(struct hci_dev *hdev, unsigned int cmd, unsigned long arg); + +} + +(include/net/bluetooth/hci_core.h) + +Here is a description of some of the important members of the hci_dev structure: + + * flags: Represents the state of a device, like HCI_UP or HCI_INIT. + + * bus: The bus associated with the device, like USB (HCI_USB), UART (HCI_UART), PCI (HCI_PCI), etc. (see include/net/bluetooth/hci.h). + + * bdaddr: Each HCI device has a unique address of 48 bits. It is exported to sysfs by: /sys/class/bluetooth//address + + * dev_type: There are two types of Bluetooth devices: + + * Basic Rate devices (HCI_BREDR). + + * Alternate MAC and PHY devices (HCI_AMP). + + * rx_work: Handles receiving packets that are kept in the rx_q queue of the HCI device, by the hci_rx_work() callback. + + * cmd_work: Handles sending command packets which are kept in the cmd_q queue of the HCI device, by the hci_cmd_work() callback. + + * rx_q: Receive queue of SKBs. SKBs are added to the rx_q by calling the skb_queue_tail() method when receiving an SKB, in the hci_recv_frame() method. + + * raw_q: SKBs are added to the raw_q by calling the skb_queue_tail() method in the hci_sock_sendmsg() method. + + * cmd_q: Command queue. SKBs are added to the cmd_q by calling the skb_queue_tail() method in the hci_sock_sendmsg() method. + +The hci_dev callbacks (like open(), close(), send(), etc) are typically assigned in the probe() method of a Bluetooth device driver (for example, refer to the generic USB Bluetooth driver, drivers/bluetooth/btusb.c). + +The HCI layer exports methods for registering/unregistering an HCI device (by the hci_register_dev() and the hci_unregister_dev() methods, respectively). Both methods get an hci_dev object as a single parameter. The registration will fail if the open() or close() callbacks of the specified hci_dev object are not defined. + +There are five types of HCI packets: + + * HCI_COMMAND_PKT: Commands sent from the host to the Bluetooth device. + + * HCI_ACLDATA_PKT: Asynchronous data which is sent or received from a Bluetooth device. ACL stands for Asynchronous Connection-oriented Link (ACL) protocol. + + * HCI_SCODATA_PKT: Synchronous data which is sent or received from a Bluetooth device (usually audio). SCO stands for Synchronous Connection-Oriented (SCO). + + * HCI_EVENT_PKT: Sent when an event (such as connection establishment) occurs. + + * HCI_VENDOR_PKT: Used in some Bluetooth device drivers for vendor specific needs. + +#### HCI and the Layer Below It (Link Controller) + +The HCI communicates with the layer below it, the Link Controller, by: + + * Sending data packets (HCI_ACLDATA_PKT or HCI_SCODATA_PKT) by calling the hci_send_frame() method, which delegates the call to the send() callback of the hci_dev object. The hci_send_frame() method gets an SKB as a single parameter. + + * Sending command packets (HCI_COMMAND_PKT), by calling the hci_send_cmd() method. For example, sending a scan command. + + * Receiving data packets, by calling the hci_acldata_packet() method or by calling the hci_scodata_packet() method. + + * Receiving event packets, by calling the hci_event_packet() method. Handling HCI commands is asynchronous; so some time after sending a command packet (HCI_COMMAND_PKT), a single event or several events are received as a response by the HCI rx_work work_queue (the hci_rx_work() method). There are more than 45 different events (see HCI_EV_* in include/net/bluetooth/hci.h). For example, when performing a scan for nearby Bluetooth devices using the command-line hcitool, by hcitool scan, a command packet (HCI_OP_INQUIRY) is sent. As a result, three event packets are returned asynchronously to be handled by the hci_event_packet() method: HCI_EV_CMD_STATUS, HCI_EV_EXTENDED_INQUIRY_RESULT, and HCI_EV_INQUIRY_COMPLETE. + +#### HCI and the Layers Above It (L2CAP/SCO) + +Let's take a look at the methods by which the HCI layer communicates with the layers above it, the L2CAP layer and the SCO layer: + + * HCI communicates with the L2CAP layer above it when receiving data packets by calling the hci_acldata_packet() method, which invokes the l2cap_recv_acldata() method of the L2CAP protocol. + + * HCI communicates with the SCO layer above it when receiving SCO packets by calling the hci_scodata_packet() method, which invokes the sco_recv_scodata() method of the SCO protocol. + +### HCI Connection + +The HCI connection is represented by the hci_conn structure: + +struct hci_conn { + +struct list_head list; + +atomic_t refcnt; + +bdaddr_t dst; + +. . . + +__u8 type; + +} + +(include/net/bluetooth/hci_core.h) + +The following is a description of some of the members of the hci_conn structure: + + * refcnt: A reference counter. + + * dst: The Bluetooth destination address. + + * type: Represents the type of the connection: + + * SCO_LINK for SCO connection. + + * ACL_LINK for ACL connection. + + * ESCO_LINK for Extended Synchronous connection. + + * LE_LINK – represents LE (Low Energy) connection; was added in kernel v2.6.39 to support Bluetooth V4.0, which added the LE feature. + + * AMP_LINK – Added in v3.6 to support Bluetooth AMP controllers. + +An HCI connection is created by calling the hci_connect() method. There are three types of connections: SCO, ACL, and LE connection. + +### L2CAP + +In order to provide several data streams, L2CAP uses channels, which are represented by the l2cap_chan structure (include/net/bluetooth/l2cap.h). There is a global linked list of channels, named chan_list. Access to this list is serialized by a global read-write lock, chan_list_lock. + +The l2cap_recv_acldata() method, which I described in the section "HCI and the layers above it (L2CAP/SCO)" earlier in this chapter, is called when HCI passes data packets to the L2CAP layer. The l2cap_recv_acldata() method first performs some sanity checks and drops the packet if something is wrong, then it invokes the l2cap_recv_frame() method in case a complete packet was received. Each received packet starts with an L2CAP header: + +struct l2cap_hdr { + +__le16 len; + +__le16 cid; + +} __attribute__ ((packed)); + +(include/net/bluetooth/l2cap.h) + +The l2cap_recv_frame() method checks the channel id of the received packet by inspecting the cid of the l2cap_hdr object. In case it is an L2CAP command (the cid is 0x0001) the l2cap_sig_channel() method is invoked to handle it. For example, when another Bluetooth device wants to connect to our device, an L2CAP_CONN_REQ request is received on the L2CAP signal channel, which will be handled by the l2cap_connect_req() method, net/bluetooth/l2cap_core.c. In the l2cap_connect_req() method, an L2CAP channel is created by calling the l2cap_chan_create() method, via pchan->ops->new_connection(). The L2CAP channel state is set to be BT_OPEN, and the configuration state is set to be CONF_NOT_COMPLETE. This means that the channel should be configured in order to work with it. + +### BNEP + +The BNEP protocol enables IP over Bluetooth, which means in practical terms running TCP/IP applications on top of L2CAP Bluetooth channels. You can also run TCP/IP applications with PPP over Bluetooth RFCOMM, but networking over serial PPP link is less efficient. The BNEP protocol uses a PAN profile. I will show a short example of using the BNEP protocol to setup Bluetooth over IP, and subsequently I will describe the kernel methods which implement such communication. Delving into the details of BNEP is beyond the scope of this book. If you want to learn more, see the BNEP spec, which can be found in: http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf . A very simple way to create a PAN is by running: + + * On the server side: + + * pand --listen --role=NAP + + * Note: NAP stands for: Network Access Point (NAP) + + * On the client side + + * pand --connect btAddressOfTheServer + +On both endpoints, a virtual interface (bnep0) is created. Afterward, you can assign an IP addresses on bnep0 for both endpoints with the ifconfig command (or with the ip command), just like with Ethernet devices, and you will have a network connection over Bluetooth between these endpoints. See more in http://bluez.sourceforge.net/contrib/HOWTO-PAN . + +The pand --listen command creates an L2CAP server socket, and calls the accept() system call, whereas the pand --connect btAddressOfTheServer creates an L2CAP client socket and calls the connect() system call. When the connect request is received in the server side, it sends an IOCTL of BNEPCONNADD, which is handled in the kernel by the bnep_add_connection() method (net/bluetooth/bnep/core.c), which performs the following tasks: + + * Creates a BNEP session (bnep_session object). + + * Adds the BNEP session object to the BNEP session list (bnep_session_list) by calling the __bnep_link_session() method. + + * Creates a network device named bnepX (for the first BNEP device X is 0, for the second X is 1, and so on). + + * Registers the network device by calling the register_netdev() method. + + * Creates a kernel thread named "kbnepd btDeviceName". This kernel thread runs the bnep_session() method which contains an endless loop, to receive or transmit packets. This endless loop terminates only when a userspace application sends an IOCTL of BNEPCONNDEL, which calls the method bnep_del_connection() to set the terminate flag of the BNEP session, or when the state of the socket is changed and it is not connected anymore. + + * The bnep_session() method invokes the bnep_rx_frame() method to receive incoming packets and to pass them to the network stack, and it invokes the bnep_tx_frame() method to send outgoing packets. + +### Receiving Bluetooth Packets: Diagram + +Figure 14-3 shows the path of a received Bluetooth ACL packet (as opposed to SCO, which is for handling audio and is handled differently). The first layer where the packet is handled is the HCI layer, by the hci_acldata_packet() method. It then proceeds to the higher L2CAP layer by calling the l2cap_recv_acldata() method. + +Figure 14-3. + +Receiving an ACL packet + +The l2cap_recv_acldata() method calls the l2cap_recv_frame() method, which fetches the L2CAP header (the l2cap_hdr object was described earlier) from the SKB. + +An action is being taken according to the channel ID of the L2CAP header. + +### L2CAP Extended Features + +Support for L2CAP Extended Features (also called eL2CAP) was added in kernel 2.6.36. These extended features include: + + * Enhanced Retransmission Mode (ERTM), a reliable protocol with error and flow control. + + * Streaming Mode (SM), an unreliable protocol for streaming purposes. + + * Frame Check Sequence (FCS), a checksum for each received packet. + + * Segmentation and Reassembly (SAR) of L2CAP packets that make retransmission easier. + +Some of these extensions were required for new profiles, like the Bluetooth Health Device Profile (HDP). Note that these features were available also before, but they were considered experimental and were disabled by default, and you should have set CONFIG_BT_L2CAP_EXT_FEATURES to enable them. + +### Bluetooth Tools + +Accessing the kernel from userspace is done with sockets with minor changes: instead of using AF_INET sockets, we use AF_BLUTOOTH sockets. Here is a short description of some important and useful Bluetooth tools: + + * hciconfig: A tool for configuring Bluetooth devices. Displays information such as the interface type (BR/EDR or AMP), its Bluetooth address, its flags, and more. The hciconfig tool works by opening a raw HCI socket (BTPROTO_HCI) and sending IOCTLs; for example, in order to bring up or bring down the HCI device, an HCIDEVUP or HCIDEVDOWN is sent, respectively. These IOCTLs are handled in the kernel by the hci_sock_ioctl() method, net/bluetooth/hci_sock.c. + + * hcitool: A tool for configuring Bluetooth connections and sending some special command to Bluetooth devices. For example hcitool scan will scan for nearby Bluetooth devices. + + * hcidump: Dump raw HCI data coming from and going to a Bluetooth device. + + * l2ping: Send an L2CAP echo request and receive answer. + + * btmon: A friendlier version of hcidump. + + * bluetoothctl: A friendlier version of hciconfig/hcitool. + +You can find more information about the Linux Bluetooth subsystem in: + + * Linux BlueZ, the official Linux Bluetooth website: http://www.bluez.org . + + * Linux Bluetooth mailing list: linux-bluetooth@vger.kernel.org. + + * Linux Bluetooth mailing list archives: http://www.spinics.net/lists/linux-bluetooth/ . + + * Note that this mailing list is for Bluetooth kernel patches as well as Bluetooth userspace patches. + + * IRC channels on freenode.net: + + * #bluez (development related topics) + + * #bluez-users (non-development related topics) + +In this section I described the Linux Bluetooth subsystem, focusing on the networking aspects of this subsystem. You learned about the layers of the Bluetooth stack and how they are implemented in the Linux kernel. You also learned about the important Bluetooth kernel structures like HCI device and HCI connection. Next, I will describe the second wireless subsystem, the IEEEE 802.15.4 subsystem, and its implementation. + +## IEEE 802.15.4 and 6LoWPAN + +The IEEE 802.15.4 standard (IEEE Std 802.15.4-2011) specifies the Medium Access Control (MAC) layer and Physical (PHY) layer for Low-Rate Wireless Personal Area Networks (LR-WPANs). It is intended for low-cost and low-power consumption devices in a short-range network. Several bands are supported, among which the most common are the 2.4 GHz ISM band, 915 MHz, and 868 MHz. IEEE 802.15.4 devices can be used for example in wireless sensor networks (WSNs), security systems, industry automation systems, and more. It was designed to organize networks of sensors, switches, automation devices, etc. The maximum allowed bit rate is 250 kb/s. The standard also supports a 1000 kb/s bit rate for the 2.4 GHz band, but it is less common. Typical personal operating space is around 10m. The IEEE 802.15.4 standard is maintained by the IEEE 802.15 working group ( http://www.ieee802.org/15/ ). There are several protocols which sit on top of IEEE 802.15.4; the most known are ZigBee and 6LoWPAN. + +The ZigBee Alliance (ZA) has published non GPL specifications for IEEE802.15.4, but also the ZigBee IP (Z-IP) open standard ( http://www.zigbee.org/Specifications/ZigBeeIP/Overview.aspx ). It is based on Internet protocols such as IPv6, TCP, UDP, 6LoWPAN, and more. Using the IPv6 protocol for IEEE 802.15.4 is a good option because there is a huge address space of IPv6 addresses, which makes it possible to assign a unique routable address to each IPv6 node. The IPv6 header is simpler than the IPv4 header, and processing its extension headers is simpler than processing IPv4 header options. Using IPv6 with LR-WPANs is termed IPv6 over Low-power Wireless Personal Area Networks (6LoWPAN). IPv6 is not adapted for its use on an LR-WPAN and therefore requires an adaptation layer, as will be explained later in this section. There are five RFCs related to 6LoWPAN: + + * RFC 4944: "Transmission of IPv6 Packets over IEEE 802.15.4 Networks." + + * RFC 4919: "IPv6 over Low-Power Wireless Personal Area Networks (6LoWPANs): Overview, Assumptions, Problem Statement, and Goals." + + * RFC 6282: "Compression Format for IPv6 Datagrams over IEEE 802.15.4-Based Networks." This RFC introduced a new encoding format, the LOWPAN_IPHC Encoding Format, instead of LOWPAN_HC1 and LOWPAN_HC2. + + * RFC 6775: "Neighbor Discovery Optimization for IPv6 over Low-Power Wireless Personal Area Networks (6LoWPANs)." + + * RFC 6550: "RPL: IPv6 Routing Protocol for Low-Power and Lossy Networks." + +The main challenges for implementing 6LoWPAN are: + + * Different packet sizes: IPv6 has MTU of 1280 whereas IEEE802.15.4 has an MTU of 127 (IEEE802154_MTU). In order to support packets larger than 127 bytes, an adaptation layer between IPv6 and IEEE 802.15.4 should be defined. This adaptation layer is responsible for the transparent fragmentation/defragmentation of IPv6 packets. + + * Different addresses: IPv6 address is 128 bit whereas IEEE802.15.4 are IEEE 64-bit extended (IEEE802154_ADDR_LONG) or, after association and after a PAN id is assigned, a 16 bit short addresses (IEEE802154_ADDR_SHORT) which are unique in that PAN. The main challenge is that we need compression mechanisms to reduce the size of a 6LoWPAN packet, largely made up of the IPv6 addresses. 6LoWPAN can for example leverage the fact that IEEE802.15.4 supports 16 bits short addresses to avoid the need of a 64-bit IID. + + * Multicast is not supported natively in IEEE 802.15.4 whereas IPv6 uses multicast for ICMPv6 and for protocols that rely on ICMPv6 like the Neighbour Discovery protocol. + +IEEE 802.15.4 defines four types of frames: + + * Beacon frames (IEEE802154_FC_TYPE_BEACON) + + * MAC command frames (IEEE802154_FC_TYPE_MAC_CMD) + + * Acknowledgement frames (IEEE802154_FC_TYPE_ACK) + + * Data frames (IEEE802154_FC_TYPE_DATA) + +IPv6 packets must be carried on the fourth type, data frames. Acknowledgment for data packets is not mandatory, although it is recommended. As with 802.11, there are device drivers that implement most parts of the protocol by themselves (HardMAC device drivers), and device drivers that handle most of the protocol in software (SoftMAC device drivers). There are three types of nodes in 6LoWPAN: + + * 6LoWPAN Node (6LN): Either a host or a router. + + * 6LoWPAN Router (6LR): can send and receive Router Advertisements (RA) and Router Solicitations (RS) messages as well as forward and route IPv6 packets. These nodes are more complex than simple 6LoWPAN nodes and may need more memory and processing capacity. + + * 6LoWPAN Border Router (6LBR): A border router located at the junction of separate 6LoWPAN networks or between a 6LoWPAN network and another IP network. The 6LBR is responsible for Forwarding between the IP network and the 6LoWPAN network and for the IPv6 configuration of the 6LoWPAN nodes. A 6LBR requires much more memory and processing capacity than a 6LN. They share context for the nodes in the LoWPAN, keep track of registered nodes with 6LoWPAN-ND and RPL. Generally 6LBR is always-on in contrast to 6LN who sleep most of their times. Figure 14-4 shows a simple setup with 6LBR, which connects between an IP network and a Wireless Sensor Network based on 6LoWPAN. + +Figure 14-4. + +6LBR connecting an IP network to WSN which runs over 6LoWPAN + +### Neighbor Discovery Optimization + +There are two reasons we should have optimizations and extensions for the IPv6 Neighboring protocol: + + * IEEE 802.15.4 link layer does not have multicast support, although it supports broadcast (it uses 0xFFFF short address for message broadcasting). + + * The Neighbor Discovery protocol is designed for sufficiently powered devices, and IEEE 802.15.4 devices can sleep in order to preserve energy; moreover, they operate in a lossy network environment, as the RFC puts it. + +RFC 6775, which deals with Neighbor Discovery Optimization, added new optimizations such as: + + * Host-initiated refresh of Router Advertisement information. In IPv6, routers usually send periodically Router Advertisements. This feature removes the need for periodic or unsolicited Router Advertisements sent from routers to hosts. + + * EUI-64-based IPv6 addresses are considered to be globally unique. When such addresses are used, DAD (Duplicate Address Detection) is not needed. + + * Three options were added: + + * Address Registration Option (ARO): The ARO option (33) can be a part of unicast NS message that a host sends as part of NUD (Neighbor Unreachability Detection) to determine that it can still reach a default router. When a host has a non-link-local address, it sends periodically NS messages to its default routers with the ARO options in order to register its address. Unregistration is done by sending an NS with an ARO containing a lifetime of 0. + + * 6LoWPAN Context Option (6CO): The 6CO option (34) carries prefix information for LoWPAN header compression, and is similar to Prefix Information option (PIO) which is specified in RFC 4861. + + * Authoritative Border Router Option (ABRO): The ABRO option (35) enables disseminating prefixes and context information across a route-over topology. + + * Two new DAD messages were added: + + * Duplicate Address Request (DAR). New ICMPv6 type of 157. + + * Duplicate Address Confirmation (DAC). New ICMPv6 type of 158. + +### Linux Kernel 6LoWPAN + +The 6LoWPAN basic implementation was integrated into v3.2 Linux. It was contributed by the Embedded Systems Open Platform Group, from Siemens Corporate Technology. It has three layers: + + * Network layer - net/ieee802154 (includes the 6lowpan module, Raw IEEE 802.15.4 sockets, the netlink interface, and more). + + * MAC layer - net/mac802154. Implements a partial MAC layer for SoftMAC device drivers. + + * PHY layer - drivers/net/ieee802154 – the IEEE802154 device drivers. + + * There are currently two 802.15.4 devices which are supported: + + * AT86RF230/231 transceiver driver + + * Microchip MRF24J40 + + * There is the Fakelb driver (IEEE 802.15.4 loopback interface). + + * These two devices, as well as many other 802.15.4 transceivers, are connected via SPI. There is also a serial driver, although it is not included in the mainline kernel and still experimental. There are devices like atusb, which are based on an AT86RF231 BN but are not in mainline as of this writing. + +#### 6LoWPAN Initialization + +In the lowpan_init_module() method, initialization of 6LoWPAN netlink sockets is done by calling the lowpan_netlink_init() method, and a protocol handler is registered for 6LoWPAN packets by calling the dev_add_pack() method: + +. . . + +static struct packet_type lowpan_packet_type = { + +.type = __constant_htons(ETH_P_IEEE802154), + +.func = lowpan_rcv, + +}; + +. . . + +static int __init lowpan_init_module(void) + +{ + +. . . + +dev_add_pack(&lowpan_packet_type); + +. . . + +} + +(net/ieee802154/6lowpan.c) + +The lowpan_rcv() method is the main Rx handler for 6LoWPAN packets, which has an ethertype of 0x00F6 (ETH_P_IEEE802154). It handles two cases: + + * Reception of uncompressed packets (dispatch type is IPv6.) + + * Reception of compressed packets. + +You use a virtual link to ensure the translation between 6LoWPAN and IPv6 packets. One endpoint of this virtual link speaks IPv6 and has an MTU of 1280, this is the 6LoWPAN interface. The other one speaks 6LoWPAN and has an MTU of 127, this is the WPAN interface. Compressed 6LoWPAN packets are processed by the lowpan_process_data() method, which calls the lowpan_uncompress_addr() to uncompress addresses and the lowpan_uncompress_udp_header() to uncompress the UDP header accordingly to the IPHC header. The uncompressed IPv6 packet is then delivered to the 6LoWPAN interface with the lowpan_skb_deliver() method (net/ieee802154/6lowpan.c). + +Figure 14-5 shows the 6LoWPAN Adaptation layer. + +Figure 14-5. + +6LoWPAN Adaptation layer + +Figure 14-6 shows the path of a packet from the PHY layer (the driver) via the MAC layer to the 6LoWPAN adaptation layer. + +Figure 14-6. + +Receiving a packet + +I will not delve into the details of the device drivers implementation, as this is out of our scope. I will mention that each device driver should create an ieee802154_dev object by calling the ieee802154_alloc_device() method, passing as a parameter an ieee802154_ops object. Every driver should define some ieee802154_ops object callbacks, like xmit, start, stop, and more. This applies for SoftMAC drivers only. + +I will mention here that an Internet-Draft was submitted for applying 6LoWPAN technology over Bluetooth Low-Energy devices (these devices are part of the Bluetooth 4.0 specification, as was mentioned in the previous chapter). See "Transmission of IPv6 Packets over Bluetooth Low Energy," http://tools.ietf.org/html/draft-ietf-6lowpan-btle-12 . + +Note + +Contiki is an open source Operating System implementing the Internet of Things (IoT) concept; some patches of the Linux IEEE802.15.4 6LoWPAN are derived from it, like the UDP header compression and decompression. It implements 6LoWPAN, and RPL. It was developed by Adam Dunkels. See http://www.contiki-os.org/ + +For additional resources about 6LoWPAN and 802.15.4: + + * Books: + + * "6LoWPAN: The Wireless Embedded Internet", by Zach Shelby and Carsten Bormann, Wiley, 2009. + + * "Interconnecting Smart Objects with IP: The Next Internet," by Jean-Philippe Vasseur and Adam Dunkels (the Contiki developer), Morgan Kaufmann, 2010. + + * An article about IPv6 Neighbor Discovery Optimization: http://www.internetsociety.org/articles/ipv6-neighbor-discovery-optimization . + +The lowpan-tools is a set of utilities to manage the Linux LoWPAN stack. See: http://sourceforge.net/projects/linux-zigbee/files/linux-zigbee-sources/0.3/ + +Note + +The IEEE802.15.4 does not maintain a git repository of its own (though in the past there was one). Patches are sent to the netdev mailing list; some of the developers send the patches first to the linux zigbee developer mailing list to get some feedback: https://lists.sourceforge.net/lists/listinfo/linux-zigbee-devel + +I described the IEEE 802.15.4 and the 6LoWPAN protocol in this section and the challenges it poses for integration in the Linux kernel, like adding Neighboring Discovery messages. In the next section I will describe the third wireless subsystem, which is intended for the most shortest ranges among the three wireless subsystems described in this chapter: the Near Field Communication (NFC) subsystem. + +## Near Field Communication (NFC) + +Near Field Communication is a very short range wireless technology (less than two inches) designed to transfer small amount of data over a very low latency link at up to 424 kb/s. NFC payloads range from very simple URLs or raw texts to more complex out of band data to trigger connection handover. Through its very short range and latency, NFC implements a tap and share concept by linking proximity to an immediate action triggered by the NFC data payload. Touch an NFC tag with your NFC enabled mobile phone and this will, for example, immediately fire up a web browser. + +NFC runs on the 13.65MHz band and is based on the Radio Frequency ID (RFID) ISO14443 and FeliCa standards. The NFC Forum ( http://www.nfc-forum.org/ ) is a consortium responsible for standardizing the technology through a set of specifications, ranging from the NFC Digital layer up to high-level services definitions like the NFC Connection Handover or the Personal Health Device Communication (PHDC) ones. All adopted NFC Forum specifications are available free of charge. See http://www.nfc-forum.org/specs/ . + +At the heart of the NFC Forum specification is the NFC Data Exchange Format (NDEF) definition. It defines the NFC data structure used to exchange NFC payloads from NFC tags or between NFC peers. All NDEFs contain one or more NDEF Records that embed the actual payload. NDEF record header contains metadata that allow applications to build the semantic link between the NFC payload and an action to trigger on the reader side. + +### NFC Tags + +NFC tags are cheap, mostly static and battery less data containers. They're typically made of an inductive antenna connected to a very small amount of flash memory, packaged in many different form factors (labels, key rings, stickers, etc.). As per the NFC Forum definitions, NFC tags are passive devices, i.e., they're unable to generate any radio field. Instead they're powered by NFC active devices initiated RF fields. The NFC Forum defines four different tag types, each of them carrying a strong RFID and smart card legacy: + + * Type 1 specifications derive from Innovision/Broadcom Topaz and Jewel card specifications. They can expose from 96 up to 2 KBytes of data at 106 kb/s. + + * Type 2 tags are based on NXP Mifare Ultralight specifications. They're very similar to Type 1 tags. + + * Type 3 tags are built on top of the non-secure parts of Sony FeliCa tags. They're more expensive than Type 1 and 2 tags, but can carry up to 1 MBytes at 212 or 424 kb/s. + + * Type 4 specifications are based on NXP DESFire cards, support up to 32 KBytes and three transmission speeds: 106, 212, or 424 kb/s. + +### NFC Devices + +As opposed to NFC tags, NFC devices can generate their own magnetic field to initiate NFC communications. NFC-enabled mobile phones and NFC readers are the most common kinds of NFC devices. They support a larger feature set than NFC tags. They can read from or write to NFC tags, but they can also pretend to be a card and be seen as simple NFC tags from any reader. But one of the key advantages of the NFC technology over RFID is the possibility to have two NFC devices talking to each other in an NFC specific peer-to-peer mode. The link between two NFC devices is kept alive as long as the two devices are in magnetic range. In practice this means two NFC devices can maintain a peer-to-peer link while they physically touch each other. This introduces a whole new range of mobile use cases where one can exchange data, context, or credentials by touching someone else NFC device. + +### Communication and Operation Modes + +The NFC Forum defines two communication and three operation modes. An active NFC communication is established when two NFC devices can talk to one another by alternatively generating the magnetic field. This implies that both devices have their own power supply as they don't rely on any inductively generated power. Active communications can only be established in NFC peer-to-peer mode. On the other hand, only one NFC device generates the radio field on a passive NFC communication, and the other device replies by using that field. + +There are three NFC operation modes: + + * Reader/Writer: An NFC device (e.g., an NFC-enabled mobile phone) read from or write to an NFC tag. + + * Peer-to-peer: Two NFC devices establish a Logical Link Control Protocol (LLCP) over which several NFC services can be multiplexed: Simple NDEF Exchange Protocol (SNEP) for exchanging NDEF formatted data, Connection Handover for initiating a carrier (Bluetooth or WiFi) handover, or any proprietary protocol. + + * Card Emulation: An NFC device replies to a reader poll by pretending to be an NFC tag. Payment and transaction issuers rely on this mode to implement contactless payments on top of NFC. In card emulation mode, payment applets running on a trusted execution environment (also known as "secure elements") take control of the NFC radio and expose themselves as a legacy payment card that can be read from an NFC-enabled point-of-sale terminal. + +### Host-Controller Interfaces + +Communication between hardware controllers and host stacks must follow a precisely defined interface: the host-controller one (HCI). The NFC hardware ecosystem is quite fragmented in that regard, as most of the initial NFC controllers implement an ETSI specified HCI originally designed for communication between SIM cards and contactless front-ends. (See http://www.etsi.org/deliver/etsi_ts/102600_102699/102622/07.00.00_60/ts_102622v070000p.pdf ). This HCI was not tailored for NFC specific use cases, and so each and every manufacturer defined a large number of proprietary extensions to support their features. The NFC Forum tries to address that situation by defining its own interface, much more NFC oriented, the NFC Controller Interface (NCI). The industry trend is clearly showing that manufacturers abandon ETSI HCI in favor of NCI, building a more standardized hardware ecosystem. + +### Linux NFC support + +Unlike the Android operating system NFC stack, which is described later in this section, the standard Linux one is partly implemented by the kernel itself. Since the 3.1 Linux kernel release, Linux based application will find an NFC specific socket domain, along with a generic netlink family for NFC. (See http://git.kernel.org/?p=linux/kernel/git/sameo/nfc-next.git;a=shortlog;h=refs/heads/master .) The NFC generic netlink family is intended to be an NFC out of band channel for controlling and monitoring NFC adapters. The NFC socket domain supports two families: + + * Raw sockets for sending NFC frames that will arrive unmodified to the drivers + + * LLCP sockets for implementing NFC peer-to-peer services + +The hardware abstraction is implemented in NFC kernel drivers that register against various parts of the stack, mostly depending on the host-controller interface used by the controllers they support. As a consequence, Linux applications can work on top of a hardware agnostic and fully POSIX compatible NFC kernel APIs. The Linux NFC stack is split between kernel and userspace. The kernel NFC sockets allow userspace applications to implement NFC tags support by sending tag types specific commands through the raw protocol. NFC peer-to-peer protocols (SNEP, Connection Handover, PHDC, etc.) can be implemented by transmitting their specific payloads through NFC sockets as well. Finally, card emulation mode is built on top of the secure element parts of the kernel NFC netlink API. The Linux NFC daemon, neard, sits on top of the kernel and implements all three NFC modes, regardless of the NFC controller physically wired to the host platform. (See https://01.org/linux-nfc/ .) + +Figure 14-7 shows an overview of the NFC system. + +Figure 14-7. + +NFC overview + +#### NFC Sockets + +NFC sockets are of two kinds: raw and LLCP. Raw NFC sockets were designed with reader mode support in mind, as they provide a way to transmit tag specific commands and receive the tag replies back. The neard daemon uses NFC Raw sockets to implement all four tag types support, in both reader and writer modes. LLCP sockets implement the NFC peer-to-peer logical link control protocol on top of which neard implements all NFC Forum specified peer-to-peer services (SNEP, Connection Handover, and PHDC). + +Depending on the selected protocol, NFC socket semantics differ. + +##### Raw Sockets + + * connect: Select and enable a detected NFC tag + + * bind: Not supported + + * send/recv: Send and receive raw NFC payloads. The NFC core implementation does not modify those payloads. + +##### LLCP Sockets + + * connect: Connect to a specific LLCP service on a detected peer device, like the SNEP or Connection Handover services. + + * bind: Link a device to a specific LLCP service. The service will be exported through the LLCP service name lookup (SNL) protocol for any NFC peer device to attempt a connection to it. + + * send/recv: Transmit LLCP service payloads to and from an NFC peer device. The kernel will handle the LLCP specific link layer encapsulation and fragmentation. + + * LLCP transport can be connected or connectionless, and this is handled through the UNIX standard SOCK_STREAM and SOCK_DGRAM socket types. NFC LLCP sockets also support the SOCK_RAW type for monitoring and sniffing purposes. + +#### NFC Netlink API + +The NFC generic netlink API is designed to implement out of band NFC specific operations. It also handles any discoverable secure element from an NFC controller. Through NFC netlink commands, you can: + + * List all available NFC controllers. + + * Power NFC controllers up and down. + + * Start (and stop) NFC polls for discovering NFC tags and devices. + + * Enable NFC peer-to-peer (a.k.a. LLCP) links between the local controller and remote NFC peers. + + * Send LLCP service name lookup requests, in order to discover the available LLCP services on a remote peer. + + * Enable and disable NFC discoverable secure elements (typically SIM card based or embedded secure elements). + + * Send ISO7816 frames to enabled secure elements. + + * Trigger NFC controller firmware downloads. + +The netlink API is not only about sending synchronous commands from NFC applications, but also about receiving asynchronous NFC-related events. Applications listening for broadcast NFC events on an NFC netlink socket will get notified about: + + * Detected NFC tags and devices + + * Discovered secure elements + + * Secure element transaction status + + * LLCP service name lookup replies + +The entire netlink API (both commands and events) along with the socket one are exported through the kernel headers, and installed at /usr/include/linux/nfc.h on standard Linux distributions. + +#### NFC Initialization + +NFC initialization is done by the nfc_init() method: + +static int __init nfc_init(void) + +{ + +int rc; + +. . . + +Register the generic netlink NFC family and the NFC notifier callback, the nfc_genl_rcv_nl_event() method: + +rc = nfc_genl_init(); + +if (rc) + +goto err_genl; + +/* the first generation must not be 0 */ + +nfc_devlist_generation = 1; + +Initialize NFC Raw sockets: + +rc = rawsock_init(); + +if (rc) + +goto err_rawsock; + +Initialize NFC LLCP sockets: + +rc = nfc_llcp_init(); + +if (rc) + +goto err_llcp_sock; + +Initialize the AF_NFC protocol: + +rc = af_nfc_init(); + +if (rc) + +goto err_af_nfc; + +return 0; + +. . . + +} + +(net/nfc/core.c) + +#### Drivers API + +As explained earlier, most NFC controllers nowadays either use HCI or NCI as their host-controller interface. Others define their proprietary interface over USB, like most PC-compatible NFC readers, for example. There are also some "Soft" NFC controllers that expect the host platform to implement the NFC Forum Digital layer and talk to an analog-only capable firmware. In order to support this variety of hardware controllers, the NFC kernel implements NFC NCI, HCI, and Digital layers. Depending on the NFC hardware they intend to support, device driver developers will need to register at module probing time against one of these stacks, or directly against the NFC core implementation for purely proprietary protocols. When registering, they typically provide a stack operands implementation, which is the actual hardware abstraction layer between NFC kernel drivers and the core parts of the NFC stack. The NFC driver registration APIs and operand prototypes are defined in the kernel include/net/nfc/ directory. + +Figure 14-8 shows a block diagram of the NFC Linux Architecture. + +Figure 14-8. + +NFC Linux Kernel Architecture. (Note that the NFC Digital layer is not in kernel 3.9. It is to be integrated into kernel 3.13.) + +The hierarchy shown in this figure can be understood better by looking into the implementation details of the registration of NFC device drivers directly to the NFC core and against the HCI and the NCI layer: + + * Registration directly against the NFC core is done typically in the driver probe() callback. The registration is done using these steps: + + * Create an nfc_dev object by calling the nfc_allocate_device() method. + + * Call the nfc_register_device() method, passing the nfc_dev object which was created in the previous step as a single parameter. + + * See: drivers/nfc/pn533.c. + + * Registration against the HCI layer is done typically also in the probe() callback of the driver; in the case of the pn544 and microread NFC device drivers, which are the only HCI drivers in kernel 3.9, this probe() method is invoked by the I2C subsystem. The registration is done using these steps: + + * Create an nfc_hci_dev object by calling the nfc_hci_allocate_device() method. + + * The nfc_hci_dev structure is defined in include/net/nfc/hci.h. + + * Call the nfc_hci_register_device() method, passing the nfc_hci_dev object which was created in the previous step as a single parameter. The nfc_hci_register_device() method in turn performs a registration against the NFC core by calling the nfc_register_device() method. + + * See drivers/nfc/pn544/pn544.c and drivers/nfc/microread/microread.c. + + * Registration against the NCI layer is done typically also in the probe() callback of the driver, for example in the nfcwilink driver. The registration is done using these steps: + + * Create an nci_dev object by calling the nci_allocate_device() method. + + * The nci_dev structure is defined in include/net/nfc/nci_core.h. + + * Call the nci_register_device() method, passing the nci_dev object that was created in the previous step as a single parameter. The nci_register_device() method in turn performs a registration against the NFC core by calling the nfc_register_device() method, similarly to what you saw earlier in this section with registration against the HCI layer. + + * See drivers/nfc/nfcwilink.c. + +When working directly against the NFC core, the driver must define five callbacks in the nfs_ops object (this object is passed as a first parameter of the nfc_allocate_device() method): + + * start_poll: Set the driver to work in polling mode. + + * stop_poll: Stop polling. + + * activate_target: Activate a chosen target. + + * deactivate_target: Deactivate a chosen target. + + * im_transceive: Transceive operation. + +When working with HCI, the hci_nfc_ops object, which is an instance of nfs_ops, defines these five callbacks, and when allocating an HCI object with the nfc_hci_allocate_device() method, the nfc_allocate_device() method is invoked with this hci_nfc_ops object as a first parameter. + +With NCI, there is something quite similar, with the nci_nfc_ops object; see: net/nfc/nci/core.c. + +### Userspace Architecture + +neard ( http://git.kernel.org/?p=network/nfc/neard.git;a=summary ) is the Linux NFC daemon that runs on top of the kernel NFC APIs. It is a single threaded, GLib based process that implements the higher layers of the NFC peer-to-peer stack along with the four tag types specific commands for reading from and writing to NFC tags. The NDEF Push Protocol (NPP), SNEP, PHDC, and Connection Handover specifications are implemented through neard plugins. One of neard's main design goals is to provide a small, simple, and uniform NFC API for Linux based applications willing to provide high-level NFC services. This is achieved through a small D-Bus API that abstracts tags and devices interfaces and methods, hiding the NFC complexity away from application developers. This API is compatible with the freedesktop D-Bus ObjectManager one and provides the following interfaces: + + * org.neard.Adapter: For detecting new NFC controllers, turning them on and off, and starting NFC polls. + + * org.neard.Device, org.neard.Tag: For representing detected NFC tags and devices. Calling the Device.Push method will send NDEFs to the peer device while Tag.Write will write them to the selected tag. + + * org.neard.Record: Represents human readable and understandable NDEF record payload and properties. Registering agents against the org.neard.NDEFAgent interface will give application access to the NDEF raw payloads. + +You can find more information about the neard userspace daemon here: http://git.kernel.org/cgit/network/nfc/neard.git/tree/doc . + +### NFC on Android + +The initial NFC support was added to the Android operating system on December 2010, with the official 2.3 (Gingerbread) release. Android 2.3 only supported the reader/writer mode, but things have improved significantly since then, and the latest Android releases (Jelly Bean 4.3) come with a fully featured NFC support. For more information, see the Android NFC page: http://developer.android.com/guide/topics/connectivity/nfc/index.html . Following the classic Android architecture, a Java specific NFC API is available for applications to provide NFC services and operations. It is left to integrators to implement these APIs through native hardware abstraction layers (HAL). Google ships a Broadcom NFC HAL that currently only supports Broadcom NFC hardware. Here again, it is left to Android OEMs and integrators to either adapt the Broadcom NFC HAL to their selected NFC chipset or to implement their own HAL. It is important to note that since the Broadcom stack implements the NFC Controller Interface (NCI) specification, it is relatively easy to adapt it to support any NCI compatible NFC controller. The Android NFC architecture is what one could call a userspace NFC stack. In fact the entire NFC implementation is done in userspace through the HAL. NFC frames are then pushed down to the NFC controller through a kernel driver stub. The driver simply encapsulates those frames into buffers that are ready to be sent to the physical link (e.g., I2C, SPI, UART) between the host platform and the NFC controller. + +Note + +Pull requests of the nfc-next git tree are sent to the wireless-next tree (Apart from the NFC subsystem, also the Bluetooth subsystem and the mac802.11 subsystem pull requests are handled by the wireless maintainer). From the wireless-next tree, pull requests are sent to net-next tree, and from there to Linus linux-next tree. The nfc-next tree is available in: git://git.kernel.org/pub/scm/linux/kernel/git/sameo/nfc-next.git + +There is also an nfc-fixes git repository, which contains urgent and critical fixes for the current release(-rc*). The git tree of nfc-fixes is available in: git://git.kernel.org/pub/scm/linux/kernel/git/sameo/nfc-fixes.git/ + +NFC mailing list: linux-nfc@lists.01.org. + +NFC mailing list archives: https://lists.01.org/pipermail/linux-nfc/ . + +In this section you learned about what NFC is in general, and about the Linux NFC subsystem implementation and about the Android NFC subsystem implementation. In the next section I will discuss the notification chains mechanism, which is an important mechanism to inform network devices about various events. + +## Notifications Chains + +Network devices state can change dynamically; from time to time, the user/administrator can register/unregister network devices, change their MAC address, change their MTU, etc. The network stack and other subsystems and modules should be able to be notified about these events and handle them properly. The network notifications chains provide a mechanism for handling such events, and I will describe its API and the possible network events it handles in this section. For a full list of the events, see Table 14-1 later in this section. Every subsystem and every module can register itself to notification chains. This is done by defining a notifier_block and registering it. The core methods of notification chain registration and unregistration is the notifier_chain_register() and the notifier_chain_unregister() method, respectively. Generation of notification events is done by calling the notifier_call_chain() method. These three methods are not used directly (they are not exported; see kernel/notifier.c), and they do not use any locking mechanism. The following methods are wrappers around notifier_chain_register(), all of them implemented in kernel/notifier.c: + + * atomic_notifier_chain_register() + + * blocking_notifier_chain_register() + + * raw_notifier_chain_register() + + * srcu_notifier_chain_register() + + * register_die_notifier() + +Table 14-1. + +Network Device Events: + +Event | Meaning + +---|--- + +NETDEV_UP | device up event + +NETDEV_DOWN | device down event + +NETDEV_REBOOT | detected a hardware crash and restarted the device + +NETDEV_CHANGE | device state change + +NETDEV_REGISTER | device registration event + +NETDEV_UNREGISTER | device unregistration event + +NETDEV_CHANGEMTU | device MTU changed + +NETDEV_CHANGEADDR | device MAC address changed + +NETDEV_GOING_DOWN | device is going down + +NETDEV_CHANGENAME | device has changed its name + +NETDEV_FEAT_CHANGE | device features changed + +NETDEV_BONDING_FAILOVER | bonding failover event + +NETDEV_PRE_UP | this event enables to veto changing the device state to UP; for example, in cfg80211, denying interfaces to be set UP if the device is known to be rfkill'ed. + +see cfg80211_netdev_notifier_call() + +NETDEV_PRE_TYPE_CHANGE | The device is about to change its type. + +This is a generalization of the + +NETDEV_BONDING_OLDTYPE flag, which was replaced by NETDEV_PRE_TYPE_CHANGE + +NETDEV_POST_TYPE_CHANGE | device changed its type. This is a generalization of the NETDEV_BONDING_NEWTYPE flag, which was replaced by NETDEV_POST_TYPE_CHANGE + +NETDEV_POST_INIT | This event is generated in device registration (register_netdevice()), before creating the network device kobjects by netdev_register_kobject(); used in cfg80211 (net/wireless/core.c) + +NETDEV_UNREGISTER_FINAL | An event which is generated to finalize the device unregistration. + +NETDEV_RELEASE | the last slave of a bond is released (when working with netconsole over bonding) (This flag was also once used for bridges, in br_if.c). + +NETDEV_NOTIFY_PEERS | notify network peers event (i.e., a device wants to inform the rest of the network about some sort of reconfiguration such as a failover event or a virtual machine migration) + +NETDEV_JOIN | The device added a slave. + +Used for example in the bonding driver, in the bond_enslave() method, where we add a slave; see drivers/net/bonding/bond_main.c + +There are also corresponding wrapper methods for unregistering notification chains and for generating notification events for each of these wrappers. For example, for the notification chain registered with the atomic_notifier_chain_register() method, the atomic_notifier_chain_unregister() is for unregistering the notification chain, and the __atomic_notifier_call_chain() method is for generating notification events. Each of these wrappers has also a corresponding macro to define a notification chain; for the atomic_notifier_chain_register() wrapper it is the ATOMIC_NOTIFIER_HEAD macro (include/linux/notifier.h). + +After registering a notifier_block object, when every one of the events shown in Table 14-1 occurs, the callback specified in a notifier_block is invoked. The fundamental data structure of notification chains is the notifier_block structure; let's take a look: + +struct notifier_block { + +int (*notifier_call)(struct notifier_block *, unsigned long, void *); + +struct notifier_block __rcu *next; + +int priority; + +}; + +(include/linux/notifier.h) + + * notifier_call: The callback to be invoked. + + * priority: callbacks of notifier_block objects with higher priority are performed first. + +There are many chains in the networking subsystem and in other subsystems. Let's mention some of the important ones: + + * netdev_chain: Registered by the register_netdevice_notifier() method and unregistered by the unregister_netdevice_notifier() method (net/core/dev.c). + + * inet6addr_chain: Registered by the register_inet6addr_notifier() method and unregistered by the unregister_inet6addr_notifier () method. Notifications are generated by the inet6addr_notifier_call_chain () method (net/ipv6/addrconf_core.c). + + * netevent_notif_chain: Registered by the register_netevent_notifier() method and unregistered by the unregister_netevent_notifier() method. Notifications are generated by the call_netevent_notifiers() method (net/core/netevent.c). + + * inetaddr_chain: Registered by the register_inetaddr_notifier() method and unregistered by the unregister_inetaddr_notifier() method. Notifications are generated by calling the blocking_notifier_call_chain() method. + +Let's take a look at an example of using the netdev_chain; you saw earlier that with netdev_chain, registration is done with the register_netdevice_notifier() method, which is a wrapper around the raw_notifier_chain_register() method. Following is an example of registering a callback named br_device_event; First, a notifier_block object is defined, and then it is registered by calling the register_netdevice_notifier() method: + +struct notifier_block br_device_notifier = { + +.notifier_call = br_device_event + +}; + +(net/bridge/br_notify.c) + +static int __init br_init(void) + +{ + +... + +register_netdevice_notifier(&br_device_notifier); + +... + +} + +(net/bridge/br.c) + +Notifications of the netdev_chain are generated by invoking the call_netdevice_notifiers() method. The first parameter of this method is the event. The call_netdevice_notifiers() method :is in fact a wrapper around raw_notifier_call_chain(). + +So, when a network notification is generated, all callbacks which were registered are invoked; in this example, the br_device_event() callback will be called, regardless of which network event occurred; the callback will decide how to handle the notification, or maybe it will ignore it. Let's take a look at the callback method, br_device_event(): + +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) + +{ + +struct net_device *dev = ptr; + +struct net_bridge_port *p; + +struct net_bridge *br; + +bool changed_addr; + +int err; + +. . . + +The second parameter for the br_device_event() method is the event (all the events are defined in include/linux/netdevice.h): + +switch (event) { + +case NETDEV_CHANGEMTU: + +dev_set_mtu(br->dev, br_min_mtu(br)); + +break; + +. . . + +} + +Note + +Registration of notification chains is not limited only to the networking subsystem. Thus, for example, the clockevents subsystem defines a chain called clockevents_chain and registers it by calling the raw_notifier_chain_register() method, and the hung_task module defines a chain named panic_notifier_list and registers it by calling the atomic_notifier_chain_register() method. + +Beside the notifications that are discussed in this section, there is another type or notifications, named RTNetlink notifications; these notifications are sent with the rtmsg_ifinfo() method. :This type of notifications was discussed in Chapter 2, which dealt with Netlink Sockets. + +These are the event types supported for networking (Note: the event types mentioned in the following table are defined in include/linux/netdevice.h): + +We have now covered notification events, a mechanism that enables network devices to get notifications about events such as change of MTU, change of MAC address and more. The next section will discuss shortly the PCI subsystem, describing some of its main data structures. + +## The PCI Subsystem + +Many network interfaces cards are Peripheral Component Interconnect (PCI) devices and should work in conjunction with the Linux PCI subsystem. Not all network interfaces are PCI devices; there are many embedded devices where the network interface is not on a PCI bus; the initialization and handling of these devices is done in a different way, and the following discussion is not relevant for these non-PCI devices. The new PCI devices are PCI Express (PCIe or PCIE) devices; the standard was created in 2004. They have a serial interface instead of a parallel interface, and as a result they have higher maximum system bus throughput. Each PCI device has a read-only configuration space; it is at least 256 bytes. The extended configuration space, available in PCI-X 2.0 and PCI Express buses, is 4096 bytes. You can read the PCI configuration space and the extended PCI configuration space by lspci (the lspci utility belongs to the pciutils package): + + * lspci -xxx: Shows a hexadecimal dump of the PCI configuration space. + + * lspci –xxxx: Shows a hexadecimal dump of the extended PCI configuration space. + +The Linux PCI API provides three methods for reading the configuration space, for handling 8-, 16-, and 32-bit granularity: + + * static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val) + + * static inline int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val) + + * static inline int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val) + +There are also three methods for writing the configuration space; likewise, 8-, 16-, and 32-bit granularities are handled: + + * static inline int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val) + + * static inline int pci_write_config_word(const struct pci_dev *dev, int where, u16 val) + + * static inline int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val) + +Every PCI manufacturer assigns values to at least the vendor, device, and class fields in the configuration space of the PCI device. A PCI device is identified by the Linux PCI subsystem by a pci_device_id object. The pci_device_id struct is defined in include/linux/mod_devicetable.h: + +struct pci_device_id { + +__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ + +__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ + +__u32 class, class_mask; /* (class,subclass,prog-if) triplet */ + +kernel_ulong_t driver_data; /* Data private to the driver */ + +}; + +(include/linux/mod_devicetable.h) + +The vendor, device, and class fields in pci_device_id identify a PCI device; most drivers do not need to specify the class as vendor/device is normally sufficient. + +Each PCI device driver declares a pci_driver object. Let's take a look at the pci_driver structure: + +struct pci_driver { + +. . . + +const char *name; + +const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ + +int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ + +void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ + +int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */ + +. . . + +int (*resume) (struct pci_dev *dev); /* Device woken up */ + +. . . + +}; + +(include/linux/pci.h) + +Here are short descriptions of the members of the pci_driver structure: + + * name: Name of the PCI device. + + * id_table: An array of pci_device_id objects which it supports. Initializing id_table is done usually with the DEFINE_PCI_DEVICE_TABLE macro. + + * probe: A method for device initialization. + + * remove: A method for freeing the device. The remove() method usually frees all the resources that were assigned in the probe() method. + + * suspend: A power management callback which puts the device to be in low power state, for devices that support power management. + + * resume: A power management callback that wakes the device from low power state, for devices that support power management. + +A PCI device is represented by struct pci_dev. It is a large structure; let's take a look at some of its members (they are self-explanatory): + +struct pci_dev { + +. . . + +unsigned short vendor; + +unsigned short device; + +unsigned short subsystem_vendor; + +unsigned short subsystem_device; + +. . . + +struct pci_driver *driver; /* which driver has allocated this device */ + +. . . + +pci_power_t current_state; /* Current operating state. In ACPI-speak, + +this is D0-D3, D0 being fully functional, + +and D3 being off. */ + +struct device dev; /* Generic device interface */ + +int cfg_size; /* Size of configuration space */ + +unsigned int irq; + +}; + +(include/linux/pci.h) + +Registering of a PCI network device against the PCI subsystem is done by defining a pci_driver object and calling the pci_register_driver() macro, which gets as its single argument a pci_driver object. In order to initialize the PCI device before it's being used, a driver should call the pci_enable_device() method. This method wakes up the device if it was suspended, and allocates the required I/O resources and memory resources. Unregistering the PCI driver is done by the pci_unregister_driver() method. Usually the pci_register_driver() macro is called in the driver module_init() method and the pci_unregister_driver() method is called in the driver module_exit() method. Each driver should call the request_irq() method specifying the IRQ handler when the device is brought up, and call free_irq() when the device is brought down. + +Allocation and freeing of DMA (Direct Memory Access) memory is usually done with dma_alloc_coherent()/dma_free_coherent() when working with uncached memory buffer. With dma_alloc_coherent() we don't need to worry about cache coherency, as the mappings of this method are cache-coherent. See for example in e1000_alloc_ring_dma(), drivers/net/ethernet/intel/e1000e/netdev.c. The Linux DMA API is described in Documentation/DMA-API.txt. + +Note + +Single Root I/O Virtualization (SR-IOV) is a PCI feature that makes one physical device appear as several virtual devices. The SR-IOV specification was created by the PCI SIG. See http://www.pcisig.com/specifications/iov/single_root/ . For more information see Documentation/PCI/pci-iov-howto.txt. + +More information about PCI can be found in the third edition of "Linux Device Drivers" by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman, which is available (under Creative Commons License) in this URL: http://lwn.net/Kernel/LDD3/ . + +### Wake-On-LAN (WOL) + +Wake-On-LAN is a standard that allows a device that had been soft-powered-down to be powered up or awakened by a network packet. Wake-On-LAN is disabled by default. There are some network device drivers which let the sysadmin enable the Wake-On-LAN feature, usually by running from userspace the ethtool command. In order to support this, the network device driver should define a set_wol() callback in the ethtool_ops object. See for example, the 8139cp driver of RealTek (net/ethernet/realtek/8139cp.c). Running ethtool shows whether the network device supports Wake-On-LAN. The ethtool also lets the sysadmin define which packets should wake the device; for example, ethtool -s eth1 wol g will enable Wake-On-LAN for MagicPacket frames (MagicPacket is a standard of AMD). You can use the ether-wake utility of the net-tools package to send Wake-On-LAN MagicPacket frames. + +## Teaming Network Device + +The virtual teaming network device driver is intended to be a replacement for the bonding network device (drivers/net/bonding). The bonding network device provides a link aggregation solution (also known as: "link bundling" or "trunking"). See Documentation/networking/bonding.txt. The bonding driver is implemented fully in the kernel, and is known to be very large and prone to problems. The teaming network driver is controlled by userspace, as opposed to the bonding network driver. The userspace daemon is called teamd and it communicates with the kernel teaming driver by a library name libteam. The libteam library is based on generic netlink sockets (see Chapter 2). + +There are four modes for the teaming driver: + + * loadbalance: Used in Link Aggregation Control Protocol (LACP), which is part of the 802.3ad standard. + + * net/team/team_mode_loadbalance.c + + * activebackup: Only one port is active at a given time. This port can transmit and receive SKBs. The other ports are backup ports. A userspace application can specify which port to use as the active port. + + * net/team/team_mode_activebackup.c + + * broadcast: All packets are sent by all ports. + + * net/team/team_mode_broadcast.c + + * roundrobin: Selection of ports is done by a round robin algorithm. No need for interaction with userspace for this mode. + + * net/team/team_mode_roundrobin.c + +Note + +The teaming network driver resides under drivers/net/team and is developed by Jiri Pirko. + +For more information see http://libteam.org/ . + +libteam site: https://github.com/jpirko/libteam . + +Our brief overview about the teaming driver is over. Many of the readers use PPPoE services when they are surfing the Internet. The following short section covers the PPPoE protocol. + +## The PPPoE Protocol + +PPPoE is a specification for connecting multiple clients to a remote site. PPPoE is typically used by DSL providers to handle IP addresses and authenticate users. The PPPoE protocol provides the ability to use PPP encapsulation for Ethernet packets. The PPPoE protocol is specified in RFC 2516 from 1999, and the PPP protocol is specified in RFC 1661 from 1994. There are two stages in PPPoE: + + * PPPoE discovery stage. The discovery is done in a client-server session. The server is called an Access Concentrator, and there can be more than one. These Access Concentrators are often deployed by an Internet Server Provider (ISP). These are the four steps in the Discovery stage: + + * The PPPoE Active Discovery Initiation (PADI). A broadcast packet is sent from a host. The code in the PPPoE header is 0x09 (PADI_CODE), and the session id (sid) in the PPPoE header must be 0. + + * The PPPoE Active Discovery Offer (PADO). An Access Concentrator replies to a PADI request with a PADO reply. The destination address is the address of the host that sent the PADI. The code in the PPPoE header is 0x07 (PADO_CODE). The session id (sid) in the PPPoE header must again be 0. + + * PPPoE Active Discovery Request (PADR). A host sends a PADR packet to an Access Concentrator after it receives a PADO reply. The code in the PPPoE header is 0x19 (PADR_CODE). The session id (sid) in the PPPoE header must again be 0. + + * PPPoE Active Discovery Session-confirmation (PADS). When the Access Concentrator gets a PADR request, it generates a unique session id, and sends a PADS packet as a reply. The code in the PPPoE header is 0x65 (PADS_CODE). The session id (sid) in the PPPoE header is the session id that it generated. The destination of the packet is the IP address of the host that sent the PADR request. + + * A session is terminated by sending PPPoE Active Discovery Terminate (PADT) packet. The code in the PPPoE header is 0xa7 (PADT_CODE). A PADT can be sent either by an Access Concentrator or a host, and it can be sent any time after the session was established. The destination address is a unicast address. The ethertype of the Ethernet header of all the five discovery packets (PADI, PADO, PADR, PADS and PADT) is 0x8863 (ETH_P_PPP_DISC). + + * PPPoE Session stage. Once the PPPoE discovery stage completed successfully, packets are sent using PPP encapsulation, which means adding a PPP header of two bytes. Using PPP enables registration and authentication using PPP subprotocols like Password Authentication Protocol (PAP) or Challenge Handshake Authentication Protocol (CHAP), and also PPP subprotocol called the Link Control Protocol (LCP), which is responsible for establishing and testing the data-link connection. The ethertype of the Ethernet header is 0x8864 (ETH_P_PPP_SES). + +Every PPPoE packet starts with a 6-byte of PPPoE header, and you must learn about the PPPoE header in order to understand better the PPPoE protocol. + +## PPPoE Header + +I will start by showing the PPPoE header definition in the Linux kernel: + +struct pppoe_hdr { + +#if defined(__LITTLE_ENDIAN_BITFIELD) + +__u8 ver : 4; + +__u8 type : 4; + +#elif defined(__BIG_ENDIAN_BITFIELD) + +__u8 type : 4; + +__u8 ver : 4; + +#else + +#error "Please fix " + +#endif + +__u8 code; + +__be16 sid; + +__be16 length; + +struct pppoe_tag tag[0]; + +} __packed; + +(include/uapi/linux/if_pppox.h) + +The following is a description of the members of the pppoe_hdr structure: + + * ver: The ver field is a 4-bit field and it must be set to 0x1 according to section 4 in RFC 2516. + + * type: The type field is a 4-bit field and it must also be set to 0x1 according to section 4 in RFC 2516. + + * code: The code field is a 8-bit field and it can be one of the constants mentioned earlier: PADI_CODE, PADO_CODE, PADR_CODE, PADS_CODE and PADT_CODE. + + * sid: Session ID (16-bit). + + * length: The length is a 16-bit field, and it represents the length of the PPPoE payload, without the length of the PPPoE header or the length of the Ethernet header. + + * tag[0]: The PPPoE payload can contains zero or more tags, in a type-length-value (TLV) format. A tag consists of 3 fields: + + * TAG_TYPE: 16-bit (for example, AC-Name, Service-Name, Generic-Error and more). + + * TAG_LENGTH: 16-bit. + + * TAG_VALUE: variable in length. + + * Appendix A of RFC 2516 lists the various TAG_TYPEs and TAG_VALUEs. + +Figure 14-9 shows a PPPoE header: + +Figure 14-9. + +PPPoE header + +### PPPoE Initialization + +PPPoE Initialization is done by the pppoe_init() method, drivers/net/ppp/pppoe.c. Two PPPoE protocol handlers are registered, one for PPPoE discovery packets, and one for PPPoE session packets. Let's take a look at the PPPoE protocol handler registration: + +static struct packet_type pppoes_ptype __read_mostly = { + +.type = cpu_to_be16(ETH_P_PPP_SES), + +.func = pppoe_rcv, + +}; + +static struct packet_type pppoed_ptype __read_mostly = { + +.type = cpu_to_be16(ETH_P_PPP_DISC), + +.func = pppoe_disc_rcv, + +}; + +static int __init pppoe_init(void) + +{ + +int err; + +dev_add_pack(&pppoes_ptype); + +dev_add_pack(&pppoed_ptype); + +. . . + +return 0; + +} + +The dev_add_pack() method is the generic method for registering protocol handlers, and you encountered in previous chapters. The protocol handlers which are registered by the pppoe_init() method are: + + * The pppoe_disc_rcv() method is the handler for PPPoE discovery packets. + + * The pppoe_rcv() method is the handler for PPPoE session packets. + +The PPPoE module exports an entry to procfs, /proc/net/pppoe. This entry consists of the session id, the MAC address, and the device of the current PPPoE sessions. Running cat /proc/net/pppoe is handled by the pppoe_seq_show() method. A notifier chain is registered by the pppoe_init() method by calling the register_netdevice_notifier(&pppoe_notifier). + +#### PPPoX Sockets + +PPPoX sockets are represented by the pppox_sock structure (include/linux/if_pppox.h) and are implemented in net/ppp/pppox.c. These sockets implement a Generic PPP encapsulation socket family. Apart from PPPoE, they are used also by Layer 2 Tunneling Protocol (L2TP) over PPP. PPPoX sockets are registered by calling register_pppox_proto(PX_PROTO_OE, &pppoe_proto) in the pppoe_init() method. Let's take a look at the definition of the pppox_sock structure: + +struct pppox_sock { + +/* struct sock must be the first member of pppox_sock */ + +struct sock sk; + +struct ppp_channel chan; + +struct pppox_sock *next; /* for hash table */ + +union { + +struct pppoe_opt pppoe; + +struct pptp_opt pptp; + +} proto; + +__be16 num; + +}; + +(include/linux/if_pppox.h) + +When the PPPoX socket is used by PPPoE, the pppoe_opt of the proto union of the pppox_sock object is used. The pppoe_opt structure includes a member called pa, which is an instance of the pppoe_addr structure. The pppoe_addr structure represents the parameters of the PPPoE session: session id, remote MAC address of the peer, and the name of the network device that is used: + +struct pppoe_addr { + +sid_t sid; /* Session identifier */ + +unsigned char remote[ETH_ALEN]; /* Remote address */ + +char dev[IFNAMSIZ]; /* Local device to use */ + +}; + +(include/uapi/linux/if_pppox.h) + +Note + +Access to the pa member of the pppoe_opt structure which is embedded in the proto union is done in most cases in the PPPoE module using the pppoe_pa macro: + +#define pppoe_pa proto.pppoe.pa + +(include/linux/if_pppox.h) + +### Sending and Receiving Packets with PPPoE + +Once the discovery stage is completed, the PPP protocol must be used in order to enable traffic between the two peers, as was mentioned earlier. When starting a PPP connection by running, for example, pppd eth0 (see the example later in this section), the userspace pppd daemon creates a PPPoE socket by calling socket(AF_PPPOX, SOCK_STREAM, PX_PROTO_OE); this is done in the rp-pppoe plugin of the pppd daemon, in the PPPOEConnectDevice() method of pppd/plugins/rp-pppoe/plugin.c. This socket() system call creates a PPPoE socket by the pppoe_create() method of the PPPoE kernel module. Releasing the socket after the PPPoE session completed is done by the pppoe_release() method of the PPPoE kernel module. Let's take a look at the pppoe_create() method: + +static const struct proto_ops pppoe_ops = { + +.family = AF_PPPOX, + +.owner = THIS_MODULE, + +.release = pppoe_release, + +.bind = sock_no_bind, + +.connect = pppoe_connect, + +. . . + +.sendmsg = pppoe_sendmsg, + +.recvmsg = pppoe_recvmsg, + +. . . + +.ioctl = pppox_ioctl, + +}; + +static int pppoe_create(struct net *net, struct socket *sock) + +{ + +struct sock *sk; + +sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto); + +if (!sk) + +return -ENOMEM; + +sock_init_data(sock, sk); + +sock->state = SS_UNCONNECTED; + +sock->ops = &pppoe_ops; + +sk->sk_backlog_rcv = pppoe_rcv_core; + +sk->sk_state = PPPOX_NONE; + +sk->sk_type = SOCK_STREAM; + +sk->sk_family = PF_PPPOX; + +sk->sk_protocol = PX_PROTO_OE; + +return 0; + +} + +(drivers/net/ppp/pppoe.c) + +By defining pppoe_ops we set callbacks for this socket. So calling from userspace the connect() system call on an AF_PPPOX socket will be handled by the pppoe_connect() method of the PPPoE module in the kernel. After creating a PPPoE socket, the PPPOEConnectDevice() method calls connect(). Let's take a look at the pppoe_connect() method: + +static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, + +int sockaddr_len, int flags) + +{ + +struct sock *sk = sock->sk; + +struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; + +struct pppox_sock *po = pppox_sk(sk); + +struct net_device *dev = NULL; + +struct pppoe_net *pn; + +struct net *net = NULL; + +int error; + +lock_sock(sk); + +error = -EINVAL; + +if (sp->sa_protocol != PX_PROTO_OE) + +goto end; + +/* Check for already bound sockets */ + +error = -EBUSY; + +The stage_session() method returns true when the session id is not 0 (as mentioned earlier, the session id is 0 in the discovery stage only). In case the socket is connected and it is in the session stage, the socket is already bound, so we exit: + +if ((sk->sk_state & PPPOX_CONNECTED) && + +stage_session(sp->sa_addr.pppoe.sid)) + +goto end; + +Reaching here means that the socket is not connected (it's sk_state is not PPPOX_CONNECTED) and we need to register a PPP channel: + +. . . + +/* Re-bind in session stage only */ + +if (stage_session(sp->sa_addr.pppoe.sid)) { + +error = -ENODEV; + +net = sock_net(sk); + +dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev); + +if (!dev) + +goto err_put; + +po->pppoe_dev = dev; + +po->pppoe_ifindex = dev->ifindex; + +pn = pppoe_pernet(net); + +The network device must be up: + +if (!(dev->flags & IFF_UP)) { + +goto err_put; + +} + +memcpy(&po->pppoe_pa, + +&sp->sa_addr.pppoe, + +sizeof(struct pppoe_addr)); + +write_lock_bh(&pn->hash_lock); + +The __set_item() method inserts the pppox_sock object, po, into the PPPoE socket hashtable; the hash key is generated according to the session id and the remote peer MAC address by the hash_item() method. The remote peer MAC address is po->pppoe_pa.remote. If there is an entry in the hash table with the same session id and the same remote MAC address and the same ifindex of the network device, the __set_item() method will return an error of –EALREADY: + +error = __set_item(pn, po); + +write_unlock_bh(&pn->hash_lock); + +if (error < 0) + +goto err_put; + +po->chan is a ppp_channel object, see earlier in the pppox_sock structure definition. Before registering it by the ppp_register_net_channel() method, some of its members should be initialized: + +po->chan.hdrlen = (sizeof(struct pppoe_hdr) + + +dev->hard_header_len); + +po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr); + +po->chan.private = sk; + +po->chan.ops = &pppoe_chan_ops; + +error = ppp_register_net_channel(dev_net(dev), &po->chan); + +if (error) { + +The delete_item() method deletes a pppox_sock object from the PPPoE socket hashtable. + +delete_item(pn, po->pppoe_pa.sid, + +po->pppoe_pa.remote, po->pppoe_ifindex); + +goto err_put; + +} + +Set the socket state to be connected: + +sk->sk_state = PPPOX_CONNECTED; + +} + +po->num = sp->sa_addr.pppoe.sid; + +end: + +release_sock(sk); + +return error; + +err_put: + +if (po->pppoe_dev) { + +dev_put(po->pppoe_dev); + +po->pppoe_dev = NULL; + +} + +goto end; + +} + +By registration of a PPP channel we are allowed to use PPP services. We are able to process PPPoE session packets by calling the generic PPP method, ppp_input(), from the pppoe_rcv_core() method. Transmission of PPPoE session packets is done with the generic ppp_start_xmit() method. + +RP-PPPoE is an open source project which provides a PPPoE client and a PPPoE server for Linux: http://www.roaringpenguin.com/products/pppoe . A simple example of running a PPPoE server is: + +pppoe-server -I p3p1 -R 192.168.3.101 -L 192.168.3.210 -N 200 + +The options that are used in this example are: + + * -I: The interface name (p3p1) + + * -L: Set local IP address (192.168.3.210) + + * -R: Set the starting remote IP address (192.168.3.101) + + * -N: Max number of concurrent PPPoE sessions (200 in this case) + +For other options, see man 8 pppoe-server. + +Clients on the same LAN can create a PPPoE connection to this server by a pppd daemon, using the rp-pppoe plugin. + +Android popularity as a mobile Operating System for smartphones and tablets is growing steadily. I will conclude the book with a short section about Android, discussing briefly the Android development model and showing four examples about Android networking. + +## Android + +In the recent years, the Android operating system proved to be a very reliable and successful mobile OS. The Android operating system is based on a Linux kernel, with changes by Google developers. Android runs on hundreds of types of mobile devices, which are mostly based on the ARM processor. (I should mention that there is a project of porting Android to Intel x86 processors, http://www.android-x86.org/ ). The first generation of Google TV devices is based on x86 processors by Intel, but the second generation of Google TV devices are based on ARM. Originally Android was developed by "Android Inc.", a company that was founded in California in 2003 by Andy Rubin and others. Google bought this company in 2005. The Open Handset Alliance (OHA), a consortium of over 80 companies, announced Android in 2007. Android is an open source operating system, and its source code is released under the Apache License. Unlike Linux, most of the development is done by Google employees behind closed doors. As opposed to Linux, there is no public mailing list where developers are sending and discussing patches. One can, however, send patches to public Gerrit (see http://source.android.com/source/submit-patches.html ). But it is up to Google only to decide whether or not they will be included in the Android tree. + +Google developers had contributed a lot to the Linux kernel. You had learned earlier in this chapter that the cgroup subsystem was started by Google developers. I will mention also two Linux kernel networking patches, the Receive Packet Steering (RPS) patch, and the Receive flow steering (RFS) patch by Tom Herbert from Google (see http://lwn.net/Articles/362339/ and http://lwn.net/Articles/382428/ ), which were integrated into kernel 2.6.35. When working with multicore platforms, RPS and RFS let you steer packets according to the hash of the payload to a specific CPU. And there are a lot of other examples of contributions from Google to the Linux kernel, and it seems that also in the future you will encounter many important contributions to the Linux kernel from Google. One can find a lot of code from Android kernel in the staging tree of the Linux kernel. However, it is difficult to say whether the Android kernel will be merged fully into the Linux kernel; probably a very large part of it will find its way into the Linux kernel. For more information about Mainlining Android see this wiki: http://elinux.org/Android_Mainlining_Project . In the past there were many obstacles in the way, as Google implemented unique mechanisms, like wakelocks, alternative power management, its own IPC (called Binder), which is based on a Lightweight Remote Procedure Call (RPC), Android shared memory driver (Ashmem), Low Memory Killer and more. In fact, the Kernel community rejected the Google power management wakelocks patches in 2010. But since then, some of these features were merged and the situation changed. (See "Autosleep and Wake Locks," https://lwn.net/Articles/479841/ , and "The LPC Android microconference", https://lwn.net/Articles/570406/ ). Linaro ( www.linaro.org/ ) is a non-profit organization that was established in 2010 by leading big companies such as ARM, Freescale, IBM, Samsung, ST-Ericsson, and Texas Instruments (TI). Its engineering teams develop Linux ARM kernel and also optimizations for GCC toolchain. Linaro teams are doing an amazing job of coordinating and pushing/tweaking changes upstream. Delving into the details of Android kernel implementation and mainlining is beyond the scope of this book. + +### Android Networking + +The main networking issue with Android is, however, not due to Linux kernel but to Android userspace. Android heavily relies on HAL even for networking, as well as for system framework. Originally (i.e., up to 4.2), there's no Ethernet support at all at framework level. If drivers are compiled in the kernel, the TCP/IP stack still allows basic Ethernet connectivity for Android Debug Bridge (ADB) debugging, but that's all. Starting with 4.0, Android-x86 project fork added an early implementation (badly designed but somehow working) of Ethernet at framework level. Starting with 4.2, official upstream sources support Ethernet, but there is no way to actually configure it (it detects Ethernet plug in/out, and if a DHCP server is there, it provides an IP address to the interface). Applications can actually make use of this interface through framework, but mostly no one does this. If you require real Ethernet support (i.e., being able to configure your interface, static/DHCP configure it, set proxy, ensure that all apps are using the interface, then a lot of hacks are still required (see www.slideshare.net/gxben/abs-2013-dive-into-android-networking-adding-ethernet-connectivity ). In all cases, only one interface is being supported at a time (eth0 only, even if you have eth0 and eth1, so don't expect to act as a router of any kind). I will show here four short examples of how Android networking differs from Linux kernel networking: + + * Security privileges and networking: Android added a security feature (named "paranoid network") to the Linux kernel, which restricts access to some networking features, depending on the group of the calling process. As opposed to the standard Linux kernel, where any application can open a socket and transmit/receive with it, in Android access to network resources is filtered by GID (group ID). The part of network security will be probably very difficult to merge into the mainline kernel, as it includes many features that are unique to Android. For more information about Android network security, see http://elinux.org/Android_Security#Paranoid_network-ing . + + * Bluetooth: Bluedroid is a Bluetooth stack based on code that was developed by Broadcom. It replaced the BlueZ based stack in Android 4.2. Support for Bluetooth Low Energy (BLE, or Bluetooth LE) devices, also known as Bluetooth Smart and Smart Ready devices, was introduced in Android 4.3 (API Level 18), July 2013. Prior to this, Android Open Source Project (AOSP) did not have support for BLE devices, but there were some vendors who provided an API to BLE. + + * Netfilter: There is an interesting project from Google that provides better network statistics on Android. This is implemented by xt_qtaguid, a netfilter module, which enables userspace applications to tag their sockets. This project required some changes in the Linux kernel netfilter subsystem. Patches of these changes were also sent to the Linux Kernel Mailing List (LKML); see http://lwn.net/Articles/517358/ . For details, see "Android netfilter changes" http://www.linuxplumbersconf.org/2013/ocw/sessions/1491 . + + * NFC: As was described in the Near Field Communication (NFC) section earlier in this chapter, the Android NFC architecture is a userspace NFC stack: the implementation is done in userspace through the HAL which is supplied by Broadcom or by Android OEMs. + +### Android internals: Resources + +Although there are many resources about developing applications for Android (whether in books, mailing list, forums, courses, etc.), there are very few resources about the internals of Android. For those readers who are interested to learn more, I suggest these resources: + + * The book Embedded Android: Porting, Extending, and Customizing, by Karim Yaghmour (O'Reilly Media, 2013) + + * Slides: Android System Development by Maxime Ripard, Alexandre Belloni (over 400 slides); http://free-electrons.com/doc/training/android/ . + + * Slides: Android Platform Anatomy by Benjamin Zores (59 slides); http://www.slideshare.net/gxben/droidcon-2013-france-android-platform-anatomy . + + * Slides: Jelly Bean Device Porting by Benjamin Zores (127 slides); http://www.slideshare.net/gxben/as-2013-jelly-bean-device-porting-walkthrough . + + * Website: http://developer.android.com/index.html . + + * Android platform internals forum - archives: http://news.gmane.org/gmane.comp.handhelds.android.platform + + * Once a year, an Android Builders Summit (ABS) is held. The first ABS was held in 2011 in San Francisco. It is recommended to read slides, watch videos, or attend. + + * XDA Developers Conference: http://xda-devcon.com/ ; Slides and videos in http://xda-devcon.com/presentations/ + + * Slides: Android Internals, Marko Gargenta: http://www.scandevconf.se/db/Marakana-Android-Internals.pdf + +Note + +Android git repositories are available in https://android.googlesource.com/ + +Note that Android uses a special tool based on python called repo for management of hundreds of git repositories, which makes working with git easier. + +## Summary + +I have dealt in this chapter with namespaces in Linux, focusing on network namespaces. I also described the cgroups subsystem and its implementation; furthermore, I described its two network modules, net_prio and cls_cgroup. The Linux Bluetooth subsystem and its implementation, the IEEE 802.15.4 Linux subsystem and 6LoWPAN, and the NFC subsystem were all covered. The optimization achieved by Low Latency Sockets Poll was also discussed in this chapter, along with the Notification Chains mechanism, which is widely used in the kernel networking stack (and you will encounter it when browsing the source code). Another topic that was briefly discussed was the PCI subsystem, in order to give some background about PCI devices, as many network devices are PCI devices. The chapter was concluded with three short sections about the network teaming driver (which is intended to replace the bonding driver), the PPPoE implementation, and Android. + +Although we've come to the end of the book, there is much more to learn about Linux Kernel networking, as it is a vast ocean of details, and it is progressing dynamically and at such a fast pace. New features and new patches are added constantly. I hope you enjoyed the book and that you learned a thing or two! + +## Quick Reference + +I will conclude with a list of methods and macros that were mentioned in this chapter. + +### Methods + +The following list contains the prototypes and descriptions of several methods covered in this chapter. + +#### void switch_task_namespaces(struct task_struct *p, struct nsproxy *new); + +This method assigns the specified nsproxy object to the specified process descriptor (task_struct object). + +#### struct nsproxy *create_nsproxy(void); + +This method allocates an nsproxy object and initializes its reference counter to 1. + +#### void free_nsproxy(struct nsproxy *ns); + +This method released the resources of the specified nsproxy object. + +#### struct net *dev_net(const struct net_device *dev); + +This method returns the network namespace object (nd_net) associated with the specified network device. + +#### void dev_net_set(struct net_device *dev, struct net *net); + +This method associates the specified network namespace to the specified network device by setting the nd_net member of the net_device object. + +#### void sock_net_set(struct sock *sk, struct net *net); + +This method associates the specified network namespace to the specified sock object. + +#### struct net *sock_net(const struct sock *sk); + +This method returns the network namespace object (sk_net) associated with the specified sock object. + +#### int net_eq(const struct net *net1, const struct net *net2); + +This method returns 1 if the first specified network namespace pointer equals the second specified network namespace pointer and 0 otherwise. + +#### struct net *net_alloc(void); + +This method allocates a network namespace. It is invoked from the copy_net_ns() method. + +#### struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); + +This method creates a new network namespace if the CLONE_NEWNET flag is set in its first parameter, flags. It creates the new network namespace by first calling the net_alloc() method to allocate it, then it initializes it by calling the setup_net() method, and finally adds it to the global list of all namespaces, net_namespace_list. In case the CLONE_NEWNET flag is set in its first parameter, flags, there is no need to create a new namespace and the specified old network namespace, old_net, is returned. Note that this description of the copy_net_ns() method refers to the case when CONFIG_NET_NS is set. When CONFIG_NET_NS is not set, there is a second implementation of copy_net_ns(), which the only thing it does is first verify that CLONE_NEWNET is set in the specified flags, and in case it is, returns the specified old network namespace (old_net); see include/net/net_namespace.h. + +#### int setup_net(struct net *net, struct user_namespace *user_ns); + +This method initializes the specified network namespace object. It assigns the network namespace user_ns member to be the specified user_ns, it initializes the reference counter (count) of the specified network namespace to be 1, and performs more initializations. It is invoked from the copy_net_ns() method and from the net_ns_init() method. + +#### int proc_alloc_inum(unsigned int *inum); + +This method allocates a proc inode and sets *inum to be the generated proc inode number (an integer between 0xf0000000 and 0xffffffff). It returns 0 on success. + +#### struct nsproxy *task_nsproxy(struct task_struct *tsk); + +This method returns the nsproxy object which is attached to the specified process descriptor (tsk). + +#### struct new_utsname *utsname(void); + +This method returns the new_utsname object which is associated with the process which currently runs (current). + +#### struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns); + +This method creates a new UTS namespace object by calling the create_uts_ns() method, and copies the new_utsname object of the specified old_ns UTS namespace into the new_utsname of the newly created UTS namespace. + +#### struct uts_namespace *copy_utsname(unsigned long flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); + +This method creates a new UTS namespace if the CLONE_NEWUTS flag is set in its first parameter, flags. It creates the new UTS namespace by calling the clone_uts_ns() method, and returns the newly created UTS namespace. In case the CLONE_NEWUTS flag is set in its first parameter, there is no need to create a new namespace and the specified old UTS namespace (old_ns) is returned. + +#### struct net *sock_net(const struct sock *sk); + +This method returns the network namespace object (sk_net) associated with the specified sock object. + +#### void sock_net_set(struct sock *sk, struct net *net); + +This method assigns the specified network namespace to the specified sock object. + +#### int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat); + +This method changes the network namespace of the specified network device to be the specified network namespace. It returns 0 on success or –errno on failure. Callers must hold the rtnl semaphore. If the NETIF_F_NETNS_LOCAL flag is set in the features of the network device, an error of –EINVAL is returned. + +#### void put_net(struct net *net); + +This method decrements the reference counter of the specified network namespace. In case it reaches zero, it calls the __put_net() method to free its resources. + +#### struct net *get_net(struct net *net); + +This method returns the specified network namespace object after incrementing its reference counter. + +#### void get_nsproxy(struct nsproxy *ns); + +This method increments the reference counter of the specified nsproxy object. + +#### struct net *get_net_ns_by_pid(pid_t pid); + +This method gets a process id (PID) as an argument, and returns the network namespace object to which this process is attached. + +#### struct net *get_net_ns_by_fd(int fd); + +This method gets a file descriptor as an argument, and returns the network namespace associated with the inode that corresponds to the specified file descriptor. + +#### struct pid_namespace *ns_of_pid(struct pid *pid); + +This method returns the PID namespace in which the specified pid was created. + +#### void put_nsproxy(struct nsproxy *ns); + +This method decrements the reference counter of the specified nsproxy object; in case it reaches 0, the specified nsproxy is freed by calling the free_nsproxy() method. + +#### int register_pernet_device(struct pernet_operations *ops); + +This method registers a network namespace device. + +#### void unregister_pernet_device(struct pernet_operations *ops); + +This method unregisters a network namespace device. + +#### int register_pernet_subsys(struct pernet_operations *ops); + +This method registers a network namespace subsystem. + +#### void unregister_pernet_subsys(struct pernet_operations *ops); + +This method unregisters a network namespace subsystem. + +#### static int register_vlan_device(struct net_device *real_dev, u16 vlan_id); + +This method registers a VLAN device associated with the specified physical device (real_dev). + +#### void cgroup_release_agent(struct work_struct *work); + +This method is called when a cgroup is released. It creates a userspace process by invoking the call_usermodehelper() method. + +#### int call_usermodehelper(char * path, char ** argv, char ** envp, int wait); + +This method prepares and starts a userspace application. + +#### int bacmp(bdaddr_t *ba1, bdaddr_t *ba2); + +This method compares two Bluetooth addresses. It returns 0 if they are equal. + +#### void bacpy(bdaddr_t *dst, bdaddr_t *src); + +This method copies the specified source Bluetooth address (src) to the specified destination Bluetooth address (dst). + +#### int hci_send_frame(struct sk_buff *skb); + +This method is the main Bluetooth method for transmitting SKBs (commands and data). + +#### int hci_register_dev(struct hci_dev *hdev); + +This method registers the specified HCI device. It is invoked from Bluetooth device drivers. If the open() or close() callbacks of the specified hci_dev object are not defined, the method will fail and return –EINVAL. This method sets the HCI_SETUP flag in the dev_flags member of the specified HCI device; it also creates a sysfs entry for the device. + +#### void hci_unregister_dev(struct hci_dev *hdev); + +This method unregisters the specified HCI device. It is invoked from Bluetooth device drivers. It sets the HCI_UNREGISTER flag in the dev_flags member of the specified HCI device; it also removes the sysfs entry of the device. + +#### void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); + +This method handles events that are received from the HCI layer by the hci_rx_work() method. + +#### int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); + +This method is the main Rx handler for 6LoWPAN packets. 6LoWPAN packets have an ethertype of 0x00F6. + +#### void pci_unregister_driver(struct pci_driver *dev); + +This method unregisters a PCI driver. It is usually called in the network driver module_exit() method. + +#### int pci_enable_device(struct pci_dev *dev); + +This method initializes the PCI device before it is used by driver. + +#### int request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); + +This method registers the specified handler as the interrupt service routine for the specified irq. + +#### void free_irq(unsigned int irq, void *dev_id); + +This method frees an interrupt which was allocated with the request_irq() method. + +#### int nfc_init(void); + +This method performs initialization of the NFC subsystem by registering the generic netlink NFC family, initializing NFC Raw sockets and NFC LLCP sockets, and initializing the AF_NFC protocol. + +#### int nfc_register_device(struct nfc_dev *dev); + +This method registers an NFC device (an nfc_dev object) against the NFC core. + +#### int nfc_hci_register_device(struct nfc_hci_dev *hdev); + +This method registers an NFC HCI device (an nfc_hci_dev object) against the NFC HCI layer. + +#### int nci_register_device(struct nci_dev *ndev); + +This method registers an NFC NCI device (an nci_dev object) against the NFC NCI layer. + +#### static int __init pppoe_init(void); + +This method initializes the PPPoE layer (PPPoE protocol handlers, the sockets used by PPPoE, the network notification handler, the PPPoE procfs entry, and more). + +#### struct pppoe_hdr *pppoe_hdr(const struct sk_buff *skb); + +This method returns the PPPoE header associated with the specified skb. + +#### static int pppoe_create(struct net *net, struct socket *sock); + +This method creates a PPPoE socket. Return 0 on success or –ENOMEM if allocation of a socket by the sk_alloc() method failed. + +#### int __set_item(struct pppoe_net *pn, struct pppox_sock *po); + +This method inserts the specified pppox_sock object into the PPPoE socket hashtable. The hash key is calculated according to the session id and the remote peer MAC address by the hash_item() method. + +#### void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex); + +This method removes the PPPoE socket hashtable entry which has the specified session id, the specified MAC address, and the specified network interface index (ifindex). + +#### bool stage_session(__be16 sid); + +This method returns true when the specified session id is not 0. + +#### int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n); + +This method registers the specified notifier_block object (n) to the specified notifier chain (nl). Note that this method is not used directly, there are several wrappers around it. + +#### int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n); + +This method unregistered the specified notifier_block object (n) from the specified notifier chain (nl). Note that also this method is not used directly, there are several wrappers around it. + +#### int register_netdevice_notifier(struct notifier_block *nb); + +This method registers the specified notifier_block object to netdev_chain by calling the raw_notifier_chain_register() method. + +#### int unregister_netdevice_notifier(struct notifier_block *nb); + +This method unregisters the specified notifier_block object from netdev_chain by calling the raw_notifier_chain_unregister() method. + +#### int register_inet6addr_notifier(struct notifier_block *nb); + +This method registers the specified notifier_block object to inet6addr_chain by calling the atomic_notifier_chain_register() method. + +#### int unregister_inet6addr_notifier(struct notifier_block *nb); + +This method unregisters the specified notifier_block object from inet6addr_chain by calling the atomic_notifier_chain_unregister() method. + +#### int register_netevent_notifier(struct notifier_block *nb); + +This method registers the specified notifier_block object to netevent_notif_chain by calling the atomic_notifier_chain_register() method. + +#### int unregister_netevent_notifier(struct notifier_block *nb); + +This method unregisters the specified notifier_block object from netevent_notif_chain by calling the atomic_notifier_chain_unregister() method. + +#### int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls); + +This method is for generating notification events. Note that also this method is not used directly, there are several wrappers around it. + +#### int call_netdevice_notifiers(unsigned long val, struct net_device *dev); + +This method is for generating notification events on the netdev_chain, by calling the raw_notifier_call_chain() method. + +#### int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v); + +This method is for generating notification events; eventually, after using locking mechanism, it invokes the notifier_call_chain() method. + +#### int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,unsigned long val, void *v, int nr_to_call, int *nr_calls); + +This method is for generating notification events. Eventually, after using locking mechanism, it invokes the notifier_call_chain() method. + +### Macros + +Here you'll find a description of the macro that was covered in this chapter. + +#### pci_register_driver() + +This macro registers a PCI driver in the PCI subsystem. It gets a pci_driver object as a parameter. It is usually called in the network driver module_init() method. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_15 + +© Rami Rosen 2014 + +# Linux API + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +In this appendix I cover the two most fundamental data structures in the Linux Kernel Networking stack: the sk_buff and the net_device. This is reference material that can help when reading the rest of this book, as you will probably encounter these two structures in almost every chapter. Becoming familiar with and learning about these two data structures is essential for understanding the Linux Kernel Networking stack. Subsequently, there is a section about remote DMA (RDMA), which is further reference material for Chapter 13. It describes in detail the main methods and the main data structures that are used by RDMA. This appendix is a good place to always return to, especially when looking for definitions of the basic terms. + +In this appendix I cover the two most fundamental data structures in the Linux Kernel Networking stack: the sk_buff and the net_device. This is reference material that can help when reading the rest of this book, as you will probably encounter these two structures in almost every chapter. Becoming familiar with and learning about these two data structures is essential for understanding the Linux Kernel Networking stack. Subsequently, there is a section about remote DMA (RDMA), which is further reference material for Chapter 13. It describes in detail the main methods and the main data structures that are used by RDMA. This appendix is a good place to always return to, especially when looking for definitions of the basic terms. + +## The sk_buff Structure + +The sk_buff structure represents a packet. SKB stands for socket buffer. A packet can be generated by a local socket in the local machine, which was created by a userspace application; the packet can be sent outside or to another socket in the same machine. A packet can also be created by a kernel socket; and you can receive a physical frame from a network device (Layer 2) and attach it to an sk_buff and pass it on to Layer 3. When the packet destination is your local machine, it will continue to Layer 4. If the packet is not for your machine, it will be forwarded according to your routing tables rules, if your machine supports forwarding. If the packet is damaged for any reason, it will be dropped. The sk_buff is a very large structure; I mention most of its members in this section. The sk_buff structure is defined in include/linux/skbuff.h . Here is a description of most of its members: + + * ktime_t tstamp + +Timestamp of the arrival of the packet. Timestamps are stored in the SKB as offsets to a base timestamp. Note: do not confuse tstamp of the SKB with hardware timestamping, which is implemented with the hwtstamps of skb_shared_info. I describe the skb_shared_info object later in this appenidx. + +Helper methods: + + * skb_get_ktime(const struct sk_buff *skb): Returns the tstamp of the specified skb. + + * skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp): Converts the offset back to a struct timeval. + + * net_timestamp_set(struct sk_buff *skb): Sets the timestamp for the specified skb. The timestamp calculation is done with the ktime_get_real() method, which returns the time in ktime_t format. + + * net_enable_timestamp(): This method should be called to enable SKB timestamping. + + * net_disable_timestamp(): This method should be called to disable SKB timestamping. + + * struct sock *sk + +The socket that owns the SKB, for local generated traffic and for traffic that is destined for the local host. For packets that are being forwarded, sk is NULL. Usually when talking about sockets you deal with sockets which are created by calling the socket() system call from userspace. It should be mentioned that there are also kernel sockets, which are created by calling the sock_create_kern() method. See for example in vxlan_init_net() in the VXLAN driver, drivers/net/vxlan.c. + +Helper method: + + * skb_orphan(struct sk_buff *skb): If the specified skb has a destructor, call this destructor; set the sock object (sk) of the specified skb to NULL, and set the destructor of the specified skb to NULL. + + * struct net_device *dev + +The dev member is a net_device object which represents the network interface device associated to the SKB; you will sometimes encounter the term NIC (Network Interface Card) for such a network device. It can be the network device on which the packet arrives, or the network device on which the packet will be sent. The net_device structure will be discussed in depth in the next section. + + * char cb[48] + +This is the control buffer. It is free to use by any layer. This is an opaque area used to store private information. For example, the TCP protocol uses it for the TCP control buffer: + +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + +(include/net/tcp.h) + +The Bluetooth protocol also uses the control block: + +#define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) + +(include/net/bluetooth/bluetooth.h) + + * unsigned long _skb_refdst + +The destination entry (dst_entry) address. The dst_entry struct represents the routing entry for a given destination. For each packet, incoming or outgoing, you perform a lookup in the routing tables. Sometimes this lookup is called FIB lookup. The result of this lookup determines how you should handle this packet; for example, whether it should be forwarded, and if so, on which interface it should be transmitted; or should it be thrown, should an ICMP error message be sent, and so on. The dst_entry object has a reference counter (the __refcnt field). There are cases when you use this reference count, and there are cases when you do not use it. The dst_entry object and the lookup in the FIB is discussed in more detail in Chapter 4. + +Helper methods: + + * skb_dst_set(struct sk_buff *skb, struct dst_entry *dst): Sets the skb dst, assuming a reference was taken on dst and should be released by the dst_release() method (which is invoked by the skb_dst_drop() method). + + * skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst): Sets the skb dst, assuming a reference was not taken on dst. In this case, the skb_dst_drop() method will not call the dst_release() method for the dst. + +Note + +The SKB might have a dst_entry pointer attached to it; it can be reference counted or not. The low order bit of _skb_refdst is set if the reference counter was not taken. + + * struct sec_path *sp + +The security path pointer. It includes an array of IPsec XFRM transformations states (xfrm_state objects). IPsec (IP Security) is a Layer 3 protocol which is used mostly in VPNs. It is mandatory in IPv6 and optional in IPv4. Linux, like many other operating systems, implements IPsec both for IPv4 and IPv6. The sec_path structure is defined in include/net/xfrm.h. See more in Chapter 10, which deals with the IPsec subsystem. + +Helper method: + + * struct sec_path *skb_sec_path(struct sk_buff *skb): Returns the sec_path object (sp) associated with the specified skb. + + * unsigned int len + +The total number of packet bytes. + + * unsigned int data_len + +The data length. This field is used only when the packet has nonlinear data (paged data). + +Helper method: + + * skb_is_nonlinear(const struct sk_buff *skb): Returns true when the data_len of the specified skb is larger than 0. + + * __u16 mac_len + +The length of the MAC (Layer 2) header. + + * __wsum csum + +The checksum. + + * __u32 priority + +The queuing priority of the packet. In the Tx path, the priority of the SKB is set according to the socket priority (the sk_priority field of the socket). The socket priority in turn can be set by calling the setsockopt() system call with the SO_PRIORITY socket option. Using the net_prio cgroup kernel module, you can define a rule which will set the priority for the SKB; see in the description of the sk_buff netprio_map field, later in this section, and also in Documentation/cgroup/netprio.txt. For forwarded packets, the priority is set according to TOS (Type Of Service) field in the IP header. There is a table named ip_tos2prio which consists of 16 elements. The mapping from TOS to priority is done by the rt_tos2priority() method, according to the TOS field of the IP header; see the ip_forward() method in net/ipv4/ip_forward.c and the ip_tos2prio definition in include/net/route.h. + + * __u8 local_df:1 + +Allow local fragmentation flag. If the value of the pmtudisc field of the socket which sends the packet is IP_PMTUDISC_DONT or IP_PMTUDISC_WANT, local_df is set to 1; if the value of the pmtudisc field of the socket is IP_PMTUDISC_DO or IP_PMTUDISC_PROBE, local_df is set to 0. See the implementation of the __ip_make_skb() method in net/ipv4/ip_output.c. Only when the packet local_df is 0 do you set the IP header don't fragment flag, IP_DF; see the ip_queue_xmit() method in net/ipv4/ip_output.c: + +... + +if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) + +iph->frag_off = htons(IP_DF); + +else + +iph->frag_off = 0; + +... + +The frag_off field in the IP header is a 16-bit field, which represents the offset and the flags of the fragment. The 13 leftmost (MSB) bits are the offset (the offset unit is 8-bytes) and the 3 rightmost (LSB) bits are the flags. The flags can be IP_MF (there are more fragments), IP_DF (do not fragment), IP_CE (for congestion), or IP_OFFSET (offset part). + +The reason behind this is that there are cases when you do not want to allow IP fragmentation. For example, in Path MTU Discovery (PMTUD), you set the DF (don't fragment) flag of the IP header. Thus, you don't fragment the outgoing packets. Any network device along the path whose MTU is smaller than the packet will drop it and send back an ICMP packet ("Fragmentation Needed"). Getting these ICMP "Fragmentation Needed" packets is required in order to determine the Path MTU. See more in Chapter 3. From userspace, setting IP_PMTUDISC_DO is done, for example, thus (the following code snippet is taken from the source code of the tracepath utility from the iputils package; the tracepath utility finds the path MTU): + +... + +int on = IP_PMTUDISC_DO; + +setsockopt(fd, SOL_IP, IP_MTU_DISCOVER, &on, sizeof(on)); + +... + + * __u8 cloned:1 + +When the packet is cloned with the __skb_clone() method, this field is set to 1 in both the cloned packet and the primary packet. Cloning SKB means creating a private copy of the sk_buff struct; the data block is shared between the clone and the primary SKB. + + * __u8 ip_summed:2 + +Indicator of IP (Layer 3) checksum; can be one of these values: + + * CHECKSUM_NONE: When the device driver does not support hardware checksumming, it sets the ip_summed field to be CHECKSUM_NONE. This is an indication that checksumming should be done in software. + + * CHECKSUM_UNNECESSARY: No need for any checksumming. + + * CHECKSUM_COMPLETE: Calculation of the checksum was completed by the hardware, for incoming packets. + + * CHECKSUM_PARTIAL: A partial checksum was computed for outgoing packets; the hardware should complete the checksum calculation. CHECKSUM_COMPLETE and CHECKSUM_PARTIAL replace the CHECKSUM_HW flag, which is now deprecated. + + * __u8 nohdr:1 + +Payload reference only, must not modify header. There are cases when the owner of the SKB no longer needs to access the header at all. In such cases, you can call the skb_header_release() method, which sets the nohdr field of the SKB; this indicates that the header of this SKB should not be modified. + + * __u8 nfctinfo:3 + +Connection Tracking info. Connection Tracking allows the kernel to keep track of all logical network connections or sessions. NAT relies on Connection Tracking information for its translations. The value of the nfctinfo field corresponds to the ip_conntrack_info enum values. So, for example, when a new connection is starting to be tracked, the value of nfctinfo is IP_CT_NEW. When the connection is established, the value of nfctinfo is IP_CT_ESTABLISHED. The value of nfctinfo can change to IP_CT_RELATED when the packet is related to an existing connection—for example, when the traffic is part of some FTP session or SIP session, and so on. For a full list of ip_conntrack_info enum values see include/uapi/linux/netfilter/nf_conntrack_common.h. The nfctinfo field of the SKB is set in the resolve_normal_ct() method, net/netfilter/nf_conntrack_core.c. This method performs a Connection Tracking lookup, and if there is a miss, it creates a new Connection Tracking entry. Connection Tracking is discussed in depth in Chapter 9, which deals with the netfilter subsystem. + + * __u8 pkt_type:3 + +For Ethernet, the packet type depends on the destination MAC address in the ethernet header, and is determined by the eth_type_trans() method: + + * PACKET_BROADCAST for broadcast + + * PACKET_MULTICAST for multicast + + * PACKET_HOST if the destination MAC address is the MAC address of the device which was passed as a parameter + + * PACKET_OTHERHOST if these conditions are not met + +See the definition of the packet types in include/uapi/linux/if_packet.h. + + * __u8 ipvs_property:1 + +This flag indicates whether the SKB is owned by ipvs (IP Virtual Server), which is a kernel-based transport layer load-balancing solution. This field is set to 1 in the transmit methods of ipvs (net/netfilter/ipvs/ip_vs_xmit.c). + + * __u8 peeked:1 + +This packet has been already seen, so stats have been done for it—so don't do them again. + + * __u8 nf_trace:1 + +The netfilter packet trace flag. This flag is set by the packet flow tracing the netfilter module, xt_TRACE module, which is used to mark packets for tracing (net/netfilter/xt_TRACE.c). + +Helper method: + + * nf_reset_trace(struct sk_buff *skb): Sets the nf_trace of the specified skb to 0. + + * __be16 protocol + +The protocol field is initialized in the Rx path by the eth_type_trans() method to be ETH_P_IP when working with Ethernet and IP. + + * void (*destructor)(struct sk_buff *skb) + +A callback that is invoked when freeing the SKB by calling the kfree_skb() method. + + * struct nf_conntrack *nfct + +The associated Connection Tracking object, if it exists. The nfct field, like the nfctinfo field, is set in the resolve_normal_ct() method. The Connection Tracking layer is discussed in depth in Chapter 9, which deals with the netfilter subsystem. + + * int skb_iif + +The ifindex of the network device on which the packet arrived. + + * __u32 rxhash + +The rxhash of the SKB is calculated in the receive path, according to the source and destination address of the IP header and the ports from the transport header. A value of zero indicates that the hash is not valid. The rxhash is used to ensure that packets with the same flow will be handled by the same CPU when working with Symmetrical Multiprocessing (SMP). This decreases the number of cache misses and improves network performance. The rxhash is part of the Receive Packet Steering (RPS) feature, which was contributed by Google developers (Tom Herbert and others). The RPS feature gives performance improvement in SMP environments. See more in Documentation/networking/scaling.txt. + + * __be16 vlan_proto + +The VLAN protocol used—usually it is the 802.1q protocol. Recently support for the 802.1ad protocol (also known as Stacked VLAN) was added. + +The following is an example of creating 802.1q and 802.1ad VLAN devices in userspace using the ip command of the iproute2 package: + +ip link add link eth0 eth0.1000 type vlan proto 802.1ad id 1000 + +ip link add link eth0.1000 eth0.1000.1000 type vlan proto 802.1q id 100 + +Note: this feature is supported in kernel 3.10 and higher. + + * __u16 vlan_tci + +The VLAN tag control information (2 bytes), composed of ID and priority. + +Helper method: + + * vlan_tx_tag_present(__skb): This macro checks whether the VLAN_TAG_PRESENT flag is set in the vlan_tci field of the specified __skb. + + * __u16 queue_mapping + +Queue mapping for multiqueue devices. + +Helper methods: + + * skb_set_queue_mapping (struct sk_buff *skb, u16 queue_mapping): Sets the specified queue_mapping for the specified skb. + + * skb_get_queue_mapping(const struct sk_buff *skb): Returns the queue_mapping of the specified skb. + + * __u8 pfmemalloc + +Allocate the SKB from PFMEMALLOC reserves. + +Helper method: + + * skb_pfmemalloc(): Returns true if the SKB was allocated from PFMEMALLOC reserves. + + * __u8 ooo_okay:1 + +The ooo_okay flag is set to avoid ooo (out of order) packets. + + * __u8 l4_rxhash:1 + +A flag that is set when a canonical 4-tuple hash over transport ports is used. + +See the __skb_get_rxhash() method in net/core/flow_dissector.c. + + * __u8 no_fcs:1 + +A flag that is set when you request the NIC to treat the last 4 bytes as Ethernet Frame Check Sequence (FCS). + + * __u8 encapsulation:1 + +The encapsulation field denotes that the SKB is used for encapsulation. It is used, for example, in the VXLAN driver. VXLAN is a standard protocol to transfer Layer 2 Ethernet packets over a UDP kernel socket. It can be used as a solution when there are firewalls that block tunnels and allow, for example, only TCP or UDP traffic. The VXLAN driver uses UDP encapsulation and sets the SKB encapsulation to 1 in the vxlan_init_net() method. Also the ip_gre module and the ipip tunnel module use encapsulation and set the SKB encapsulation to 1. + + * __u32 secmark + +Security mark field. The secmark field is set by an iptables SECMARK target, which labels packets with any valid security context. For example: + +iptables -t mangle -A INPUT -p tcp --dport 80 -j SECMARK --selctx system_u:object_r:httpd_packet_t:s0 + +iptables -t mangle -A OUTPUT -p tcp --sport 80 -j SECMARK --selctx system_u:object_r:httpd_packet_t:s0 + +In the preceding rule, you are statically labeling packets arriving at and leaving from port 80 as httpd_packet_t. See: netfilter/xt_SECMARK.c. + +Helper methods: + + * void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from): Sets the value of the secmark field of the first specified SKB (to) to be equal to the value of the secmark field of the second specified SKB (from). + + * void skb_init_secmark(struct sk_buff *skb): Initializes the secmark of the specified skb to be 0. + +The next three fields: mark, dropcount, and reserved_tailroom appear in a union. + + * __u32 mark + +This field enables identifying the SKB by marking it. + +You can set the mark field of the SKB, for example, with the iptables MARK target in an iptables PREROUTING rule with the mangle table. + + * iptables -A PREROUTING -t mangle -i eth1 -j MARK --set-mark 0x1234 + +This rule will assign the value of 0x1234 to every SKB mark field for incoming traffic on eth1 before performing a routing lookup. You can also run an iptables rule which will check the mark field of every SKB to match a specified value and act upon it. Netfilter targets and iptables are discussed in Chapter 9, which deals with the netfilter subsystem. + + * __u32 dropcount + +The dropcount counter represents the number of dropped packets (sk_drops) of the sk_receive_queue of the assigned sock object (sk). See the sock_queue_rcv_skb() method in net/core/sock.c. + + * _u32 reserved_tailroom: Used in the sk_stream_alloc_skb() method. + + * sk_buff_data_t transport_header + +The transport layer (L4) header. + +Helper methods: + + * skb_transport_header(const struct sk_buff *skb): Returns the transport header of the specified skb. + + * skb_transport_header_was_set(const struct sk_buff *skb): Returns 1 if the transport_header of the specified skb is set. + + * sk_buff_data_t network_header + +The network layer (L3) header. + +Helper method: + + * skb_network_header(const struct sk_buff *skb): Returns the network header of the specified skb. + + * sk_buff_data_t mac_header + +The link layer (L2) header. + +Helper methods: + + * skb_mac_header(const struct sk_buff *skb): Returns the MAC header of the specified skb. + + * skb_mac_header_was_set(const struct sk_buff *skb): Returns 1 if the mac_header of the specified skb was set. + + * sk_buff_data_t tail + +The tail of the data. + + * sk_buff_data_t end + +The end of the buffer. The tail cannot exceed end. + + * unsigned char head + +The head of the buffer. + + * unsigned char data + +The data head. The data block is allocated separately from the sk_buff allocation. + +See, in _alloc_skb(), net/core/skbuff.c: + +data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); + +Helper methods: + + * skb_headroom(const struct sk_buff *skb): This method returns the headroom, which is the number of bytes of free space at the head of the specified skb (skb->data – skb->head). See Figure A-1. + + * skb_tailroom(const struct sk_buff *skb): This method returns the tailroom, which is the number of bytes of free space at the tail of the specified skb (skb->end – skb->tail). See Figure A-1. + +Figure A-1 shows the headroom and the tailroom of an SKB. + +Figure A-1. + +Headroom and tailroom of an SKB + +The following are some methods for handling buffers: + + * skb_put(struct sk_buff *skb, unsigned int len): Adds data to a buffer: this method adds len bytes to the buffer of the specified skb and increments the length of the specified skb by the specified len. + + * skb_push(struct sk_buff *skb, unsigned int len): Adds data to the start of a buffer; this method decrements the data pointer of the specified skb by the specified len and increments the length of the specified skb by the specified len. + + * skb_pull(struct sk_buff *skb, unsigned int len): Removes data from the start of a buffer; this method increments the data pointer of the specified skb by the specified len and decrements the length of the specified skb by the specified len. + + * skb_reserve(struct sk_buff *skb, int len): Increases the headroom of an empty skb by reducing the tail. + +After describing some methods for handling buffers, I continue with listing the members of the sk_buff structure: + + * unsigned int truesize + +The total memory allocated for the SKB (including the SKB structure itself and the size of the allocated data block). + + * atomic_t users + +A reference counter, initialized to 1; incremented by the skb_get() method and decremented by the kfree_skb() method or by the consume_skb() method; the kfree_skb() method decrements the usage counter; if it reached 0, the method will free the SKB—otherwise, the method will return without freeing it. + +Helper methods: + + * skb_get(struct sk_buff *skb): Increments the users reference counter by 1. + + * skb_shared(const struct sk_buff *skb): Returns true if the number of users is not 1. + + * skb_share_check(struct sk_buff *skb, gfp_t pri): If the buffer is not shared, the original buffer is returned. If the buffer is shared, the buffer is cloned, and the old copy drops a reference. A new clone with a single reference is returned. When being called from interrupt context or with spinlocks held, the pri parameter (priority) must be GFP_ATOMIC. If memory allocation fails, NULL is returned. + + * consume_skb(struct sk_buff *skb): Decrements the users reference counter and frees the SKB if the users reference counter is zero. + +### struct skb_shared_info + +The skb_shared_info struct is located at the end of the data block (skb_end_pointer(SKB)). It consists of only a few fields. Let's take a look at it: + +struct skb_shared_info { + +unsigned char nr_frags; + +__u8 tx_flags; + +unsigned short gso_size; + +unsigned short gso_segs; + +unsigned short gso_type; + +struct sk_buff *frag_list; + +struct skb_shared_hwtstamps hwtstamps; + +__be32 ip6_frag_id; + +atomic_t dataref; + +void * destructor_arg; + +skb_frag_t frags[MAX_SKB_FRAGS]; + +}; + +The following is a description of some of the important members of the skb_shared_info structure: + + * nr_frags: Represents the number of elements in the frags array. + + * tx_flags can be: + + * SKBTX_HW_TSTAMP: Generate a hardware time stamp. + + * SKBTX_SW_TSTAMP: Generate a software time stamp. + + * SKBTX_IN_PROGRESS: Device driver is going to provide a hardware timestamp. + + * SKBTX_DEV_ZEROCOPY: Device driver supports Tx zero-copy buffers. + + * SKBTX_WIFI_STATUS: Generate WiFi status information. + + * SKBTX_SHARED_FRAG: Indication that at least one fragment might be overwritten. + + * When working with fragmentation, there are cases when you work with a list of sk_buffs (frag_list), and there are cases when you work with the frags array. It depends mostly on whether the Scatter/Gather mode is set. + +Helper methods: + + * skb_is_gso(const struct sk_buff *skb): Returns true if the gso_size of the skb_shared_info associated with the specified skb is not 0. + + * skb_is_gso_v6(const struct sk_buff *skb): Returns true if the gso_type of the skb_shared_info associated with the skb is SKB_GSO_TCPV6. + + * skb_shinfo(skb): A macro that returns the skb_shinfo associated with the specified skb. + + * skb_has_frag_list(const struct sk_buff *skb): Returns true if the frag_list of the skb_shared_info of the specified skb is not NULL. + + * dataref: A reference counter of the skb_shared_info struct. It is set to 1 in the method, which allocates the skb and initializes skb_shared_info (The __alloc_skb() method). + +## The net_device structure + +The net_device struct represents the network device. It can be a physical device, like an Ethernet device, or it can be a software device, like a bridge device or a VLAN device. As with the sk_buff structure, I will list its important members. The net_device struct is defined in include/linux/netdevice.h: + + * char name[IFNAMSIZ] + +The name of the network device. This is the name that you see with ifconfig or ip commands (for example eth0, eth1, and so on). The maximum length of the interface name is 16 characters. In newer distributions with biosdevname support, the naming scheme corresponds to the physical location of the network device. So PCI network devices are named pp, according to the chassis labels, and embedded ports (on motherboard interfaces) are named em—for example, em1, em2, and so on. There is a special suffix for SR-IOV devices and Network Partitioning (NPAR)–enabled devices. Biosdevname is developed by Dell: http://linux.dell.com/biosdevname . See also this white paper: http://linux.dell.com/files/whitepapers/consistent_network_device_naming_in_linux.pdf . + +Helper method: + + * dev_valid_name(const char *name): Checks the validity of the specified network device name. A network device name must obey certain restrictions in order to enable creating corresponding sysfs entries. For example, it cannot be ". " or ".. "; its length should not exceed 16 characters. Changing the interface name can be done like this, for example: ip link set p2p1 . So, for example, ip link set p2p1 name a12345678901234567 will fail with this message: Error: argument "a12345678901234567" is wrong: "name" too long. The reason is that you tried to set a device name that is longer than 16 characters. And running ip link set p2p1 name. will fail with RTNETLINK answers: Invalid argument, since you tried to set the device name to be ".", which is an invalid value. See dev_valid_name() in net/core/dev.c. + + * struct hlist_node name_hlist + +This is a hash table of network devices, indexed by the network device name. A lookup in this hash table is performed by dev_get_by_name(). Insertion into this hash table is performed by the list_netdevice() method, and removal from this hash table is done with the unlist_netdevice() method. + + * char *ifalias + +SNMP alias interface name. Its length can be up to 256 (IFALIASZ). + +You can create an alias to a network device using this command line: + +ip link set alias myalias + +The ifalias name is exported via sysfs by /sys/class/net//ifalias. + +Helper method: + + * dev_set_alias(struct net_device *dev, const char *alias, size_t len): Sets the specified alias to the specified network device. The specified len parameter is the number of bytes of specified alias to be copied; if the specified len is greater than 256 (IFALIASZ), the method will fail with -EINVAL. + + * unsigned int irq + +The Interrupt Request (IRQ) number of the device. The network driver should call request_irq() to register itself with this IRQ number. Typically this is done in the probe() callback of the network device driver. The prototype of the request_irq() method is: int request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev). The first argument is the IRQ number. The sepcified handler is the Interrupt Service Routine (ISR). The network driver should call the free_irq() method when it no longer uses this irq. In many cases, this irq is shared (the request_irq() method is called with the IRQF_SHARED flag). You can view the number of interrupts that occurred on each core by running cat /proc/interrupts. You can set the SMP affinity of the irq by echo irqMask > /proc/irq//smp_affinity. + +In an SMP machine, setting the SMP affinity of interrupts means setting which cores are allowed to handle the interrupt. Some PCI network interfaces use Message Signaled Interrupts (MSIs).PCI MSI interrupts are never shared, so the IRQF_SHARED flag is not set when calling the request_irq() method in these network drivers. See more info in Documentation/PCI/MSI-HOWTO.txt. + + * unsigned long state + +A flag that can be one of these values: + + * __LINK_STATE_START: This flag is set when the device is brought up, by the dev_open() method, and is cleared when the device is brought down. + + * __LINK_STATE_PRESENT: This flag is set in device registration, by the register_netdevice() method, and is cleared in the netif_device_detach() method. + + * __LINK_STATE_NOCARRIER: This flag shows whether the device detected loss of carrier. It is set by the netif_carrier_off() method and cleared by the netif_carrier_on() method. It is exported by sysfs via /sys/class/net//carrier. + + * __LINK_STATE_LINKWATCH_PENDING: This flag is set by the linkwatch_fire_event() method and cleared by the linkwatch_do_dev() method. + + * __LINK_STATE_DORMANT: The dormant state indicates that the interface is not able to pass packets (that is, it is not "up"); however, this is a "pending" state, waiting for some external event. See section 3.1.12, "New states for IfOperStatus" in RFC 2863, "The Interfaces Group MIB." + +The state flag can be set with the generic set_bit() method. + +Helper methods: + + * netif_running(const struct net_device *dev): Returns true if the __LINK_STATE_START flag of the state field of the specified device is set. + + * netif_device_present(struct net_device *dev): Returns true if the __LINK_STATE_PRESENT flag of the state field of the specified device is set. + + * netif_carrier_ok (const struct net_device *dev): Returns true if the __LINK_STATE_NOCARRIER flag of the state field of the specified device is not set. + +These three methods are defined in include/linux/netdevice.h. + + * netdev_features_t features + +The set of currently active device features. These features should be changed only by the network core or in error paths of the ndo_set_features() callback. Network driver developers are responsible for setting the initial set of the device features. Sometimes they can use a wrong combination of features. The network core fixes this by removing an offending feature in the netdev_fix_features() method, which is invoked when the network interface is registered (in the register_netdevice() method); a proper message is also written to the kernel log. + +I will mention some net_device features here and discuss them. For the full list of net_device features, look in include/linux/netdev_features.h. + + * NETIF_F_IP_CSUM means that the network device can checksum L4 IPv4 TCP/UDP packets. + + * NETIF_F_IPV6_CSUM means that the network device can checksum L4 IPv6 TCP/UDP packets. + + * NETIF_F_HW_CSUM means that the device can checksum in hardware all L4 packets. You cannot activate NETIF_F_HW_CSUM together with NETIF_F_IP_CSUM, or together with NETIF_F_IPV6_CSUM, because that will cause duplicate checksumming. + +If the driver features set includes both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM features, then you will get a kernel message saying "mixed HW and IP checksum settings." In such a case, the netdev_fix_features() method removes the NETIF_F_IP_CSUM feature. If the driver features set includes both NETIF_F_HW_CSUM and NETIF_F_IPV6_CSUM features, you get again the same message as in the previous case. This time, the NETIF_F_IPV6_CSUM feature is the one which is being removed by the netdev_fix_features() method. In order for a device to support TSO (TCP Segmentation Offload), it needs also to support Scatter/Gather and TCP checksum; this means that both NETIF_F_SG and NETIF_F_IP_CSUM features must be set. If the driver features set does not include the NETIF_F_SG feature, then you will get a kernel message saying "Dropping TSO features since no SG feature," and the NETIF_F_ALL_TSO feature will be removed. If the driver features set does not include the NETIF_F_IP_CSUM feature and does not include NETIF_F_HW_CSUM, then you will get a kernel message saying "Dropping TSO features since no CSUM feature," and the NETIF_F_TSO will be removed. + +Note + +In recent kernels, if CONFIG_DYNAMIC_DEBUG kernel config item is set, you might need to explicitly enable printing of some messages, via /dynamic_debug/control interface. See Documentation/dynamic-debug-howto.txt. + + * NETIF_F_LLTX is the LockLess TX flag and is considered deprecated. When it is set, you don't use the generic Tx lock (This is why it is called LockLess TX). See the following macro (HARD_TX_LOCK) from net/core/dev.c: + +#define HARD_TX_LOCK(dev, txq, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ + +__netif_tx_lock(txq, cpu); \ + +} \ + +} + +NETIF_F_LLTX is used in tunnel drivers like VXLAN, VETH, and in IP over IP (IPIP) tunneling driver. For example, in the IPIP tunnel module, you set the NETIF_F_LLTX flag in the ipip_tunnel_setup() method (net/ipv4/ipip.c). + +The NETIF_F_LLTX flag is also used in a few drivers that have implemented their own Tx lock, like the cxgb network driver. + +In drivers/net/ethernet/chelsio/cxgb/cxgb2.c, you have: + +static int __devinit init_one(struct pci_dev *pdev + +const struct pci_device_id *ent) + +{ + +... + +netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | + +NETIF_F_RXCSUM | NETIF_F_LLTX; + +... + +} + + * NETIF_F_GRO is used to indicate that the device supports GRO (Generic Receive Offload). With GRO, incoming packets are merged at reception time. The GRO feature improves network performance. GRO replaced LRO (Large Receive Offload), which was limited to TCP/IPv4. This flag is checked in the beginning of the dev_gro_receive() method; devices that do not have this flag set will not perform the GRO handling part in this method. A driver that wants to use GRO should call the napi_gro_receive() method in the Rx path of the driver. You can enable/disable GRO with ethtool, by ethtool -K gro on/ ethtool -K gro off, respectively. You can check whether GRO is set by running ethtool –k and looking at the gro field. + + * NETIF_F_GSO is set to indicate that the device supports Generic Segmentation Offload (GSO). GSO is a generalization of a previous solution called TSO (TCP segmentation offload), which dealt only with TCP in IPv4. GSO can handle also IPv6, UDP, and other protocols. GSO is a performance optimization, based on traversing the networking stack once instead of many times, for big packets. So the idea is to avoid segmentation in Layer 4 and defer segmentation as much as possible. The sysadmin can enable/disable GSO with ethtool, by ethtool -K gso on/ethtool -K gso off, respectively. You can check whether GSO is set by running ethtool –k and looking at the gso field. To work with GSO, you should work in Scatter/Gather mode. The NETIF_F_SG flag must be set. + + * NETIF_F_NETNS_LOCAL is set for network namespace local devices. These are network devices that are not allowed to move between network namespaces. The loopback, VXLAN, and PPP network devices are examples of namespace local devices. All these devices have the NETIF_F_NETNS_LOCAL flag set. A sysadmin can check whether an interface has the NETIF_F_NETNS_LOCAL flag set or not by ethtool -k . This feature is fixed and cannot be changed by ethtool. Trying to move a network device of this type to a different namespace results in an error (-EINVAL). For details, look in the dev_change_net_namespace() method (net/core/dev.c). When deleting a network namespace, devices that do not have the NETIF_F_NETNS_LOCAL flag set are moved to the default initial network namespace (init_net). Network namespace local devices that have the NETIF_F_NETNS_LOCAL flag set are not moved to the default initial network namespace (init_net), but are deleted. + + * NETIF_F_HW_VLAN_CTAG_RX is for use by devices which support VLAN Rx hardware acceleration. It was formerly called NETIF_F_HW_VLAN_RX and was renamed in kernel 3.10, when support for 802.1ad was added. "CTAG" was added to indicate that this device differ from "STAG" device (Service provider tagging). A device driver that sets the NETIF_F_HW_VLAN_RX feature must also define the ndo_vlan_rx_add_vid() and ndo_vlan_rx_kill_vid() callbacks. Failure to do so will avoid device registration and result in a "Buggy VLAN acceleration in driver" kernel error message. + + * NETIF_F_HW_VLAN_CTAG_TX is for use by devices that support VLAN Tx hardware acceleration. It was formerly called NETIF_F_HW_VLAN_TX and was renamed in kernel 3.10 when support for 802.1ad was added. + + * NETIF_F_VLAN_CHALLENGED is set for devices that can't handle VLAN packets. Setting this feature avoids registration of a VLAN device. Let's take a look at the VLAN registration method: + +static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { + +int err; + +... + +err = vlan_check_real_dev(real_dev, vlan_id); + +The first thing the vlan_check_real_dev() method does is to check the network device features and return an error if the NETIF_F_VLAN_CHALLENGED feature is set: + +int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) + +{ + +const char *name = real_dev->name; + +if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { + +pr_info("VLANs not supported on %s\n", name); + +return -EOPNOTSUPP; + +} + +... + +} + +For example, some types of Intel e100 network device drivers set the NETIF_F_VLAN_CHALLENGED feature (see e100_probe() in drivers/net/ethernet/intel/e100.c). + +You can check whether the NETIF_F_VLAN_CHALLENGED is set by running ethtool –k and looking at the vlan-challenged field. This is a fixed value that you cannot change with the ethtool command. + + * NETIF_F_SG is set when the network interface supports Scatter/Gather IO. You can enable and disable Scatter/Gather with ethtool, by ethtool -K sg on/ ethtool -K sg off, respectively. You can check whether Scatter/Gather is set by running ethtool –k and looking at the sg field. + + * NETIF_F_HIGHDMA is set if the device can perform access by DMA to high memory. The practical implication of setting this feature is that the ndo_start_xmit() callback of the net_device_ops object can manage SKBs, which have frags elements in high memory. You can check whether the NETIF_F_HIGHDMA is set by running ethtool –k and looking at the highdma field. This is a fixed value that you cannot change with the ethtool command. + + * netdev_features_t hw_features + +The set of features that are changeable features. This means that their state may possibly be changed (enabled or disabled) for a particular device by a user's request. This set should be initialized in the ndo_init() callback and not changed later. + + * netdev_features_t wanted_features + +The set of features that were requested by the user. A user may request to change various offloading features—for example, by running ethtool -K eth1 rx on. This generates a feature change event notification (NETDEV_FEAT_CHANGE) to be sent by the netdev_features_change() method. + + * netdev_features_t vlan_features + +The set of features whose state is inherited by child VLAN devices. For example, let's look at the rtl_init_one() method, which is the probe callback of the r8169 network device driver (see Chapter 14): + +int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + +{ + +... + +dev->vlan_features=NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_TSO| NETIF_F_HIGHDMA; + +... + +} + +(drivers/net/ethernet/realtek/r8169.c) + +This initialization means that all child VLAN devices will have these features. For example, let's say that your eth0 device is an r8169 device, and you add a VLAN device thus: vconfig add eth0 100. Then, in the initialization in the VLAN module, there is this code related to vlan_features: + +static int vlan_dev_init(struct net_device *dev) + +{ + +... + +dev->features |= real_dev->vlan_features | NETIF_F_LLTX; + +... + +} + +(net/8021q/vlan_dev.c) + +This means that it sets the features of the VLAN child device to be the vlan_features of the real device (which is eth0 in this case), which were set according to what you saw earlier in the rtl_init_one() method. + + * netdev_features_t hw_enc_features + +The mask of features inherited by encapsulating devices. This field indicates what encapsulation offloads the hardware is capable of doing, and drivers will need to set them appropriately. For more info about the network device features, see Documentation/networking/netdev-features.txt. + + * ifindex + +The ifindex (Interface index) is a unique device identifier. This index is incremented by 1 each time you create a new network device, by the dev_new_index() method. The first network device you create, which is almost always the loopback device, has ifindex of 1. Cyclic integer overflow is handled by the method that handles assignment of the ifindex number. The ifindex is exported by sysfs via /sys/class/net//ifindex. + + * struct net_device_stats stats + +The statistics struct, which was left as a legacy, includes fields like the number of rx_packets or the number of tx_packets. New device drivers use the rtnl_link_stats64 struct (defined in include/uapi/linux/if_link.h) instead of the net_device_stats struct. Most of the network drivers implement the ndo_get_stats64() callback of net_device_ops (or the ndo_get_stats() callback of net_device_ops, when working with the older API). + +The statistics are exported via /sys/class/net//statistics. + +Some drivers implement the get_ethtool_stats() callback. These drivers show statistics by ethtool -S + +See, for example, the rtl8169_get_ethtool_stats() method in drivers/net/ethernet/realtek/r8169.c. + + * atomic_long_t rx_dropped + +A counter of the number of packets that were dropped in the RX path by the core network stack. This counter should not be used by drivers. Do not confuse the rx_dropped field of the sk_buff with the dropped field of the softnet_data struct. The softnet_data struct represents a per-CPU object. They are not equivalent because the rx_dropped of the sk_buff might be incremented in several methods, whereas the dropped counter of softnet_data is incremented only by the enqueue_to_backlog() method (net/core/dev.c). The dropped counter of softnet_data is exported by /proc/net/softnet_stat. In /proc/net/softnet_stat you have one line per CPU. The first column is the total packets counter, and the second one is the dropped packets counter. + +For example: + +cat /proc/net/softnet_stat + +00000076 00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 + +00000005 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 + +You see here one line per CPU (you have two CPUs); for the first CPU, you see 118 total packets (hex 0x76), where one packet is dropped. For the second CPU, you see 5 total packets and 0 dropped. + + * struct net_device_ops *netdev_ops + +The netdev_ops structure includes pointers for several callback methods that you want to define if you want to override the default behavior. Here are some callbacks of netdev_ops: + + * The ndo_init() callback is called when network device is registered. + + * The ndo_uninit() callback is called when the network device is unregistered or when the registration fails. + + * The ndo_open() callback handles change of device state, when a network device state is being changed from down state to up state. + + * The ndo_stop() callback is called when a network device state is being changed to be down. + + * The ndo_validate_addr() callback is called to check whether the MAC is valid. Many network drivers set the generic eth_validate_addr() method to be the ndo_validate_addr() callback. The generic eth_validate_addr() method returns true if the MAC address is not a multicast address and is not all zeroes. + + * The ndo_set_mac_address() callback sets the MAC address. Many network drivers set the generic eth_mac_addr() method to be the ndo_set_mac_address() callback of struct net_device_ops for setting their MAC address. For example, the VETH driver (drivers/net/veth.c) or the VXLAN driver (drivers/nets/vxlan.c). + + * The ndo_start_xmit() callback handles packet transmission. It cannot be NULL. + + * The ndo_select_queue() callback is used to select a Tx queue, when working with multiqueues. If the ndo_select_queue() callback is not set, then the __netdev_pick_tx() is called. See the implementaion of the netdev_pick_tx() method in net/core/flow_dissector.c. + + * The ndo_change_mtu() callback handles modifying the MTU. It should check that the specified MTU is not less than 68, which is the minimum MTU. In many cases, network drivers set the ndo_change_mtu() callback to be the generic eth_change_mtu() method. The eth_change_mtu() method should be overridden if jumbo frames are supported. + + * The ndo_do_ioctl() callback is called when getting an IOCTL request which is not handled by the generic interface code. + + * The ndo_tx_timeout() callback is called when the transmitter was idle for a quite a while (for watchdog usage). + + * The ndo_add_slave() callback is called to set a specified network device as a slave to a specified netowrk device. It is used, for example, in the team network driver and in the bonding network driver. + + * The ndo_del_slave() callback is called to remove a previously enslaved network device. + + * The ndo_set_features() callback is called to update the configuration of a network device with new features. + + * The ndo_vlan_rx_add_vid() callback is called when registering a VLAN id if the network device supports VLAN filtering (the NETIF_F_HW_VLAN_FILTER flag is set in the device features). + + * The ndo_vlan_rx_kill_vid() callback is called when unregistering a VLAN id if the network device supports VLAN filtering (the NETIF_F_HW_VLAN_FILTER flag is set in the device features). + +Note + +From kernel 3.10, the NETIF_F_HW_VLAN_FILTER flag was renamed to NETIF_F_HW_VLAN_CTAG_FILTER. + + * There are also several callbacks for handling SR-IOV devices, for example, ndo_set_vf_mac() and ndo_set_vf_vlan(). + +Before kernel 2.6.29, there was a callback named set_multicast_list() for addition of multicast addresses, which was replaced by the dev_set_rx_mode() method. The dev_set_rx_mode() callback is called primarily whenever the unicast or multicast address lists or the network interface flags are updated. + + * struct ethtool_ops *ethtool_ops + +The ethtool_ops structure includes pointers for several callbacks for handling offloads, getting and setting various device settings, reading registers, getting statistics, reading RX flow hash indirection table, WakeOnLAN parameters, and many more. If the network driver does not initialize the ethtool_ops object, the networking core provides a default empty ethtool_ops object named default_ethtool_ops. The management of ethtool_ops is done in net/core/ethtool.c . + +Helper method: + + * SET_ETHTOOL_OPS (netdev,ops): A macro which sets the specified ethtool_ops for the specified net_device. + +You can view the offload parameters of a network interface device by running ethtool –k . You can set some offload parameters of a network interface device by running ethtool –K offloadParameter off/on. See man 8 ethtool. + + * const struct header_ops *header_ops + +The header_ops struct include callbacks for creating the Layer 2 header, parsing it, rebuilding it, and more. For Ethernet it is eth_header_ops, defined in net/ethernet/eth.c. + + * unsigned int flags + +The interface flags of the network device that you can see from userspace. Here are some flags (for a full list see include/uapi/linux/if.h): + + * IFF_UP flag is set when the interface state is changed from down to up. + + * IFF_PROMISC is set when the interface is in promiscuous mode (receives all packets). When running sniffers like wireshark or tcpdump, the network interface is in promiscuous mode. + + * IFF_LOOPBACK is set for the loopback device. + + * IFF_NOARP is set for devices which do not use the ARP protocol. IFF_NOARP is set, for example, in tunnel devices (see for example, in the ipip_tunnel_setup() method, net/ipv4/ipip.c). + + * IFF_POINTOPOINT is set for PPP devices. See for example, the ppp_setup() method, drivers/net/ppp/ppp_generic.c. + + * IFF_MASTER is set for master devices. See, for example, for bonding devices, the bond_setup() method in drivers/net/bonding/bond_main.c. + + * IFF_LIVE_ADDR_CHANGE flag indicates that the device supports hardware address modification when it's running. See the eth_mac_addr() method in net/ethernet/eth.c. + + * IFF_UNICAST_FLT flag is set when the network driver handles unicast address filtering. + + * IFF_BONDING is set for a bonding master device or bonding slave device. The bonding driver provides a method for aggregating multiple network interfaces into a single logical interface. + + * IFF_TEAM_PORT is set for a device used as a team port. The teaming driver is a load-balancing network software driver intended to replace the bonding driver. + + * IFF_MACVLAN_PORT is set for a device used as a macvlan port. + + * IFF_EBRIDGE is set for an Ethernet bridging device. + +The flags field is exported by sysfs via /sys/class/net//flags. + +Some of these flags can be set by userspace tools. For example, ifconfig -arp will set the IFF_NOARP network interface flag, and ifconfig arp will clear the IFF_NOARP flag. Note that you can do the same with the iproute2 ip command: ip link set dev arp on and ip link set dev arp off. + + * unsigned int priv_flags + +The interface flags, which are invisible from userspace. For example, IFF_EBRIDGE for a bridge interface or IFF_BONDING for a bonding interface, or IFF_SUPP_NOFCS for an interface support sending custom FCS. + +Helper methods: + + * netif_supports_nofcs(): Returns true if the IFF_SUPP_NOFCS is set in the priv_flags of the specified device. + + * is_vlan_dev(struct net_device *dev): Returns 1 if the IFF_802_1Q_VLAN flag is set in the priv_flags of the specified network device. + + * unsigned short gflags + +Global flags (kept as legacy). + + * unsigned short padded + +How much padding is added by the alloc_netdev() method. + + * unsigned char operstate + +RFC 2863 operstate. + + * unsigned char link_mode + +Mapping policy to operstate. + + * unsigned int mtu + +The network interface MTU (Maximum Transmission Unit) value. The maximum size of frame the device can handle. RFC 791 sets 68 as a minimum MTU. Each protocol has MTU of its own. The default MTU for Ethernet is 1,500 bytes. It is set in the ether_setup() method, net/ethernet/eth.c. Ethernet packets with sizes higher than 1,500 bytes, up to 9,000 bytes, are called Jumbo frames. The network interface MTU is exported by sysfs via /sys/class/net//mtu. + +Helper method: + + * dev_set_mtu(struct net_device *dev, int new_mtu): Changes the MTU of the specified device to a new value, specified by the mtu parameter. + +The sysadmin can change the MTU of a network interface to 1,400, for example, in one of the following ways: + +ifconfig mtu 1400 + +ip link set mtu 1400 + +echo 1400 > /sys/class/net//mtu + +Many drivers implement the ndo_change_mtu() callback to change the MTU to perform driver-specific needed actions (like resetting the network card). + + * unsigned short type + +The network interface hardware type. For example, for Ethernet it is ARPHRD_ETHER and is set in ether_setup() in net/ethernet/eth.c. For PPP interface, it is ARPHRD_PPP, and is set in the ppp_setup() method in drivers/net/ppp/ppp_generic.c. The type is exported by sysfs via /sys/class/net//type. + + * unsigned short hard_header_len + +The hardware header length. Ethernet headers, for example, consist of MAC source address, MAC destination address, and a type. The MAC source and destination addresses are 6 bytes each, and the type is 2 bytes. So the Ethernet header length is 14 bytes. The Ethernet header length is set to 14 (ETH_HLEN) in the ether_setup() method, net/ethernet/eth.c. The ether_setup() method is responsible for initializing some Ethernet device defaults, like the hard header len, Tx queue len, MTU, type, and more. + + * unsigned char perm_addr[MAX_ADDR_LEN] + +The permanent hardware address (MAC address) of the device. + + * unsigned char addr_assign_type + +Hardware address assignment type, can be one of the following: + + * NET_ADDR_PERM + + * NET_ADDR_RANDOM + + * NET_ADDR_STOLEN + + * NET_ADDR_SET + +By default, the MAC address is permanent (NET_ADDR_PERM). If the MAC address was generated with a helper method named eth_hw_addr_random(), the type of the MAC address is NET_ADD_RANDOM. The type of the MAC address is stored in the addr_assign_type member of the net_device. Also when changing the MAC address of the device, with eth_mac_addr(), you reset the addr_assign_type with ∼NET_ADDR_RANDOM (if it was marked as NET_ADDR_RANDOM before). When a network device is registered (by the register_netdevice() method), if the addr_assign_type equals NET_ADDR_PERM, dev->perm_addr is set to be dev->dev_addr. When you set a MAC address, you set the addr_assign_type to be NET_ADDR_SET. This indicates that the MAC address of a device has been set by the dev_set_mac_address() method. The addr_assign_typ e is exported by sysfs via /sys/class/net//addr_assign_type. + + * unsigned char addr_len + +The hardware address length in octets. For Ethernet addresses, it is 6 (ETH_ALEN) bytes and is set in the ether_setup() method. The addr_len is exported by sysfs via /sys/class/net//addr_len. + + * unsigned char neigh_priv_len + +Used in the neigh_alloc() method,net/core/neighbour.c; neigh_priv_len is initialized only in the ATM code (atm/clip.c). + + * struct netdev_hw_addr_list uc + +Unicast MAC addresses list, initialized by the dev_uc_init() method. There are three types of packets in Ethernet: unicast, multicast, and broadcast. Unicast is destined for one machine, multicast is destined for a group of machines, and broadcast is destined for all the machines in the LAN. + +Helper methods: + + * netdev_uc_empty(dev): Returns 1 if the unicast list of the specified device is empty (its count field is 0). + + * dev_uc_flush(struct net_device *dev): Flushes the unicast addresses of the specified network device and zeroes count. + + * struct netdev_hw_addr_list mc + +Multicast MAC addresses list, initialized by the dev_mc_init() method. + +Helper methods: + + * netdev_mc_empty(dev): Returns 1 if the multicast list of the specified device is empty (its count field is 0). + + * dev_mc_flush(struct net_device *dev): Flushes the multicast addresses of the specified network device and zeroes the count field. + + * unsigned int promiscuity + +A counter of the times a network interface card is told to work in promiscuous mode. With promiscuous mode, packets with MAC destination address which is different than the interface MAC address are not rejected. The promiscuity counter is used, for example, to enable more than one sniffing client; so when opening some sniffing clients (like wireshark), this counter is incremented by 1 for each client you open, and closing that client will decrement the promiscuity counter. When the last instance of the sniffing client is closed, promiscuity will be set to 0, and the device will exit from working in promiscuous mode. It is used also in the bridging subsystem, as the bridge interface needs to work in promiscuous mode. So when adding a bridge interface, the network interface card is set to work in promiscuous mode. See the call to the dev_set_promiscuity() method in br_add_if(), net/bridge/br_if.c. + +Helper method: + + * dev_set_promiscuity(struct net_device *dev, int inc): Increments/decrements the promiscuity counter of the specified network device according to the specified increment. The dev_set_promiscuity() method can get a positive increment or a negative increment parameter. As long as the promiscuity counter remains above zero, the interface remains in promiscuous mode. Once it reaches zero, the device reverts back to normal filtering operation. Because promiscuity is an integer, the dev_set_promiscuity() method takes into account cyclic overflow of integer, which means it handles the case when the promiscuity counter is incremented when it reaches the maximum positive value an unsigned integer can reach. + + * unsigned int allmulti + +The allmulti counter of the network device enables or disables the allmulticast mode. When selected, all multicast packets on the network will be received by the interface. You can set a network device to work in allmulticast mode by ifconfig eth0 allmulti. You disable the allmulti flag by ifconfig eth0 –allmulti. + +Enabling/disabling the allmulticast mode can also be performed with the ip command: + +ip link set p2p1 allmulticast on + +ip link set p2p1 allmulticast off + +You can also see the allmulticast state by inspecting the flags that are shown by the ip command: + +ip addr show + +flags=4610 mtu 1500 + +Helper method: + + * dev_set_allmulti(struct net_device *dev, int inc): Increments/decrements the allmulti counter of the specified network device according to the specified increment (which can be a positive or a negative integer). The dev_set_allmulti() method also sets the IFF_ALLMULTI flag of the network device when setting the allmulticast mode and removes this flag when disabling the allmulticast mode. + +The next three fields are protocol-specific pointers: + + * struct in_device __rcu *ip_ptr + +This pointer is assigned to a pointer to struct in_device, which represents IPv4 specific data, in inetdev_init(), net/ipv4/devinet.c. + + * struct inet6_dev __rcu *ip6_ptr + +This pointer is assigned to a pointer to struct inet6_dev, which represents IPv6 specific data, in ipv6_add_dev(), net/ipv6/addrconf.c. + + * struct wireless_dev *ieee80211_ptr + +This is a pointer for the wireless device, assigned in the ieee80211_if_add() method, net/mac80211/iface.c. + + * unsigned long last_rx + +Time of last Rx. It should not be set by network device drivers, unless really needed. Used, for example, in the bonding driver code. + + * struct list_head dev_list + +The global list of network devices. Insertion to the list is done with the list_netdevice() method, when the network device is registered. Removal from the list is done with the unlist_netdevice() method, when the network device is unregistered. + + * struct list_head napi_list + +NAPI stands for New API, a technique by which the network driver works in polling mode, and not in interrupt-driven mode, when it is under high traffic. Using NAPI under high traffic has been proven to improve performance. When working with NAPI, instead of getting an interrupt for each received packet, the network stack buffers the packets and from time to time triggers the poll method the driver registered with the netif_napi_add() method. When working with polling mode, the driver starts to work in interrupt-driven mode. When there is an interrupt for the first received packet, you reach the interrupt service routine (ISR), which is the method that was registered with request_irq(). Then the driver disables interrupts and notifies NAPI to take control, usually by calling the __napi_schedule() method from the ISR. See, for example, the cpsw_interrupt() method in drivers/net/ethernet/ti/cpsw. + +When the traffic is low, the network driver switches to work in interrupt-driven mode. Nowadays, most network drivers work with NAPI. The napi_list object is the list of napi_struct objects; The netif_napi_add() method adds napi_struct objects to this list, and the netif_napi_del() method deletes napi_struct objects from this list. When calling the netif_napi_add() method, the driver should specify its polling method and a weight parameter. The weight is a limit on the number of packets the driver will pass to the stack in each polling cycle. It is recommended to use a weight of 64. If a driver attempts to call netif_napi_add() with weight higher than 64 (NAPI_POLL_WEIGHT), there is a kernel error message. NAPI_POLL_WEIGHT is defined in include/linux/netdevice.h. + +The network driver should call napi_enable() to enable NAPI scheduling. Usually this is done in the ndo_open() callback of the net_device_ops object. The network driver should call napi_disable() to disable NAPI scheduling. Usually this is done in the ndo_stop() callback of net_device_ops. NAPI is implemented using softirqs. This softirq handler is the net_rx_action() method and is registered by calling open_softirq(NET_RX_SOFTIRQ, net_rx_action) by the net_dev_init() method in net/core/dev.c. The net_rx_action() method invokes the poll method of the network driver which was registered with NAPI. The maximum number of packets (taken from all interfaces which are registered to polling) in one polling cycle (NAPI poll) is by default 300. It is the netdev_budget variable, defined in net/core/dev.c, and can be modified via a procfs entry, /proc/sys/net/core/netdev_budget. In the past, you could change the weight per device by writing values to a procfs entry, but currently, the /sys/class/net//weight sysfs entry is removed. See Documentation/sysctl/net.txt. I should also mention that the napi_complete() method removes a device from the polling list. When a network driver wants to return to work in interrupt-driven mode, it should call the napi_complete() method to remove itself from the polling list. + + * struct list_head unreg_list + +The list of unregistered network devices. Devices are added to this list when they are unregistered. + + * unsigned char *dev_addr + +The MAC address of the network interface. Sometimes you want to assign a random MAC address. You do that by calling the eth_hw_addr_random() method, which also sets the addr_assign_type to be NET_ADDR_RANDOM. + +The dev_addr field is exported by sysfs via /sys/class/net//address. + +You can change dev_addr with userspace tools like ifconfig or ip of iproute2. + +Helper methods: Many times you invoke the following helper methods on Ethernet addresses in general and on dev_addr field of a network device in particular: + + * is_zero_ether_addr(const u8 *addr): Returns true if the address is all zeroes. + + * is_multicast_ether_addr(const u8 *addr): Returns true if the address is a multicast address. By definition the broadcast address is also a multicast address. + + * is_valid_ether_addr (const u8 *addr): Returns true if the specified MAC address is not 00:00:00:00:00:00, is not a multicast address, and is not a broadcast address (FF:FF:FF:FF:FF:FF). + + * struct netdev_hw_addr_list dev_addrs + +The list of device hardware addresses. + + * unsigned char broadcast[MAX_ADDR_LEN] + +The hardware broadcast address. For Ethernet devices, the broadcast address is initialized to 0XFFFFFF in the ether_setup() method, net/ethernet/eth.c. The broadcast address is exported by sysfs via /sys/class/net//broadcast . + + * struct kset *queues_kset + +A kset is a group of kobjects of a specific type, belonging to a specific subsystem. + +The kobject structure is the basic type of the device model. A Tx queue is represented by struct netdev_queue, and the Rx queue is represented by struct netdev_rx_queue. Each of them holds a kobject pointer. The queues_kset object is a group of all kobjects of the Tx queues and Rx queues. Each Rx queue has the sysfs entry /sys/class/net//queues/, and each Tx queue has the sysfs entry /sys/class/net//queues/. These entries are added with the rx_queue_add_kobject() method and the netdev_queue_add_kobject() method respectively, in net/core/net-sysfs.c. For more information about the kobject and the device model, see Documentation/kobject.txt. + + * struct netdev_rx_queue *_rx + +An array of Rx queues (netdev_rx_queue objects), initialized by the netif_alloc_rx_queues() method. The Rx queue to be used is determined in the get_rps_cpu() method. See more info about RPS in the description of the rxhash field in the previous sk_buff section. + + * unsigned int num_rx_queues + +The number of Rx queues allocated in the register_netdev() method. + + * unsigned int real_num_rx_queues + +Number of Rx queues currently active in the device. + +Helper method: + + * netif_set_real_num_rx_queues (struct net_device *dev, unsigned int rxq): Sets the actual number of Rx queues used for the specified device according to the specified number of Rx queues. The relevant sysfs entries (/sys/class/net//queues/*) are updated (only in the case that the state of the device is NETREG_REGISTERED or NETREG_UNREGISTERING). Note that alloc_netdev_mq() initializes num_rx_queues, real_num_rx_queues, num_tx_queues and real_num_tx_queues to the same value. One can set the number of Tx queues and Rx queues by using ip link when adding a device. For example, if you want to create a VLAN device with 6 Tx queues and 7 Rx queues, you can run this command: + +ip link add link p2p1 name p2p1.1 numtxqueues 6 numrxqueues 7 type vlan id 8 + + * rx_handler_func_t __rcu *rx_handler + +Helper methods: + + * netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler void *rx_handler_data) + +The rx_handler callback is set by calling the netdev_rx_handler_register() method. It is used, for example, in bonding, team, openvswitch, macvlan, and bridge devices. + + * netdev_rx_handler_unregister(struct net_device *dev): Unregisters a receive handler for the specified network device. + + * void __rcu *rx_handler_data + +The rx_handler_data field is also set by the netdev_rx_handler_register() method when a non-NULL value is passed to the netdev_rx_handler_register() method. + + * struct netdev_queue __rcu *ingress_queue + +Helper method: + + * struct netdev_queue *dev_ingress_queue(struct net_device *dev): Returns the ingress_queue of the specified net_device (include/linux/rtnetlink.h). + + * struct netdev_queue *_tx + +An array of Tx queues (netdev_queue objects), initialized by the netif_alloc_netdev_queues() method. + +Helper method: + + * netdev_get_tx_queue(const struct net_device *dev,unsigned int index): Returns the Tx queue (netdev_queue object), an element of the _tx array of the specified network device at the specified index. + + * unsigned int num_tx_queues + +Number of Tx queues, allocated by the alloc_netdev_mq() method. + + * unsigned int real_num_tx_queues + +Number of Tx queues currently active in the device. + +Helper method: + + * netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq): Sets the actual number of Tx queues used. + + * struct Qdisc *qdisc + +Each device maintains a queue of packets to be transmitted named qdisc. The Qdisc (Queuing Disciplines) layer implements the Linux kernel traffic management. The default qdisc is pfifo_fast. You can set a different qdisc using tc, the traffic control tool of the iproute2 package. You can view the qdisc of your network device by the using the ip command: + +ip addr show + +For example, running + +ip addr show eth1 + +can give: + +2: eth1: mtu 1500 qdisc pfifo_fast state UP qlen 1000 + +link/ether 00:e0:4c:53:44:58 brd ff:ff:ff:ff:ff:ff + +inet 192.168.2.200/24 brd 192.168.2.255 scope global eth1 + +inet6 fe80::2e0:4cff:fe53:4458/64 scope link + +valid_lft forever preferred_lft forever + +In this example, you can see that a qdisc of pfifo_fast is used, which is the default. + + * unsigned long tx_queue_len + +The maximum number of allowed packets per queue. Each hardware layer has its own tx_queue_len default. For Ethernet devices, tx_queue_len is set to 1,000 by default (see the ether_setup() method). For FDDI, tx_queue_len is set to 100 by default (see the fddi_setup() method in net/802/fddi.c). + +The tx_queue_len field is set to 0 for virtual devices, such as the VLAN device, because the actual transmission of packets is done by the real device on which these virtual devices are based. You can set the Tx queue length of a device by using the command ifconfig (this option is called txqueuelen) or by using the command ip link show (it is called qlen), in this way, for example: + +ifconfig p2p1 txqueuelen 900 + +ip link set txqueuelen 950 dev p2p1 + +The Tx queue length is exported via the following sysfs entry: /sys/class/net//tx_queue_len. + + * unsigned long trans_start + +The time (in jiffies) of the last transmission. + + * int watchdog_timeo + +The watchdog is a timer that will invoke a callback when the network interface was idle and did not perform transmission in some specified timeout interval. Usually the driver defines a watchdog callback which will reset the network interface in such a case. The ndo_tx_timeout() callback of net_device_ops serves as the watchdog callback. The watchdog_timeo field represents the timeout that is used by the watchdog. See the dev_watchdog() method, net/sched/sch_generic.c. + + * int __percpu *pcpu_refcnt + +Per CPU network device reference counter. + +Helper methods: + + * dev_put(struct net_device *dev): Decrements the reference count. + + * dev_hold(struct net_device *dev): Increments the reference count. + + * struct hlist_node index_hlist + +This is a hash table of network devices, indexed by the network device index (the ifindex field). A lookup in this table is performed by the dev_get_by_index() method. Insertion into this table is performed by the list_netdevice() method, and removal from this list is done with the unlist_netdevice() method. + + * enum {...} reg_state + +An enum that represents the various registration states of the network device. + +Possible values: + + * NETREG_UNINITIALIZED: When the device memory is allocated, in the alloc_netdev_mqs() method. + + * NETREG_REGISTERED: When the net_device is registered, in the register_netdevice() method. + + * NETREG_UNREGISTERING: When unregistering a device, in the rollback_registered_many() method. + + * NETREG_UNREGISTERED: The network device is unregistered but it is not freed yet. + + * NETREG_RELEASED: The network device is in the last stage of freeing the allocated memory of the network device, in the free_netdev() method. + + * NETREG_DUMMY: Used in the dummy device, in the init_dummy_netdev() method. See drivers/net/dummy.c. + + * bool dismantle + +A Boolean flag that shows that the device is in dismantle phase, which means that it is going to be freed. + + * enum {...} rtnl_link_state + +This is an enum that can have two values that represent the two phases of creating a new link: + + * RTNL_LINK_INITIALIZE: The ongoing state, when creating the link is still not finished. + + * RTNL_LINK_INITIALIZING: The final state, when work is finished. + +See the rtnl_newlink() method in net/core/rtnetlink.c. + + * void (*destructor)(struct net_device *dev) + +This destructor callback is called when unregistering a network device, in the netdev_run_todo() method. It enables network devices to perform additional tasks that need to be done for unregistering. For example, the loopback device destructor callback, loopback_dev_free(), calls free_percpu() for freeing its statistics object and free_netdev(). Likewise the team device destructor callback, team_destructor(), also calls free_percpu() for freeing its statistics object and free_netdev(). And there are many other network device drivers that define a destructor callback. + + * struct net *nd_net + +The network namespace this network device is inside. Network namespaces support was added in the 2.6.29 kernel. These features provide process virtualization, which is considered lightweight in comparison to other virtualization solutions like KVM and Xen. There is currently support for six namespaces in the Linux kernel. In order to support network namespaces, a structure called net was added. This structure represents a network namespace. The process descriptor (task_struct) handles the network namespace and other namespaces via a new member which was added for namespaces support, named nsproxy. This nsproxy includes a network namespace object called net_ns, and also four other namespace objects of the following namespaces: pid namespace, mount namespace, uts namespace, and ipc namespace; the sixth namespace, the user namespace, is kept in struct cred (the credentials object) which is a member of the process descriptor, task_struct). + +Network namespaces provide a partitioning and isolation mechanism which enables one process or a group of processes to have a private view of a full network stack of their own. By default, after boot all network interfaces belong to the default network namespace, init_net. You can create a network namespace with userspace tools using the ip command from iproute2 package or with the unshare command of util-linux—or by writing your own userspace application and invoking the unshare() or the clone() system calls with the CLONE_NEWNET flag. Moreover, you can also change the network namespace of a process by invoking the setns() system call. This setns() system call and the unshare() system call were added specially to support namespaces. The setns() system call can attach to the calling process an existing namespace of any type (network namespace, pid namespace, mount namespace, and so on). You need CAP_SYS_ADMIN privilege to call set_ns() for all namespaces, except the user namespace. See man 2 setns. + +A network device belongs to exactly one network namespace at a given moment. And a network socket belongs to exactly one network namespace at a given moment. Namespaces do not have names, but they do have a unique inode which identifies them. This unique inode is generated when the namespace is created and can be read by reading a procfs entry (the command ls –al /proc//ns/ shows all the unique inode numbers symbolic links of a process—you can also read these symbolic links with the readlink command). + +For example, using the ip command, creating a new namespace called ns1 is done thus: + +ip netns add myns1 + +Each newly created network namespace includes only the loopback device and includes no sockets. Each device (like a bridge device or a VLAN device) that is created from a process that runs in that namespace (like a shell) belongs to that namespace. + +Removing a namespace is done using the following command: + +ip netns del myns1 + +Note + +After deleting a namespace, all its physical network devices are moved to the default network namespace. Local devices (namespace local devices that have the NETIF_F_NETNS_LOCAL flag set, like PPP device or VXLAN device) are not moved to the default network namespace but are deleted. + +Showing the list of all network namespaces on the system is done with this command: + +ip netns list + +Assigning the p2p1 interface to the myns1 network namespace is done by the command: + +ip link set p2p1 netns myns1 + +Opening a shell in myns1 is done thus: + +ip netns exec myns1 bash + +With the unshare utility, creating a new namespace and starting a bash shell inside is done thus: + +unshare --net bash + +Two network namespaces can communicate by using a special virtual Ethernet driver, veth. (drivers/net/veth.c). + +Helper methods: + + * dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat): Moves the network device to a different network namespace, specified by the net parameter. Local devices (devices in which the NETIF_F_NETNS_LOCAL feature is set) are not allowed to change their namespace. This method returns -EINVAL for this type of device. The pat parameter, when it is not NULL, is the name pattern to try if the current device name is already taken in the destination network namespace. The method also sends a KOBJ_REMOVE uevent for removing the old namespace entries from sysfs, and a KOBJ_ADD uevent to add the sysfs entries to the new namespace. This is done by invoking the kobject_uevent() method specifying the corresponding uevent. + + * dev_net(const struct net_device *dev): Returns the network namespace of the specified network device. + + * dev_net_set(struct net_device *dev, struct net *net): Decrements the reference count of the nd_net (namespace object) of the specified device and assigns the specified network namespace to it. + +The following four fields are members in a union: + + * struct pcpu_lstats __percpu *lstats + +The loopback network device statistics. + + * struct pcpu_tstats __percpu *tstats + +The tunnel statistics. + + * struct pcpu_dstats __percpu *dstats + +The dummy network device statistics. + + * struct pcpu_vstats __percpu *vstats + +The VETH (Virtual Ethernet) statistics. + + * struct device dev + +The device object associated with the network device. Every device in the Linux kernel is associated with a device object, which is an instance of the device structure. For more information about the device structure, I suggest you read the "Devices" section in Chapter 14 of Linux Device Drivers, 3rd Edition (O'Reilly, 2005) and Documentation/driver-model/overview.txt. + +Helper methods: + + * to_net_dev(d): Returns the net_device object that contains the specified device as its device object. + + * SET_NETDEV_DEV (net, pdev): Sets the parent of the dev member of the specified network device to be that specified device (the second argument, pdev). + +With virtual devices, you do not call the SET_NETDEV_DEV() macro. As a result, entries for these virtual devices are created under /sys/devices/virtual/net. + +The SET_NETDEV_DEV() macro should be called before calling the register_netdev() method. + + * SET_NETDEV_DEVTYPE(net, devtype): Sets the type of the dev member of the specified network device to be the specified type. The type is a device_type object. + +SET_NETDEV_DEVTYPE() is used, for example, in the br_dev_setup() method, innet/bridge/br_device.c: + +static struct device_type br_type = { + +.name = "bridge" + +}; + +void br_dev_setup(struct net_device *dev) + +{ + +... + +SET_NETDEV_DEVTYPE(dev, &br_type); + +... + +} + +With the udevadm tool (udev management tool), you can find the device type, for example, for a bridge device named mybr: + +udevadm info -q all -p /sys/devices/virtual/net/mybr + +P: /devices/virtual/net/mybr + +E: DEVPATH=/devices/virtual/net/mybr + +E: DEVTYPE=bridge + +E: ID_MM_CANDIDATE=1 + +E: IFINDEX=7 + +E: INTERFACE=mybr + +E: SUBSYSTEM=net + + * const struct attribute_group *sysfs_groups[4] + +Used by networking sysfs. + + * struct rtnl_link_ops *rtnl_link_ops + +The rtnetlink link operations object. It consists of various callbacks for handling network devices, for example: + + * newlink() for configuring and registering a new device. + + * changelink() for changing parameters of an existing device. + + * dellink() for removing a device. + + * get_num_tx_queues() for getting the number of Tx queues. + + * get_num_rx_queues() for getting the number of Rx queues. + +Registration and unregistration of rtnl_link_ops object is done with the rtnl_link_register() method and the rtnl_link_unregister() method, respectively. + + * unsigned int gso_max_size + +Helper method: + + * netif_set_gso_max_size(struct net_device *dev, unsigned int size): Sets the specified gso_max_size for the specified network device. + + * u8 num_tc + +The number of traffic classes in the net device. + +Helper method: + + * netdev_set_num_tc(struct net_device *dev, u8 num_tc): Sets the num_tc of the specified network device (the maximum value of num_tc can be TC_MAX_QUEUE, which is 16). + + * int netdev_get_num_tc(struct net_device *dev): Returns the num_tc value of the specified network device. + + * struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE] + + * u8 prio_tc_map[TC_BITMASK + 1]; + + * struct netprio_map __rcu *priomap + +The network priority cgroup module provides an interface to set the priority of network traffic. The cgroups layer is a Linux kernel layer that enables process resource management and process isolation. It enables assigning one task or several tasks to a system resource, like a networking resource, memory resource, CPU resource, and so on. The cgroups layer implements a Virtual File System (VFS)and is managed by filesystem operations like mounting/unmounting, creating files and directories, writing to cgroup VFS control files, and so forth. The cgroup project was started in 2005 by developers from Google (Paul Manage, Rohit Seth, and others). Some projects are based on cgroups usage, like systemd and lxc (Linux containers). Google has its own implementation of containers, based on cgroups. There is no relation between the cgroup implementation and the namespaces implementation. In the past, there was a namespace controller in cgroups but it was removed. No new system calls were added for cgroups implementations, and the cgroup code additions are not critical in terms of performance. There are two networking cgroups modules: net_prio and net_cls. These two cgroup modules are relatively short and simple. + +Setting the priority of network traffic with the netprio cgroup module is done by writing an entry to a cgroup control file,/sys/fs/cgroup/net_prio//net_prio.ifpriomap. The entry is in the form "deviceName priority." It is true that an application can set the priority of its traffic via the setsockopt() system call with SO_PRIORITY, but this is not always possible. Sometimes you cannot change the code of certain applications. Moreover, you want to let the system administrator decide on priority according to site-specific setup. The netprio kernel module is a solution when using the setsockopt() system call with SO_PRIORITY is not feasible. The netprio module also exports another /sys/fs/cgroup/netprio entry, net_prio.prioidx. The net_prio.prioidx entry is a read-only file and contains a unique integer value that the kernel uses as an internal representation of this cgroup. + +netprio is implemented in net/core/netprio_cgroup.c. + +net_cls is implemented in net/sched/cls_cgroup.c. + +The network classifier cgroup provides an interface to tag network packets with a class identifier (classid). Creating a net_cls cgroups instance creates a net_cls.classid control file. This net_cls.classid value is initialized to 0. You can set up rules for this classid with tc, the traffic control command of iproute2. + +For more information, see Documentation/cgroups/net_cls.txt . + + * struct phy_device *phydev + +The associated PHY device. The phy_device is the Layer 1 (the physical layer) device. It is defined in include/linux/phy.h. For many devices, PHY flow control parameters like autonegotiation, speed, or duplex can be configured via the PHY device with ethtool commands. See man 8 ethtool for more info. + + * int group + +The group that the network device belongs to. It is initialized with INIT_NETDEV_GROUP (0) by default. The group is exported by sysfs via /sys/class/net//netdev_group. The network device group filters are used for example in netfilter, in net/netfilter/xt_devgroup.c. + +Helper method: + + * void dev_set_group(struct net_device *dev, int new_group): Changes the group of the specified device to be the specified group. + + * struct pm_qos_request pm_qos_req + +Power Management Quality Of Service request object, defined in include/linux/pm_qos.h. + +For more details about PM QoS, see Documentation/power/pm_qos_interface.txt. + +Next I will describe the netdev_priv() method and the alloc_netdev() macro, which are used a lot in network drivers. + +The netdev_priv(struct net_device *netdev) method returns a pointer to the end of the net_device. This area is used by drivers, which define a private network interface structure in order to store private data. For example, in drivers/net/ethernet/intel/e1000e/netdev.c: + +static int e1000_open(struct net_device *netdev) + +{ + +struct e1000_adapter *adapter = netdev_priv(netdev); + +... + +} + +The netdev_priv() method is used also for software devices, like the VLAN device. So you have: + +static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) + +{ + +return netdev_priv(dev); + +} + +(net/8021q/vlan.h) + + * The alloc_netdev(sizeof_priv, name, setup) macro is for allocation and initialization of a network device. It is in fact a wrapper around alloc_netdev_mqs(), with one Tx queue and one Rx queue. sizeof_priv is the size of private data to allocate space for. The setup method is a callback to initialize the network device. For Ethernet devices, it is usually ether_setup(). + +For Ethernet devices, you can use the alloc_etherdev() or alloc_etherdev_mq() macros, which eventually invoke alloc_etherdev_mqs(); alloc_etherdev_mqs() is also a wrapper around alloc_netdev_mqs(), with the ether_setup() as the setup callback method. + + * Software devices usually define a setup method of their own. So, in PPP you have the ppp_setup() method in drivers/net/ppp/ppp_generic.c, and for VLAN you have vlan_setup(struct net_device *dev) in net/8021q/vlan.h. + +## RDMA (Remote DMA) + +The following sections describe the RDMA API for the following data structures: + + * RDMA device + + * Protection Domain (PD) + + * eXtended Reliable Connected (XRC) + + * Shared Receive Queue (SRQ) + + * Address Handle (AH) + + * Multicast Groups + + * Completion Queue (CQ) + + * Queue Pair (QP) + + * Memory Window (MW) + + * Memory Region (MR) + +## RDMA Device + +The following methods are related to the RDMA device. + +### The ib_register_client() Method + +The ib_register_client() method registers a kernel client that wants to use the RDMA stack. The specified callbacks will be called for every RDMA device that currently exists in the system and for every new device that will be detected or removed by the system (using hot-plug). It will return 0 on success or the errno value with the reason for the failure. + +int ib_register_client(struct ib_client *client); + + * client: A structure that describes the attributes of the registration. + +#### The ib_client Struct: + +The device registration attributes are represented by struct ib_client: + +struct ib_client { + +char *name; + +void (*add) (struct ib_device *); + +void (*remove)(struct ib_device *); + +struct list_head list; + +}; + + * name: The name of the kernel module to be registered. + + * add: A callback to be called for each RDMA device that exists in the system and for every new RDMA device that will be detected by the kernel. + + * remove: A callback to be called for each RDMA device being removed by the kernel. + +### The ib_unregister_client() Method + +The ib_unregister_client() method unregisters a kernel module that wants to stop using the RDMA stack. + +void ib_unregister_client(struct ib_client *client); + + * device: A structure that describes the attributes of the unregistration. + + * client: Should be the same object that was used when ib_register_client() was called. + +### The ib_get_client_data() Method + +The ib_get_client_data() method returns the client context which was associated with the RDMA device using the ib_set_client_data() method. + +void *ib_get_client_data(struct ib_device *device, struct ib_client *client); + + * device: The RDMA device to get the client context from. + + * client: The object that describes the attributes of the registration/unregistration. + +### The ib_set_client_data() Method + +The ib_set_client_data() method sets a client context to be associated with the RDMA device. + +void ib_set_client_data(struct ib_device *device, struct ib_client *client + +void *data); + + * device: The RDMA device to set the client context with. + + * client: The object that describes the attributes of the registration/unregistration. + + * data: The client context to associate. + +### The INIT_IB_EVENT_HANDLER macro + +The INIT_IB_EVENT_HANDLER macro initializes an event handler for the asynchronous events that may occur to the RDMA device. This macro should be used before calling the ib_register_event_handler() method: + +#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ + +do { \ + +(_ptr)->device = _device; \ + +(_ptr)->handler = _handler; \ + +INIT_LIST_HEAD(&(_ptr)->list); \ + +} while (0) + + * _ptr: A pointer to the event handler that will be provided to the ib_register_event_handler() method. + + * _device: The RDMA device context; upon its events the callback will be called. + + * _handler: The callback that will be called with every asynchronous event. + +### The ib_register_event_handler() Method + +The ib_register_event_handler() method registers an RDMA event to be called with every handler asynchronous event. It will return 0 on success or the errno value with the reason for the failure. + +int ib_register_event_handler (struct ib_event_handler *event_handler); + + * event_handler: The event handler that was initialized with the macro INIT_IB_EVENT_HANDLER. This callback may occur in interrupt context. + +### The ib_event_handler struct: + +The RDMA event handler is represented by struct ib_event_handler: + +struct ib_event_handler { + +struct ib_device *device; + +void (*handler)(struct ib_event_handler *, struct ib_event *); + +struct list_head list; + +}; + +### The ib_event Struct + +The event callback is being called with the new event that happens to the RDMA device. This event is represented by struct ib_event. + +struct ib_event { + +struct ib_device *device; + +union { + +struct ib_cq *cq; + +struct ib_qp *qp; + +struct ib_srq *srq; + +u8 port_num; + +} element; + +enum ib_event_type event; + +}; + + * device: The RDMA device to which the asynchronous event occurred. + + * element.cq: If this is a CQ event, the CQ on which the asynchronous event occurred. + + * element.qp: If this is a QP event, the QP on which the asynchronous event occurred. + + * element.srq: If this is an SRQ event, the SRQ on which the asynchronous event occurred. + + * element.port_num: If this is a port event, the port number on which the asynchronous event occurred. + + * event: The type of the asynchronous event that was occurred. It can be: + + * IB_EVENT_CQ_ERR: CQ event. An error occurred to the CQ and no more Work Completions will be generated to it. + + * IB_EVENT_QP_FATAL: QP event. An error occurred to the QP that prevents it from reporting an error through a Work Completion. + + * IB_EVENT_QP_REQ_ERR: QP event. An incoming RDMA request caused a transport error violation in the targeted QP. + + * IB_EVENT_QP_ACCESS_ERR: QP event. An incoming RDMA request caused a requested error violation in the targeted QP. + + * IB_EVENT_COMM_EST: QP event. A communication established event occurred. An incoming message was received by a QP when it was in the RTR state. + + * IB_EVENT_SQ_DRAINED: QP event. Send Queue drain event. The QP's Send Queue was drained. + + * IB_EVENT_PATH_MIG: QP event. Path migration was completed successfully and the primary was changed. + + * IB_EVENT_PATH_MIG_ERR: QP event. There was an error when trying to perform path migration. + + * IB_EVENT_DEVICE_FATAL: Device event. There was an error with the RDMA device. + + * IB_EVENT_PORT_ACTIVE: Port event. The port state has become active. + + * IB_EVENT_PORT_ERR: Port event. The port state was active and it is no longer active. + + * IB_EVENT_LID_CHANGE: Port event. The LID of the port was changed. + + * IB_EVENT_PKEY_CHANGE: Port event. A P_Key entry was changed in the port's P_Key table. + + * IB_EVENT_SM_CHANGE: Port event. The Subnet Manager that manages this port was change. + + * IB_EVENT_SRQ_ERR: SRQ event. An error occurred to the SRQ. + + * IB_EVENT_SRQ_LIMIT_REACHED: SRQ event/SRQ limit event. The number of Receive Requests in the SRQ dropped below the requested watermark. + + * IB_EVENT_QP_LAST_WQE_REACHED: QP event. Last Receive Request reached from the SRQ, and it won't consume any more Receive Requests from it. + + * IB_EVENT_CLIENT_REREGISTER: Port event. The client should reregister to all services from the Subnet Administrator. + + * IB_EVENT_GID_CHANGE: Port event. A GID entry was changed in the port's GID table. + +### The ib_unregister_event_handler() Method + +The ib_unregister_event_handler() method unregisters an RDMA event handler. It will return 0 on success or the errno value with the reason for the failure. + +int ib_unregister_event_handler(struct ib_event_handler *event_handler); + + * event_handler: The event handler to be unregistered. It should be the same object that was registered with ib_register_event_handler(). + +### The ib_query_device() Method + +The ib_query_device() method queries the RDMA device for its attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_device(struct ib_device *device + +struct ib_device_attr *device_attr); + + * device: The RDMA device to be queried. + + * device_attr: Pointer to a structure of an RDMA device attributes that will be filled. + +#### The ib_device_attr struct: + +The RDMA device attributes are represented by struct ib_device_attr: + +struct ib_device_attr { + +u64 fw_ver; + +__be64 sys_image_guid; + +u64 max_mr_size; + +u64 page_size_cap; + +u32 vendor_id; + +u32 vendor_part_id; + +u32 hw_ver; + +int max_qp; + +int max_qp_wr; + +int device_cap_flags; + +int max_sge; + +int max_sge_rd; + +int max_cq; + +int max_cqe; + +int max_mr; + +int max_pd; + +int max_qp_rd_atom; + +int max_ee_rd_atom; + +int max_res_rd_atom; + +int max_qp_init_rd_atom; + +int max_ee_init_rd_atom; + +enum ib_atomic_cap atomic_cap; + +enum ib_atomic_cap masked_atomic_cap; + +int max_ee; + +int max_rdd; + +int max_mw; + +int max_raw_ipv6_qp; + +int max_raw_ethy_qp; + +int max_mcast_grp; + +int max_mcast_qp_attach; + +int max_total_mcast_qp_attach; + +int max_ah; + +int max_fmr; + +int max_map_per_fmr; + +int max_srq; + +int max_srq_wr; + +int max_srq_sge; + +unsigned int max_fast_reg_page_list_len; + +u16 max_pkeys; + +u8 local_ca_ack_delay; + +}; + + * fw_ver: A number which represents the FW version of the RDMA device. It can be evaluated as ZZZZYYXX: Zs are the major number, Ys are the minor number, and Xs are the build number. + + * sys_image_guid: The system image GUID: Has a unique value for each system. + + * max_mr_size: The maximum supported MR size. + + * page_size_cap: Bitwise OR for all of supported memory page shifts. + + * vendor_id: The IEEE vendor ID. + + * vendor_part_id: Device's part ID, as supplied by the vendor. + + * hw_ver: Device's HW version, as supplied by the vendor. + + * max_qp: Maximum supported number of QPs. + + * max_qp_wr: Maximum supported number of Work Requests in each non-RD QP. + + * device_cap_flags: Supported capabilities of the RDMA device. It is a bitwise OR of the masks: + + * IB_DEVICE_RESIZE_MAX_WR: The RDMA device supports resize of the number of Work Requests in a QP. + + * IB_DEVICE_BAD_PKEY_CNTR: The RDMA device supports the ability to count the number of bad P_Keys. + + * IB_DEVICE_BAD_QKEY_CNTR: The RDMA device supports the ability to count the number of bad Q_Keys. + + * IB_DEVICE_RAW_MULTI: The RDMA device supports raw packet multicast. + + * IB_DEVICE_AUTO_PATH_MIG: The RDMA device supports Automatic Path Migration. + + * IB_DEVICE_CHANGE_PHY_PORT: The RDMA device supports changing the QP's primary Port number. + + * IB_DEVICE_UD_AV_PORT_ENFORCE: The RDMA device supports enforcements of the port number of UD QP and Address Handle. + + * IB_DEVICE_CURR_QP_STATE_MOD: The RDMA device supports the current QP modifier when calling ib_modify_qp(). + + * IB_DEVICE_SHUTDOWN_PORT: The RDMA device supports port shutdown. + + * IB_DEVICE_INIT_TYPE: The RDMA device supports setting InitType and InitTypeReply. + + * IB_DEVICE_PORT_ACTIVE_EVENT: The RDMA device supports the generation of the port active asynchronous event. + + * IB_DEVICE_SYS_IMAGE_GUID: The RDMA device supports system image GUID. + + * IB_DEVICE_RC_RNR_NAK_GEN: The RDMA device supports RNR-NAK generation for RC QPs. + + * IB_DEVICE_SRQ_RESIZE: The RDMA device supports resize of a SRQ. + + * IB_DEVICE_N_NOTIFY_CQ: The RDMA device supports notification when N Work Completions exists in the CQ. + + * IB_DEVICE_LOCAL_DMA_LKEY: The RDMA device supports Zero Stag (in iWARP) and reserved LKey (in InfiniBand). + + * IB_DEVICE_RESERVED: Reserved bit. + + * IB_DEVICE_MEM_WINDOW: The RDMA device supports Memory Windows. + + * IB_DEVICE_UD_IP_CSUM: The RDMA device supports insertion of UDP and TCP checksum on outgoing UD IPoIB messages and can verify the validity of those checksum for incoming messages. + + * IB_DEVICE_UD_TSO: The RDMA device supports TCP Segmentation Offload. + + * IB_DEVICE_XRC: The RDMA device supports the eXtended Reliable Connected transport. + + * IB_DEVICE_MEM_MGT_EXTENSIONS: The RDMA device supports memory management extensions support. + + * IB_DEVICE_BLOCK_MULTICAST_LOOPBACK: The RDMA device supports blocking multicast loopback. + + * IB_DEVICE_MEM_WINDOW_TYPE_2A: The RDMA device supports Memory Windows type 2A: association with a QP number. + + * IB_DEVICE_MEM_WINDOW_TYPE_2B: The RDMA device supports Memory Windows type 2B: association with a QP number and a PD. + + * max_sge: Maximum supported number of scatter/gather elements per Work Request in a non-RD QP. + + * max_sge_rd: Maximum supported number of scatter/gather elements per Work Request in an RD QP. + + * max_cq: Maximum supported number of CQs. + + * max_cqe: Maximum supported number of entries in each CQ. + + * max_mr: Maximum supported number of MRs. + + * max_pd: Maximum supported number of PDs. + + * max_qp_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent to a QP as the target of the operation. + + * max_ee_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent to an EE context as the target of the operation. + + * max_res_rd_atom: Maximum number of for incoming RDMA Read and Atomic operations that can be sent to this RDMA device as the target of the operation. + + * max_qp_init_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent from a QP as the initiator of the operation. + + * max_ee_init_rd_atom: Maximum number of RDMA Read and Atomic operations that can be sent from an EE context as the initiator of the operation. + + * atomic_cap: Ability of the device to support atomic operations. Can be: + + * IB_ATOMIC_NONE: The RDMA device doesn't guarantee any atomicity at all. + + * IB_ATOMIC_HCA: The RDMA device guarantees atomicity between QPs in the same device. + + * IB_ATOMIC_GLOB: The RDMA device guarantees atomicity between this device and any other component. + + * masked_atomic_cap: The ability of the device to support masked atomic operations. Possible values as described in atomic_cap earlier. + + * max_ee: Maximum supported number of EE contexts. + + * max_rdd: Maximum supported number of RDDs. + + * max_mw: Maximum supported number of MWs. + + * max_raw_ipv6_qp: Maximum supported number of Raw IPv6 Datagram QPs. + + * max_raw_ethy_qp: Maximum supported number of Raw Ethertype Datagram QPs. + + * max_mcast_grp: Maximum supported number of multicast groups. + + * max_mcast_qp_attach: Maximum supported number of QPs that can be attached to each multicast group. + + * max_total_mcast_qp_attach: Maximum number of total QPs that can be attached to any multicast group. + + * max_ah: Maximum supported number of AHs. + + * max_fmr: Maximum supported number of FMRs. + + * max_map_per_fmr: Maximum supported number of map operations which are allowed per FMR. + + * max_srq: Maximum supported number of SRQs. + + * max_srq_wr: Maximum supported number of Work Requests in each SRQ. + + * max_srq_sge: Maximum supported number of scatter/gather elements per Work Request in an SRQ. + + * max_fast_reg_page_list_len: Maximum number of page list that can be used when registering an FMR using a Work Request. + + * max_pkeys: Maximum supported number of P_Keys. + + * local_ca_ack_delay: Local CA ack delay. This value specifies the maximum expected time interval between the local device receiving a message and transmitting the associated ACK or NAK. + +### The ib_query_port() Method + +The ib_query_port() method queries the RDMA device port's attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_port(struct ib_device *device + +u8 port_num, struct ib_port_attr *port_attr); + + * device: The RDMA device to be queried. + + * port_num: The port number to be queried. + + * port_attr: A pointer to a structure of an RDMA port attributes which will be filled. + +#### The ib_port_attr Struct + +The RDMA port attributes are represented by struct ib_port_attr: + +struct ib_port_attr { + +enum ib_port_state state; + +enum ib_mtu max_mtu; + +enum ib_mtu active_mtu; + +int gid_tbl_len; + +u32 port_cap_flags; + +u32 max_msg_sz; + +u32 bad_pkey_cntr; + +u32 qkey_viol_cntr; + +u16 pkey_tbl_len; + +u16 lid; + +u16 sm_lid; + +u8 lmc; + +u8 max_vl_num; + +u8 sm_sl; + +u8 subnet_timeout; + +u8 init_type_reply; + +u8 active_width; + +u8 active_speed; + +u8 phys_state; + +}; + + * state: The logical port state. Can be: + + * IB_PORT_NOP: Reserved value. + + * IB_PORT_DOWN: Logical link is down. + + * IB_PORT_INIT: Logical link is initialized. The physical link is up but the Subnet Manager hasn't started to configure the port. + + * IB_PORT_ARMED: Logical link is armed. The physical link is up but the Subnet Manager started, and did not yet complete, configuring the port. + + * IB_PORT_ACTIVE: Logical link is active. + + * IB_PORT_ACTIVE_DEFER: Logical link is active but the physical link is down. The link tries to recover from this state. + + * max_mtu: The maximum MTU supported by this port. Can be: + + * IB_MTU_256: 256 bytes. + + * IB_MTU_512: 512 bytes. + + * IB_MTU_1024: 1,024 bytes. + + * IB_MTU_2048: 2,048 bytes. + + * IB_MTU_4096: 4,096 bytes. + + * active_mtu: The actual MTU that this port is configured with. Can be as max_mtu, mentioned earlier. + + * gid_tbl_len: The number of entries in the port's GID table. + + * port_cap_flags: The port supported capabilities. It is a bitwise OR of the masks: + + * IB_PORT_SM: An indication that the SM that manages the subnet is sending packets from this port. + + * IB_PORT_NOTICE_SUP: An indication that this port supports notices. + + * IB_PORT_TRAP_SUP: An indication that this port supports traps. + + * IB_PORT_OPT_IPD_SUP: An indication that this port supports Inter Packet Delay optional values. + + * IB_PORT_AUTO_MIGR_SUP: An indication that this port supports Automatic Path Migration. + + * IB_PORT_SL_MAP_SUP: An indication that this port supports SL 2 VL mapping table. + + * IB_PORT_MKEY_NVRAM: An indication that this port supports saving the M_Key attributes in Non Volatile RAM. + + * IB_PORT_PKEY_NVRAM: An indication that this port supports saving the P_Key table in Non Volatile RAM. + + * IB_PORT_LED_INFO_SUP: An indication that this port supports turning on and off the LED using management packets. + + * IB_PORT_SM_DISABLED: An indication that there is an SM which isn't active in this port. + + * IB_PORT_SYS_IMAGE_GUID_SUP: An indication that the port supports system image GUID. + + * IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP: An indication that the SMA on the switch management port will monitor P_Key mismatches on each switch external port. + + * IB_PORT_EXTENDED_SPEEDS_SUP: An indication that the port supports extended speeds (FDR and EDR). + + * IB_PORT_CM_SUP: An indication that this port supports CM. + + * IB_PORT_SNMP_TUNNEL_SUP: An indication that an SNMP tunneling agent is listening on this port. + + * IB_PORT_REINIT_SUP: An indication that this port supports reinitialization of the node. + + * IB_PORT_DEVICE_MGMT_SUP: An indication that this port supports device management. + + * IB_PORT_VENDOR_CLASS_SUP: An indication that a vendor-specific agent is listening on this port. + + * IB_PORT_DR_NOTICE_SUP: An indication that this port supports Direct Route notices. + + * IB_PORT_CAP_MASK_NOTICE_SUP: An indication that this port supports sending a notice if the port's port_cap_flags is changed. + + * IB_PORT_BOOT_MGMT_SUP: An indication that a boot manager agent is listening on this port. + + * IB_PORT_LINK_LATENCY_SUP: An indication that this port supports link round trip latency measurement. + + * IB_PORT_CLIENT_REG_SUP: An indication that this port is capable of generating the IB_EVENT_CLIENT_REREGISTER asynchronous event. + + * max_msg_sz: The maximum supported message size by this port. + + * bad_pkey_cntr: A counter for the number of bad P_Key from messages that this port received. + + * qkey_viol_cntr: A counter for the number of Q_Key violations from messages that this port received. + + * pkey_tbl_len: The number of entries in the port's P_Key table. + + * lid: The port's Local Identifier (LID), as assigned by the SM. + + * sm_lid: The LID of the SM. + + * lmc: LID mask of this port. + + * max_vl_num: Maximum number of Virtual Lanes supported by this port. Can be: + + * 1: 1 VL is supported: VL0 + + * 2: 2 VLs are supported: VL0–VL1 + + * 3: 4 VLs are supported: VL0–VL3 + + * 4: 8 VLs are supported: VL0–VL7 + + * 5: 15 VLs are supported: VL0–VL14 + + * sm_sl: The SL to be used when sending messages to the SM. + + * subnet_timeout: The maximum expected subnet propagation delay. This duration of time calculation is 4.094*2^subnet_timeout. + + * init_type_reply: The value that the SM configures before moving the port state to IB_PORT_ARMED or IB_PORT_ACTIVE to specify the type of the initialization performed. + + * active_width: The port's active width. Can be: + + * IB_WIDTH_1X: Multiple of 1. + + * IB_WIDTH_4X: Multiple of 4. + + * IB_WIDTH_8X: Multiple of 8. + + * IB_WIDTH_12X: Multiple of 12. + + * active_speed: The port's active speed. Can be: + + * IB_SPEED_SDR: Single Data Rate (SDR): 2.5 Gb/sec, 8/10 bit encoding. + + * IB_SPEED_DDR: Double Data Rate (DDR): 5 Gb/sec, 8/10 bit encoding. + + * IB_SPEED_QDR: Quad Data Rate (DDR): 10 Gb/sec, 8/10 bit encoding. + + * IB_SPEED_FDR10: Fourteen10 Data Rate (FDR10): 10.3125 Gb/sec, 64/66 bit encoding. + + * IB_SPEED_FDR: Fourteen Data Rate (FDR): 14.0625 Gb/sec, 64/66 bit encoding. + + * IB_SPEED_EDR: Enhanced Data Rate (EDR): 25.78125 Gb/sec. + + * phys_state: The physical port state. There isn't any enumeration for this value. + +### The rdma_port_get_link_layer() Method + +The rdma_port_get_link_layer() method returns the link layer of the RDMA device port. It will return the following values: + + * IB_LINK_LAYER_UNSPECIFIED: Unspecified value, usually legacy value that indicates that this is an InfiniBand link layer. + + * IB_LINK_LAYER_INFINIBAND: Link layer is InfiniBand. + + * IB_LINK_LAYER_ETHERNET: Link layer is Ethernet. This indicates that the port supports RDMA Over Converged Ethernet (RoCE). + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); + + * device: The RDMA device to be queried. + + * port_num: The port number to be queried. + +### The ib_query_gid() Method + +The ib_query_gid() method queries the RDMA device port's GID table. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); + + * device: The RDMA device to be queried. + + * port_num: The port number to be queried. + + * index: The index in the GID table to be queried. + + * gid: A pointer to the GID union to be filled. + +### The ib_query_pkey() Method + +The ib_query_pkey() method queries the RDMA device port's P_Key table. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_pkey(struct ib_device *device + +u8 port_num, u16 index, u16 *pkey); + + * device: The RDMA device to be queried. + + * port_num: The port number to be queried. + + * index: The index in the P_Key table to be queried. + + * pkey: A pointer to the P_Key to be filled. + +### The ib_modify_device() Method + +The ib_modify_device() method modifies the RDMA device attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_modify_device(struct ib_device *device + +int device_modify_mask + +struct ib_device_modify *device_modify); + + * device: The RDMA device to be modified. + + * device_modify_mask: The device attributes to be changed. It is a bitwise OR of the masks: + + * IB_DEVICE_MODIFY_SYS_IMAGE_GUID: Modifies the system image GUID. + + * IB_DEVICE_MODIFY_NODE_DESC: Modifies the node description. + + * device_modify: The RDMA attributes to be modified, as described immediately. + +#### The ib_device_modify Struct + +The RDMA device attributes are represented by struct ib_device_modify: + +struct ib_device_modify { + +u64 sys_image_guid; + +char node_desc[64]; + +}; + + * sys_image_guid: A 64-bit value of the system image GUID. + + * node_desc: A NULL terminated string that describes the node description. + +### The ib_modify_port() Method + +The ib_modify_port() method modifies the RDMA device port's attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_modify_port(struct ib_device *device + +u8 port_num, int port_modify_mask + +struct ib_port_modify *port_modify); + + * device: The RDMA device to be modified. + + * port_num: The port number to be modified. + + * port_modify_mask: The port's attributes to be changed. It is a bitwise OR of the masks: + + * IB_PORT_SHUTDOWN: Moves the port state to IB_PORT_DOWN. + + * IB_PORT_INIT_TYPE: Sets the port InitType value. + + * IB_PORT_RESET_QKEY_CNTR: Resets the port's Q_Key violation counter. + + * port_modify: The port attributes to be modified, as described in the next section. + +#### The ib_port_modify struct: + +The RDMA device attributes are represented by struct ib_port_modify: + +struct ib_port_modify { + +u32 set_port_cap_mask; + +u32 clr_port_cap_mask; + +u8 init_type; + +}; + + * set_port_cap_mask: The port capabilities bits to be set. + + * clr_port_cap_mask: The port capabilities bits to be cleared. + + * init_type: The InitType value to be set. + +### The ib_find_gid() Method + +The ib_find_gid() method finds the port number and the index where a specific GID value exists in the GID table. It will return 0 on success or the errno value with the reason for the failure. + +int ib_find_gid(struct ib_device *device, union ib_gid *gid + +u8 *port_num, u16 *index); + + * device: The RDMA device to be queried. + + * gid: A pointer of the GID to search for. + + * port_num: Will be filled with the port number that this GID exists in. + + * index: Will be filled with the index in the GID table that this GID exists in. + +### The ib_find_pkey() Method + +The ib_find_pkey() method finds the index where a specific P_Key value exists in the P_Key table in a specific port number. It will return 0 on success or the errno value with the reason for the failure. + +int ib_find_pkey(struct ib_device *device + +u8 port_num, u16 pkey, u16 *index); + + * device: The RDMA device to be queried. + + * port_num: The port number to search the P_Key in. + + * pkey: The P_Key value to search for. + + * index: The index in the P_Key table that this P_Key exists in. + +### The rdma_node_get_transport() Method + +The rdma_node_get_transport() method returns the RDMA transport type of a specific node type. The available transport types can be: + + * RDMA_TRANSPORT_IB: Transport is InfiniBand. + + * RDMA_TRANSPORT_IWARP: Transport is iWARP. + +### The rdma_node_get_transport() Method + +enum rdma_transport_type + +rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__; + + * node_type: The node type. Can be: + + * RDMA_NODE_IB_CA: Node type is an InfiniBand Channel Adapter. + + * RDMA_NODE_IB_SWITCH: Node type is an InfiniBand Switch. + + * RDMA_NODE_IB_ROUTER: Node type is an InfiniBand Router. + + * RDMA_NODE_RNIC: Node type is an RDMA NIC. + +### The ib_mtu_to_int() Method + +The ib_mtu_to_int() method returns the number of bytes, as an integer, for MTU enumerations. It will return a positive value on success or –1 on a failure. + +static inline int ib_mtu_enum_to_int(enum ib_mtu mtu); + + * mtu: Can be an MTU enumeration, as described earlier. + +### The ib_width_enum_to_int() Method + +The ib_width_enum_to_int() method returns the number of width multiple, as an integer, for an IB port enumerations. It will return a positive value on success or –1 on a failure. + +static inline int ib_width_enum_to_int(enum ib_port_width width); + + * width: Can be a port width enumeration, as described earlier. + +### The ib_rate_to_mult() Method + +The ib_rate_to_mult() method returns the number of multiple of the base rate of 2.5 Gbit/sec, as an integer, for an IB rate enumerations. It will return a positive value on success or –1 on a failure. + +int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; + + * rate: The rate enumeration to be converted. Can be: + + * IB_RATE_PORT_CURRENT: Current port's rate. + + * IB_RATE_2_5_GBPS: Rate of 2.5 Gbit/sec. + + * IB_RATE_5_GBPS: Rate of 5 Gbit/sec. + + * IB_RATE_10_GBPS: Rate of 10 Gbit/sec. + + * IB_RATE_20_GBPS: Rate of 20 Gbit/sec. + + * IB_RATE_30_GBPS: Rate of 30 Gbit/sec. + + * IB_RATE_40_GBPS: Rate of 40 Gbit/sec. + + * IB_RATE_60_GBPS: Rate of 60 Gbit/sec. + + * IB_RATE_80_GBPS: Rate of 80 Gbit/sec. + + * IB_RATE_120_GBPS: Rate of 120 Gbit/sec. + + * IB_RATE_14_GBPS: Rate of 14 Gbit/sec. + + * IB_RATE_56_GBPS: Rate of 56 Gbit/sec. + + * IB_RATE_112_GBPS: Rate of 112 Gbit/sec. + + * IB_RATE_168_GBPS: Rate of 168 Gbit/sec. + + * IB_RATE_25_GBPS: Rate of 25 Gbit/sec. + + * IB_RATE_100_GBPS: Rate of 100 Gbit/sec. + + * IB_RATE_200_GBPS: Rate of 200 Gbit/sec. + + * IB_RATE_300_GBPS: Rate of 300 Gbit/sec. + +### The ib_rate_to_mbps() Method + +The ib_rate_to_mbps() method returns the number of Mbit/sec, as an integer, for an IB rate enumerations. It will return a positive value on success or –1 on a failure. + +int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__; + + * rate: The rate enumeration to be converted, as described earlier. + +### The ib_rate_to_mbps() Method + +The ib_rate_to_mbps() method returns the IB rate enumerations for a multiple of the base rate of 2.5 Gbit/sec. It will return a positive value on success or –1 on a failure. + +enum ib_rate mult_to_ib_rate(int mult) __attribute_const__; + + * mult: The rate multiple to be converted, as described earlier. + +## Protection Domain (PD) + +PD is an RDMA resource that associates QPs and SRQs with MRs and AHs with QPs. One can look at PD as a color, for example: red MR can work with a red QP, and red AH can work with a red QP. Working with green AH with a red QP will result in an error. + +### The ib_alloc_pd() Method + +The ib_alloc_pd() method allocates a PD. It will return a pointer to the newly allocated PD on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_pd *ib_alloc_pd(struct ib_device *device); + + * device: The RDMA device that the PD will be associated with. + +### The ib_dealloc_pd() Method + +The ib_dealloc_pd() method deallocates a PD. It will return 0 on success or the errno value with the reason for the failure. + +int ib_dealloc_pd(struct ib_pd *pd); + + * pd: The PD to be deallocated. + +## eXtended Reliable Connected (XRC) + +XRC is an IB transport extension that provides better scalability, in the sender side, for Reliable Connected QPs than the original Reliable Transport can provide. Using XRC will decrease the number of QPs between two specific cores: when using RC QPs, for each core, in each machine, there is a QP. When using XRC, there will be one XRC QP in each host. When sending a message, the sender needs to specify the remote SRQ number that will receive the message. + +### The ib_alloc_xrcd() Method + +The ib_alloc_xrcd() method allocates an XRC domain. It will return a pointer to the newly created XRC domain on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); + + * device: The RDMA device that this XRC domain will be allocated on. + +### The ib_dealloc_xrcd_cq() Method + +The ib_dealloc_xrcd_cq() method deallocates an XRC domain. It will return 0 on success or the errno value with the reason for the failure: + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd); + + * xrcd: The XRC domain to be deallocated. + +## Shared Receive Queue (SRQ) + +SRQ is a resource that helps RDMA to be more scalable. Instead of managing the Receive Requests in the Receive Queues of many QPs, it is possible to manage them in a single Receive Queue, which all of them share. This will eliminate starvation in RC QPs or packet drops in unreliable transport types and will help to reduce the total posted Receive Requests, thus reducing the consumed memory. Furthermore, unlike a QP, an SRQ can have a watermark to allow a notification if the number of RRs in the SRQ dropped below a specify value. + +### The ib_srq_attr Struct + +The SRQ attributes are represented by struct ib_srq_attr: + +struct ib_srq_attr { + +u32 max_wr; + +u32 max_sge; + +u32 srq_limit; + +}; + + * max_wr: The maximum number of outstanding RRs that this SRQ can hold. + + * max_sge: The maximum number of scatter/gather elements that each RR in the SRQ can hold. + + * srq_limit: The watermark limit that creates an asynchronous event if the number of RRs in the SRQ dropped below this value. + +### The ib_create_srq() Method + +The ib_create_srq() method creates an SRQ. It will return a pointer to the newly created SRQ on success or an ERR_PTR() which specifies the reason for the failure: + +struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr); + + * pd: The PD that this SRQ is being associated with. + + * srq_init_attr: The attributes that this SRQ will be created with. + +#### The ib_srq_init_attr Struct + +The created SRQ attributes are represented by struct ib_srq_init_attr: + +struct ib_srq_init_attr { + +void (*event_handler)(struct ib_event *, void *); + +void *srq_context; + +struct ib_srq_attr attr; + +enum ib_srq_type srq_type; + +union { + +struct { + +struct ib_xrcd *xrcd; + +struct ib_cq *cq; + +} xrc; + +} ext; + +}; + + * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the SRQ. + + * srq_context: User-defined context that can be associated with the SRQ. + + * attr: The SRQ attributes, as described earlier. + + * srq_type: The type of the SRQ. Can be: + + * IB_SRQT_BASIC: For regular SRQ. + + * IB_SRQT_XRC: For XRC SRQ. + + * ext: If srq_type is IB_SRQT_XRC, specifies the XRC domain or the CQ that this SRQ is associated with. + +### The ib_modify_srq() Method + +The ib_modify_srq() method modifies the attributes of the SRQ. It will return 0 on success or the errno value with the reason for the failure. + +int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); + + * srq: The SRQ to be modified. + + * srq_attr: The SRQ attributes, as described earlier. + + * srq_attr_mask: The SRQ attributes to be changed. It is a bitwise OR of the masks: + + * IB_SRQ_MAX_WR: Modify the number of RRs in the SRQ (that is, resize the SRQ). This can be done only if the device supports SRQ resize—that is, the IB_DEVICE_SRQ_RESIZE is set in the device flags. + + * IB_SRQ_LIMIT: Set the value of the SRQ watermark limit. + +### The ib_query_srq() Method + +The ib_query_srq() method queries for the current SRQ attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); + + * srq: The SRQ to be queried. + + * srq_attr: The SRQ attributes, as described earlier. + +### The ib_destory_srq() Method + +The ib_destory_srq() method destroys an SRQ. It will return 0 on success or the errno value with the reason for the failure. + +int ib_destroy_srq(struct ib_srq *srq); + + * srq: The SRQ to be destroyed. + +### The ib_post_srq_recv() Method + +The ib_post_srq_recv() method takes a linked list of Receive Requests and adds them to the SRQ for future processing. Every Receive Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure. + +static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr + +struct ib_recv_wr **bad_recv_wr); + + * srq: The SRQ that the Receive Requests will be posted to. + + * recv_wr: A linked list of Receive Request to be posted. + + * bad_recv_wr: If there was an error with the handling of the Receive Requests, this pointer will be filled with the address of the Receive Request that caused this error. + +#### The ib_recv_wr Struct + +The Receive Request is represented by struct ib_recv_wr: + +struct ib_recv_wr { + +struct ib_recv_wr *next; + +u64 wr_id; + +struct ib_sge *sg_list; + +int num_sge; + +}; + + * next: A pointer to the next Receive Request in the list or NULL, if this is the last Receive Request. + + * wr_id: A 64-bit value that is associated with this Receive Request and will be available in the corresponding Work Completion. + + * sg_list: The array of the scatter/gather elements, as described in the next section. + + * num_sge: The number of entries in sg_list. The value zero means that the message size that can be saved has zero bytes. + +#### The ib_sge Struct + +The scatter/gather element is represented by struct ib_sge: + +struct ib_sge { + +u64 addr; + +u32 length; + +u32 lkey; + +}; + + * addr: The address of the buffer to access. + + * length: The length of the address to access. + + * lkey: The Local Key of the Memory Region that this buffer was registered with. + +## Address Handle (AH) + +AH is an RDMA resource that describes the path from the local port to the remote port of the destination. It is being used for a UD QP. + +### The ib_ah_attr Struct + +The AH attributes are represented by struct ib_ah_attr: + +struct ib_ah_attr { + +struct ib_global_route grh; + +u16 dlid; + +u8 sl; + +u8 src_path_bits; + +u8 static_rate; + +u8 ah_flags; + +u8 port_num; + +}; + + * grh: The Global Routing Header attributes that are used for sending messages to another subnet or to a multicast group in the local or remote subnet. + + * dlid: The destination LID. + + * sl: The Service Level that this message will use. + + * src_path_bits: The used source path bits. Relevant if LMC is used in this port. + + * static_rate: The level of delay that should be done between sending the messages. It is used when sending a message to a remote node that supports a slower message rate than the local node. + + * ah_flags: The AH flags. It is a bitwise OR of the masks: + + * IB_AH_GRH: GRH is used in this AH. + + * port_num: The local port number that messages will be sent from. + +### The ib_create_ah() Method + +The ib_create_ah() method creates an AH. It will return a pointer to the newly created AH on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + + * pd: The PD that this AH is being associated with. + + * ah_attr: The attributes that this AH will be created with. + +### The ib_init_ah_from_wc() Method + +The ib_init_ah_from_wc() method initializes an AH attribute structure from a Work Completion and a GRH structure. This is being done in order to return a message back for an incoming message of an UD QP. It will return 0 on success or the errno value with the reason for the failure. + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc + +struct ib_grh *grh, struct ib_ah_attr *ah_attr); + + * device: The RDMA device that the Work Completion came from and the AH to be created on. + + * port_num: The port number that the Work Completion came from and the AH will be associated with. + + * wc: The Work Completion of the incoming message. + + * grh: The GRH buffer of the incoming message. + + * ah_attr: The attributes of this AH to be filled. + +### The ib_create_ah_from_wc() Method + +The ib_create_ah_from_wc() method creates an AH from a Work Completion and a GRH structure. This is done in order to return a message back for an incoming message of a UD QP. It will return a pointer to the newly created AH on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, struct ib_grh *grh, u8 port_num); + + * pd: The PD that this AH is being associated with. + + * wc: The Work Completion of the incoming message. + + * grh: The GRH buffer of the incoming message. + + * port_num: The port number that the Work Completion came from and the AH will be associated with. + +### The ib_modify_ah() Method + +The ib_modify_ah() method modifies the attributes of the AH. It will return 0 on success or the errno value with the reason for the failure. + +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + + * ah: The AH to be modified. + + * ah_attr: The AH attributes, as described earlier. + +### The ib_query_ah() Method + +The ib_query_ah() method queries for the current AH attributes. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + + * ah: The AH to be queried + + * ah_attr: The AH attributes, as described earlier. + +### The ib_destory_ah() Method + +The ib_destory_ah() method destroys an AH. It will return 0 on success or the errno value with the reason for the failure. + +int ib_destroy_ah(struct ib_ah *ah); + + * ah: The AH to be destroyed. + +## Multicast Groups + +Multicast groups are means to send a message from one UD QP to many UD QPs. Every UD QP that wants to get this message needs to be attached to a multicast group. + +### The ib_attach_mcast() Method + +The ib_attach_mcast() method attaches a UD QP to a multicast group within an RDMA device. It will return 0 on success or the errno value with the reason for the failure. + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + + * qp: A handler of a UD QP to be attached to the multicast group. + + * gid: The GID of the multicast group that the QP will be added to. + + * lid: The LID of the multicast group that the QP will be added to. + +### The ib_detach_mcast() method + +The ib_detach_mcast() method detaches a UD QP from a multicast group within an RDMA device. It will return 0 on success or the errno value with the reason for the failure. + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + + * qp: A handler of a UD QP to be detached from the multicast group. + + * gid: The GID of the multicast group that the QP will be removed from. + + * lid: The LID of the multicast group that the QP will be removed from. + +## Completion Queue (CQ) + +A Work Completion specifies that a corresponding Work Request was completed and provides some information. + +about it: its status, the used opcode, its size, and so on. A CQ is an object that consists of Work Completions. + +### The ib_create_cq() Method + +The ib_create_cq() method creates a CQ. It will return a pointer to the newly created CQ on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, int cqe, int comp_vector); + + * device: The RDMA device that this CQ is being associated with. + + * comp_handler: A pointer to a callback that will be called when a completion event occur to the CQ. + + * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the CQ. + + * cq_context: A user-defined context that can be associated with the CQ. + + * cqe: The requested number of Work Completions that this CQ can hold. + + * comp_vector: The index of the RDMA device's completion vector to work on. If the IRQ affinity masks of these interrupts are spread across the cores, this value can be used to spread the completion workload over all of the cores. + +### The ib_resize_cq() Method + +The ib_resize_cq() method changes the size of the CQ to hold at least the new size, either by increasing the CQ size or decreasing it. Even if the user asks to resize a CQ, its size may not be resized. + +int ib_resize_cq(struct ib_cq *cq, int cqe); + + * cq: The CQ to be resized. This value cannot be lower than the number of Work Completions that exists in the CQ. + + * cqe: The requested number of Work Completions that this CQ can hold. + +### The ib_modify_cq() Method + +The ib_modify_cq() method changes the moderation parameter for a CQ. A Completion event will be generated if at least a specific number of Work Completion will enter the CQ or a timeout will expire. Using it may help to reduce the number of interrupts that happen to the RDMA device. It will return 0 on success or the -errno value with the reason for the failure. + +int ib_modify_cq(structib_cq *cq, u16 cq_count, u16 cq_period); + + * cq: The CQ to be modified. + + * cq_count: The number of Work Completions that will be added to the CQ, since the last Completion event, that will trigger a CQ event. + + * cq_period: The number of microseconds that will pass, since the last Completion event, that will trigger a CQ event. + +### The ib_peek_cq() Method + +The ib_peek_cq() method returns the number of available Work Completions in the CQ. If the number of Work Completions in the CQ is equal to or greater than wc_cnt, it will return wc_cnt. Otherwise it will return the actual number of the Work Completions in the CQ. If an error occurred, it will return the errno value with the reason for the failure. + +int ib_peek_cq(structib_cq *cq, intwc_cnt); + + * cq: The CQ to peek. + + * cq_count: The number of Work Completions that will added to the CQ, since the last Completion event, that will trigger a CQ event. + +### The ib_req_notify_cq() Method + +The ib_req_notify_cq()method requests that a Completion event notification be created. Its return value can be: + + * 0: This means that the notification was requested successfully. If IB_CQ_REPORT_MISSED_EVENTS was used, then a return value of 0 means that there aren't any missed events. + + * Positive value is returned only when IB_CQ_REPORT_MISSED_EVENTS is used and there are missed events. The user should call the ib_poll_cq() method in order to read the Work Completions that exist in the CQ. + + * Negative value is returned when an error occurred. The –errno value is returned, specifying the reason for the failure. + +static inline int ib_req_notify_cq(struct ib_cq *cq + +enum ib_cq_notify_flags flags); + + * cq: The CQ that this Completion event will be generated for. + + * flags: Information about the Work Completion that will cause the Completion event notification to be created. Can be one of: + + * IB_CQ_NEXT_COMP: The next Work Completion that will be added to the CQ, after calling this method, will trigger the CQ event. + + * IB_CQ_SOLICITED: The next Solicited Work Completion that will be added to the CQ, after calling this method, will trigger the CQ event. + +Both of those values can be bitwise ORed with IB_CQ_REPORT_MISSED_EVENTS in order to request a hint about missed events (that is, when calling this method and there are already Work Completions in this CQ). + +### The ib_req_ncomp_notif() Method + +The ib_req_ncomp_notif() method requests that a Completion event notification be created when the number of Work Completions in the CQ equals wc_cnt. It will return 0 on success, or the errno value with the reason for the failure. + +static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt); + + * cq: The CQ that this Completion event will be generated for. + + * wc_cnt: The number of Work Completions that the CQ will hold before a Completion event notification is generated. + +### The ib_poll_cq() Method + +The ib_poll_cq() method polls Work Completions from a CQ. It reads the Work Completion from the CQ and removes them. The Work Completions are read in the order they were added to the CQ. It will return 0 or a positive number to indicate the number of Work Completions that were read or the -errno value with the reason for the failure. + +static inline int ib_poll_cq(struct ib_cq *cq, int num_entries + +struct ib_wc *wc); + + * cq: The CQ to be polled. + + * num_entries: The maximum number of Work Completions to be polled. + + * wc: An array that the number of polled Work Completions will be stored in. + +#### The ib_wc Struct + +Every Work Completion is represented by struct ib_wc: + +struct ib_wc { + +u64 wr_id; + +enum ib_wc_status status; + +enum ib_wc_opcode opcode; + +u32 vendor_err; + +u32 byte_len; + +struct ib_qp *qp; + +union { + +__be32 imm_data; + +u32 invalidate_rkey; + +} ex; + +u32 src_qp; + +int wc_flags; + +u16 pkey_index; + +u16 slid; + +u8 sl; + +u8 dlid_path_bits; + +u8 port_num; + +}; + + * wr_id: A 64-bit value that was associated with the corresponding Work Request. + + * status: Status of the ended Work Request. Can be: + + * IB_WC_SUCCESS: Operation completed successfully. + + * IB_WC_LOC_LEN_ERR: Local length error. Either sent message is too big to be handled or incoming message is bigger than the available Receive Request. + + * IB_WC_LOC_QP_OP_ERR: Local QP operation error. An internal QP consistency error was detected while processing a Work Request. + + * IB_WC_LOC_EEC_OP_ERR: Local EE context operation error. Deprecated, since RD QPs aren't supported. + + * IB_WC_LOC_PROT_ERR: Local protection error. The protection of the Work Request buffers is invalid to the requested operation. + + * IB_WC_WR_FLUSH_ERR: Work Request flushed error. The Work Request was completed when the QP was in the Error state. + + * IB_WC_MW_BIND_ERR: Memory Windows bind error. The operation of the Memory Windows binding failed. + + * IB_WC_BAD_RESP_ERR: Bad response error. Unexpected transport layer opcode returned by the responder. + + * IB_WC_LOC_ACCESS_ERR: Local access error. A protection error occurred on local buffers during the processing of an RDMA Write With Immediate message. + + * IB_WC_REM_INV_REQ_ERR: Remove invalid request error. The incoming message is invalid. + + * IB_WC_REM_ACCESS_ERR: Remote access error. A protection error occurred to incoming RDMA operation. + + * IB_WC_REM_OP_ERR: Remote operation error. The incoming operation couldn't be completed successfully. + + * IB_WC_RETRY_EXC_ERR: Transport retry counter exceeded. The remote QP didn't send any Ack or Nack, and the timeout was expired after the message retransmission. + + * IB_WC_RNR_RETRY_EXC_ERR: RNR retry exceeded. The RNR NACK return count was exceeded. + + * IB_WC_LOC_RDD_VIOL_ERR: Local RDD violation error. Deprecated, since RD QPs aren't supported. + + * IB_WC_REM_INV_RD_REQ_ERR: Remove invalid RD request. Deprecated, since RD QPs aren't supported. + + * IB_WC_REM_ABORT_ERR: Remote aborted error. The responder aborted the operation. + + * IB_WC_INV_EECN_ERR: Invalid EE Context number. Deprecated, since RD QPs aren't supported. + + * IB_WC_INV_EEC_STATE_ERR: Invalid EE context state error. Deprecated, since RD QPs aren't supported. + + * IB_WC_FATAL_ERR: Fatal error. + + * IB_WC_RESP_TIMEOUT_ERR: Response timeout error. + + * IB_WC_GENERAL_ERR: General error. Other error which isn't covered by one of the earlier errors. + + * opcode: The operation of the corresponding Work Request that was ended with this Work Completion. Can be: + + * IB_WC_SEND: Send operation was completed in the sender side. + + * IB_WC_RDMA_WRITE: RDMA Write operation was completed in the sender side. + + * IB_WC_RDMA_READ: RDMA Read operation was completed in the sender side. + + * IB_WC_COMP_SWAP: Compare and Swap operation was completed in the sender side. + + * IB_WC_FETCH_ADD: Fetch and Add operation was completed in the sender side. + + * IB_WC_BIND_MW: Memory bind operation was completed in the sender side. + + * IB_WC_LSO: Send operation with Large Send Offload (LSO) was completed in the sender side. + + * IB_WC_LOCAL_INV: Local invalidate operation was completed in the sender side. + + * IB_WC_FAST_REG_MR: Fast registration operation was completed in the sender side. + + * IB_WC_MASKED_COMP_SWAP: Masked Compare and Swap operation was completed in the sender side. + + * IB_WC_MASKED_FETCH_ADD: Masked Fetch and Add operation was completed in the sender side. + + * IB_WC_RECV: Receive Request of an incoming send operation was completed in the receiver side. + + * IB_WC_RECV_RDMA_WITH_IMM: Receive Request of an incoming RDMA Write with immediate operation was completed in the receiver side. + + * vendor_err: A vendor-specific value that provides extra information about the reason for the error. + + * byte_len: If this is a Work Completion that was created from the end of a Receive Request, the byte_len value indicates the number of bytes that were received. + + * qp: Handle of the QP that got the Work Completion. It is useful when QPs are associated with an SRQ—this way you can know the handle associated with the QP, that its incoming message consumed the Receive Request from the SRQ. + + * ex.imm_data: Out Of Band data (32 bits), in network order, that was sent with the message. It is available if IB_WC_WITH_IMM is set in wc_flags. + + * ex.invalidate_rkey: The rkey that was invalidated. It is available if IB_WC_WITH_INVALIDATE is set in wc_flags. + + * src_qp: Source QP number. The QP number that sent this message. Only relevant for UD QPs. + + * wc_flags: Flags that provide information about the Work Completion. It is a bitwise OR of the masks: + + * IB_WC_GRH: Indicator that the message was received has a GRH and the first 40 bytes of the Receive Request buffers contains it. Only relevant for UD QPs. + + * IB_WC_WITH_IMM: Indicator that the received message has immediate data. + + * IB_WC_WITH_INVALIDATE: Indicator that a Send with Invalidate message was received. + + * IB_WC_IP_CSUM_OK: Indicator that the received message passed the IP checksum test done by the RDMA device. This is available only if the RDMA device supports IP checksum offload. It is available if IB_DEVICE_UD_IP_CSUM is set in the device flags. + + * pkey_index: The P_Key index, relevant only for GSI QPs. + + * slid: The source LID of the message. Only relevant for UD QPs. + + * sl: The Service Level of the message. Only relevant for UD QPs. + + * dlid_path_bits: The destination LID path bits. Only relevant for UD QPs. + + * port_num: The port number from which the message came in. Only relevant for Direct Route SMPs on switches. + +### The ib_destory_cq() Method + +The ib_destory_cq() method destroys a CQ. It will return 0 on success or the errno value with the reason for the failure. + +int ib_destroy_cq(struct ib_cq *cq); + + * cq: The CQ to be destroyed. + +## Queue Pair (QP) + +QP is a resource that combines two Work Queues together: the Send Queue and the Receive Queue. Each queue acts as a FIFO. WRs that are being posted to each Work Queue will be processed by the order of their arrival. However, there isn't any guarantee about the order between the Queues. This resource is the resource that sends and receives packets. + +### The ib_qp_cap Struct + +The QP's Work Queues sizes are represented by struct ib_qp_cap: + +struct ib_qp_cap { + +u32 max_send_wr; + +u32 max_recv_wr; + +u32 max_send_sge; + +u32 max_recv_sge; + +u32 max_inline_data; + +}; + + * max_send_wr: The maximum number of outstanding Work Requests that this QP can hold in the Send Queue. + + * max_recv_wr: The maximum number of outstanding Work Requests that this QP can hold in the Receive Queue. This value is ignored if the QP is associated with an SRQ. + + * max_send_sge: The maximum number of scatter/gather elements that each Work Request in the Send Queue will be able to hold. + + * max_recv_sge: The maximum number of scatter/gather elements that each Work Request in the Receive Queue will be able to hold. + + * max_inline_data: The maximum message size that can be sent inline. + +### The ib_create_qp() Method + +The ib_create_qp() method creates a QP. It will return a pointer to the newly created QP on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_qp *ib_create_qp(struct ib_pd *pd + +struct ib_qp_init_attr *qp_init_attr); + + * pd: The PD that this QP is being associated with. + + * qp_init_attr: The attributes that this QP will be created with. + +#### The ib_qp_init_attr Struct + +The created QP attributes are represented by struct ib_qp_init_attr: + +struct ib_qp_init_attr { + +void (*event_handler)(struct ib_event *, void *); + +void *qp_context; + +struct ib_cq *send_cq; + +struct ib_cq *recv_cq; + +struct ib_srq *srq; + +struct ib_xrcd *xrcd; /* XRC TGT QPs only */ + +struct ib_qp_cap cap; + +enum ib_sig_type sq_sig_type; + +enum ib_qp_type qp_type; + +enum ib_qp_create_flags create_flags; + +u8 port_num; /* special QP types only */ + +}; + + * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the QP. + + * qp_context: User-defined context that can be associated with the QP. + + * send_cq: A CQ that is being associated with the Send Queue of this QP. + + * recv_cq: A CQ that is being associated with the Receive Queue of this QP. + + * srq: A SRQ that is being associated with the Receive Queue of this QP or NULL if the QP isn't associated with an SRQ. + + * xrcd: An XRC domain that this QP will be associated with. Relevant only if qp_type is IB_QPT_XRC_TGT. + + * cap: A structure that describes the size of the Send and Receive Queues. This structure is described earlier. + + * sq_sig_type: The signaling type of the Send Queue. It can be: + + * IB_SIGNAL_ALL_WR: Every posted Send Request to the Send Queue will end with a Work Completion. + + * IB_SIGNAL_REQ_WR: Only posted Send Requests to the Send Queue with an explicit request, i.e. set the IB_SEND_SIGNALED flag—will end with a Work Completion. This is called selective signaling. + + * qp_type: The QP transport type. Can be: + + * IB_QPT_SMI: A Subnet Management Interface QP. + + * IB_QPT_GSI: A General Service Interface QP. + + * IB_QPT_RC: A Reliable Connected QP. + + * IB_QPT_UC: An Unreliable Connected QP. + + * IB_QPT_UD: An Unreliable Datagram QP. + + * IB_QPT_RAW_IPV6: An IPv6 raw datagram QP. + + * IB_QPT_RAW_ETHERTYPE: An EtherType raw datagram QP. + + * IB_QPT_RAW_PACKET: A raw packet QP. + + * IB_QPT_XRC_INI: An XRC-initiator QP. + + * IB_QPT_XRC_TGT: An XRC-target QP. + + * create_flags: QP attributes flags. It is a bitwise OR of the masks: + + * IB_QP_CREATE_IPOIB_UD_LSO: The QP will be used to send IPoIB LSO messages. + + * IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK: Block loopback multicast packets. + + * port_num: The RDMA device port number that this QP is associated with. Only relevant when qp_type is IB_QPT_SMI or IB_QPT_GS. + +### The ib_modify_qp() Method + +The ib_modify_qp() method modifies the attributes of the QP. It will return 0 on success or the errno value with the reason for the failure. + +int ib_modify_qp(struct ib_qp *qp + +struct ib_qp_attr *qp_attr + +int qp_attr_mask); + + * qp: The QP to be modified. + + * qp_attr: The QP attributes, as described earlier. + + * qp_attr_mask: The QP attributes to be changed. Each mask specifies the attributes that will be modified in this QP transition, such as specifying which attributes in qp_attr will be used. It is a bitwise OR of the masks: + + * IB_QP_STATE: Modifies the QP state, specified in the qp_state field. + + * IB_QP_CUR_STATE: Modifies the assumed current QP state, specified in the cur_qp_state field. + + * IB_QP_EN_SQD_ASYNC_NOTIFY: Modifies the status of the request for notification when the QP state is SQD.drained, specified in the en_sqd_async_notify field. + + * IB_QP_ACCESS_FLAGS: Modifies the allowed incoming Remote operations, specified in the qp_access_flags field. + + * IB_QP_PKEY_INDEX: Modifies the index in the P_Key table that this QP is associated with in the primary path, specified in the pkey_index field. + + * IB_QP_PORT: Modifies the RDMA device's port number that QP's primary path is associated with, specified in the port_num field. + + * IB_QP_QKEY: Modifies the Q-Key of the QP, specified in the qkey field. + + * IB_QP_AV: Modifies the Address Vector attributes of the QP, specified in the ah_attr field. + + * IB_QP_PATH_MTU: Modifies the MTU of the path, specified in the path_mtu field. + + * IB_QP_TIMEOUT: Modifies the timeout to wait before retransmission, specified in the field timeout. + + * IB_QP_RETRY_CNT: Modifies the number of retries of the QP for lack of Ack/Nack, specified in the retry_cnt field. + + * IB_QP_RNR_RETRY: Modifies the number of RNR retry of the QP, specified in the rq_psn field. + + * IB_QP_RQ_PSN: Modifies the start PSN of the received packets, specified in the rnr_retry field. + + * IB_QP_MAX_QP_RD_ATOMIC: Modifies the number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator, specified in the max_rd_atomic field. + + * IB_QP_ALT_PATH: Modifies the alternate path of the QP, specified in the alt_ah_attr, alt_pkey_index, alt_port_num, and alt_timeout fields. + + * IB_QP_MIN_RNR_TIMER: Modifies the minimum RNR timer that the QP will report to the remote side in the RNR Nak, specified in the min_rnr_timer field. + + * IB_QP_SQ_PSN: Modifies the start PSN of the sent packets, specified in the sq_psn field. + + * IB_QP_MAX_DEST_RD_ATOMIC: Modifies the number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator, specified in the max_dest_rd_atomic field. + + * IB_QP_PATH_MIG_STATE: Modifies the state of the path migration state machine, specified in the path_mig_state field. + + * IB_QP_CAP: Modifies the size of the Work Queues in the QP (both Send and Receive Queues), specified in the cap field. + + * IB_QP_DEST_QPN: Modifies the destination QP number, specified in the dest_qp_num field. + +#### The ib_qp_attr Struct + +The QP attributes are represented by struct ib_qp_attr: + +struct ib_qp_attr { + +enum ib_qp_state qp_state; + +enum ib_qp_state cur_qp_state; + +enum ib_mtu path_mtu; + +enum ib_mig_state path_mig_state; + +u32 qkey; + +u32 rq_psn; + +u32 sq_psn; + +u32 dest_qp_num; + +int qp_access_flags; + +struct ib_qp_cap cap; + +struct ib_ah_attr ah_attr; + +struct ib_ah_attr alt_ah_attr; + +u16 pkey_index; + +u16 alt_pkey_index; + +u8 en_sqd_async_notify; + +u8 sq_draining; + +u8 max_rd_atomic; + +u8 max_dest_rd_atomic; + +u8 min_rnr_timer; + +u8 port_num; + +u8 timeout; + +u8 retry_cnt; + +u8 rnr_retry; + +u8 alt_port_num; + +u8 alt_timeout; + +}; + + * qp_state: The state to move the QP to. Can be: + + * IB_QPS_RESET: Reset state. + + * IB_QPS_INIT: Initialized state. + + * IB_QPS_RTR: Ready To Receive state. + + * IB_QPS_RTS: Ready To Send state. + + * IB_QPS_SQD: Send Queue Drained state. + + * IB_QPS_SQE: Send Queue Error state. + + * IB_QPS_ERR: Error state. + + * cur_qp_state: The assumed current state of the QP. Can be like qp_state. + + * path_mtu: The size of the MTU in the path. Can be: + + * IB_MTU_256: 256 bytes. + + * IB_MTU_512: 512 bytes. + + * IB_MTU_1024: 1,024 bytes. + + * IB_MTU_2048: 2,048 bytes. + + * IB_MTU_4096: 4,096 bytes. + + * path_mig_state: The path migration state machine, used in APM (Automatic Path Migration). Can be: + + * IB_MIG_MIGRATED: Migrated. The state machine of path migration is Migrated (initial state of migration was done). + + * IB_MIG_REARM: Rearm. The state machine of path migration is Rearm (attempt to try to coordinate the remote RC QP to move both local and remote QPs to Armed state). + + * IB_MIG_ARMED: Armed. The state machine of path migration is Armed (both local and remote QPs are ready to perform a path migration). + + * qkey: The Q_Key of the QP. + + * rq_psn: The expected PSN of the first packet in the Receive Queue. The value is 24 bits. + + * sq_psn: The used PSN of the first packet in the Send Queue. The value is 24 bits. + + * dest_qp_num: The QP number in the remote (destination) side. The value is 24 bits. + + * qp_access_flags: The allowed incoming RDMA and Atomic operations. It is a bitwise OR of the masks: + + * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations are allowed. + + * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations are allowed. + + * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations are allowed. + + * cap: The QP size. The number of Work Requests in the Receive and Send Queues. This can be done only if the device supports QP resize—that is, the IB_DEVICE_RESIZE_MAX_WR is set in the device flags. This structure is described earlier. + + * ah_attr: Address vector of the primary path of the QP. This structure is described earlier. + + * alt_ah_attr: Address vector of the alternate path of the QP. This structure is described earlier. + + * pkey_index: The P_Key index of the primary path that this QP is associated with. + + * alt_pkey_index: The P_Key index of the alternate path that this QP is associated with. + + * en_sqd_async_notify: If value isn't zero, request that the asynchronous event callback will be called when the QP will moved to SQE.drained state. + + * sq_draining: Relevant only for ib_query_qp(). If value isn't zero, the QP is in state SQD.drainning (and not SQD.drained). + + * max_rd_atomic: The number of RDMA Read and Atomic operations that this QP can process in parallel as an initiator. + + * max_dest_rd_atomic: The number of RDMA Read and Atomic operations that this QP can process in parallel as a destination. + + * min_rnr_timer: The timeout to wait before resend the message again if the remote side responds with an RNR Nack. + + * port_num: The RDMA device's Port number that this QP is associated with in the Primary path. + + * timeout: The timeout to wait before resending the message again if the remote side didn't respond with any Ack or Nack in the primary path. The timeout is a 5-bit value, 0 is infinite time, and any other value means that the timeout will be 4.096 * 2 ^ timeout usec. + + * retry_cnt: The number of times to (re)send the message if the remote side didn't respond with any Ack or Nack. + + * rnr_retry: The number of times to (re)send the message if the remote side answered with an RNR Nack. 3 bits value, 7 means infinite retry. The value can be: + + * IB_RNR_TIMER_655_36: Delay of 655.36 milliseconds. + + * IB_RNR_TIMER_000_01: Delay of 0.01 milliseconds. + + * IB_RNR_TIMER_000_02: Delay of 0.02 milliseconds. + + * IB_RNR_TIMER_000_03: Delay of 0.03 milliseconds. + + * IB_RNR_TIMER_000_04: Delay of 0.04 milliseconds. + + * IB_RNR_TIMER_000_06: Delay of 0.06 milliseconds. + + * IB_RNR_TIMER_000_08: Delay of 0.08 milliseconds. + + * IB_RNR_TIMER_000_12: Delay of 0.12 milliseconds. + + * IB_RNR_TIMER_000_16: Delay of 0.16 milliseconds. + + * IB_RNR_TIMER_000_24: Delay of 0.24 milliseconds. + + * IB_RNR_TIMER_000_32: Delay of 0.32 milliseconds. + + * IB_RNR_TIMER_000_48: Delay of 0.48 milliseconds. + + * IB_RNR_TIMER_000_64: Delay of 0.64 milliseconds. + + * IB_RNR_TIMER_000_96: Delay of 0.96 milliseconds. + + * IB_RNR_TIMER_001_28: Delay of 1.28 milliseconds. + + * IB_RNR_TIMER_001_92: Delay of 1.92 milliseconds. + + * IB_RNR_TIMER_002_56: Delay of 2.56 milliseconds. + + * IB_RNR_TIMER_003_84: Delay of 3.84 milliseconds. + + * IB_RNR_TIMER_005_12: Delay of 5.12 milliseconds. + + * IB_RNR_TIMER_007_68: Delay of 7.68 milliseconds. + + * IB_RNR_TIMER_010_24: Delay of 10.24 milliseconds. + + * IB_RNR_TIMER_015_36: Delay of 15.36 milliseconds. + + * IB_RNR_TIMER_020_48: Delay of 20.48 milliseconds. + + * IB_RNR_TIMER_030_72: Delay of 30.72 milliseconds. + + * IB_RNR_TIMER_040_96: Delay of 40.96 milliseconds. + + * IB_RNR_TIMER_061_44: Delay of 61.44 milliseconds. + + * IB_RNR_TIMER_081_92: Delay of 81.92 milliseconds. + + * IB_RNR_TIMER_122_88: Delay of 122.88 milliseconds. + + * IB_RNR_TIMER_163_84: Delay of 163.84 milliseconds. + + * IB_RNR_TIMER_245_76: Delay of 245.76 milliseconds. + + * IB_RNR_TIMER_327_68: Delay of 327.86 milliseconds. + + * IB_RNR_TIMER_491_52: Delay of 391.52 milliseconds. + + * alt_port_num: The RDMA device's Port number that this QP is associated with in the alternate path. + + * alt_timeout: The timeout to wait before resend the message again if the remote side didn't respond with any Ack or Nack in the alternate path. 5-bit value, 0 is infinite time, and any other value means that the timeout will be 4.096 * 2 ^ timeout usec. + +### The ib_query_qp() Method + +The ib_query_qp() method queries for the current QP attributes. Some of the attributes in qp_attr may change in subsequent calls to ib_query_qp() the state fields. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + + * qp: The QP to be queried. + + * qp_attr: The QP attributes, as described earlier. + + * qp_attr_mask: The mask of the mandatory attributes to query. Low-level drivers can use it as a hint for the fields to be queried, but they may also ignore it as well and fill the whole structure. + + * qp_init_attr: The QP init attributes, as described earlier. + +The ib_destory_qp() method destroys a QP. It will return 0 on success or the errno value with the reason for the failure. + +int ib_destroy_qp(struct ib_qp *qp); + + * qp: The QP to be destroyed. + +### The ib_open_qp() Method + +The ib_open_qp() method obtains a reference to an existing sharable QP among multiple processes. The process that created the QP may exit, allowing transfer of the ownership of the QP to another process. It will return a pointer to the sharable QP on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); + + * xrcd: The XRC domain that the QP will be associated with. + + * qp_open_attr: The attributes of the existing QP to be opened. + +#### The ib_qp_open_attr Struct + +The shared QP attributes are represented by struct ib_qp_open_attr: + +struct ib_qp_open_attr { + +void (*event_handler)(struct ib_event *, void *); + +void *qp_context; + +u32 qp_num; + +enum ib_qp_type qp_type; + +}; + + * event_handler: A pointer to a callback that will be called in case of an affiliated asynchronous event to the QP. + + * qp_context: User-defined context that can be associated with the QP. + + * qp_num: The QP number that this QP will open. + + * qp_type: QP transport type. Only IB_QPT_XRC_TGT is supported. + +### The ib_close_qp() Method + +The ib_close_qp() method releases an external reference to a QP. The underlying shared QP won't be destroyed until all internal references that were acquired by the ib_open_qp() method are released. It will return 0 on success or the errno value with the reason for the failure. + +int ib_close_qp(struct ib_qp *qp); + + * qp: The QP to be closed. + +### The ib_post_recv() Method + +The ib_post_recv() method takes a linked list of Receive Requests and adds them to the Receive Queue for future processing. Every Receive Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure. + +static inline int ib_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); + + * qp: The QP that the Receive Requests will be posted to. + + * recv_wr: A linked list of Receive Request to be posted. + + * bad_recv_wr: If there was an error with the handling of the Receive Requests, this pointer will be filled with the address of the Receive Request that caused this error. + +### The ib_post_send() Method + +The ib_post_send() method takes a linked list of Send Requests as an argument and adds them to the Send Queue for future processing. Every Send Request is considered outstanding until a Work Completion is generated after its processing. It will return 0 on success or the errno value with the reason for the failure. + +static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, struct ib_send_wr **bad_send_wr); + + * qp: The QP that the Send Requests will be posted to. + + * send_wr: A linked list of Send Requests to be posted. + + * bad_send_wr: If there was an error with the handling of the Send Requests, this pointer will be filled with the address of the Send Request that caused this error. + +#### The ib_send_wr Struct + +The Send Request is represented by struct ib_send_wr: + +struct ib_send_wr { + +struct ib_send_wr *next; + +u64 wr_id; + +struct ib_sge *sg_list; + +int num_sge; + +enum ib_wr_opcode opcode; + +int send_flags; + +union { + +__be32 imm_data; + +u32 invalidate_rkey; + +} ex; + +union { + +struct { + +u64 remote_addr; + +u32 rkey; + +} rdma; + +struct { + +u64 remote_addr; + +u64 compare_add; + +u64 swap; + +u64 compare_add_mask; + +u64 swap_mask; + +u32 rkey; + +} atomic; + +struct { + +struct ib_ah *ah; + +void *header; + +int hlen; + +int mss; + +u32 remote_qpn; + +u32 remote_qkey; + +u16 pkey_index; /* valid for GSI only */ + +u8 port_num; /* valid for DR SMPs on switch only */ + +} ud; + +struct { + +u64 iova_start; + +struct ib_fast_reg_page_list *page_list; + +unsigned int page_shift; + +unsigned int page_list_len; + +u32 length; + +int access_flags; + +u32 rkey; + +} fast_reg; + +struct { + +struct ib_mw *mw; + +/* The new rkey for the memory window. */ + +u32 rkey; + +struct ib_mw_bind_info bind_info; + +} bind_mw; + +} wr; + +u32 xrc_remote_srq_num; /* XRC TGT QPs only */ + +}; + + * next: A pointer to the next Send Request in the list or NULL, if this is the last Send Request. + + * wr_id: 64-bit value that is associated with this Send Request and will be available in the corresponding Work Completion. + + * sg_list: The array of the scatter/gather elements. As described earlier. + + * num_sge: The number of entries in sg_list. The value zero means that the message size is zero bytes. + + * opcode: The operation to perform. This affects the way that data is being transferred, the direction of it, and whether a Receive Request will be consumed in the remote side and which fields in the Send Request (send_wr) will be used. Can be: + + * IB_WR_RDMA_WRITE: RDMA Write operation. + + * IB_WR_RDMA_WRITE_WITH_IMM: RDMA Write with immediate operation. + + * IB_WR_SEND: Send operation. + + * IB_WR_SEND_WITH_IMM: Send with immediate operation. + + * IB_WR_RDMA_READ: RDMA Read operation. + + * IB_WR_ATOMIC_CMP_AND_SWP: Compare and Swap operation. + + * IB_WR_ATOMIC_FETCH_AND_ADD: Fetch and Add operation. + + * IB_WR_LSO: Send an IPoIB message with LSO (let the RDMA device fragment the big SKBs to multiple MSS-sized packets).LSO is an optimization feature which allows to use large packets by reducing CPU overhead. + + * IB_WR_SEND_WITH_INV: Send with invalidate operation. + + * IB_WR_RDMA_READ_WITH_INV: RDMA Read with invalidate operation. + + * IB_WR_LOCAL_INV: Local invalidate operation. + + * IB_WR_FAST_REG_MR: Fast MR registration operation. + + * IB_WR_MASKED_ATOMIC_CMP_AND_SWP: Masked Compare and Swap operation. + + * IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: Masked Fetch and Add operation. + + * IB_WR_BIND_MW: Memory bind operation. + + * send_flags: Extra attributes for the Send Request. It is a bitwise OR of the masks: + + * IB_SEND_FENCE: Before performing this operation, wait until the processing of prior Send Requests has ended. + + * IB_SEND_SIGNALED: If the QP was created with selective signaling, when the processing of this Send Request is ended, a Work Completion will be generated. + + * IB_SEND_SOLICITED: Mark that a Solicited event will be created in the remote side. + + * IB_SEND_INLINE: Post this Send Request as inline—that is, let the low-level driver read the memory buffers in if sg_list instead of the RDMA device; this may increase the latency. + + * IB_SEND_IP_CSUM: Send an IPoIB message and calculate the IP checksum in HW (checksum offload). + + * ex.imm_data: The immediate data to send. This value is relevant if opcode is IB_WR_SEND_WITH_IMM or IB_WR_RDMA_WRITE_WITH_IMM. + + * ex.invalidate_rkey: The rkey to be invalidated. This value is relevant if opcode is IB_WR_SEND_WITH_INV. + +The following union is relevant if opcode is IB_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM, or IB_WR_RDMA_READ: + + * wr.rdma.remote_addr: The remote address that this Send Request is going to access. + + * wr.rdma.rkey: The Remote Key (rkey) of the MR that this Send Request is going to access. + +The following union is relevant if opcode is IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD,IB_WR_MASKED_ATOMIC_CMP_AND_SWP, or IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + + * wr.atomic.remote_addr: The remote address that this Send Request is going to access. + + * wr.atomic.compare_add: If opcode is IB_WR_ATOMIC_FETCH_AND_ADD*, this is the value to add to the content of remote_addr. Otherwise, this is the value to compare the content of remote_addr with. + + * wr.atomic.swap: The value to place in remote_addr if the value in it is equal to compare_add. This value is relevant if opcode is IB_WR_ATOMIC_CMP_AND_SWP or IB_WR_MASKED_ATOMIC_CMP_AND_SWP. + + * wr.atomic.compare_add_mask: If opcode is IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, this is the mask of the values to change when adding the value of compare_add to the content of remote_addr. Otherwise, this is the mask to use on the content of remote_addr when comparing it with swap. + + * wr.atomic.swap_mask: This is the mask of the value in the content of remote_addr to change. Relevant only if opcode is IB_WR_MASKED_ATOMIC_CMP_AND_SWP. + + * wr.atomic.rkey: The rkey of the MR that this Send Request is going to access. + +The following union is relevant if the QP type that this Send Request is being posted to is UD: + + * wr.ud.ah: The address handle that describes the path to the target node(s). + + * wr.ud.header: A pointer that contains the header. Relevant if opcode is IB_WR_LSO. + + * wr.ud.hlen: The length of wr.ud.header. Relevant if opcode is IB_WR_LSO. + + * wr.ud.mss: The Maximum Segment Size that the message will be fragmented to. Relevant if opcode is IB_WR_LSO. + + * wr.ud.remote_qpn: The remote QP number to send the message to. The enumeration IB_MULTICAST_QPN should be used if sending this message to a multicast group. + + * wr.ud.remote_qkey: The remote Q_Key value to use. If the MSB of this value is set, then the value of the Q_Key will be taken from the QP attributes. + + * wr.ud.pkey_index: The P_Key index that the message will be sent with. Relevant if QP type is IB_QPT_GSI. + + * wr.ud.port_num: The port number that the message will be sent from. Relevant for Direct Route SMP on a switch. + +The following union is relevant if opcode is IB_WR_FAST_REG_MR: + + * wr.fast_reg.iova_start: I/O Virtual Address of the newly created FMR. + + * wr.fast_reg.page_list: List of pages to allocate to map in the FMR. + + * wr.fast_reg.page_shift: Log 2 of size of "pages" to be mapped. + + * wr.fast_reg.page_list_len: The number of pages in page_list. + + * wr.fast_reg.length: The size, in bytes, of the FMR. + + * wr.fast_reg.access_flags: The allowed operations on this FMR. + + * wr.fast_reg.rkey: The value of the remote key to be assigned to the FMR. + +The following union is relevant if opcode is IB_WR_BIND_MW: + + * wr.bind_mw.mw: The MW to be bounded. + + * wr.bind_mw.rkey: The value of the remote key to be assigned to the MW. + + * wr.bind_mw.bind_info: The bind attributes, as explained in the next section. + +The following member is relevant if the QP type that this Send Request is being posted to is XRCTGT: + + * xrc_remote_srq_num: The remote SRQ that will receive the messages. + +#### The ib_mw_bind_info Struct + +The MW binding attributes for both MW type 1 and type 2 are represented by struct ib_mw_bind_info. + +struct ib_mw_bind_info { + +struct ib_mr *mr; + +u64 addr; + +u64 length; + +int mw_access_flags; + +}; + + * mr: A Memory Region that this Memory Window will be bounded to. + + * addr: The address where the Memory Window will start from . + + * length: The length, in bytes, of the Memory Window. + + * mw_access_flags: The allowed incoming RDMA and Atomic operations. It is a bitwise OR of the masks: + + * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations are allowed. + + * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations are allowed. + + * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations are allowed. + +## Memory Windows (MW) + +Memory Windows are used as a lightweight operation to change the allowed permission of incoming remote operations and invalidate them. + +### The ib_alloc_mw() Method + +The ib_alloc_mw() method allocates a Memory Window. It will return a pointer to the newly allocated MW on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + + * pd: The PD that this MW is being associated with. + + * type: The type of the Memory Window. Can be: + + * IB_MW_TYPE_1: MW that can be bounded using a verb and supports only association of a PD. + + * IB_MW_TYPE_2: MW that can be bounded using Work Request and supports association of a QP number only or a QP number and a PD. + +### The ib_bind_mw() Method + +The ib_bind_mw() method binds a Memory Window to a specified Memory Region with a specific address, size, and remote permissions. If there isn't any immediate error, the rkey of the MW will be updated to the new value, but the bind operation may still fail asynchronously (and end with completion with error). It will return 0 on success or the errno value with the reason for the failure. + +static inline int ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind); + + * qp: The QP that the bind WR will be posted to. + + * mw: The MW to bind. + + * mw_bind: The bind attributes, as explained next. + +#### The ib_mw_bind Struct + +The MW binding attributes for type 1 MW are represented by struct ib_mw_bind. + +struct ib_mw_bind { + +u64 wr_id; + +int send_flags; + +struct ib_mw_bind_info bind_info; + +}; + + * wr_id: A 64-bit value that is associated with this bind Send Request The value of Work Request id (wr_id) will be available in the corresponding Work Completion. + + * send_flags: Extra attribute for the bind Send Request, as explained earlier. Only IB_SEND_FENCE and IB_SEND_SIGNALED are supported here. + + * bind_info: More attributes for the bind operation. As explained earlier. + +### The ib_dealloc_mw() Method + +The ib_dealloc_mw() method deallocates an MW. It will return 0 on success or the errno value with the reason for the failure. + +int ib_dealloc_mw(struct ib_mw *mw); + + * mw: The MW to be deallocated. + +## Memory Region (MR) + +Every memory buffer that is being accessed by the RDMA device needs to be registered. During the registration process, the memory will be pinned (prevented from being swapped out), and the memory translation information (from virtual addresses ➤ physical addresses) will be saved in the RDMA device. After the registration, every Memory Region has two keys: one for local access and one for remote access. Those keys will be used when specifying those memory buffers in Work Requests. + +### The ib_get_dma_mr() Method + +The ib_get_dma_mr() method returns a Memory Region for system memory that is usable for DMA. Creating this MR isn't enough, and the ib_dma_*() methods below are needed in order to create or destroy addresses that the lkey and rkey of this MR will be used with. It will return a pointer to the newly allocated MR on success or an ERR_PTR() which specifies the reason for the failure. + +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + + * pd: The PD that this MR is being associated with. + + * mr_access_flags: The allowed operations on this MR. Local Write is always supported in this MR. It is a bitwise OR of the masks: + + * IB_ACCESS_LOCAL_WRITE: Local write to this Memory Region is allowed. + + * IB_ACCESS_REMOTE_WRITE: Incoming RDMA Write operations to this Memory Region are allowed. + + * IB_ACCESS_REMOTE_READ: Incoming RDMA Read operations to this Memory Region are allowed. + + * IB_ACCESS_REMOTE_ATOMIC: Incoming Atomic operations to this Memory Region are allowed. + + * IB_ACCESS_MW_BIND: MW bind to this Memory Region is allowed. + + * IB_ZERO_BASED: Indication that the Virtual address is zero based. + +### The ib_dma_mapping_error() Method + +The ib_dma_mapping_error() method checks if the DMA address that was returned from ib_dma_*() failed. It will return a non-zero value if there was any failure and zero if the operation finished successfully. + +static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr); + + * dev: The RDMA device for which the DMA address was created by using an ib_dma_*() method. + + * dma_addr: The DMA address to verify. + +### The ib_dma_map_single() Method + +The ib_dma_map_single() method maps a kernel virtual address to a DMA address. It will return a DMA address that needed to be checked with the ib_dma_mapping_error() method for errors: + +static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address will be created. + + * cpu_addr: The kernel virtual address to map for DMA. + + * size: The size, in bytes, of the region to map. + + * direction: The direction of the DMA. Can be: + + * DMA_TO_DEVICE: DMA from the main memory to the device. + + * DMA_FROM_DEVICE: DMA from the device to main memory. + + * DMA_BIDIRECTIONAL: DMA from the main memory to the device or from the device to main memory. + +### The ib_dma_unmap_single() Method + +The ib_dma_unmap_single() method unmaps a DMA mapping that was assigned using ib_dma_map_single(): + +static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address was created. + + * addr: The DMA address to unmap. + + * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_single() method. + + * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_single() method. + +### The ib_dma_map_single_attrs() Method + +The ib_dma_map_single_attrs() method maps a kernel virtual address to a DMA address according to a DMA attributes. It will return a DMA address that is needed to be checked with the ib_dma_mapping_error() method for errors. + +static inline u64 ib_dma_map_single_attrs(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs); + + * dev: The RDMA device on which the DMA address will be created. + + * cpu_addr: The kernel virtual address to map for DMA. + + * size: The size, in bytes, of the region to map. + + * direction: The direction of the DMA. As described earlier. + + * attrs: The DMA attributes for the mapping. If this value is NULL, this method behaves like the ib_dma_map_single() method. + +### The ib_dma_unmap_single_attrs() Method + +The ib_dma_unmap_single_attrs() method unmaps a DMA mapping that was assigned using the ib_dma_map_single_attrs() method: + +static inline void ib_dma_unmap_single_attrs(struct ib_device *dev, u64 addr, size_t size + +enum dma_data_direction direction, struct dma_attrs *attrs); + + * dev: The RDMA device on which the DMA address was created. + + * addr: The DMA address to unmap. + + * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_single_attrs() method. + + * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_single_attrs() method. + + * attrs: The DMA attributes of the mapping. This value must be the same value that was used in the ib_dma_map_single_attrs() method. If this value is NULL, this method behaves like the ib_dma_unmap_single() method. + +### The ib_dma_map_page() Method + +The ib_dma_map_page() method maps a physical page to a DMA address. It will return a DMA address that needs to be checked with the ib_dma_mapping_error() method for errors: + +static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address will be created. + + * page: The physical page address to map for DMA. + + * offset: The offset within the page that the registration will start from. + + * size: The size, in bytes, of the region. + + * direction: The direction of the DMA. As described earlier. + +### The ib_dma_unmap_page() Method + +The ib_dma_unmap_page() method unmaps a DMA mapping that was assigned using the ib_dma_map_page() method: + +static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address was created. + + * addr: The DMA address to unmap. + + * size: The size, in bytes, of the region to unmap. This value must be the same value that was used in the ib_dma_map_page() method. + + * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_page() method. + +### The ib_dma_map_sg() Method + +The ib_dma_map_sg() method maps a scatter/gather list to a DMA address. It will return a non-zero value on success and 0 on a failure. + +static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address will be created. + + * sg: An array of the scatter/gather entries to map. + + * nents: The number of scatter/gather entries in sg. + + * direction: The direction of the DMA. As described earlier. + +### The ib_dma_unmap_sg() Method + +The ib_dma_unmap_sg() method unmaps a DMA mapping that was assigned using the ib_dma_map_sg() method: + +static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); + + * dev: The RDMA device on which the DMA address was created. + + * sg: An array of the scatter/gather entries to unmap. This value must be the same value that was used in the ib_dma_map_sg() method. + + * nents: The number of scatter/gather entries in sg. This value must be the same value that was used in the ib_dma_map_sg() method. + + * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_sg() method. + +### The ib_dma_map_sg_attr() Method + +The ib_dma_map_sg_attr() method maps a scatter/gather list to a DMA address according to a DMA attributes. It will return a non-zero value on success and 0 on a failure. + +static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); + + * dev: The RDMA device on which the DMA address will be created. + + * sg: An array of the scatter/gather entries to map. + + * nents: The number of scatter/gather entries in sg. + + * direction: The direction of the DMA. As described earlier. + + * attrs: The DMA attributes for the mapping. If this value is NULL, this method behaves like the ib_dma_map_sg() method. + +### The ib_dma_unmap_sg() Method + +The ib_dma_unmap_sg() method unmaps a DMA mapping that was done using the ib_dma_map_sg() method: + +static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs); + + * dev: The RDMA device on which the DMA address was created. + + * sg: An array of the scatter/gather entries to unmap. This value must be the same value that was used in the ib_dma_map_sg_attrs() method. + + * nents: The number of scatter/gather entries in sg. This value must be the same value that was used in the ib_dma_map_sg_attrs() method. + + * direction: The direction of the DMA. This value must be the same value that was used in the ib_dma_map_sg_attrs() method. + + * attrs: The DMA attributes of the mapping. This value must be the same value that was used in the ib_dma_map_sg_attrs() method. If this value is NULL, this method behaves like the ib_dma_unmap_sg() method. + +### The ib_sg_dma_address() Method + +The ib_sg_dma_address() method returns the DMA address from a scatter/gather entry. + +static inline u64 ib_sg_dma_address(struct ib_device *dev, struct scatterlist *sg); + + * dev: The RDMA device on which the DMA address was created. + + * sg: A scatter/gather entry. + +### The ib_sg_dma_len() Method + +The ib_sg_dma_len() method returns the DMA length from a scatter/gather entry. + +static inline unsigned int ib_sg_dma_len(struct ib_device *dev, struct scatterlist *sg); + + * dev: The RDMA device on which the DMA address was created. + + * sg: A scatter/gather entry. + +### The ib_dma_sync_single_for_cpu() Method + +The ib_dma_sync_single_for_cpu() method transfers a DMA region ownership to the CPU. This method must be called before the CPU accesses a DMA-mapped buffer in order to read or modify its content, and prevents the device from accessing it: + +static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir); + + * dev: The RDMA device on which the DMA address was created. + + * addr: The DMA address to sync. + + * size: The size, in bytes, of the region. + + * direction: The direction of the DMA. As described earlier. + +### The ib_dma_sync_single_for_device() Method + +The ib_dma_sync_single_for_device() method transfers a DMA region ownership to the device. This method must be called before the device can access a DMA-mapped buffer again after the ib_dma_sync_single_for_cpu() method was called. + +static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir); + + * dev: The RDMA device on which the DMA address was created. + + * addr: The DMA address to sync. + + * size: The size, in bytes, of the region. + + * direction: The direction of the DMA. As described earlier. + +### The ib_dma_alloc_coherent() Method + +The ib_dma_alloc_coherent() method allocates a memory block that can be accessible by the CPU and maps it for DMA. It will return the virtual address that the CPU can access on success or NULL in case of a failure: + +static inline void *ib_dma_alloc_coherent(struct ib_device *dev, size_t size, u64 *dma_handle, gfp_t flag); + + * dev: The RDMA device on which the DMA address will be created. + + * size: The size, in bytes, of the memory to allocate and map. + + * direction: The direction of the DMA. As described earlier. + + * dma_handle: A pointer that will be filled with the DMA address of the region, if the allocation succeeds. + + * flag: Memory allocation flags. Can be: + + * GFP_KERNEL: To allow blocking (not in interrupt, not holding SMP locks). + + * GFP_ATOMIC: Prevent blocking. + +### The ib_dma_free_coherent() method + +The ib_dma_free_coherent() method frees a memory block that was allocated using the ib_dma_alloc_coherent() method: + +static inline void ib_dma_free_coherent(struct ib_device *dev, size_t size, void *cpu_addr, u64 dma_handle); + + * dev: The RDMA device on which the DMA address was created. + + * size: The size, in bytes, of the memory region. This value must be the same value that was used in the ib_dma_alloc_coherent() method. + + * cpu_addr: The CPU memory address to free. This value must be the value that was returned by the ib_dma_alloc_coherent() method. + + * dma_handle: The DMA address to free. This value must be the value that was returned by the ib_dma_alloc_coherent() method. + +### The ib_reg_phys_mr() Method + +The ib_reg_phys_mr() method takes a set of physical pages, register them and prepare a virtual address that can be accessed by an RDMA device. It will return a pointer to the newly allocated MR on success or an ERR_PTR(), which specifies the reason for the failure. + +struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); + + * pd: The PD that this MR is being associated with. + + * phys_buf_array: An array of physical buffers to use in the Memory Region. + + * num_phys_buf: The number of physical buffers in phys_buf_array. + + * mr_access_flags: The allowed operations on this MR. As specified earlier. + + * iova_start: A pointer to the requested I/O Virtual Address to be associated with the Region, which is allowed to begin anywhere within the first physical buffer. The RDMA device will set this value with the actual I/O virtual address of the Region. This value may be different from the requested one. + +#### The ib_phys_buf Struct + +The physical buffer is represented by struct ib_phys_buf. + +struct ib_phys_buf { + +u64 addr; + +u64 size; + +}; + + * addr: The physical address of the buffer. + + * size: The size of the buffer. + +### The ib_rereg_phys_mr() Method + +The ib_rereg_phys_mr() method modifies the attributes of an existing Memory Region. This method can be thought of as a call to the ib_dereg_mr() method, which was followed by a call to the ib_reg_phys_mr() method. Where possible, resources are reused instead of being deallocated and reallocated. It will return 0 on success or the errno value with the reason for the failure: + +int ib_rereg_phys_mr(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, struct ib_phys_buf *phys_buf_array, int num_phys_buf, int mr_access_flags, u64 *iova_start); + + * mr: The Memory Region to be reregistered. + + * mr_rereg_mask: The Memory Region attributes to be changed. It is a bitwise OR of the masks: + + * IB_MR_REREG_TRANS: Modify the memory pages of this Memory Region. + + * IB_MR_REREG_PD: Modify the PD of this Memory Region. + + * IB_MR_REREG_ACCESS: Modify the allowed operations of this Memory Region. + + * pd: The new Protection Domain that this Memory Region will be associated with. + + * phys_buf_array: The new physical pages to be used. + + * num_phys_buf: The number of physical pages to be used. + + * mr_access_flags: The new allowed operations of this Memory Region. + + * iova_start: The new I/O Virtual Address of this Memory Region. + +### The ib_query_mr() Method + +The ib_query_mr() method retrieves the attributes of a specific MR. It will return 0 on success or the errno value with the reason for the failure. + +int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + + * mr: The MR to be queried. + + * mr_attr: The MR attributes as describe in the next section. + +The MR attributes are represented by struct ib_mr_attr. + +#### The ib_mr_attr Struct + +struct ib_mr_attr { + +struct ib_pd *pd; + +u64 device_virt_addr; + +u64 size; + +int mr_access_flags; + +u32 lkey; + +u32 rkey; + +}; + + * pd: The PD that the MR is associated with. + + * device_virt_addr: The address of the virtual block that this MR covers. + + * size: The size, in bytes, of the Memory Region. + + * mr_access_flags: The access permissions of this Memory Region. + + * lkey: The local key of this Memory Region. + + * rkey: The remote key of this Memory Region. + +### The ib_dereg_mr() Method + +The ib_dereg_mr() method deregisters an MR. This method may fail if a Memory Window is bounded to it. It will return 0 on success or the errno value with the reason for the failure: + +int ib_dereg_mr(struct ib_mr *mr); + + * mr: The MR to be deregistered. + +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_16 + +© Rami Rosen 2014 + +# Network Administration + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +This appendix reviews some of the most popular tools for network administration and debugging. These tools can help a lot in finding solutions to common problems and in developing, debugging, benchmarking, analyzing, troubleshooting, and researching network projects. Most of these tools have very good documentation resources, either with man pages or with wiki pages, and a lot of other information resources about them are on the Internet. Many of them have active mailing lists (for users and developers) and a bug reporting system. Some of the most commonly used tools are described here by specifying their purpose and relevant links, accompanied by several examples. The tools mentioned in this appendix appear in alphabetical order. + +This appendix reviews some of the most popular tools for network administration and debugging. These tools can help a lot in finding solutions to common problems and in developing, debugging, benchmarking, analyzing, troubleshooting, and researching network projects. Most of these tools have very good documentation resources, either with man pages or with wiki pages, and a lot of other information resources about them are on the Internet. Many of them have active mailing lists (for users and developers) and a bug reporting system. Some of the most commonly used tools are described here by specifying their purpose and relevant links, accompanied by several examples. The tools mentioned in this appendix appear in alphabetical order. + +## arp + +This command is for ARP table management. Example of usage: + +You can display the ARP table by running arp from the command-line. arp –n will display the ARP table without name resolution. + +You can add static entries to the ARP table by: + +arp –s 192.168.2.10 00:e0:4c:11:22:33 + +The arp utility belongs to the net-tools package. Website: http://net-tools.sourceforge.net . + +## arping + +A utility to send ARP requests. The –D flag is for Duplicate Address Detection (DAD). The arping utility belongs to the iputils package. Website: http://www.skbuff.net/iputils/ . + +### arptables + +A userspace tool for configuring rules for a Linux-based ARP rules firewall. Website: http://ebtables.sourceforge.net/ . + +### arpwatch + +A userspace tool for monitoring ARP traffic. Website: http://ee.lbl.gov/ . + +## ApacheBench (ab) + +A command-line utility for measuring the performance of HTTP web servers. The ApacheBench tool is part of the Apache open source project. In many distributions (for example, Ubuntu) it is part of the apache2-utils package. Example of usage: + +ab -n 100 http://www.google.com/ + +The -n option is the number of requests to perform for the benchmarking session. + +## brctl + +A command-line utility for administration of Ethernet bridges, enabling the setup of a bridge configuration. The brctl utility belongs to the bridge-utils package. Examples for usage: + + * brctl addbr mybr: Add a bridge named mybr. + + * brctl delbr mybr: Delete the bridge named mybr. + + * brctl addif mybr eth1: Add the eth1 interface to the bridge. + + * brctl delif mybr eth1: Delete the eth1 interface from the bridge. + + * brctl show: Show information about the bridge and its attached ports. + +The maintainer of the bridge-utils package is Stephen Hemminger. Fetching the git repository can be done by: + +git clone git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/bridge-utils.git + +Website: http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge . + +## conntrack-tools + +A set of userspace tools for management of netfilter connection tracking. It consists of a userspace daemon, conntrackd, and a command-line tool, conntrack. Website: http://conntrack-tools.netfilter.org/ . + +## crtools + +A utility for checkpoint/restore of a process. Website: http://criu.org/Installation . + +## ebtables + +A userspace tool for configuring rules for a Linux-based bridging firewall. Website: http://ebtables.sourceforge.net/ . + +## ether-wake + +A utility to send Wake-On-LAN Magic Packets. The ether-wake utility belongs to the net-tools package. + +## ethtool + +The ethtool utility provides a way to query or control network driver and hardware settings, get statistics, get diagnostic information, and more. With ethtool you can control parameters of Ethernet devices, such as speed, duplex, auto-negotiation and flow control. Many features of ethtool require support in the network driver code. + +Examples: + + * Output of ethtool eth0: + +Settings for eth0: + +Supported ports: [ TP MII ] + +Supported link modes: 10baseT/Half 10baseT/Full + +100baseT/Half 100baseT/Full + +1000baseT/Half 1000baseT/Full + +Supported pause frame use: No + +Supports auto-negotiation: Yes + +Advertised link modes: 10baseT/Half 10baseT/Full + +100baseT/Half 100baseT/Full + +1000baseT/Half 1000baseT/Full + +Advertised pause frame use: Symmetric Receive-only + +Advertised auto-negotiation: Yes + +Speed: 10Mb/s + +Duplex: Half + +Port: MII + +PHYAD: 0 + +Transceiver: internal + +Auto-negotiation: on + +Supports Wake-on: pumbg + +Wake-on: g + +Current message level: 0x00000033 (51) + +drv probe ifdown ifup + +Link detected: no + + * Getting offload parameters is done by: ethtool –k eth1. + + * Setting offload parameters is done by: ethtool –K eth1 offLoadParamater. + + * Querying the network device for associated driver information is done by: ethtool -i eth1. + + * Showing statistics is done by: ethtool -S eth1 (note that not all the network device drivers implement this feature). + + * Show permanent hardware (MAC) address: ethtool -P eth0. + +The development of ethtool is done by sending patches to the netdev mailing list. The maintainer of ethtool as of this writing is Ben Hutchings. The ethtool project is developed over a git repository. It can be downloaded by: git clone git://git.kernel.org/pub/scm/network/ethtool/ethtool.git. + +Website: www.kernel.org/pub/software/network/ethtool/ . + +## git + +A distributed version control system started by Linus Torvalds. Linux kernel development, as well as many Linux related projects, are managed by git. One can also use the git send-email command in order to send patches by mail. Website: http://git-scm.com/ . + +## hciconfig + +A command-line tool for configuring Bluetooth devices. With hciconfig, you can display information such as the Bluetooth interface type (BR/EDR or AMP), its Bluetooth address, its flags, and more. The hciconfig tool belongs to the bluez package. Example: + +hciconfig + +hci0: Type: BR/EDR Bus: USB + +BD Address: 00:02:72:AA:FB:94 ACL MTU: 1021:7 SCO MTU: 64:1 + +UP RUNNING PSCAN + +RX bytes:964 acl:0 sco:0 events:41 errors:0 + +TX bytes:903 acl:0 sco:0 commands:41 errors:0 + +Website: http://www.bluez.org/ . + +## hcidump + +A command-line utility for dumping raw HCI data coming from and going to a Bluetooth device. The hcidump utility belongs to the bluez-hcidump package. Website: http://www.bluez.org/ . + +## hcitool + +A command-line utility for configuring Bluetooth connections and for sending some special commands to Bluetooth devices. For example, you can scan for nearby Bluetooth devices by: hcitool scan. The hcitool utility belongs to the bluez-hcidump package. + +## ifconifg + +The ifconfig command allows you to configure various network interface parameters, including the IP address of the device, the MTU, the MAC address, the Tx queue length (txqueuelen), flags, and more. The ifconfig tool belongs to the net-tools package, which is older than the iproute2 package (discussed later in this appendix). Here are three examples of usage: + + * ifconfig eth0 mtu 1300: Change the MTU to 1300. + + * ifconfig eth0 txqueuelen 1100: Change the Tx Queue length to 1100. + + * ifconfig eth0 –arp: Disable the ARP protocol on eth0. + +Website: http://net-tools.sourceforge.net . + +## ifenslave + +A utility for attaching and detaching slave network devices to a bonding device. Bonding is putting multiple physical Ethernet devices into a single logical one, what is often termed as Link aggregation/Trunking/Link bundling. The source file is in Documentation/networking/ifenslave.c. You can attach eth0, for example, to a bonding device bond0 by: + +ifenslave bond0 eth0 + +The ifenslave utility belongs to the iputils package, maintained by Yoshifuji Hideaki. Website: www.skbuff.net/iputils/ . + +## iperf + +The iperf project is an open source project that provides a benchmarking tool to measure TCP and UDP bandwidth performance. It allows you to tune various parameters. The iperf tool reports bandwidth, delay jitter, and datagram loss. It was originally developed by the Distributed Applications Support Team (DAST) at the National Laboratory for Applied Network Research (NLANR) in C++. It works in a client-server model. A new implementation from scratch, iperf3, which is not backwards compatible with the original iperf, is available from https://code.google.com/p/iperf/ . The iperf3 is said to have a simpler code base. The iperf3 tool can report also the average CPU utilization of the client and the server. + +### Using iperf + +Following is a simple example of using iperf for measuring TCP performance. On one device (which has an IP address of 192.168.2.104), run the next command, which starts the server side (by default, it is a TCP socket on port 5001): + +iperf -s + +On a second device, run the iperf TCP client to connect to the iperf server: + +iperf -c 192.168.2.104 + +On the client side you will see the following: + +\------------------------------------------------------------ + +Client connecting to 192.168.2.104, TCP port 5001 + +TCP window size: 22.9 KByte (default) + +\------------------------------------------------------------ + +[ 3] local 192.168.2.200 port 35146 connected with 192.168.2.104 port 5001 + +The default time interval is 10 seconds. After 10 seconds, the client will be disconnected, and you will see a message like this on the terminal: + +[ ID] Interval Transfer Bandwidth + +[ 3] 0.0-10.3 sec 7.62 MBytes 6.20 Mbits/sec + +You can tune many parameters of iperf, like these: + + * –u: For using a UDP socket. + + * -t: For using a different time interval in seconds instead of the default of 10 seconds. + + * -T: Sets a TTL for multicast (the default is 1). + + * -B: Bind to a host, an interface, or a multicast address. + +See man iperf. Website: http://iperf.sourceforge.net/ . + +## iproute2 + +The iproute2 package provides many tools for interaction between the userspace and the kernel networking subsystem. The most well-known is the ip command. It is based on netlink sockets (discussed in Chapter 2). With the ip command, you can perform various operations in a wide range of networking areas, and it has numerous options; see man 8 ip. Here are several examples of using the ip command for various tasks: + + * Configuration of a network device with ip addr: + + * ip addr add 192.168.0.10/24 dev eth0: Sets an IP address on eth0. + + * ip addr show: Displays the addresses of all network interfaces (both IPv4 and IPv6). + +See man ip address. + + * Configuration of a network device with ip link: + + * ip link add mybr type bridge: Creates a bridge named mybr. + + * ip link add name myteam type team: Creates a teaming device named myteam. (The teaming device driver aggregates multiple physical Ethernet devices into one logical one and is in fact the new bonding device. The teaming driver is discussed in Chapter 14.) + + * ip link set eth1 mtu 1450: Sets the MTU of eth1 to be 1450. + +See man ip link. + + * Management of ARP tables (IPv4) and NDISC (IPv6) tables: + + * ip neigh show: Shows both the IPv4 neighbouring table (ARP table) and the IPv6 neighbouring table. + + * ip -6 neigh show: Shows only the IPv6 neighbouring table. + + * ip neigh flush dev eth0: Removes all entries from the neighboring tables associated with eth0. + + * ip neigh add 192.168.2.20 dev eth2 lladdr 00:11:22:33:44:55 nud permanent: Adds a permanent neighbour entry (parallel to adding static entries in an ARP table). + + * ip neigh change 192.168.2.20 dev eth2 lladdr 55:44:33:22:11:00 nud permanent: Updates a neighbour entry. + +See man ip neighbour. + + * Management of the parameters for the neighbour tables: + + * ip ntable show: Displays the neighbour tables parameters. + + * ip ntable change name arp_cache locktime 1200 dev eth0: Changes the locktime parameter for the IPv4 neighbouring table associated with eth0. + +See man ip ntable. + + * Network namespaces management: + + * ip netns add myNamespace: Adds a network namespace named myNamespace. + + * ip netns del myNamespace: Deletes the network namespace named myNamespace. + + * ip netns list: Shows all network namespaces on the host. + + * ip netns monitor: Displays a line of screen for each network namespace that is added or removed by the ip netns command. + +See man ip netns. + + * Configuration of multicast addresses: + + * ip maddr show: Shows all multicast addresses on the host (both IPv4 and IPv6). + + * ip maddr add 00:10:02:03:04:05 dev eth1: Adds a multicast address on eth1. + +See man ip maddress. + + * Monitor netlink messages. For example: + + * ip monitor route displays on the screen messages about various network events like adding or deleting a route. + +See man ip monitor. + + * Management of routing tables: + + * ip route show: Shows the routing table. + + * ip route flush dev eth1: Removes routing entries associated with eth1 from the routing table. + + * ip route add default via 192.168.2.1: Adds 192.168.2.1 as a default gateway. + + * ip route get 192.168.2.10: Gets the route to 192.168.2.10 and displays it. + +See man ip route. + + * Management of rules in the RPDB (Routing Policy DataBase). For example: + + * ip rule add tos 0x02 table 200: Adds a rule that sets the routing subsystem to perform a lookup in routing table 252 for packets whose TOS value is 0x02 (TOS is a field in the IPv4 header). + + * ip rule del tos 0x02 table 200: Deletes a specified rule from the RPDB. + + * ip rule show: Displays the rules in the RPDB. + +See man ip rule. + + * Management of TUN/TAP devices: + + * ip tuntap add tun1 mode tun: Creates a TUN device named tun1. + + * ip tuntap del tun1 mode tun: Deletes a TUN device named tun1. + + * ip tuntap add tap1 mode tap: Creates a TAP device named tap1. + + * ip tuntap del tap1 mode tap: Deletes a TAP device named tap1. + + * Management of IPsec policies: + + * ip xfrm policy show: Shows IPsec policies. + + * ip xfrm state show: Shows IPsec states. + +See man ip xfrm. + +The ss tool is used to dump socket statistics. For example, running + +ss -t –a + +will show all TCP sockets: + +State Recv-Q Send-Q Local Address:Port Peer Address:Port + +LISTEN 0 32 *:ftp *:* + +LISTEN 0 128 *:ssh *:* + +LISTEN 0 128 127.0.0.1:ipp *:* + +ESTAB 0 0 192.168.2.200:ssh 192.168.2.104:52089 + +ESTAB 0 52 192.168.2.200:ssh 192.168.2.104:51352 + +ESTAB 0 0 192.168.2.200:ssh 192.168.2.104:51523 + +ESTAB 0 0 192.168.2.200:59532 107.21.231.190:http + +LISTEN 0 128 :::ssh :::* + +LISTEN 0 128 ::1:ipp :::* + +CLOSE-WAIT 1 0 ::1:48723 ::1:ipp + +There are other tools of iproute2: + + * bridge: Shows/manipulates bridge addresses and devices. For example: + + * bridge fdb show: Displays forwarding entries. + +See man bridge. + + * genl: Gets information (like id, header size, max attributes, and more) about registered generic netlink families. For example, running genl ctrl list can have this as a result: + +Name: nlctrl + +ID: 0x10 Version: 0x2 header size: 0 max attribs: 7 + +commands supported: + +#1: ID-0x3 + +Capabilities (0xe): + +can doit; can dumpit; has policy + +multicast groups: + +#1: ID-0x10 name: notify + + * lnstat: Displays Linux network statistics. + + * rtmon: Monitors Rtnetlink sockets. + + * tc: Shows/manipulates traffic control settings. For example: + + * tc qdisc show: Running this command shows which queueing discipline (qdisc) entries are installed, for example: + +qdisc pfifo_fast 0: dev eth1 root refcnt 2 bands 3 priomap 1 2 . . . + + * This shows that the pfifo_fast qdisc is associated with the eth1 network device. The pfifo_fast qdisc, which is a classless queueing discipline, is the default qdisc in Linux. + + * tc -s qdisc show dev eth1: Shows statistics of the qdisc associated to eth1. + +See man tc. + +See: Linux Advanced Routing & Traffic Control HOWTO: www.lartc.org/howto/ . + +The development of iproute2 is done by sending patches to the netdev mailing list. The maintainer of ethtool as of this writing is Stephen Hemminger. The iproute2 is developed over a git repository, which can be downloaded by: git clone git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git. + +## iptables and iptables6 + +The iptables and iptables6 are administration tools for packet filtering and NAT management for IPv4 and IPv6, respectively. With iptables/iptables6, you can define lists of rules. Each such rule tells what should be done with the packet (for example, discard it or accept it). Each rule specifies some matching condition for a packet, for example, that it will be a UDP packet. Following are some examples for using the iptables command: + + * iptables -A INPUT -p tcp --dport=80 -j LOG --log-level 1: The meaning of this rule is that incoming TCP packets with destination port 80 will be dumped to the syslog. + + * iptables –L: Lists all rules in the filter table. (There is no table mentioned in the command, so it accesses the Filter table, which is the default table.) + + * iptables –t nat –L: Lists all rules in the NAT table. + + * iptables –F: Flushes the selected table. + + * iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE: Sets a MASQUERADE rule. + +Website: www.netfilter.org/ . + +## ipvsadm + +A tool for Linux Virtual Server administration. Website: www.linuxvirtualserver.org/software/ipvs.html . + +## iw + +Shows/manipulates wireless devices and their configuration. The iw package is based on generic netlink sockets (see Chapter 2). For example, you can perform these operations: + + * iw dev wlan0 scan: Scans for nearby wireless devices. + + * iw wlan0 station dump: Displays statistics about a station. + + * iw list: Gets information about a wireless device (such as band information and 802.11n information). + + * iw dev wlan0 get power_save – get power save mode. + + * iw dev wlan0 set type ibss: Changes the wireless interface mode to be ibss (Ad-Hoc). + + * iw dev wlan0 set type mesh: Changes the wireless interface mode to be mesh mode. + + * iw dev wlan0 set type monitor: Changes the wireless interface mode to be monitor mode. + + * iw dev wlan0 set type managed: Changes the wireless interface mode to be managed mode. + +See man iw. + +Gitweb: http://git.kernel.org/cgit/linux/kernel/git/jberg/iw.git . + +Website: http://wireless.kernel.org/en/users/Documentation/iw . + +## iwconfig + +The old tool for administering wireless devices. The iwconfig belongs to the wireless-tools package and is based on IOCTLs. Website: www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html . + +## libreswan Project + +An IPsec software solution which forked from openswan version 2.6.38. Website: http://libreswan.org/ . + +## l2ping + +A command-line utility for sending L2CAP echo requests and receiving answers over a Bluetooth device. The l2ping utility belongs to the bluez package. Website: www.bluez.org/ . + +## lowpan-tools + +A set of utilities to manage the Linux LoWPAN stack. Website: http://sourceforge.net/projects/linux-zigbee/files/linux-zigbee-sources/0.3/ . + +## lshw + +A utility that displays information about the hardware configuration of the machine. Website: http://ezix.org/project/wiki/HardwareLiSter . + +## lscpu + +A utility for displaying information about the CPUs on the system. It is based on information from /proc/cpuinfo and sysfs. The lscpu belongs to the util-linux package. + +## lspci + +A utility for displaying information about PCI buses in the system and devices connected to them. Sometimes you need to get some information about a PCI network device with the lspci command. The lspci utility belongs to the pciutils package. Website: http://mj.ucw.cz/sw/pciutils/ . + +## mrouted + +A multicast routing daemon, implementing the IPv4 Distance Vector Multicast Routing Protocol (DVMRP), which is specified in RFC 1075 from 1988. Website: http://troglobit.com/mrouted.html . + +## nc + +A command-line utility that reads and writes data across networks. The nc belongs to the nmap-ncat package. Website: http://nmap.org/ . + +## ngrep + +A command-line tool, based on the well-known grep command, that allows you to specify extended expressions to match against data payloads of packets. It recognizes TCP, UDP, and ICMP across Ethernet, PPP, SLIP, FDDI, and null interfaces. Website: http://ngrep.sourceforge.net/ . + +## netperf + +Netperf is a networking benchmarking tool. Website: www.netperf.org/netperf/ . + +## netsniff-ng + +netsniff-ng is an open source project networking toolkit that, among other things, can help in analyzing network traffic, performing stress tests, generating packets at a very high speed, and more. It uses the PF_PACKET zero-copy RINGs (TX and RX). Among the tools it provides are the following: + + * netsniff-ng is a fast zero-copy analyzer, pcap capturing and replaying tool. The netsniff-ng tool is Linux-specific and does not support other operating systems, unlike many of the tools mentioned in this appendix. Example: Running netsniff-ng --in eth1 --out dump.pcap -s -b 0 creates a pcap file that can be read by wireshark or by tcpdump. The –s flag is for silence, and the –b 0 is for binding to CPU 0. See man netsniff-ng. + + * trafgen is a zero-copy high performance network packet traffic generator utility. + + * ifpps is a small utility that periodically provides top-like networking and system statistics from the kernel. ifpps gathers its data directly from procfs files. + + * bpfc is a small Berkeley Packet Filter assembler and compiler. + +Fetching the git repository: git clone git://github.com/borkmann/netsniff-ng.git. Website: http://netsniff-ng.org/ . + +## netstat + +The netstat tool enables you to print multicast memberships, routing tables, network connections, interface statistics, state of sockets, and more. The netstat tool belongs to the net-tools package. Useful flags: + + * netstat –s: Displays summary statistics for each protocol. + + * netstat –g: Displays multicast group membership information for IPv4 and IPv6. + + * netstat -r: Shows the kernel IP routing table. + + * netstat –nl: Shows the listening sockets (the -n flag is for showing numerical addresses instead of trying to determine symbolic host, port, or user names). + + * netstat –aw: Shows all raw sockets. + + * netstat –ax: Shows all Unix sockets. + + * netstat –at: Shows all TCP sockets. + + * netstat –au: Shows all UDP sockets. + +Website: http://net-tools.sourceforge.net . + +## nmap (Network Mapper) + +Nmap is an open source security project that provides a network exploration and probing tool and a security/port scanner. It has features like port scanning (detecting the open ports on target hosts), OS detection, detecting MAC addresses, and more. For example, + +nmap www.google.com + +can give output such as: + +Starting Nmap 6.00 ( http://nmap.org ) at 2013-09-26 16:37 IDT + +Nmap scan report for www.google.com (212.179.154.227) + +Host is up (0.013s latency). + +Other addresses for www.google.com (not scanned): 212.179.154.221 212.179.154.251 212.179.154.232 212.179.154.237 212.179.154.216 212.179.154.231 212.179.154.241 212.179.154.247 212.179.154.222 212.179.154.226 212.179.154.236 212.179.154.246 212.179.154.212 212.179.154.217 212.179.154.242 + +Not shown: 998 filtered ports + +PORT STATE SERVICE + +80/tcp open http + +443/tcp open https + +Nmap done: 1 IP address (1 host up) scanned in 5.24 seconds + +The nping utility of nmap can be used to generate raw packets for ARP poisoning, networking stress tests, and Denial of Service attacks, as well as to test connectivity like the ordinary ping utility. You can use the nping utility for setting IP options in generated traffic. See http://nmap.org/book/nping-man-ip-options.html . Website: http://nmap.org/ . + +## openswan + +An open source project implementing an IPsec-based VPN solution. It is based on the FreeS/WAN project. Website: www.openswan.org/projects/openswan . + +### OpenVPN + +An open source project implementing VPN based on SSL/TLS. Website: www.openvpn.net/ . + +## packeth + +An Ethernet-based packet generator tool for Ethernet. The tool has both GUI and CLI. Website: http://packeth.sourceforge.net/packeth/Home.html . + +## ping + +The well-known utility for testing connectivity by sending ICMP ECHO request messages. Here are four useful options that are also mentioned in this book: + + * -Q tos: Enables setting Quality Of Service bits in an ICMP packet. Mentioned in this appendix in the explanation about tshark filters. + + * -R: Sets the Record Route IP option (discussed in Chapter 4). + + * -T: Sets the timestamp IP option (discussed in Chapter 4). + + * -f: Flood ping. + + * See man ping for more command-line options. + +The ping utility belongs to the iputils package. Website: www.skbuff.net/iputils/ . + +## pimd + +An open source lightweight stand-alone Protocol Independent Multicast - Sparse Mode (PIM-SM) v2 multicast daemon. Maintained by Joachim Nilsson. See http://troglobit.com/pimd.html . git repository: https://github.com/troglobit/pimd/ . + +## poptop + +PPTP Server for Linux. Website: http://poptop.sourceforge.net/dox/ . + +## ppp + +An open source PPP daemon. git repository: git://ozlabs.org/~paulus/ppp.git. Website: http://ppp.samba.org/download.html . + +## pktgen + +The pktgen kernel module (net/core/pktgen.c) can generate packets at very high speed. Monitoring and controlling is done via writing to /proc/net/pktgen entries. For "HOWTO for the linux packet generator" see Documentation/networking/pktgen.txt. + +## radvd + +This is a Router Advertisement Daemon for IPv6. It is an open source project maintained by Reuben Hawkins. It can be used for IPv6 stateless autoconfiguration and for renumbering. Website: www.litech.org/radvd/ . git repository: https://github.com/reubenhwk/radvd . + +## route + +A command-line tool for routing tables management. It belongs to the net-tools package, which is based on IOCTLs and which is older than the iproute2 package. Examples: + + * route –n: Shows the routing table without name resolving. + + * route add default gateway 192.168.1.1: Adds 192.168.1.1 as a default gateway. + + * route –C: Displays the routing cache (keep in mind that the IPv4 routing cache was removed in kernel 3.6; see the "IPv4 Routing Cache" section in chapter 5). + +See man route. + +### RP-PPPoE + +An open source PPP over Ethernet (PPPoE) client for Linux and Solaris systems. Website: www.roaringpenguin.com/products/pppoe . + +### sar + +A command-line tool to collect and report statistics about system activity. It is part of the sysstat package. As an example, running the following command will display four times the CPU statistics with interval of 1 second and the average at the end: + +sar 1 4 + +Linux 3.6.10-4.fc18.x86_64 (a) 10/22/2013 _x86_64_ (2 CPU) + +07:47:10 PM CPU %user %nice %system %iowait %steal %idle + +07:47:11 PM all 0.00 0.00 0.00 0.00 0.00 100.00 + +07:47:12 PM all 0.00 0.00 0.00 0.00 0.00 100.00 + +07:47:13 PM all 0.00 0.00 0.00 0.00 0.00 100.00 + +07:47:14 PM all 0.00 0.00 0.50 0.00 0.00 99.50 + +Average: all 0.00 0.00 0.13 0.00 0.00 99.87 + +Website: http://sebastien.godard.pagesperso-orange.fr/ . + +## smcroute + +A command-line tool for multicast routing manipulation. Website: www.cschill.de/smcroute/ . + +## snort + +An open source project that provides a network intrusion detection system (IDS) and a network intrusion prevention system (IPS). Website: www.snort.org/ . + +### suricata + +An open source project that provides an IDS/IPS and a network security monitoring engine. Website: http://suricata-ids.org/ . + +## strongSwan + +An open source project that implements IPsec solutions for Linux, Android, and other operating systems. Both IKEv1 and IKEv2 are implemented. The maintainer is Professor Andreas Steffen. Website: www.strongswan.org/ . + +## sysctl + +The sysctl utility displays kernel parameters (including network parameters) at runtime. It can also set kernel parameters. For example, sysctl –a shows all kernel parameters. The sysctl utility belongs to the procps-ng package. + +## taskset + +A command-line utility for setting or retrieving a process's CPU affinity. The taskset utility is from the util-linux package. + +## tcpdump + +Tcpdump is an open source command-line protocol analyzer, available from www.tcpdump.org . It is based on a C/C++ network traffic capture library called libpcap. Like wireshark, it can write its results to a file and read them from a file and it supports filtering. Unlike wireshark, it does not have a front end GUI. However, its output files can be read by wireshark. Example of sniffing with tcpdump: + +tcpdump -i eth1 + +Website: www.tcpdump.org . + +## top + +The top utility provides a real-time view of the system (parameters like memory usage, CPU usage, and more) and a system summary. This utility is part of the procps-ng package. Website: https://gitorious.org/procps . + +## tracepath + +The tracepath command traces a path to a destination address, discovering the MTU along this path. For IPv6 destination addresses, you can use tracepath6. The tracepath utility belongs to the iputils package. Website: www.skbuff.net/iputils/ . + +## traceroute + +Print the path that packets traverse to some destination. The traceroute utility uses the IP protocol's Time To Live (TTL) field to cause hosts on the packet path to return an ICMP TIME EXCEEDED response. The traceroute utility is discussed in Chapter 3, which deals with the ICMP protocol. Website: http://traceroute.sourceforge.net . + +## tshark + +The tshark utility provides a command-line packet analyzer. It is part of the wireshark package. It has many command-line options. For example, you can write the output to a file with the –w option. You can set various filters to the packet filtering with tshark, some of which can be complex filters (as you will soon see). Example of setting a filter for capturing only ICMPv4 packets: + +tshark -R icmp + +Capturing on eth1 + +17.609101 192.168.2.200 -> 81.218.16.241 ICMP 98 Echo (ping) request id=0x0dc6, seq=1/256, ttl=64 + +17.617101 81.218.16.241 -> 192.168.2.200 ICMP 98 Echo (ping) reply id=0x0dc6, seq=1/256, ttl=58 + +You can also set a filter on a value of a field in the IPv4 header. For example, the following command sets a filter on the DS field in the IPv4 header: + +tshark -R "ip.dsfield==0x2" + +If from a second terminal you send traffic with DS field as 0x2 in the IPv4 header (such traffic can be sent, for example, with ping –Q 0x2 destinationAdderss), it will be displayed onscreen by tshark. + +Example for filtering by source MAC address: + +tshark ether src host 00:e0:4c:11:22:33 + +Example for filtering for UDP packets whose ports are in the port range 6000–8000: + +tshark -R udp portrange 6000-8000 + +Example for setting a filter for capturing traffic where the source IP address is 192.168.2.200 and the port is 80 (it does not have to be TCP traffic only because here there is no filter set on some specified protocol): + +tshark -i eth1 -f "src host 192.168.2.200 and port 80" + +## tunctl + +tunctl is an older tool for creating TUN/TAP devices. It is available from http://tunctl.sourceforge.net . Note that you can also create or remove a TUN/TAP device with the ip command (see the iproute2 section earlier in this appendix) and with the openvpn command-line tool of the openvpn package: + +openvpn --mktun --dev tun1 + +openvpn --rmtun --dev tun1 + +## udevadm + +You can get the network device type by running udevadm on its sysfs entry. For example, if the device has this entry under sysfs: + +/sys/devices/virtual/net/eth1.100 + +then you can find that its DEVTYPE is VLAN: + +udevadm info -q all -p /sys/devices/virtual/net/eth1.100/ + +P: /devices/virtual/net/eth1.100 + +E: COMMENT=net device () + +E: DEVPATH=/devices/virtual/net/eth1.100 + +E: DEVTYPE=vlan + +E: IFINDEX=4 + +E: INTERFACE=eth1.100 + +E: MATCHADDR=00:e0:4c:53:44:58 + +E: MATCHDEVID=0x0 + +E: MATCHIFTYPE=1 + +E: SUBSYSTEM=net + +E: UDEV_LOG=3 + +E: USEC_INITIALIZED=28392625695 + +udevadm belongs to the udev package. Website: www.kernel.org/pub/linux/utils/kernel/hotplug/udev.html . + +## unshare + +The unshare utility enables you to create a namespace and run a program within that namespace that is unshared from its parent. The unsare utility belongs to the util-linux package. For various command-line options of the unshare utility, see man unshare, Example of usage: + +unshare -u /bin/bash + +This will create a UTS namespace. + +unshare --net /bin/bash + +This will create a new network namespace, in which a bash process will be started. Gitweb: http://git.kernel.org/cgit/utils/util-linux/util-linux.git . Website: http://userweb.kernel.org/~kzak/util-linux/ . + +## vconfig + +The vconfig utility enables you to configure VLAN (802.1q) interface. Examples of usage: + + * vconfig add eth2 100: Adds a VLAN interface. This will create a VLAN interface, eth2.100. + + * vconfig rem eth2.100: Remove the eth2.100 VLAN interface. + + * Note that you can also add and delete VLAN interfaces with the ip command, for example, like this: + + * ip link add link eth0 name eth0.100 type vlan id 100 + + * vconfig set_egress_map eth2.100 0 4: Map SKB priority of 0 to VLAN priority 4, so that outgoing packets which their SKB priority is 0 will be tagged with 4 as VLAN priority. The default VLAN priority is 0. + + * vconfig set_ingress_map eth2.100 1 5: Map VLAN priority 5 to SKB priority of 1, so that incoming packets with VLAN priority of 5 will be queued with SKB priority of 1. The default SKB priority is 0. + +See man vconfig. + +Note that if VLAN support is compiled as a kernel module, then you must load the VLAN kernel module before trying to add the VLAN interface, by modprobe 8021q. Website: www.candelatech.com/~greear/vlan.html . + +### wpa_supplicant + +Open source software that provides a wireless supplicant for Linux and other OSs. It supports WPA and WPA2. Website: http://hostap.epitest.fi/wpa_supplicant/ . + +## wireshark + +The wireshark project provides a free and open source analyzer ("sniffer"). It has two flavors: a front-end GTK+ based GUI and a command-line, the tshark utility (mentioned earlier in this appendix). It is available on many operating systems and evolves dynamically: when new features are added to existing protocols and new protocols are added, new parsers ("dissectors") are modified or added. Wireshark has many features: + + * Enables defining a wide range of filters (ports, destination or source address, protocol identifier, fields in headers, and more). + + * Enables sorting the result according to various parameters (protocol type, time, and so on). + + * Saves the sniffer output to a file/read a sniffer output from a file. + + * Reads/writes many different capture file formats: tcpdump (libpcap), Pcap NG, and more. + + * Capture Filters and Display Filters. + +Activating the wireshark or thsark sniffer puts the network interface to be in promiscuous mode to enable it to handle packets that are not destined to the local host. A lot of information is available in the man pages: man wireshark and man tshark. You can find more than 75 sniff samples of different protocols in http://wiki.wireshark.org/SampleCaptures . Wireshark users mailing list: www.wireshark.org/mailman/listinfo/wireshark-users . Website: www.wireshark.org . Wiki: http://wiki.wireshark.org/ . + +## XORP + +An Open Source project, implementing various routing protocols, like BGP, IGMP, OLSR, OSPF, PIM, and RIP. The name XORP is derived from eXtensible Open Router Platform. Website: www.xorp.org/ . +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1_17 + +© Rami Rosen 2014 + +# Glossary + +Rami Rosen1 + +(1) + +Haifa, Israel + +Abstract + +The following list of glossary terms are covered in this book. + +The following list of glossary terms are covered in this book. + +ACL—Asynchronous Connection-oriented Link. A Bluetooth protocol. + +ADB — Android Debug Bridge. + +AVDTP—Audio/Video Distribution Transport Protocol. A Bluetooth protocol. + +AEAD—Authenticated Encryption with Associated Data. + +AES-NI—AES instruction set. + +AH—Authentication Header protocol. Used in IPsec, has a protocol number 51. + +AID—Association ID. A unique number that a wireless client gets when it associates to an Access Point. It is assigned by the Access Point, and it is in the range 1–2007. + +AMP—Alternate MAC/PHY. + +AMPDU—Aggregated Mac Protocol Data Unit. A type of packet aggregation in IEEE 802.11n. + +AMSDU—Aggregated Mac Service Data Unit. A type of packet aggregation in IEEE 802.11n. + +AOSP—Android Open Source Project. + +AP—Access Point. In wireless networks, a wireless device to which wireless clients associate and which enables them to connect to a wired network. + +API—Application Programming Interface. A set of methods and data structures that define the interface to a software layer, such as an interface for a library. + +ABRO—Authoritative Border Router Option. Added for Neighbour Discovery Optimization for IPv6. See RFC 6775. + +ABS—Android Builders Summit. + +ARO—Address Registration Option. Added for Neighbour Discovery Optimization for IPv6. See RFC 6775. + +ARP—Address Resolution Protocol. A protocol used to find the mapping between a network address (such as IPv4 address) into a link layer address (like a 48-bit Ethernet address). + +ARPD—ARP daemon. A userspace daemon that implements the ARP functionality. + +Ashmem—Android shared memory. + +ASM—Any-Source Multicast. In the any-source model, you do not specify interest in receiving multicast traffic from a single particular source address or from a set of addresses. + +BA—Block Acknowledgement mechanism used in IEEE 802.11n. + +BGP—Border Gateway Protocol. A core routing protocol. + +BLE—Bluetooth Low Energy. + +BNEP—Bluetooth Network Encapsulation Protocol. + +BTH—Base Transport Header. An InfiniBand header of 12 bytes. It specifies the source and destination QPs, the operation, packet sequence number, and partition. + +CM—Communication Manager in the InfiniBand stack. + +CIDR—Classless Inter-Domain Routing. A way to allocate Internet addresses used in inter-domain routing. + +CQ—Completion Queue (InfiniBand). + +CRIU — Checkpoint/Restore In Userspace. CRIU is a software tool, mainly implemented in userspace, with which you can freeze a running process and checkpoint it to a filesystem as a collection of files. You can then use these files to restore and run the application from the point where it was frozen. See http://criu.org/Main_Page . + +CSMA/CD—Carrier Sense Multiple Access/Collision Detection. A Media Access Control method used in Ethernet networks. + +CSMA/CA—Carrier Sense Multiple Access/Collision Avoidance. A Media Access Control method used in wireless networks. + +CT—Connection Tracking. A netfilter layer that is the basis for NAT. + +DAD—Duplicate Address Detection. The DAD is a mechanism that helps to detect the existence of double L3 addresses on different hosts on a LAN. + +DAC—Duplicate Address Confirmation. An ICMPv6 type which was added in RFC 6775, with numeric value of 158. + +DAR—Duplicate Address Request. An ICMPv6 type which was added in RFC 6775, with numeric value of 157. + +DCCP—Datagram Congestion Control Protocol. An unreliable, congestion-controlled transport layer protocol. The use of DCCP would make sense, for instance, in applications that require low delays and where a small degree of data loss is permitted, like in telephony and streaming media applications. + +DHCP—Dynamic Host Configuration Protocol. A protocol for configuring network device parameters like an IP address, a default route, and one or more DNS server addresses. + +DMA—Direct Memory Access. + +DNAT—Destination NAT. A NAT that changes the destination address. + +DNS—Domain Name System. A system for translating domain names to IP addresses. + +DSCP—Differentiated Services Code Point. A classifying mechanism. + +DVMRP—Distance Vector Multicast Routing Protocol. A protocol for routing multicast datagrams. Suitable for use within an autonomous system. Defined in RFC 1075 from 1988. + +ECN—Explicit Congestion Notification. See RFC 3168, "The Addition of Explicit Congestion Notification (ECN) to IP." + +EDR—Enhanced Data Rate. + +EGP—Exterior Gateway Protocol. A routing protocol which is now considered obsolete. It was first formalized in RFC 827 in 1982. + +ERTM—Enhanced Retransmission Mode. A reliable protocol with error and flow control, used in Bluetooth. + +ESP—Encapsulating Security Payload. Used in IPsec, has protocol number 50. + +ETH—Extended Transport Header: An InfiniBand header with size from 4 to 28 bytes. This header represents an extra family of headers that may be present depending on the class of the service and the used operation. + +ETSI—European Telecommunications Standards Institute. + +FCS—Frame Check Sequence + +FIB—Forwarding Information Base. The database that contains the routing tables information. + +FMR—Fast Memory Region (InfiniBand). + +FSF—Free Software Foundation. + +FTP—File Transfer Protocol. A protocol for transferring files between two hosts, based on TCP. + +GCC—GNU Compiler Collection. + +GID—Global Identifier. + +GMP—Group Management Protocol. A term that refers to both IGMP and MLD. See RFC 4604, section 1. + +GRE—Generic Routing Encapsulation. A tunneling protocol. + +GRH—Global Routing Header. An InfiniBand header of 40 bytes. It describes the source and destination port using GIDs, and its format is identical to the IPv6 header. + +GRO—Generic Receive Offload. A technique with which incoming packets are merged at reception time into a bigger packet to improve performance. + +GSO—Generic Segmentation Offload. A technique with which outgoing packets are segmented not in the transport layer but as close as possible to the network driver or in the network driver itself. + +GUID—Global Unique Identifier. + +HAL—Hardware Abstraction Layer. + +HCA—Host Channel Adapter. + +HCI—Host Controller Interface. Used, for example, in Bluetooth, PCI and more. + +HDP—Health Device Profile. Used by Bluetooth. + +HFP—Hands-Free Profile. Used by Bluetooth. + +HoL Blocking—Head-of-line blocking is a performance-limiting phenomenon that occurs when a line of packets is held up by the first packet, for example, in multiple requests in HTTP pipelining. + +HPC—High Performance Computing. Management of computer resources in a way that gives high performance for heavy tasks such as solving large-scale problems in science, engineering, or economics. + +HS—High Speed. + +HTTP—Hypertext Transfer Protocol. The basic protocol for accessing the World Wide Web. + +HWMP— Hybrid Wireless Mesh Protocol. A routing protocol used in wireless Mesh networks that consists of two types of routing: on-demand routing and proactive routing. + +iWARP—Internet Wide Area RDMA Protocol. + +iSER—iSCSI extension for RDMA. + +IANA—Internet Assigned Numbers Authority. Responsible for IP addressing, global coordination of the DNS Root, and other IP-related symbols and numbers. Operated by the Internet Corporation for Assigned Names and Numbers (ICANN). + +IBTA—InfiniBand Trade Association. + +ICMP—Internet Control Message Protocol. An IP protocol for control and informational messages. The well-known ping utility is based on ICMP. The ICMP protocol is known to be used in various types of security DoS attacks, like the Smurf attack. + +ICE—Interactive Connectivity Establishment. Specified in RFC 5245. A protocol for NAT traversal. + +ICRC—Invariant CRC. An InfiniBand header of 4 bytes. Covers all fields, which should not be changed as the packet travels in the subnet. + +IDS—Intrusion Detection System. + +IoT—Internet of Things. Networking of everyday objects. + +IEEE—Institute of Electrical and Electronics Engineers. + +IGMP—Internet Group Management Protocol. Multicast group memberships protocol. + +IKE—Internet Key Exchange. A protocol for setting an IPsec Security Association. + +IOMMU—I/O Memory Management Unit. + +IP—Internet Protocol. The primary addressing and routing protocol for the Internet. IPv4 was first specified in RFC 791 from 1981, and IPv6 was first specified in RFC 1883 from 1995. + +IPoIB—IP over InfiniBand. + +IPS—Intrusion Prevention System. + +ISAKMP—Internet Security Association & Key Management Protocol. + +IOCTL—Input/Output Control. A system call that provides access from userspace to kernel. + +IPC—Inter Process Communication. There are many different mechanisms for IPC, such as shared memory semaphores, message queues, and more. + +IPCOMP—IP Payload Compression Protocol. A compressing protocol intended to reduce the size of data sent over a slow network connection. Using IPComp increases the overall communication performance between two network nodes. + +IPsec—IP security. A set of protocols developed by the IETF for secure exchange of packets over the IP protocol. IPsec is mandatory in IPv6 according to the IPv6 spec and optional in IPv4, though many operating systems implemented it also in IPv4. IPsec uses two encryption modes: Transport and Tunnel. + +IPVS—IP Virtual Server. A Linux kernel load balancing infrastructure, supports IPv4 and IPv6. See http://www.linuxvirtualserver.org/software/ipvs.html . + +ISR—Interrupt Service Routine. An interrupt handler that is invoked when an interrupt is received. + +ISM—Industrial, scientific, and medical radio band. + +jumbo frames—Packets with size up to 9K. Some network interfaces allow using an MTU of up to 9K. Using jumbo frames can improve the network performance in some cases, such as in bulk data transfers. + +KVM—Kernel-based Virtual Machine. A Linux virtualization project. + +LACP—Link Aggregation Control Protocol. + +LAN—Local Area Network. A network that connects a limited area, such as an office building. + +LID—Local Identifier. A 16-bit value assigned to every subnet port by the Subnet Manager (InfiniBand). + +L2CAP—Logical Link Control and Adaptation Protocol. Used in Bluetooth. + +L2TP—Layer 2 Tunneling Protocol used by VPNs. L2TPv3 is specified in RFC 3931 (RFC 5641 has some updates). + +LKML—Linux Kernel Mailing List. + +LLCP —Logical Link Control Protocol. Used by NFC. + +LLN—Low-power and Lossy Network. + +LoWPAN—Low-power Wireless Personal Area Network. + +LMP—Link Management Protocol. Controls the radio link between two Bluetooth devices. + +LPM—Longest Prefix Match. An algorithm used by the routing subsystem. + +LRH—Local Routing Header. An InfiniBand header of 8 bytes. It identifies the local source and destination ports of the packet. It also specifies the requested QoS attributes (SL and VL) of the message. + +LRO—Large Receive Offload. + +LR-WPAN—Low-Rate Wireless Personal Area Network. Used in IEEE 802.15.4. + +LSB—Least significant bit. + +LSRR—Loose Source Record Route. + +LTE—Long Term Evolution. + +MAC—Media Access Control. A sublayer of the Data Link Layer (L2) of the OSI model. + +MAD—Management Datagram (InfiniBand). + +MFC—Multicast Forwarding Cache. A data structure in the kernel that consists of multicast forwarding entries. + +MIB—Management Information Base. + +MLD—Multicast Listener Discovery protocol. Enables each IPv6 router to discover the presence of multicast listeners. The MLD protocol is specified in RFC 3810, from 2004. + +MLME—MAC Layer Management Entity. A component in the IEEE 802.11 management layer responsible for operations such as scanning, authentication, association, and reassociation. + +MR—Memory Region (InfiniBand). + +MSF—Multicast Source Filtering. This is the feature to set filters so that multicast traffic from sources other than the expected ones will be dropped. + +MSI—Message Signaled Interrupts. + +MSS—Maximum Segment Size. A parameter of the TCP protocol. + +MTU—Maximum transmission unit. The size of the largest packet that a network protocol can transmit. + +MW—Memory Window (InfiniBand). + +NAP—Network Access Point. + +NAPI—New API. A technique by which network drivers are not interrupt-driven, but use polling. NAPI is discussed in Chapter 1. + +NAT—Network Address Translation. A layer responsible for modifying IP headers. In Linux, support for IPv6 NAT was merged in kernel 3.7. + +NAT-T—NAT traversal. + +NCI—NFC Controller Interface. + +ND / NDISC—Neighbour Discovery Protocol. Used in IPv6. Among its tasks: discovering network nodes on the same link, autoconfiguration of addresses, finding the Link Layer addresses of other nodes, and maintaining reachability information about other nodes. + +NFC—Near Field Communication. + +NDEF—NFC Data Exchange Format. + +NIC—Network Interface Card, also known as Network Interface Controller or Network Adapter. The hardware network device. + +NUMA—Non-Uniform Memory Access. + +NPP—NDEF Push Protocol. + +NPAR—NIC Partitioning. A technology that enables you to split up network card (NIC) traffic in partitions. + +NUD—Network Unreachability Detection. A mechanism responsible for determining whether a neighbour can be reached. + +OBEX—Object Exchange. A protocol for exchange of binary objects between devices, used in Bluetooth. + +OEM—Original Equipment Manufacturer. + +OFA—OpenFabrics Alliance. + +OCF—Open Cryptography Framework. + +OHA—Open Handset Alliance. + +OOTB—Out of the Blue packet (a term of the SCTP protocol). A packet is an OOTB packet if it is correctly formed (that is, no checksum error), but the receiver is not able to identify the SCTP association to which the packet belongs (see section 8.4 in RFC 4960). + +OPP—Object Push Profile. Used by Bluetooth. + +OSI Model—Open Systems Interconnection. + +OSPF—Open Shortest Path First. Interior gateway routing protocol developed for IP networks. + +PADI—PPPoE Active Discovery Initiation. + +PADO—PPPoE Active Discovery Offer. + +PADR—PPPoE Active Discovery Request. + +PADS—PPPoE Active Discovery Session. + +PADT—PPPoE Active Discovery Terminate. + +PAN—Personal Area Networking. A profile used in Bluetooth. + +PCI—Peripheral Component Interconnect. A bus for attaching devices. Many network interface cards are PCI devices. + +PD—Protection Domain. + +PHDC—Personal Health Device Communication. Used by NFC. + +PID—Process Identifier. + +PIM—Protocol Independent Multicast Protocol. A multicast routing protocol. + +PIM-SM—Protocol Independent Multicast—Sparse Mode. + +PLME—Physical Layer Management Entity in IEEE 802.11. + +PM—Power Management. + +PPP—Point To Point data link protocol. A protocol for direct communication between two hosts. + +PPPoE—PPP over Ethernet. The PPPoE protocol is specified in RFC 2516 from 1999. + +PERR—Path Error. A message that informs about some failure in a wireless Mesh network routing. + +PREP—Path Reply. A unicast packet sent as a reply to a PREQ message in a wireless Mesh network. + +PREQ—Path Request. A broadcast packet sent when looking for some address in a wireless Mesh network. + +PSK—Preshared Key. + +Qdisc—Queuing Disciplines. + +QP—Queue Pair (InfinBand). + +RA—Router Alert. One of the IPv4 options. It notifies transit routers to more closely examine the contents of an IP packet. It is used by many protocols, such as IGMP, MLD, and more. + +RANN—Root Announcement. A broadcast packet sent periodically by a Root Mesh point in a wireless Mesh network. + +RARP—Reverse Address Resolution Protocol. A protocol used to find the mapping between a link layer address (like a 48-bit Ethernet address) to a network address (like an IPv4 address). + +RC—A QP transport type in InfiniBand. + +RDMA—Remote Direct Memory Access. A direct memory access from one host to another. + +RDS—Reliable Datagram Socket. A reliable connectionless protocol developed by Oracle. + +RFC—Request For Comments. A document that specifies Internet specifications, communications protocols, procedures, and events. The standardization process of RFCs is documented at http://tools.ietf.org/html/rfc2026 , "The Internet Standards Process." + +RFID—Radio Frequency ID. + +RFCOMM—Radio Frequency Communications protocol. Used in Bluetooth. + +RFS—Receive Flow Steering. + +RIP—Routing Information Protocol: A distance-vector routing protocol. + +RoCE—RDMA over Converged Ethernet. + +RP—Rendezvous Point. + +RPL—IPv6 Routing Protocol for Low-Power and Lossy Networks. The RPL protocol is specified in RFC 6550. + +RPDB—Routing Policy DataBase. + +RPF—Reverse Path Filter. A technique intended to prevent source address spoofing. + +RPC—Remote Procedure Call. + +RPS—Receive Packet Steering. + +RS—Router Solicitations. + +RSA—A cryptography algorithm. RSA stands for Ron Rivest, Adi Shamir, and Leonard Adleman, the people who developed it. + +RTP—Real-time Transport Protocol. A protocol for transmitting audio and video over IP networks. + +RTR—Ready To Receive. A state in InfiniBand QP State Machine. + +RTS—Ready To Send. A state in InfiniBand QP State Machine. + +SA—Security Association. A logical relationship between two hosts that consists of various parameters, such as cryptographic key, cryptographic algorithm, SPI, and more. + +SACK—Selective Acknowledgments. See RFC 2018, "TCP Selective Acknowledgment Options," from 1996. + +SAD—Security Association Database. + +SAR—Segmentation and Reassembly. + +SBC—Session Border Controllers. + +SCO—Synchronous Connection Oriented link. A Bluetooth protocol. + +SDP—Service Discovery Protocol. Used in Bluetooth. + +SCTP—Stream Control Transmission Protocol. A transport protocol that has features of both UDP and TCP. + +SE—Security Element (NFC). + +SIG—Special Interest Group. + +SIP—Session Initiation Protocol. A signaling protocol for VoIP, intended for creating and modifying VoIP sessions. + +SLAAC—Stateless Address autoconfiguration. Specified in RFC 4862. + +SKB—Socket Buffer. A kernel data structure representing a network packet (implemented by the sk_buff structure, include/linux/skbuff.h). + +SL—Service Level. The QoS in InfiniBand is implemented using the SL to VL mapping and the resources for each VL. + +SLAAC—Stateless Address Autoconfiguration. + +SM—Subnet Manager. + +SMA—Subnet Management Agent. + +SME—System Management Entity in IEEE 802.11. + +SMP—Symmetrical Multiprocessing. An architecture where two or more identical processors are connected to a single shared main memory. + +SNAT—Source NAT. A NAT that changes the source address. + +SNEP—Simple NDEF Exchange Protocol (SNEP) for exchanging NDEF-formatted data. + +SNMP—Simple Network Management Protocol. + +SPI—Security Parameter Index. Used by IPsec. + +SPD—Security Policy Database. + +SQD—Send Queue Drained. A state in InfiniBand QP State Machine. + +SQE—Send Queue Error. A state in InfiniBand QP State Machine. + +SRP—SCSI RDMA protocol. + +SR-IOV—Single Root I/O Virtualization. A specification that allows a PCIe device to appear to be multiple separate physical PCIe devices. + +SRQ—Shared Receive Queue (InfiniBand). + +SSM—Source Specific Multicast. + +STUN —Session Traversal Utilities for NAT. + +SSP—Secure Simple Pairing. A security feature required by Bluetooth v2.1. + +TCP—Transmission Control Protocol. The TCP protocol is the most commonly used transport protocol on the Internet today. Many protocols run on top of TCP, including FTP, HTTP, and more. TCP is specified in RFC 793 from 1981, and during the years since then there have been many protocol updates, variations, and additions to the base TCP protocol. + +TIPC—Transparent Inter-process Communication protocol. See http://tipc.sourceforge.net/ . + +TOS —Type Of Service. + +TSO—TCP Segmentation Offload. + +TTL—Time To Live. A counter in the IPv4 header (its counterpart in IPv6 is called Hop Limit) that is decremented in each forwarding device. When this counter reaches 0, an ICMP of Time Exceeded is sent back, and the packet is discarded. Both the ttl member of the IPv4 header and the hop_limit member of the IPv6 header are 8-bit fields. + +TURN—Traversal Using Relays around NAT. + +UC—Unreliable Connected. A QP transport type in InfiniBand. + +UD—Unreliable Datagram. A QP transport type in InfiniBand. + +UDP—User Datagram Protocol. UDP is an unreliable protocol, as there is no guarantee that packets will be delivered for upper layer protocols. There is no handshaking phase in UDP, in contrast to TCP. The UDP header is simple and consists of only 4 fields: source port, destination port, checksum, and length. + +USAGI—UniverSAl playGround for Ipv6. A project that developed IPv6 and IPsec (for both IPv4 and IPv6) stacks for the Linux kernel. + +UTS—Unix Time-sharing System. + +VCRC—Variant CRC. An InfiniBand header of 2 bytes. Covers all the fields of the packet. + +VETH—Virtual Ethernet. A network driver which enables communication between two network devices in different network namespaces. + +VoIP—Voice Over IP. + +VFS—Virtual File System. + +VL—Virtual Lanes. A mechanism for creating multiple virtual links over a single physical link. + +VLAN—Virtual Local Area Network. + +VPN—Virtual Private Network. + +VXLAN—Virtual Extensible Local Area Network. VXLAN is a standard protocol to transfer Layer 2 Ethernet packets over UDP. VXLAN is needed because there are cases where firewalls block tunnels and allow, for example, only TCP/UDP traffic. + +WDS—Wireless Distribution System. + +WLAN—Wireless LAN. + +WOL—Wake On LAN. + +WSN—Wireless Sensor Networks. + +XRC—eXtended Reliable Connected. A QP transport type in InfiniBand. + +XFRM—IPsec Transformer. A Linux kernel framework for handling IPsec transformations. The two most fundamental data structures of the XFRM framework are the XFRM policy and the XFRM state. +Rami RosenLinux Kernel NetworkingImplementation and Theory10.1007/978-1-4302-6197-1© Apress 2014 + +Index + +A + +Access point (AP) + +Address registration option (ARO) + +Address resolution protocol (ARP) + +arp_constructor() method + +arp_create() method + +arp_filter() method + +arphdr structure + +arp_ignore() + +arp_process() method + +arp_rcv() method + +arp_send() method + +daemon + +dst_neigh_output() method + +ethernet packet + +inet_addr_onlink() method + +inet_select_addr() method + +MAC addresses + +neigh_lookup() + +neigh_resolve_output() method + +NF_HOOK() macro + +pneigh_enqueue() method + +solicit() method + +AES instruction set (AES-NI) + +Aggregated mac protocol data unit (AMPDU) + +Aggregated mac service data unit (AMSDU) + +Alternate MAC/PHY (AMP) + +Android + +internal resources + +networking + +android debug bridge (ADB) + +Bluetooth + +near field communication (NFC) + +netfilter + +security privileges and networking + +Android debug bridge (ADB) + +Android open source project (AOSP) + +Any-source multicast (ASM) + +Application programming interface (API) + +ARP protocol. + +See See Address resolution protocol (ARP) + +Association ID (AID) + +Audio/video distribution transport protocol (AVDTP) + +Authentication header protocol (AH) + +Authoritative border router option (ABRO) + +B + +Base Transport Header (BTH) + +Beacons + +Block Acknowledgement (BA) + +Block Ack Request (BAR) + +Bluetooth Low Energy (BLE) + +Bluetooth Network Encapsulation Protocol (BNEP) + +Bluetooth protocol + +ACL packets + +Bluetooth profiles + +Bluetooth stack + +HCI connection + +Bluetooth Network Encapsulation Protocol (BNEP) + +logical link control and adaptation protocol (L2CAP) + +HCI layer, struct hci_dev + +host controller interface (HCI) + +L2CAP/SCO layers + +link controller + +logical link control and adaptation protocol (L2CAP) features + +personal area networks (PANs) + +radio frequency communications (RFCOMM) + +service discovery protocol(SDP) + +special interest group (SIG) + +synchronous connection-oriented (SCO) + +tools + +Board Support Packages (BSPs) + +Border Gateway Protocol (BGP) + +Busy poll sockets + +busy_poll controls + +busy_read controls + +ndo_busy_poll callback + +performance + +SO_BUSY_POLL socket option + +tuning and configuration + +C + +Carrier Sense Multiple Access/Collision Avoidance (CSMA/CA) + +Carrier Sense Multiple Access/Collision Detection (CSMA/CD) + +Cgroups + +cls_cgroup classifier + +device controller + +implementation + +cgroup_subsys structure + +css_set object + +register_filesystem() method + +release_agent + +libcg library + +memory controller + +mounting cgroup subsystems + +net_prio Module + +Checkpoint/Restore In Userspace (CRIU) + +Chunk types + +Classless Inter-Domain Routing (CIDR) + +Common Development and Distribution License (CDDL) + +Communication Manager (CM) + +Completion Queue (CQ) + +Connection tracking + +callbacks + +dst structure + +entries + +ipv4_confirm() method + +network namespace object + +nf_conn structure description + +nf_ct_timeout_lookup() method + +reference counter + +resolve_normal_ct() method + +specific packet() method + +extensions + +hook callbacks + +DNAT rule + +ipv4_conntrack_in() + +NAT and netfilter hooks + +nf_nat_ipv4_in() + +hooks + +initialization + +IPTables + +Filter table rule + +log-level modifier + +LOG target + +network namespace object + +parts + +IPv4 NAT module + +local host delivery + +NAT + +NAT hook callbacks + +nf_conntrack method + +nf_conntrack_tuple structure + +NF_INET_PRE_ROUTING hook + +packet forwarding + +Constructor + +Control packets + +CSMA/CA + +D + +Datagram Congestion Control Protocol (DCCP) + +and NAT + +development of + +header + +initialization + +packet types + +receiving packets + +sending packets + +socket initialization + +Datagram sockets + +Data links sockets + +Data packets + +Dccp_init_sock() method + +DCCP. + +See See Datagram Congestion Control Protocol (DCCP) + +Dccp_v4_rcv () method + +Delayed ACK timer + +Destination NAT (DNAT) + +Distance Vector Multicast Routing Protocol (DVMRP) + +Domain Name System (DNS) + +Duplicate Address Confirmation (DAC) + +Duplicate Address Detection (DAD) + +Duplicate Address Request (DAR) + +Dynamic Host Configuration Protocol (DHCP) + +Dynamic Host Configuration Protocol version 6 (DHCPv6) + +E + +Encapsulating Security Payload (ESP) + +Enhanced data rate (EDR) + +Enhanced Retransmission Mode (ERTM) + +ESP protocol + +Authentication Data + +ESP format + +initialization + +Padding + +Payload Data + +Security Parameter Index + +Sequence Number + +Extended Service Set (ESS) + +Extended Transport Header (ETH) + +Exterior Gateway Protocol (EGP) + +F + +Failover + +Fast Memory Region (FMR) + +Fib_select_multipath() method + +File Transfer Protocol (FTP) + +Forwarding Information Base (FIB) + +Free Software Foundation (FSF) + +G + +General Public License (GPL) + +Generic netlink protocol + +acpi subsystem + +command identifier + +ctrl_getfamily() method + +flags + +generic netlink messages + +genl_ops structure + +genl_pernet_init() method + +genl_sock pointer + +hostapd package + +internal_flags + +multicast group + +netlink_kernel_create() method + +NFC subsystem + +nl_send_auto() + +policy + +socket monitoring interface + +CRIU projects + +sock_diag_handler + +sock_diag_register() + +ss tool + +UNIX diag module + +wireless subsystem + +wireless-tools + +Generic Receive Offload (GRO) packets + +Generic Segmentation Offload (GSO) + +Genl_connect()method + +Git trees + +Global IDentifier (GID) + +Global Routing Header (GRH) + +Group Management Protocol (GMP) + +H + +Head-of-Line (HoL) blocking + +HEARTBEAT mechanism + +High Performance Computing (HPC) + +High Throughput Task Group (TGn) + +AMPDU aggregation + +AMSDU aggregation + +Block Ack Request (BAR) + +del_timer_sync() + +vendors + +Host Channel Adapter (HCA) + +Hybrid Wireless Mesh Protocol (HWMP) + +I, J + +ICMP protocol. + +See See Internet control message protocol (ICMP) + +ICMPv4 messages + +categories + +destination unreachable + +ICMP_FRAG_NEEDED code + +ICMP_PORT_UNREACH code + +ICMP_PROT_UNREACH code + +icmp_reply() method + +icmp_send() method + +ICMP_SR_FAILED code + +header + +conditions + +DHCP + +icmp_bxm structure + +icmp_control objects + +icmp_control structure + +icmp_discard() + +icmp_echo()method + +ICMP_QUENCH message + +icmp_redirect() + +ICMP sockets/ping sockets + +ip_local_deliver_finish()method + +NTP + +ping_rcv() method + +raw_local_deliver() + +struct icmphdr + +timestamps + +TTL + +icmp_echo() method + +inet_init() method + +IP broadcast or IP multicast address + +ip_local_deliver_finish() method + +ping and traceroute utility + +ping_rcv() method + +ICMPv4 redirect message + +ip_do_redirect() method + +ip_forward() method + +ip_rt_send_redirect() method + +mkroute_input() method + +ICMPv6 messages + +cmpv6_rcv() method + +destination unreachable + +ICMP_FRAG_NEEDED code + +ICMPV6_EXC_FRAGTIME code + +ICMPV6_EXC_HOPLIMIT code + +parameter problem + +port unreachable + +header + +icmpv6_init() method + +icmpv6_notify() method + +igmp6_event_report() + +ND messages + +pskb_may_pull() method + +IEEE 802.15.4 + +ieee802154_dev object + +ieee802154_ops object + +low-rate wireless personal area networks (LR-WPANs) + +medium access control (MAC) + +wireless sensor networks (WSNs) + +IKE. + +See See Internet Key Exchange (IKE) + +Inet_create() method + +InfiniBand subsystem + +addressing + +Communication Manager + +features + +hardware components + +methods + +packet headers + +See(see Packet headers) + +RDMA + +See(see RDMA device; Remote Direct Memory Access (RDMA)) + +Subnet Administrator + +Subnet Management Agent + +InfiniBand Trade Association (IBTA) + +Internet Assigned Numbers Authority (IANA) + +Internet control message protocol (ICMP) + +definition + +ICMPv4 messages + +See(see ICMPv4 messages) + +ICMPv6 messages + +See(see ICMPv6 message) + +ping sockets + +Internet Key Exchange (IKE) + +Internet Key Exchange Protocol Version 2 (IKEv2) + +Internet of Things (IoT) + +Internet Protocol (IP) + +Internet Protocol security (IPsec) subsystem + +cryptography + +definition + +ESP protocol + +Authentication Data + +ESP format + +initialization + +Padding + +Payload Data + +Security Parameter Index + +Sequence Number + +IKE + +methods + +NAT traversal + +Main Mode, IKE + +SBCs + +TCP/UDP header + +VoIP NAT-traversal + +transport mode + +receiving IPv4 ESP packet + +transmitting IPv4 ESP packet + +VPN technology + +XFRM framework + +dummy bundle + +flow_cache_lookup() method + +netns_xfrm structure + +Security Association (SA) + +security policy + +See(see Security policy) + +xfrm_init() method + +xfrm_lookup() method + +xfrm_route_forward() method + +XFRM SNMP MIB counters + +Internet server provider (ISP) + +Internet Wide Area RDMA Protocol (iWARP) + +Inter Process Communication (IPC) + +Ip_cmsg_send() method + +Ip_mc_leave_group() method + +Ipmr_rules_init() method + +IP Payload Compression Protocol (IPCOMP) + +IPsec subsystem. + +See See Internet protocol security (IPsec) subsystem + +IPv4 protocol + +defragmentation + +hash function + +ip_defrag() method + +ip_expire() method + +ip_forward() method + +ip_frag_queue() + +ip_frag_reasm() method + +ipq_kill() method + +dst_input() method + +dst_output() method + +fragmentation + +fast path fragmentation + +ip_fragment() method + +slow path fragmentation + +fragmentation needed code + +header + +fragment offset + +id field + +internet header length + +L4 protocol + +struct iphdr + +Time To Live + +total length + +Type of Service + +initialization + +internet header length + +ip_append_data() method + +ip_fast_csum() method + +ip_forward_options() method + +IP_HDRINCL socket option + +ip_local_deliver_finish() method + +IP options + +copied flag + +IPOPT_CIPSO option + +IPOPT_END option + +ip_options_fragment() method + +IPOPT_LSRR option + +IPOPT_NOOP option + +IPOPT_SEC option + +linux symbol + +memset() function + +Multibyte option + +option class + +option number + +optptr pointer + +record route option + +See(see Record route option) + +Single byte option + +timestamp option + +while loop + +ip_options_build() method + +ip_queue_xmit() method + +ip_rcv_finish() method + +ip_rcv() method + +ip_route_input_noref() method + +ip_route_output_ports() + +MSG_PROBE flag + +multicast packets + +netfilter hooks + +receiving path (Rx) + +routing subsystem + +RPF + +RTCF_DOREDIRECT flag + +skb_dst() + +skb_push() method + +strict route flag + +transport layer + +TTL count exceeded code + +IPv4 routing cache + +Rx Path + +Tx Path + +IPv6 header + +destination address + +extension headers + +Authentication Header + +Destination Options header + +ESP + +Fragment Options header + +Hop-by-Hop Options header + +protocol handler + +Routing Options header + +upper-layer protocol + +flow_lbl + +hop_limit + +ip_decrease_ttl() method + +nexthdr + +payload_len + +source address + +traffic class/priority + +version + +IPv6 protocol + +addresses + +Anycast + +ARP protocol + +Global Unicast + +in6_addr structure + +IPv4-compatible format + +link-local unicast address + +multicast address + +multicast address + +See(see Multicast address) + +Site local addresses + +Unicast + +autoconfiguration + +definition + +DHCPv6 + +interface flag + +preferred lifetime + +RA + +router solicitation + +valid lifetime + +features + +in6_addr structure + +inet6_add_protocol() method + +inet6_dev structure + +inet6_init() method + +INET6_PROTO_NOPOLICY flag + +ip6_append_data() method + +ip6_forward () method + +ip6_input() method + +ip6_rcv_finish() method + +ip6_xmit() method + +IPv6 header + +See(see IPv6 header) + +ipv6_is_mld() method + +ipv6_rcv() method + +Linux symbol and value + +macros + +methods + +MLD + +See(see Multicast Listener Discovery (MLD)) + +multicast packets + +ip6_input_finish() method + +ip6_mc_input() method + +ip6_mr_input() method + +ipv6_chk_mcast_addr() method + +routing + +routing tables + +Rx path + +SKB + +IP Virtual Server (IPVS) + +K + +Keep Alive timer + +Kernel netlink sockets + +callbacks + +EPRM error + +input callback + +netlink_bind() + +netlink_kernel_create() prototype + +netlink_lookup() method + +rtmsg_ifinfo() method + +rtnetlink_net_init() method + +rtnetlink_rcv() method + +rtnl_register() + +KLIPS stack + +L + +Large Receive Offload (LRO) packets + +Linux API + +net_device structure + +See(see Net_device structure) + +RDMA + +See(see Remote Direct Memory Access (RDMA)) + +sk_buff Structure + +Bluetooth protocol + +checksum values + +connection tracking + +dev member + +dropcounter + +dst_entry struct + +eth_type_trans() method + +handling buffers + +headroom and tailroom + +ip_queue_xmit() method + +IP virtual server + +link layer + +netfilter packet trace flag + +network layer + +PMTUD + +preceding rule + +secmark field + +security path pointer + +setsockopt() + +skb_clone() method + +skb_pfmemalloc() function + +skb_shared_info struct + +sock_create_kern() method + +timestamp + +transport layer + +VLAN protocol + +Linux Kernel Mailing List (LKML) + +Linux neighbouring subsystem + +arp_netdev_event() method + +ARP protocol + +See(see Address resolution protocol (ARP)) + +Ethernet + +macros + +methods + +NDISC Protocol + +See(see Neighbour Discovery (NDISC) protocol) + +neighbour solicitations + +neighbour structure + +dead flag + +neigh_parms object + +neigh_resolve_output() method + +neigh_timer_handler() method + +NUD state + +primary_key + +reference counter + +neigh_create() method + +neigh_statistics structure + +neigh_table structure + +arp_hash() method + +arp_rcv() method + +asynchronous garbage collector handler + +constructor + +function pointers + +IPv4 procfs + +ndisc_init() method + +neigh_alloc() method + +neigh_table_init_no_netlink() method + +pdestructor method + +phash_buckets + +proxy_timer + +sizeof + +thresholds + +network unreachability detection states + +vs. userspace + +Linux network stack + +development model + +git trees + +IPv4/IPv6 + +network device drivers + +See(see Network device drivers) + +Open Systems Interconnection (OSI) model + +application layer + +data link layer + +network layer + +physical layer + +presentation layer + +protocol layer/transport layer + +session layer + +protocol rules + +TCP/UDP listening sockets + +Linux routing subsystem + +Linux wireless stack + +development trees + +Mac802 11 subsystem + +See(see Mac802 11 subsystem) + +methods + +MLME + +See(see Management Layer (MLME)) + +network topologies + +IBSS/Ad Hoc Mode + +infrastructure BSS mode + +power save mode + +entering + +exiting + +multicast/broadcast buffer + +PS-Poll packets + +Rx Flags and Linux symbol + +Local IDentifier (LID) + +Local key (lkey) + +Local Routing Header (LRH) + +Logical link control and adaptation protocol (L2CAP) + +6LoWPAN + +implementation + +initialization + +adaption layer + +PHY layer + +neighbor discovery optimization + +6LoWPAN context option (6CO) + +Address Registration Option (ARO) + +authoritative border router option (ABRO) + +duplicate address detection (DAD) mesages + +Low-rate wireless personal area networks (LR-WPANs) + +M + +Mac802.11 subsystem + +802.11 amendments types + +802.11 vs. 802.3 wired Ethernet + +add_interface()method + +Ad Hoc (IBSS) mode + +AP mode + +architecture + +configure_filter() + +debugfs + +fragmentation + +header + +addresses + +frame control + +HT control field + +ieee80211_hdr structure + +Network allocation vector + +QoS Control + +sequence control + +ieee80211_alloc_hw() method + +management layer + +Mesh mode + +mesh networking + +advantages + +Full Mesh + +HWMP Protocol + +Partial Mesh + +Sett Up + +Monitor mode + +remove_interface() + +Rx Path function + +start()method + +Station infrastructure mode + +stop() + +TGn + +See(see High Throughput Task Group (TGn)) + +tx()function + +Tx Path + +Wireless Distribution System (WDS) mode + +WLANS + +Management Layer (MLME) + +association + +authentication + +components + +reassociation + +scanning + +Management packets + +Memory windows + +ib_alloc_mw() method + +ib_bind_mw() method + +ib_dealloc_mw() method + +Mesh networking + +advantages + +Full Mesh + +HWMP Protocol + +Partial Mesh + +Sett Up + +Message Signaled Interrupts (MSIs) + +Mroute_sk pointer + +MSF + +filters + +group_filter structure + +igmp6_event_query() method + +mld2_grec structure + +MLDv1 message types + +multicast traffic + +parameters + +setsockopt() method + +Msghdr structure + +Multicast address + +Linux symbol and value + +MLD + +ndisc_send_na() method + +Multicast Forwarding Cache (MFC) + +Multicast Listener Discovery (MLD) + +ASM model + +dev_forward_change() method + +GMP + +Hop-by-Hop header + +ipv6_add_dev() method + +IPV6_ADD_MEMBERSHIP socket + +IPV6_JOIN_GROUP socket + +mld2_grec structure + +MLDv2 protocol + +MSF + +filters + +group_filter structure + +igmp6_event_query() method + +mld2_grec structure + +MLDv1 message types + +multicast traffic + +parameters + +setsockopt() method + +router join + +setsockopt() + +Multicast routing + +CIDR + +fib_rules_lookup() method + +IGMP protocol + +IGMPv1 (RFC 1112) + +IGMPv2 (RFC 2236) + +IGMPv3 (RFC updated by RFC 4604) + +ipmr_forward_finish() method + +ip_mr_forward() method + +ip_mroute_setsockopt() method + +ipmr_queue_xmit() method + +MFC + +mr_table structure, routing table + +PIM protocol + +Pv4 Multicast Rx Path + +ip_call_ra_chain() method + +ipmr_cache_alloc_unres() + +ipmr_cache_find() method + +ipmr_cache_unresolved() method + +ip_mr_forward() + +ip_mr_input() method + +ipmr_rt_fib_lookup() method + +raw_rcv() method + +setsockopt() method + +thresholds + +topology + +unicast IPV4 traffic + +vifc_flags + +vif_device structure + +Multicast Source Filtering (MSF) + +Multipath routing + +N + +Native Netkey stack + +NDISC protocol. + +See See Neighbour Discovery (NDISC) protocol + +Near field communication (NFC) + +Android + +communication and operation modes + +devices + +drivers API + +Kernel architecture + +nfc_allocate_device() method + +probe() callback + +probe() method + +host-controller Interfaces + +intialization + +netlink API + +NFC tags + +overview + +sockets + +LLCPsockets + +raw sockets + +subsystem + +userspace architecture + +Neigh_add() method + +Neighbour Discovery (NDISC) protocol + +duplicate address detection + +addrconf_dad_start() method + +ICMPv6 message types + +ipv6_addr_any() method + +ndisc_rcv() method + +ndisc_recv_na() + +ndisc_recv_ns() method + +ndisc_send_na() method + +ndisc_send_ns() method + +ndisc_solicit() + +nud_state + +override flag + +router flag + +solicited flag + +Neighbour discovery (ND) messages + +Neighbour structure + +dead flag + +neigh_parms object + +neigh_resolve_output() method + +neigh_timer_handler() method + +NUD state + +primary_key + +reference counter + +Neigh_delete() method + +Net_device structure + +allmulti counter + +boolean flag + +definition + +dev_uc_init() method + +enum + +Ethernet addresses + +eth_hw_addr_random() method + +features + +flag + +hardware address assignment type + +header_ops struct + +Interrupt Request (IRQ) + +int flags + +int priv_flags + +kobject structure + +message signaled interrupts + +MTU + +NAPI stands + +neigh_alloc() method + +netdev_ops structure + +netdev_run_todo() method + +NETIF_F_GRO + +NETIF_F_HIGHDMA + +NETIF_F_HW_VLAN_CTAG_RX + +NETIF_F_NETNS_LOCAL + +NETIF_F_VLAN_CHALLENGED + +network namespaces + +network partitioning + +promiscuity counter + +protocol-specific pointers + +Qdisc + +qdisc of pfifo_fast + +rx_handler + +Rx queues + +SET_ETHTOOL_OPS + +short gflags + +state flag + +Tx queue + +union + +VLAN devices + +watchdog timer + +Netfilter subsystem + +connection tracking + +See(see Connection tracking) + +frameworks + +IP sets + +iptables + +iptables types + +IPVS + +IPv4 and ipv6 network namespace + +methods + +netfilter hooks + +NF_INET_FORWARD + +NF_INET_LOCAL_IN + +NF_INET_LOCAL_OUT + +NF_INET_POST_ROUTING + +NF_INET_PRE_ROUTING + +parameters + +registration + +return value + +Netlink sockets + +advantages + +BSD-style sockets + +generic netlink protocol + +See(see Generic netlink protocol) + +IPC mechanism + +kernel netlink sockets + +See(see Kernel netlink sockets) + +libnl library + +netlink_kernel_create() method + +netlink message header + +attribute validation policy + +generic netlink message + +nlmsg_flags field + +nlmsg_len + +sequence number + +struct nlmsghdr + +TLV format + +types + +NETLINK_ROUTE messages + +routing table + +sockaddr_nl structure + +TCP/IP networking + +Network Address Translation (NAT) + +Network administration + +ApacheBench + +arping + +ARP table management + +arptables + +arpwatch + +brctl + +conntrack-tools + +crtools + +ebtables + +ether-wake + +ethtool + +git + +hciconfig + +hcidump + +hcitool + +ifconifg command + +ifenslave + +iperf + +iproute2 package + +iptables and iptables6 + +ipvsadm + +iwconfig tool + +iw package + +l2ping + +libreswan Project + +lowpan-tools + +lscpu + +lshw + +lspci + +mrouted + +netperf tool + +netsniff-ng + +netstat tool + +ngrep tool + +nmap + +nmap-ncat package + +openswan + +OpenVPN + +packeth + +pimd + +ping + +pktgen + +poptop + +ppp daemon + +radvd + +route tool + +RP-PPPoE + +sar tool + +smcroute + +snort + +suricata + +sysctl utility + +taskset + +tcpdump + +top utility + +tracepath command + +traceroute utility + +tshark utility + +tunctl tool + +udevadm + +unshared utility + +vconfig utility + +wireshark + +wpa_supplicant + +XORP + +Network Allocation Vector (NAV) + +Network device drivers + +IPsec policy + +NAPI + +netfilter subsystem + +nf_register_hooks() method + +promiscuity counter + +socket buffer + +datagram and stream sockets + +Ethernet packet + +eth_type_trans() method + +ICMP protocol + +ip_rcv_finish() method + +IPv4 packet + +ipv6_rcv() method + +netdev_alloc_skb() method + +RDMA + +structure + +topologies + +transport protocols + +virtualization + +wireless subsystem + +structure + +traversal + +TTL Count Exceeded + +VPN solutions + +Network driver + +Network namespaces + +implementation + +data structures + +net structure + +management + +communication + +ip netns command + +network interface + +namespaces implementation + +clone() system + +clone_uts_ns() method + +copy_net_ns() method + +copy_utsname() method + +create_nsproxy() method + +exit_task_namespaces() method + +get_net_ns_by_fd() method + +get_net_ns_by_pid() method + +IPC namespaces + +ip netns command + +mnt_namespace + +network namespaces + +nsproxy structure + +PID namespaces + +setns() system + +unshare() system + +user_namespace + +UTS namespaces + +uts_namespace + +proc_do_uts_string() method + +sethostbyname() + +Network topologies + +IBSS/Ad Hoc Mode + +infrastructure BSS mode + +Next Hop Resolution Protocol (NHRP) + +Non-Broadcast, Multiple Access (NBMA) + +Notifications chains + +call_netdevice_notifier() method + +network device events + +notifier_chain_register() method + +register_netdevice_notifier() method + +rtmsg_ifinfo() method + +subsystems + +O + +Open Cryptography Framework (OCF) + +Open Systems Interconnection (OSI) model + +application layer + +data link layer + +network layer + +physical layer + +presentation layer + +protocol layer/transport layer + +session layer + +Out of the Blue packet (OOTB) + +P + +Packet headers + +Base Transport Header + +Extended Transport Header + +Global Routing Header + +Immediate data + +Invariant CRC + +Local Routing Header + +Payload + +Variant CRC + +Peripheral Component Interconnect (PCI) subsystem + +configuration space + +pci_driver structure + +struct pci_dev structure + +Wake-On-LAN (WOL) + +Persistent timer. + +See See Zero window probe timer + +Personal area networks (PANs) + +Ping sockets + +Policy routing + +definition + +fib_default_rules_init() method + +fib_lookup() method + +fib_rules module, implementation + +rules + +PPPoE protocol + +internet server provider (ISP) + +intialization, PPoXsockets + +link control protocol (LCP) + +password authentication protocol (PAP) + +PPPoE active discovery initiation (PADI) + +PPPoE active discovery offer (PADO) + +PPPoE active discovery request (PADR) + +PPPoE active discovery session (PADS) + +PPPoE active discovery terminate (PADT) + +PPPoE header + +sending and receiving packets + +Primary_key + +Protection domain (PD) + +address handle + +Fast Memory Region (FMR) Pool + +ib_alloc_pd() method + +ib_dealloc_pd() method + +memory region(MR) + +memory window + +QP + +See(see Queue Pair (QP)) + +SRQ + +See(see Shared Receive Queue (SRQ)) + +Q + +Queue Key (Q_Key) + +Queue pair (QP) + +attributes + +ib_close_qp() method + +ib_create_qp() method + +ib_modify_qp() + +ib_post_recv() + +ib_post_send() method + +MW binding attributes + +struct ib_send_wr + +ib_query_qp() method + +selective signaling + +state machine + +Error state + +ib_modify_qp() method + +ib_query_qp() method + +Initialized state + +Ready To Receive (RTR) state + +Ready To Send (RTS) state + +Reset state + +Send Queue Drained (SQD) state + +SQE state + +struct ib_qp_cap + +struct ib_qp_open_attr + +transport types + +Quick Mode + +R + +Radio Frequency Communications protocol (RFCOMM) + +Raw sockets + +RDMA device. + +See also See also Remote Direct Memory Access (RDMA) + +Real-time Transport Protocol (RTP) + +Receive path (Rx) + +Record route option + +for loop + +ip_options_compile() + +ip_options structure + +ip_rcv_options() method + +optptr pointer + +parameter problem + +router alert + +SSRR + +stream ID + +Reliably delivered message + +Remote Direct Memory Access (RDMA) + +address handle + +attributes + +ib_create_ah_from_wc() method + +ib_create_ah() method + +ib_destory_ah() method + +ib_init_ah_from_wc() + +ib_modify_ah() method + +ib_query_ah() + +advantages + +CPU offload + +High Bandwidth + +Kernel bypass + +Low latency + +Zero copy + +attributes + +completion queue + +first-in, first-out (FIFO) + +ib_create_cq() method + +ib_destory_cq() + +ib_modify_cq() method + +ib_peek_cq() method + +ib_poll_cq() + +ib_req_ncomp_notif() + +ib_req_notify_cq() method + +ib_resize_cq() + +QP + +See(see Queue Pair (QP)) + +struct ib_wc + +device modification + +event handler + +eXtended Reliable Connected + +ib_alloc_xrcd() method + +ib_dealloc_xrcd_cq() method + +hierarchy + +ib_attach_mcast() method + +ib_detach_mcast() + +ib_find_gid() + +ib_find_pkey() method + +ib_get_client_data() method + +ib_modify_port() method + +ib_mtu_to_int() + +ib_query_device() method + +ib_query_gid() + +ib_query_pkey() + +ib_query_port() + +ib_rate_to_mbps() method + +ib_rate_to_mult() + +ib_register_client() method + +ib_register_event_handler() + +ib_set_client_data() method + +ib_unregister_client() method + +ib_width_enum_to_int() + +include/rdma/ib_verbs.h + +INIT_IB_EVENT_HANDLER macro + +memory region + +CPU accesses + +ib_dereg_mr() method + +ib_dma_alloc_coherent() method + +ib_dma_free_coherent() method + +ib_dma_map_page() method + +ib_dma_mapping_error() + +ib_dma_map_sg_attr() + +ib_dma_map_sg() method + +ib_dma_map_single() method + +ib_dma_unmap_page() method + +ib_dma_unmap_sg() + +ib_dma_unmap_sg() method + +ib_dma_unmap_single() + +ib_dma_unmap_single_attrs() method + +ib_get_dma_mr() + +ib_mr_attr Struct + +ib_reg_phys_mr() method + +ib_rereg_phys_mr() method + +ib_sg_dma_len() method + +kernel virtual address + +physical buffer + +memory windows + +ib_alloc_mw() method + +ib_bind_mw() method + +ib_dealloc_mw() method + +multicast groups + +network protocols + +node type + +operation types + +PD + +See(see Protection domain (PD)) + +port attributes + +protection domain + +ib_alloc_pd() method + +ib_dealloc_pd() + +QP + +See(see Queue pair (QP)) + +rdma_node_get_transport() + +rdma_port_get_link_layer() method + +request processing flow + +retry flow + +RNR Flow + +SRQ + +See(see Shared Receive Queue (SRQ)) + +stack architecture + +struct ib_client + +struct ib_event + +Userspace vs. Kernel-Level RDMA API + +Remote key (rkey) + +Retransmit timer + +Retry flow + +Reverse Path Filter (RPF) + +RNR Flow + +Root Announcement (RANN) + +Router + +Router Advertisement (RA) + +Router Alert (RA) + +Routing subsystem + +FIB + +fib_table structure + +caching + +fib_alias object + +fib_info + +fib_nh_exceptions + +nexthop + +policy routing + +forwarding packets + +forwarding router + +IP rule selectors + +lookup + +fib_lookup() method + +flowi4 object + +rtable structure + +macros + +MFC_HASH + +VIF_EXISTS + +methods + +multicast routing + +See(see Multicast routing) + +multipath routing + +policy routing + +definition + +fib_default_rules_init() method + +fib_lookup() method + +fib_rules module, implementation + +rules + +procfs multicast + +redirect message + +route flags + +route metrics + +route types + +routing + +rtmsg_ifinfo() method + +rtnl_notify() + +S + +SCTP. + +See See Stream Control Transmission Protocol (SCTP) + +Security Association (SA) + +Security policy + +action + +current lifetime + +definition + +polq queue + +SPD + +xfrm_policy structure + +reference counter + +xfrm_policy_timer() method + +Security Policy Database (SPD) + +Sequenced packet stream + +Service Level (SL) + +Session Initiation Protocol (SIP) + +Setsockopt() method + +Shared Receive Queue (SRQ) + +attributes + +ib_create_srq() method + +ib_destory_srq() method + +ib_destroy_srq() method + +ib_modify_srq() method + +ib_post_srq_recv() method + +ib_query_srq() + +limit asynchronous event + +QP + +scatter/gather element + +struct ib_recv_wr + +Sock_create() method + +Socket Buffer (SKB) + +Socketcall() method + +Sockets + +API + +accept() + +bind() + +connect() + +datagram + +data links + +DCCP + +listen() + +raw + +recv() + +reliably delivered message + +send() + +sequenced packet stream + +socket() + +stream + +creation + +msghdr structure + +socket() system call + +implementation + +parameters of + +return value of + +struct socket + +structure + +Sock_map_fd() method + +Sock structure + +Stream Control Transmission Protocol (SCTP) + +association + +members + +multiple addresses, addition/removal of + +representation + +setting up + +chunk + +chunk header + +common header + +features + +HEARTBEAT mechanism + +initialization + +multihoming + +multistreaming + +receiving packets + +registration + +sending packets + +Stream sockets + +Strict source record route (SSRR) + +Struct sock + +Switch + +Sys_socket() method + +T + +TCP. + +See See Transmission Control Protocol (TCP) + +Tcp_init_sock() method + +TCP/IP networking + +Time To Live (TTL) + +Traditional receive flow vs Busy Poll Sockets receive flow + +Transmission Control Protocol (TCP) + +connection setup + +description + +flags + +header + +initialization + +prot_ops objects + +receiving packets + +sending packets + +socket initialization + +timers + +Transport layer protocols + +DCCP + +See(see Datagram Congestion Control Protocol (DCCP)) + +macros + +methods + +SCTP + +See(see Stream Control Transmission Protocol (SCTP)) + +TCP + +connection setup + +description + +header + +initialization + +receiving packets + +sending packets + +timers + +UDP + +See(see User Datagram Protocol (UDP)) + +Type-Length-Value (TLV) format + +U + +User Datagram Protocol (UDP) + +description + +header + +initialization + +prot_ops objects + +receiving packets + +sending packets + +V + +Virtual Ethernet (VETH) + +Virtual Extensible Local Area Network (VXLAN) + +Virtual Lanes (VL) + +Virtual private network (VPN) + +W + +Wireless local area networks (WLANS) + +X, Y + +XFRM framework + +dummy bundle + +flow_cache_lookup() method + +netns_xfrm structure + +Security Association (SA) + +security policy + +See(see Security policy) + +xfrm_init() method + +xfrm_lookup() method + +xfrm_route_forward() method + +Z + +Zero window probe timer + diff --git a/kag/examples/csqa/builder/data/machine_learning_with_spark.txt b/kag/examples/csqa/builder/data/machine_learning_with_spark.txt new file mode 100644 index 00000000..bc2864f4 --- /dev/null +++ b/kag/examples/csqa/builder/data/machine_learning_with_spark.txt @@ -0,0 +1,8844 @@ +Machine Learning With Spark + +# Table of Contents + +Machine Learning with Spark + +Credits + +About the Author + +Acknowledgments + +About the Reviewers + +www.PacktPub.com + +Support files, eBooks, discount offers, and more + +Why subscribe? + +Free access for Packt account holders + +Preface + +What this book covers + +What you need for this book + +Who this book is for + +Conventions + +Reader feedback + +Customer support + +Downloading the example code + +Errata + +Piracy + +Questions + +1. Getting Up and Running with Spark + +Installing and setting up Spark locally + +Spark clusters + +The Spark programming model + +SparkContext and SparkConf + +The Spark shell + +Resilient Distributed Datasets + +Creating RDDs + +Spark operations + +Caching RDDs + +Broadcast variables and accumulators + +The first step to a Spark program in Scala + +The first step to a Spark program in Java + +The first step to a Spark program in Python + +Getting Spark running on Amazon EC2 + +Launching an EC2 Spark cluster + +Summary + +2. Designing a Machine Learning System + +Introducing MovieStream + +Business use cases for a machine learning system + +Personalization + +Targeted marketing and customer segmentation + +Predictive modeling and analytics + +Types of machine learning models + +The components of a data-driven machine learning system + +Data ingestion and storage + +Data cleansing and transformation + +Model training and testing loop + +Model deployment and integration + +Model monitoring and feedback + +Batch versus real time + +An architecture for a machine learning system + +Practical exercise + +Summary + +3. Obtaining, Processing, and Preparing Data with Spark + +Accessing publicly available datasets + +The MovieLens 100k dataset + +Exploring and visualizing your data + +Exploring the user dataset + +Exploring the movie dataset + +Exploring the rating dataset + +Processing and transforming your data + +Filling in bad or missing data + +Extracting useful features from your data + +Numerical features + +Categorical features + +Derived features + +Transforming timestamps into categorical features + +Text features + +Simple text feature extraction + +Normalizing features + +Using MLlib for feature normalization + +Using packages for feature extraction + +Summary + +4. Building a Recommendation Engine with Spark + +Types of recommendation models + +Content-based filtering + +Collaborative filtering + +Matrix factorization + +Explicit matrix factorization + +Implicit matrix factorization + +Alternating least squares + +Extracting the right features from your data + +Extracting features from the MovieLens 100k dataset + +Training the recommendation model + +Training a model on the MovieLens 100k dataset + +Training a model using implicit feedback data + +Using the recommendation model + +User recommendations + +Generating movie recommendations from the MovieLens 100k dataset + +Inspecting the recommendations + +Item recommendations + +Generating similar movies for the MovieLens 100k dataset + +Inspecting the similar items + +Evaluating the performance of recommendation models + +Mean Squared Error + +Mean average precision at K + +Using MLlib's built-in evaluation functions + +RMSE and MSE + +MAP + +Summary + +5. Building a Classification Model with Spark + +Types of classification models + +Linear models + +Logistic regression + +Linear support vector machines + +The naïve Bayes model + +Decision trees + +Extracting the right features from your data + +Extracting features from the Kaggle/StumbleUpon evergreen classification dataset + +Training classification models + +Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset + +Using classification models + +Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + +Evaluating the performance of classification models + +Accuracy and prediction error + +Precision and recall + +ROC curve and AUC + +Improving model performance and tuning parameters + +Feature standardization + +Additional features + +Using the correct form of data + +Tuning model parameters + +Linear models + +Iterations + +Step size + +Regularization + +Decision trees + +Tuning tree depth and impurity + +The naïve Bayes model + +Cross-validation + +Summary + +6. Building a Regression Model with Spark + +Types of regression models + +Least squares regression + +Decision trees for regression + +Extracting the right features from your data + +Extracting features from the bike sharing dataset + +Creating feature vectors for the linear model + +Creating feature vectors for the decision tree + +Training and using regression models + +Training a regression model on the bike sharing dataset + +Evaluating the performance of regression models + +Mean Squared Error and Root Mean Squared Error + +Mean Absolute Error + +Root Mean Squared Log Error + +The R-squared coefficient + +Computing performance metrics on the bike sharing dataset + +Linear model + +Decision tree + +Improving model performance and tuning parameters + +Transforming the target variable + +Impact of training on log-transformed targets + +Tuning model parameters + +Creating training and testing sets to evaluate parameters + +The impact of parameter settings for linear models + +Iterations + +Step size + +L2 regularization + +L1 regularization + +Intercept + +The impact of parameter settings for the decision tree + +Tree depth + +Maximum bins + +Summary + +7. Building a Clustering Model with Spark + +Types of clustering models + +K-means clustering + +Initialization methods + +Variants + +Mixture models + +Hierarchical clustering + +Extracting the right features from your data + +Extracting features from the MovieLens dataset + +Extracting movie genre labels + +Training the recommendation model + +Normalization + +Training a clustering model + +Training a clustering model on the MovieLens dataset + +Making predictions using a clustering model + +Interpreting cluster predictions on the MovieLens dataset + +Interpreting the movie clusters + +Evaluating the performance of clustering models + +Internal evaluation metrics + +External evaluation metrics + +Computing performance metrics on the MovieLens dataset + +Tuning parameters for clustering models + +Selecting K through cross-validation + +Summary + +8. Dimensionality Reduction with Spark + +Types of dimensionality reduction + +Principal Components Analysis + +Singular Value Decomposition + +Relationship with matrix factorization + +Clustering as dimensionality reduction + +Extracting the right features from your data + +Extracting features from the LFW dataset + +Exploring the face data + +Visualizing the face data + +Extracting facial images as vectors + +Loading images + +Converting to grayscale and resizing the images + +Extracting feature vectors + +Normalization + +Training a dimensionality reduction model + +Running PCA on the LFW dataset + +Visualizing the Eigenfaces + +Interpreting the Eigenfaces + +Using a dimensionality reduction model + +Projecting data using PCA on the LFW dataset + +The relationship between PCA and SVD + +Evaluating dimensionality reduction models + +Evaluating k for SVD on the LFW dataset + +Summary + +9. Advanced Text Processing with Spark + +What's so special about text data? + +Extracting the right features from your data + +Term weighting schemes + +Feature hashing + +Extracting the TF-IDF features from the 20 Newsgroups dataset + +Exploring the 20 Newsgroups data + +Applying basic tokenization + +Improving our tokenization + +Removing stop words + +Excluding terms based on frequency + +A note about stemming + +Training a TF-IDF model + +Analyzing the TF-IDF weightings + +Using a TF-IDF model + +Document similarity with the 20 Newsgroups dataset and TF-IDF features + +Training a text classifier on the 20 Newsgroups dataset using TF-IDF + +Evaluating the impact of text processing + +Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset + +Word2Vec models + +Word2Vec on the 20 Newsgroups dataset + +Summary + +10. Real-time Machine Learning with Spark Streaming + +Online learning + +Stream processing + +An introduction to Spark Streaming + +Input sources + +Transformations + +Keeping track of state + +General transformations + +Actions + +Window operators + +Caching and fault tolerance with Spark Streaming + +Creating a Spark Streaming application + +The producer application + +Creating a basic streaming application + +Streaming analytics + +Stateful streaming + +Online learning with Spark Streaming + +Streaming regression + +A simple streaming regression program + +Creating a streaming data producer + +Creating a streaming regression model + +Streaming K-means + +Online model evaluation + +Comparing model performance with Spark Streaming + +Summary + +Index + +# **Machine Learning with Spark** + +* * * + +# Machine Learning with Spark + +Copyright (C) 2015 Packt Publishing + +All rights reserved. No part of this book may be reproduced, stored in a retrieval system, or transmitted in any form or by any means, without the prior written permission of the publisher, except in the case of brief quotations embedded in critical articles or reviews. + +Every effort has been made in the preparation of this book to ensure the accuracy of the information presented. However, the information contained in this book is sold without warranty, either express or implied. Neither the author, nor Packt Publishing, and its dealers and distributors will be held liable for any damages caused or alleged to be caused directly or indirectly by this book. + +Packt Publishing has endeavored to provide trademark information about all of the companies and products mentioned in this book by the appropriate use of capitals. However, Packt Publishing cannot guarantee the accuracy of this information. + +First published: February 2015 + +Production reference: 1170215 + +Published by Packt Publishing Ltd. + +Livery Place + +35 Livery Street + +Birmingham B3 2PB, UK. + +ISBN 978-1-78328-851-9 + +www.packtpub.com + +Cover image by Akshay Paunikar (``) + +# Credits + + **Author** + +Nick Pentreath + + **Reviewers** + +Andrea Mostosi + +Hao Ren + +Krishna Sankar + + **Commissioning Editor** + +Rebecca Youe + + **Acquisition Editor** + +Rebecca Youe + + **Content Development Editor** + +Susmita Sabat + + **Technical Editors** + +Vivek Arora + +Pankaj Kadam + + **Copy Editor** + +Karuna Narayanan + + **Project Coordinator** + +Milton Dsouza + + **Proofreaders** + +Simran Bhogal + +Maria Gould + +Ameesha Green + +Paul Hindle + + **Indexer** + +Priya Sane + + **Graphics** + +Sheetal Aute + +Abhinash Sahu + + **Production Coordinator** + +Nitesh Thakur + + **Cover Work** + +Nitesh Thakur + +# About the Author + + **Nick Pentreath** has a background in financial markets, machine learning, and software development. He has worked at Goldman Sachs Group, Inc.; as a research scientist at the online ad targeting start-up Cognitive Match Limited, London; and led the Data Science and Analytics team at Mxit, Africa's largest social network. + +He is a cofounder of Graphflow, a big data and machine learning company focused on user-centric recommendations and customer intelligence. He is passionate about combining commercial focus with machine learning and cutting-edge technology to build intelligent systems that learn from data to add value to the bottom line. + +Nick is a member of the Apache Spark Project Management Committee. + +# Acknowledgments + +Writing this book has been quite a rollercoaster ride over the past year, with many ups and downs, late nights, and working weekends. It has also been extremely rewarding to combine my passion for machine learning with my love of the Apache Spark project, and I hope to bring some of this out in this book. + +I would like to thank the Packt Publishing team for all their assistance throughout the writing and editing process: Rebecca, Susmita, Sudhir, Amey, Neil, Vivek, Pankaj, and everyone who worked on the book. + +Thanks also go to Debora Donato at StumbleUpon for assistance with data- and legal-related queries. + +Writing a book like this can be a somewhat lonely process, so it is incredibly helpful to get the feedback of reviewers to understand whether one is headed in the right direction (and what course adjustments need to be made). I'm deeply grateful to Andrea Mostosi, Hao Ren, and Krishna Sankar for taking the time to provide such detailed and critical feedback. + +I could not have gotten through this project without the unwavering support of all my family and friends, especially my wonderful wife, Tammy, who will be glad to have me back in the evenings and on weekends once again. Thank you all! + +Finally, thanks to all of you reading this; I hope you find it useful! + +# About the Reviewers + + **Andrea Mostosi** is a technology enthusiast. An innovation lover since he was a child, he started a professional job in 2003 and worked on several projects, playing almost every role in the computer science environment. He is currently the CTO at The Fool, a company that tries to make sense of web and social data. During his free time, he likes traveling, running, cooking, biking, and coding. + + **** + +I would like to thank my geek friends: Simone M, Daniele V, Luca T, Luigi P, Michele N, Luca O, Luca B, Diego C, and Fabio B. They are the smartest people I know, and comparing myself with them has always pushed me to be better. + + **Hao Ren** is a software developer who is passionate about Scala, distributed systems, machine learning, and Apache Spark. He was an exchange student at EPFL when he learned about Scala in 2012. He is currently working in Paris as a backend and data engineer for ClaraVista--a company that focuses on high-performance marketing. His work responsibility is to build a Spark-based platform for purchase prediction and a new recommender system. + +Besides programming, he enjoys running, swimming, and playing basketball and badminton. You can learn more at his blog . + + **Krishna Sankar** is a chief data scientist at BlackArrow, where he is focusing on enhancing user experience via inference, intelligence, and interfaces. Earlier stints include working as a principal architect and data scientist at Tata America International Corporation, director of data science at a bioinformatics start-up company, and as a distinguished engineer at Cisco Systems, Inc. He has spoken at various conferences about data science (), machine learning (), and social media analysis (). He has also been a guest lecturer at the Naval Postgraduate School. He has written a few books on Java, wireless LAN security, Web 2.0, and now on Spark. His other passion is LEGO robotics. Earlier in April, he was at the St. Louis FLL World Competition as a robots design judge. + +# www.PacktPub.com + +# Support files, eBooks, discount offers, and more + +For support files and downloads related to your book, please visit www.PacktPub.com. + +Did you know that Packt offers eBook versions of every book published, with PDF and ePub files available? You can upgrade to the eBook version at www.PacktPub.com and as a print book customer, you are entitled to a discount on the eBook copy. Get in touch with us at `` for more details. + +At www.PacktPub.com, you can also read a collection of free technical articles, sign up for a range of free newsletters and receive exclusive discounts and offers on Packt books and eBooks. + + + +Do you need instant solutions to your IT questions? PacktLib is Packt's online digital book library. Here, you can search, access, and read Packt's entire library of books. + +## Why subscribe? + + * Fully searchable across every book published by Packt + * Copy and paste, print, and bookmark content + * On demand and accessible via a web browser + +## Free access for Packt account holders + +If you have an account with Packt at www.PacktPub.com, you can use this to access PacktLib today and view 9 entirely free books. Simply use your login credentials for immediate access. + +# Preface + +In recent years, the volume of data being collected, stored, and analyzed has exploded, in particular in relation to the activity on the Web and mobile devices, as well as data from the physical world collected via sensor networks. While previously large-scale data storage, processing, analysis, and modeling was the domain of the largest institutions such as Google, Yahoo!, Facebook, and Twitter, increasingly, many organizations are being faced with the challenge of how to handle a massive amount of data. + +When faced with this quantity of data and the common requirement to utilize it in real time, human-powered systems quickly become infeasible. This has led to a rise in the so-called big data and machine learning systems that learn from this data to make automated decisions. + +In answer to the challenge of dealing with ever larger-scale data without any prohibitive cost, new open source technologies emerged at companies such as Google, Yahoo!, Amazon, and Facebook, which aimed at making it easier to handle massive data volumes by distributing data storage and computation across a cluster of computers. + +The most widespread of these is Apache Hadoop, which made it significantly easier and cheaper to both store large amounts of data (via the Hadoop Distributed File System, or HDFS) and run computations on this data (via Hadoop MapReduce, a framework to perform computation tasks in parallel across many nodes in a computer cluster). + +However, MapReduce has some important shortcomings, including high overheads to launch each job and reliance on storing intermediate data and results of the computation to disk, both of which make Hadoop relatively ill-suited for use cases of an iterative or low-latency nature. Apache Spark is a new framework for distributed computing that is designed from the ground up to be optimized for low-latency tasks and to store intermediate data and results in memory, thus addressing some of the major drawbacks of the Hadoop framework. Spark provides a clean, functional, and easy-to-understand API to write applications and is fully compatible with the Hadoop ecosystem. + +Furthermore, Spark provides native APIs in Scala, Java, and Python. The Scala and Python APIs allow all the benefits of the Scala or Python language, respectively, to be used directly in Spark applications, including using the relevant interpreter for real-time, interactive exploration. Spark itself now provides a toolkit (called MLlib) of distributed machine learning and data mining models that is under heavy development and already contains high-quality, scalable, and efficient algorithms for many common machine learning tasks, some of which we will delve into in this book. + +Applying machine learning techniques to massive datasets is challenging, primarily because most well-known machine learning algorithms are not designed for parallel architectures. In many cases, designing such algorithms is not an easy task. The nature of machine learning models is generally iterative, hence the strong appeal of Spark for this use case. While there are many competing frameworks for parallel computing, Spark is one of the few that combines speed, scalability, in-memory processing, and fault tolerance with ease of programming and a flexible, expressive, and powerful API design. + +Throughout this book, we will focus on real-world applications of machine learning technology. While we may briefly delve into some theoretical aspects of machine learning algorithms, the book will generally take a practical, applied approach with a focus on using examples and code to illustrate how to effectively use the features of Spark and MLlib, as well as other well-known and freely available packages for machine learning and data analysis, to create a useful machine learning system. + +# What this book covers + +Chapter 1, _Getting Up and Running with Spark_ , shows how to install and set up a local development environment for the Spark framework as well as how to create a Spark cluster in the cloud using Amazon EC2. The Spark programming model and API will be introduced, and a simple Spark application will be created using each of Scala, Java, and Python. + +Chapter 2, _Designing a Machine Learning System_ , presents an example of a real-world use case for a machine learning system. We will design a high-level architecture for an intelligent system in Spark based on this illustrative use case. + +Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , details how to go about obtaining data for use in a machine learning system, in particular from various freely and publicly available sources. We will learn how to process, clean, and transform the raw data into features that may be used in machine learning models, using available tools, libraries, and Spark's functionality. + +Chapter 4, _Building a Recommendation Engine with Spark_ , deals with creating a recommendation model based on the collaborative filtering approach. This model will be used to recommend items to a given user as well as create lists of items that are similar to a given item. Standard metrics to evaluate the performance of a recommendation model will be covered here. + +Chapter 5, _Building a Classification Model with Spark_ , details how to create a model for binary classification as well as how to utilize standard performance-evaluation metrics for classification tasks. + +Chapter 6, _Building a Regression Model with Spark_ , shows how to create a model for regression, extending the classification model created in Chapter 5, _Building a Classification Model with Spark_. Evaluation metrics for the performance of regression models will be detailed here. + +Chapter 7, _Building a Clustering Model with Spark_ , explores how to create a clustering model as well as how to use related evaluation methodologies. You will learn how to analyze and visualize the clusters generated. + +Chapter 8, _Dimensionality Reduction with Spark_ , takes us through methods to extract the underlying structure from and reduce the dimensionality of our data. You will learn some common dimensionality-reduction techniques and how to apply and analyze them, as well as how to use the resulting data representation as input to another machine learning model. + +Chapter 9, _Advanced Text Processing with Spark_ , introduces approaches to deal with large-scale text data, including techniques for feature extraction from text and dealing with the very high-dimensional features typical in text data. + +Chapter 10, _Real-time Machine Learning with Spark Streaming_ , provides an overview of Spark Streaming and how it fits in with the online and incremental learning approaches to apply machine learning on data streams. + +# What you need for this book + +Throughout this book, we assume that you have some basic experience with programming in Scala, Java, or Python and have some basic knowledge of machine learning, statistics, and data analysis. + +# Who this book is for + +This book is aimed at entry-level to intermediate data scientists, data analysts, software engineers, and practitioners involved in machine learning or data mining with an interest in large-scale machine learning approaches, but who are not necessarily familiar with Spark. You may have some experience of statistics or machine learning software (perhaps including MATLAB, scikit-learn, Mahout, R, Weka, and so on) or distributed systems (perhaps including some exposure to Hadoop). + +# Conventions + +In this book, you will find a number of styles of text that distinguish between different kinds of information. Here are some examples of these styles, and an explanation of their meaning. + +Code words in text, database table names, folder names, filenames, file extensions, pathnames, dummy URLs, user input, and Twitter handles are shown as follows: "Spark places user scripts to run Spark in the `bin` directory." + +A block of code is set as follows: + + val conf = new SparkConf() + .setAppName("Test Spark App") + .setMaster("local[4]") + val sc = new SparkContext(conf) + +Any command-line input or output is written as follows: + + **> tar xfvz spark-1.2.0-bin-hadoop2.4.tgz** + **> cd spark-1.2.0-bin-hadoop2.4** + +**New terms** and **important words** are shown in bold. Words that you see on the screen, in menus or dialog boxes for example, appear in the text like this: "These can be obtained from the AWS homepage by clicking **Account** | **Security Credentials** | **Access Credentials**." + +### Note + +Warnings or important notes appear in a box like this. + +### Tip + +Tips and tricks appear like this. + +# Reader feedback + +Feedback from our readers is always welcome. Let us know what you think about this book--what you liked or may have disliked. Reader feedback is important for us to develop titles that you really get the most out of. + +To send us general feedback, simply send an e-mail to ``, and mention the book title through the subject of your message. + +If there is a topic that you have expertise in and you are interested in either writing or contributing to a book, see our author guide on www.packtpub.com/authors. + +# Customer support + +Now that you are the proud owner of a Packt book, we have a number of things to help you to get the most from your purchase. + +## Downloading the example code + +You can download the example code files for all Packt books you have purchased from your account at . If you purchased this book elsewhere, you can visit and register to have the files e-mailed directly to you. + +## Errata + +Although we have taken every care to ensure the accuracy of our content, mistakes do happen. If you find a mistake in one of our books--maybe a mistake in the text or the code--we would be grateful if you would report this to us. By doing so, you can save other readers from frustration and help us improve subsequent versions of this book. If you find any errata, please report them by visiting , selecting your book, clicking on the **Errata Submission Form** link, and entering the details of your errata. Once your errata are verified, your submission will be accepted and the errata will be uploaded to our website or added to any list of existing errata under the Errata section of that title. + +To view the previously submitted errata, go to and enter the name of the book in the search field. The required information will appear under the **Errata** section. + +## Piracy + +Piracy of copyright material on the Internet is an ongoing problem across all media. At Packt, we take the protection of our copyright and licenses very seriously. If you come across any illegal copies of our works, in any form, on the Internet, please provide us with the location address or website name immediately so that we can pursue a remedy. + +Please contact us at `` with a link to the suspected pirated material. + +We appreciate your help in protecting our authors, and our ability to bring you valuable content. + +## Questions + +You can contact us at `` if you are having a problem with any aspect of the book, and we will do our best to address it. + +# Chapter 1. Getting Up and Running with Spark + +Apache Spark is a framework for distributed computing; this framework aims to make it simpler to write programs that run in parallel across many nodes in a cluster of computers. It tries to abstract the tasks of resource scheduling, job submission, execution, tracking, and communication between nodes, as well as the low-level operations that are inherent in parallel data processing. It also provides a higher level API to work with distributed data. In this way, it is similar to other distributed processing frameworks such as Apache Hadoop; however, the underlying architecture is somewhat different. + +Spark began as a research project at the University of California, Berkeley. The university was focused on the use case of distributed machine learning algorithms. Hence, it is designed from the ground up for high performance in applications of an iterative nature, where the same data is accessed multiple times. This performance is achieved primarily through caching datasets in memory, combined with low latency and overhead to launch parallel computation tasks. Together with other features such as fault tolerance, flexible distributed-memory data structures, and a powerful functional API, Spark has proved to be broadly useful for a wide range of large-scale data processing tasks, over and above machine learning and iterative analytics. + +### Note + +For more background on Spark, including the research papers underlying Spark's development, see the project's history page at . + +Spark runs in four modes: + + * The standalone local mode, where all Spark processes are run within the same **Java Virtual Machine** ( **JVM** ) process + * The standalone cluster mode, using Spark's own built-in job-scheduling framework + * Using Mesos, a popular open source cluster-computing framework + * Using YARN (commonly referred to as NextGen MapReduce), a Hadoop-related cluster-computing and resource-scheduling framework + +In this chapter, we will: + + * Download the Spark binaries and set up a development environment that runs in Spark's standalone local mode. This environment will be used throughout the rest of the book to run the example code. + * Explore Spark's programming model and API using Spark's interactive console. + * Write our first Spark program in Scala, Java, and Python. + * Set up a Spark cluster using Amazon's **Elastic Cloud Compute** ( **EC2** ) platform, which can be used for large-sized data and heavier computational requirements, rather than running in the local mode. + +### Tip + +Spark can also be run on Amazon's Elastic MapReduce service using custom bootstrap action scripts, but this is beyond the scope of this book. The following article is a good reference guide: . + +At the time of writing this book, the article covers running Spark Version 1.1.0. + +If you have previous experience in setting up Spark and are familiar with the basics of writing a Spark program, feel free to skip this chapter. + +# Installing and setting up Spark locally + +Spark can be run using the built-in standalone cluster scheduler in the local mode. This means that all the Spark processes are run within the same JVM--effectively, a single, multithreaded instance of Spark. The local mode is very useful for prototyping, development, debugging, and testing. However, this mode can also be useful in real-world scenarios to perform parallel computation across multiple cores on a single computer. + +As Spark's local mode is fully compatible with the cluster mode, programs written and tested locally can be run on a cluster with just a few additional steps. + +The first step in setting up Spark locally is to download the latest version (at the time of writing this book, the version is 1.2.0). The download page of the Spark project website, found at , contains links to download various versions as well as to obtain the latest source code via GitHub. + +### Tip + +The Spark project documentation website at is a comprehensive resource to learn more about Spark. We highly recommend that you explore it! + +Spark needs to be built against a specific version of Hadoop in order to access **Hadoop Distributed File System** ( **HDFS** ) as well as standard and custom Hadoop input sources. The download page provides prebuilt binary packages for Hadoop 1, CDH4 (Cloudera's Hadoop Distribution), MapR's Hadoop distribution, and Hadoop 2 (YARN). Unless you wish to build Spark against a specific Hadoop version, we recommend that you download the prebuilt Hadoop 2.4 package from an Apache mirror using this link: . + +Spark requires the Scala programming language (version 2.10.4 at the time of writing this book) in order to run. Fortunately, the prebuilt binary package comes with the Scala runtime packages included, so you don't need to install Scala separately in order to get started. However, you will need to have a **Java Runtime Environment** ( **JRE** ) or **Java Development Kit** ( **JDK** ) installed (see the software and hardware list in this book's code bundle for installation instructions). + +Once you have downloaded the Spark binary package, unpack the contents of the package and change into the newly created directory by running the following commands: + + **> tar xfvz spark-1.2.0-bin-hadoop2.4.tgz** + **> cd spark-1.2.0-bin-hadoop2.4** + +Spark places user scripts to run Spark in the `bin` directory. You can test whether everything is working correctly by running one of the example programs included in Spark: + + **>./bin/run-example org.apache.spark.examples.SparkPi** + +This will run the example in Spark's local standalone mode. In this mode, all the Spark processes are run within the same JVM, and Spark uses multiple threads for parallel processing. By default, the preceding example uses a number of threads equal to the number of cores available on your system. Once the program is finished running, you should see something similar to the following lines near the end of the output: + + **...** + **14/11/27 20:58:47 INFO SparkContext: Job finished: reduce at SparkPi.scala:35, took 0.723269 s** + **Pi is roughly 3.1465** + **...** + +To configure the level of parallelism in the local mode, you can pass in a `master` parameter of the `local[N]` form, where `N` is the number of threads to use. For example, to use only two threads, run the following command instead: + + **> MASTER=local[2] ./bin/run-example org.apache.spark.examples.SparkPi** + +# Spark clusters + +A Spark cluster is made up of two types of processes: a driver program and multiple executors. In the local mode, all these processes are run within the same JVM. In a cluster, these processes are usually run on separate nodes. + +For example, a typical cluster that runs in Spark's standalone mode (that is, using Spark's built-in cluster-management modules) will have: + + * A master node that runs the Spark standalone master process as well as the driver program + * A number of worker nodes, each running an executor process + +While we will be using Spark's local standalone mode throughout this book to illustrate concepts and examples, the same Spark code that we write can be run on a Spark cluster. In the preceding example, if we run the code on a Spark standalone cluster, we could simply pass in the URL for the master node as follows: + + **> MASTER=spark://IP:PORT ./bin/run-example org.apache.spark.examples.SparkPi** + +Here, `IP` is the IP address, and `PORT` is the port of the Spark master. This tells Spark to run the program on the cluster where the Spark master process is running. + +A full treatment of Spark's cluster management and deployment is beyond the scope of this book. However, we will briefly teach you how to set up and use an Amazon EC2 cluster later in this chapter. + +### Note + +For an overview of the Spark cluster-application deployment, take a look at the following links: + + * + * + +# The Spark programming model + +Before we delve into a high-level overview of Spark's design, we will introduce the `SparkContext` object as well as the Spark shell, which we will use to interactively explore the basics of the Spark programming model. + +### Tip + +While this section provides a brief overview and examples of using Spark, we recommend that you read the following documentation to get a detailed understanding: + + * Spark Quick Start: + * _Spark Programming guide_ , which covers Scala, Java, and Python: + +## SparkContext and SparkConf + +The starting point of writing any Spark program is `SparkContext` (or `JavaSparkContext` in Java). `SparkContext` is initialized with an instance of a `SparkConf` object, which contains various Spark cluster-configuration settings (for example, the URL of the master node). + +Once initialized, we will use the various methods found in the `SparkContext` object to create and manipulate distributed datasets and shared variables. The Spark shell (in both Scala and Python, which is unfortunately not supported in Java) takes care of this context initialization for us, but the following lines of code show an example of creating a context running in the local mode in Scala: + + val conf = new SparkConf() + .setAppName("Test Spark App") + .setMaster("local[4]") + val sc = new SparkContext(conf) + +This creates a context running in the local mode with four threads, with the name of the application set to `Test Spark App`. If we wish to use default configuration values, we could also call the following simple constructor for our `SparkContext` object, which works in exactly the same way: + + val sc = new SparkContext("local[4]", "Test Spark App") + +### Tip + + **Downloading the example code** + +You can download the example code files for all Packt books you have purchased from your account at . If you purchased this book elsewhere, you can visit and register to have the files e-mailed directly to you. + +## The Spark shell + +Spark supports writing programs interactively using either the Scala or Python REPL (that is, the **Read-Eval-Print-Loop** , or interactive shell). The shell provides instant feedback as we enter code, as this code is immediately evaluated. In the Scala shell, the return result and type is also displayed after a piece of code is run. + +To use the Spark shell with Scala, simply run `./bin/spark-shell` from the Spark base directory. This will launch the Scala shell and initialize `SparkContext`, which is available to us as the Scala value, `sc`. Your console output should look similar to the following screenshot: + +To use the Python shell with Spark, simply run the `./bin/pyspark` command. Like the Scala shell, the Python `SparkContext` object should be available as the Python variable `sc`. You should see an output similar to the one shown in this screenshot: + +## Resilient Distributed Datasets + +The core of Spark is a concept called the **Resilient Distributed Dataset** ( **RDD** ). An RDD is a collection of "records" (strictly speaking, objects of some type) that is distributed or partitioned across many nodes in a cluster (for the purposes of the Spark local mode, the single multithreaded process can be thought of in the same way). An RDD in Spark is fault-tolerant; this means that if a given node or task fails (for some reason other than erroneous user code, such as hardware failure, loss of communication, and so on), the RDD can be reconstructed automatically on the remaining nodes and the job will still complete. + +### Creating RDDs + +RDDs can be created from existing collections, for example, in the Scala Spark shell that you launched earlier: + + val collection = List("a", "b", "c", "d", "e") + val rddFromCollection = sc.parallelize(collection) + +RDDs can also be created from Hadoop-based input sources, including the local filesystem, HDFS, and Amazon S3. A Hadoop-based RDD can utilize any input format that implements the Hadoop `InputFormat` interface, including text files, other standard Hadoop formats, HBase, Cassandra, and many more. The following code is an example of creating an RDD from a text file located on the local filesystem: + + val rddFromTextFile = sc.textFile("LICENSE") + +The preceding `textFile` method returns an RDD where each record is a `String` object that represents one line of the text file. + +### Spark operations + +Once we have created an RDD, we have a distributed collection of records that we can manipulate. In Spark's programming model, operations are split into transformations and actions. Generally speaking, a transformation operation applies some function to all the records in the dataset, changing the records in some way. An action typically runs some computation or aggregation operation and returns the result to the driver program where `SparkContext` is running. + +Spark operations are functional in style. For programmers familiar with functional programming in Scala or Python, these operations should seem natural. For those without experience in functional programming, don't worry; the Spark API is relatively easy to learn. + +One of the most common transformations that you will use in Spark programs is the `map` operator. This applies a function to each record of an RDD, thus _mapping_ the input to some new output. For example, the following code fragment takes the RDD we created from a local text file and applies the `size` function to each record in the RDD. Remember that we created an RDD of `Strings`. Using `map`, we can transform each string to an integer, thus returning an RDD of `Ints`: + + val intsFromStringsRDD = rddFromTextFile.map( **line = > line.size**) + +You should see output similar to the following line in your shell; this indicates the type of the RDD: + + **intsFromStringsRDD: org.apache.spark.rdd.RDD[Int] = MappedRDD[5] at map at :14** + +In the preceding code, we saw the `=>` syntax used. This is the Scala syntax for an anonymous function, which is a function that is not a named method (that is, one defined using the `def` keyword in Scala or Python, for example). + +### Note + +While a detailed treatment of anonymous functions is beyond the scope of this book, they are used extensively in Spark code in Scala and Python, as well as in Java 8 (both in examples and real-world applications), so it is useful to cover a few practicalities. + +The `line => line.size` syntax means that we are applying a function where the input variable is to the left of the `=>` operator, and the output is the result of the code to the right of the `=>` operator. In this case, the input is `line`, and the output is the result of calling `line.size`. In Scala, this function that maps a string to an integer is expressed as `String => Int`. + +This syntax saves us from having to separately define functions every time we use methods such as `map`; this is useful when the function is simple and will only be used once, as in this example. + +Now, we can apply a common action operation, `count`, to return the number of records in our RDD: + + intsFromStringsRDD.count + +The result should look something like the following console output: + + **14/01/29 23:28:28 INFO SparkContext: Starting job: count at :17** + **...** + **14/01/29 23:28:28 INFO SparkContext: Job finished: count at :17, took 0.019227 s** + **res4: Long = 398** + +Perhaps we want to find the average length of each line in this text file. We can first use the `sum` function to add up all the lengths of all the records and then divide the sum by the number of records: + + val sumOfRecords = intsFromStringsRDD.sum + val numRecords = intsFromStringsRDD.count + val aveLengthOfRecord = sumOfRecords / numRecords + +The result will be as follows: + + **aveLengthOfRecord: Double = 52.06030150753769** + +Spark operations, in most cases, return a new RDD, with the exception of most actions, which return the result of a computation (such as `Long` for `count` and `Double` for `sum` in the preceding example). This means that we can naturally chain together operations to make our program flow more concise and expressive. For example, the same result as the one in the preceding line of code can be achieved using the following code: + + val aveLengthOfRecordChained = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count + +An important point to note is that Spark transformations are lazy. That is, invoking a transformation on an RDD does not immediately trigger a computation. Instead, transformations are chained together and are effectively only computed when an action is called. This allows Spark to be more efficient by only returning results to the driver when necessary so that the majority of operations are performed in parallel on the cluster. + +This means that if your Spark program never uses an action operation, it will never trigger an actual computation, and you will not get any results. For example, the following code will simply return a new RDD that represents the chain of transformations: + + val transformedRDD = rddFromTextFile.map(line => line.size).filter(size => size > 10).map(size => size * 2) + +This returns the following result in the console: + + **transformedRDD: org.apache.spark.rdd.RDD[Int] = MappedRDD[8] at map at :14** + +Notice that no actual computation happens and no result is returned. If we now call an action, such as `sum`, on the resulting RDD, the computation will be triggered: + + val computation = transformedRDD.sum + +You will now see that a Spark job is run, and it results in the following console output: + + **...** + **14/11/27 21:48:21 INFO SparkContext: Job finished: sum at :16, took 0.193513 s** + **computation: Double = 60468.0** + +### Tip + +The complete list of transformations and actions possible on RDDs as well as a set of more detailed examples are available in the Spark programming guide (located at ), and the API documentation (the Scala API documentation) is located at ). + +### Caching RDDs + +One of the most powerful features of Spark is the ability to cache data in memory across a cluster. This is achieved through use of the `cache` method on an RDD: + + rddFromTextFile.cache + +Calling `cache` on an RDD tells Spark that the RDD should be kept in memory. The first time an action is called on the RDD that initiates a computation, the data is read from its source and put into memory. Hence, the first time such an operation is called, the time it takes to run the task is partly dependent on the time it takes to read the data from the input source. However, when the data is accessed the next time (for example, in subsequent queries in analytics or iterations in a machine learning model), the data can be read directly from memory, thus avoiding expensive I/O operations and speeding up the computation, in many cases, by a significant factor. + +If we now call the `count` or `sum` function on our cached RDD, we will see that the RDD is loaded into memory: + + val aveLengthOfRecordChained = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count + +Indeed, in the following output, we see that the dataset was cached in memory on the first call, taking up approximately 62 KB and leaving us with around 270 MB of memory free: + + **...** + **14/01/30 06:59:27 INFO MemoryStore: ensureFreeSpace(63454) called with curMem=32960, maxMem=311387750** + **14/01/30 06:59:27 INFO MemoryStore: Block rdd_2_0 stored as values to memory (estimated size 62.0 KB, free 296.9 MB)** + **14/01/30 06:59:27 INFO BlockManagerMasterActor$BlockManagerInfo: Added rdd_2_0 in memory on 10.0.0.3:55089 (size: 62.0 KB, free: 296.9 MB)** + **...** + +Now, we will call the same function again: + + val aveLengthOfRecordChainedFromCached = rddFromTextFile.map(line => line.size).sum / rddFromTextFile.count + +We will see from the console output that the cached data is read directly from memory: + + **...** + **14/01/30 06:59:34 INFO BlockManager: Found block rdd_2_0 locally** + **...** + +### Tip + +Spark also allows more fine-grained control over caching behavior. You can use the `persist` method to specify what approach Spark uses to cache data. More information on `RDD` caching can be found here: . + +## Broadcast variables and accumulators + +Another core feature of Spark is the ability to create two special types of variables: broadcast variables and accumulators. + +A **broadcast variable** is a _read-only_ variable that is made available from the driver program that runs the `SparkContext` object to the nodes that will execute the computation. This is very useful in applications that need to make the same data available to the worker nodes in an efficient manner, such as machine learning algorithms. Spark makes creating broadcast variables as simple as calling a method on `SparkContext` as follows: + + val broadcastAList = sc.broadcast(List("a", "b", "c", "d", "e")) + +The console output shows that the broadcast variable was stored in memory, taking up approximately 488 bytes, and it also shows that we still have 270 MB available to us: + + **14/01/30 07:13:32 INFO MemoryStore: ensureFreeSpace(488) called with curMem=96414, maxMem=311387750** + **14/01/30 07:13:32 INFO MemoryStore: Block broadcast_1 stored as values to memory (estimated size 488.0 B, free 296.9 MB)** + **broadCastAList: org.apache.spark.broadcast.Broadcast[List[String]] = Broadcast(1)** + +A broadcast variable can be accessed from nodes other than the driver program that created it (that is, the worker nodes) by calling `value` on the variable: + + sc.parallelize(List("1", "2", "3")).map(x => broadcastAList. **value** ++ x).collect + +This code creates a new RDD with three records from a collection (in this case, a Scala `List`) of `("1", "2", "3")`. In the `map` function, it returns a new collection with the relevant record from our new RDD appended to the `broadcastAList` that is our broadcast variable. + +Notice that we used the `collect` method in the preceding code. This is a Spark _action_ that returns the entire RDD to the driver as a Scala (or Python or Java) collection. + +We will often use `collect` when we wish to apply further processing to our results locally within the driver program. + +### Note + +Note that `collect` should generally only be used in cases where we really want to return the full result set to the driver and perform further processing. If we try to call `collect` on a very large dataset, we might run out of memory on the driver and crash our program. + +It is preferable to perform as much heavy-duty processing on our Spark cluster as possible, preventing the driver from becoming a bottleneck. In many cases, however, collecting results to the driver is necessary, such as during iterations in many machine learning models. + +On inspecting the result, we will see that for each of the three records in our new RDD, we now have a record that is our original broadcasted `List`, with the new element appended to it (that is, there is now either `"1"`, `"2"`, or `"3"` at the end): + + **...** + **14/01/31 10:15:39 INFO SparkContext: Job finished: collect at :15, took 0.025806 s** + **res6: Array[List[Any]] = Array(List(a, b, c, d, e, 1), List(a, b, c, d, e, 2), List(a, b, c, d, e, 3))** + +An **accumulator** is also a variable that is broadcasted to the worker nodes. The key difference between a broadcast variable and an accumulator is that while the broadcast variable is read-only, the accumulator can be added to. There are limitations to this, that is, in particular, the addition must be an associative operation so that the global accumulated value can be correctly computed in parallel and returned to the driver program. Each worker node can only access and add to its own local accumulator value, and only the driver program can access the global value. Accumulators are also accessed within the Spark code using the `value` method. + +### Tip + +For more details on broadcast variables and accumulators, see the _Shared Variables_ section of the _Spark Programming Guide_ : . + +# The first step to a Spark program in Scala + +We will now use the ideas we introduced in the previous section to write a basic Spark program to manipulate a dataset. We will start with Scala and then write the same program in Java and Python. Our program will be based on exploring some data from an online store, about which users have purchased which products. The data is contained in a **comma-separated-value** ( **CSV** ) file called `UserPurchaseHistory.csv`, and the contents are shown in the following snippet. The first column of the CSV is the username, the second column is the product name, and the final column is the price: + + **John,iPhone Cover,9.99** + **John,Headphones,5.49** + **Jack,iPhone Cover,9.99** + **Jill,Samsung Galaxy Cover,8.95** + **Bob,iPad Cover,5.49** + +For our Scala program, we need to create two files: our Scala code and our project build configuration file, using the build tool **Scala Build Tool** ( **sbt** ). For ease of use, we recommend that you download the sample project code called `scala-spark-app` for this chapter. This code also contains the CSV file under the `data` directory. You will need SBT installed on your system in order to run this example program (we use version 0.13.1 at the time of writing this book). + +### Tip + +Setting up SBT is beyond the scope of this book; however, you can find more information at . + +Our SBT configuration file, `build.sbt`, looks like this (note that the empty lines between each line of code are required): + + name := "scala-spark-app" + + version := "1.0" + + scalaVersion := "2.10.4" + + libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.0 " + +The last line adds the dependency on Spark to our project. + +Our Scala program is contained in the `ScalaApp.scala` file. We will walk through the program piece by piece. First, we need to import the required Spark classes: + + import org.apache.spark.SparkContext + import org.apache.spark.SparkContext._ + + /** + * A simple Spark app in Scala + */ + object ScalaApp { + +In our main method, we need to initialize our `SparkContext` object and use this to access our CSV data file with the `textFile` method. We will then map the raw text by splitting the string on the delimiter character (a comma in this case) and extracting the relevant records for username, product, and price: + + def main(args: Array[String]) { + val sc = new SparkContext("local[2]", "First Spark App") + // we take the raw data in CSV format and convert it into a set of records of the form (user, product, price) + val data = sc.textFile("data/UserPurchaseHistory.csv") + .map(line => line.split(",")) + .map(purchaseRecord => (purchaseRecord(0), purchaseRecord(1), purchaseRecord(2))) + +Now that we have an RDD, where each record is made up of `(user, product, price)`, we can compute various interesting metrics for our store, such as the following ones: + + * The total number of purchases + * The number of unique users who purchased + * Our total revenue + * Our most popular product + +Let's compute the preceding metrics: + + // let's count the number of purchases + val numPurchases = data.count() + // let's count how many unique users made purchases + val uniqueUsers = data.map{ case (user, product, price) => user }.distinct().count() + // let's sum up our total revenue + val totalRevenue = data.map{ case (user, product, price) => price.toDouble }.sum() + // let's find our most popular product + val productsByPopularity = data + .map{ case (user, product, price) => (product, 1) } + .reduceByKey(_ + _) + .collect() + .sortBy(-_._2) + val mostPopular = productsByPopularity(0) + +This last piece of code to compute the most popular product is an example of the _Map/Reduce_ pattern made popular by Hadoop. First, we mapped our records of `(user, product, price)` to the records of `(product, 1)`. Then, we performed a `reduceByKey` operation, where we summed up the 1s for each unique product. + +Once we have this transformed RDD, which contains the number of purchases for each product, we will call `collect`, which returns the results of the computation to the driver program as a local Scala collection. We will then sort these counts locally (note that in practice, if the amount of data is large, we will perform the sorting in parallel, usually with a Spark operation such as `sortByKey`). + +Finally, we will print out the results of our computations to the console: + + println("Total purchases: " + numPurchases) + println("Unique users: " + uniqueUsers) + println("Total revenue: " + totalRevenue) + println("Most popular product: %s with %d purchases".format(mostPopular._1, mostPopular._2)) + } + } + +We can run this program by running `sbt run` in the project's base directory or by running the program in your Scala IDE if you are using one. The output should look similar to the following: + + **...** + **[info] Compiling 1 Scala source to ...** + **[info] Running ScalaApp** + **...** + **14/01/30 10:54:40 INFO spark.SparkContext: Job finished: collect at ScalaApp.scala:25, took 0.045181 s** + **Total purchases: 5** + **Unique users: 4** + **Total revenue: 39.91** + **Most popular product: iPhone Cover with 2 purchases** + +We can see that we have five purchases from four different users with a total revenue of 39.91. Our most popular product is an iPhone cover with 2 purchases. + +# The first step to a Spark program in Java + +The Java API is very similar in principle to the Scala API. However, while Scala can call the Java code quite easily, in some cases, it is not possible to call the Scala code from Java. This is particularly the case when such Scala code makes use of certain Scala features such as implicit conversions, default parameters, and the Scala reflection API. + +Spark makes heavy use of these features in general, so it is necessary to have a separate API specifically for Java that includes Java versions of the common classes. Hence, `SparkContext` becomes `JavaSparkContext`, and `RDD` becomes `JavaRDD`. + +Java versions prior to version 8 do not support anonymous functions and do not have succinct syntax for functional-style programming, so functions in the Spark Java API must implement a `WrappedFunction` interface with the `call` method signature. While it is significantly more verbose, we will often create one-off anonymous classes to pass to our Spark operations, which implement this interface and the `call` method, to achieve much the same effect as anonymous functions in Scala. + +Spark provides support for Java 8's anonymous function (or _lambda_ ) syntax. Using this syntax makes a Spark program written in Java 8 look very close to the equivalent Scala program. + +In Scala, an RDD of key/value pairs provides special operators (such as `reduceByKey` and `saveAsSequenceFile`, for example) that are accessed automatically via implicit conversions. In Java, special types of `JavaRDD` classes are required in order to access similar functions. These include `JavaPairRDD` to work with key/value pairs and `JavaDoubleRDD` to work with numerical records. + +### Tip + +In this section, we covered the standard Java API syntax. For more details and examples related to working RDDs in Java as well as the Java 8 lambda syntax, see the Java sections of the _Spark Programming Guide_ found at . + +We will see examples of most of these differences in the following Java program, which is included in the example code of this chapter in the directory named `java-spark-app`. The code directory also contains the CSV data file under the `data` subdirectory. + +We will build and run this project with the Maven build tool, which we assume you have installed on your system. + +### Tip + +Installing and setting up Maven is beyond the scope of this book. Usually, Maven can easily be installed using the package manager on your Linux system or HomeBrew or MacPorts on Mac OS X. + +Detailed installation instructions can be found here: . + +The project contains a Java file called `JavaApp.java`, which contains our program code: + + import org.apache.spark.api.java.JavaRDD; + import org.apache.spark.api.java.JavaSparkContext; + import org.apache.spark.api.java.function.DoubleFunction; + import org.apache.spark.api.java.function.Function; + import org.apache.spark.api.java.function.Function2; + import org.apache.spark.api.java.function.PairFunction; + import scala.Tuple2; + + import java.util.Collections; + import java.util.Comparator; + import java.util.List; + + /** + * A simple Spark app in Java + */ + public class JavaApp { + + public static void main(String[] args) { + +As in our Scala example, we first need to initialize our context. Notice that we will use the `JavaSparkContext` class here instead of the `SparkContext` class that we used earlier. We will use the `JavaSparkContext` class in the same way to access our data using `textFile` and then split each row into the required fields. Note how we used an anonymous class to define a split function that performs the string processing, in the highlighted code: + + JavaSparkContext sc = new JavaSparkContext("local[2]", "First Spark App"); + // we take the raw data in CSV format and convert it into a set of records of the form (user, product, price) + JavaRDD data = sc.textFile("data/UserPurchaseHistory.csv") + .map( **new Function () {** + **@Override** + **public String[] call(String s) throws Exception {** + **return s.split(",");** + **}** + **}** ); + +Now, we can compute the same metrics as we did in our Scala example. Note how some methods are the same (for example, `distinct` and `count`) for the Java and Scala APIs. Also note the use of anonymous classes that we pass to the `map` function. This code is highlighted here: + + // let's count the number of purchases + long numPurchases = data.count(); + // let's count how many unique users made purchases + long uniqueUsers = data.map( **new Function () {** + **@Override** + **public String call(String[] strings) throws Exception {** + **return strings[0];** + **}** + **}** ).distinct().count(); + // let's sum up our total revenue + double totalRevenue = data.map( **new DoubleFunction () {** + **@Override** + **public Double call(String[] strings) throws Exception {** + **return Double.parseDouble(strings[2]);** + **}** + **}** ).sum(); + +In the following lines of code, we can see that the approach to compute the most popular product is the same as that in the Scala example. The extra code might seem complex, but it is mostly related to the Java code required to create the anonymous functions (which we have highlighted here). The actual functionality is the same: + + // let's find our most popular product + // first we map the data to records of (product, 1)using a PairFunction + // and the Tuple2 class. + // then we call a reduceByKey operation with a Function2, which is essentially the sum function + List> pairs = data.map( **new PairFunction () {** + **@Override** + **public Tuple2 call(String[] strings)throws Exception {** + **return new Tuple2(strings[1], 1);** + **}** + **}** ).reduceByKey( **new Function2 () {** + **@Override** + **public Integer call(Integer integer, Integer integer2)throws Exception {** + **return integer + integer2;** + **}** + **}** ).collect(); + // finally we sort the result. Note we need to create a Comparator function, + // that reverses the sort order. + Collections.sort(pairs, new Comparator>() { + @Override + public int compare(Tuple2 o1,Tuple2 o2) { + return -(o1._2() - o2._2()); + } + }); + String mostPopular = pairs.get(0)._1(); + int purchases = pairs.get(0)._2(); + System.out.println("Total purchases: " + numPurchases); + System.out.println("Unique users: " + uniqueUsers); + System.out.println("Total revenue: " + totalRevenue); + System.out.println(String.format("Most popular product:%s with %d purchases", mostPopular, purchases)); + } + } + +As can be seen, the general structure is similar to the Scala version, apart from the extra boilerplate code to declare variables and functions via anonymous inner classes. It is a good exercise to work through both examples and compare the same lines of Scala code to those in Java to understand how the same result is achieved in each language. + +This program can be run with the following command executed from the project's base directory: + + **> mvn exec:java -Dexec.mainClass="JavaApp"** + +You will see output that looks very similar to the Scala version, with the results of the computation identical: + + **...** + **14/01/30 17:02:43 INFO spark.SparkContext: Job finished: collect at JavaApp.java:46, took 0.039167 s** + **Total purchases: 5** + **Unique users: 4** + **Total revenue: 39.91** + **Most popular product: iPhone Cover with 2 purchases** + +# The first step to a Spark program in Python + +Spark's Python API exposes virtually all the functionalities of Spark's Scala API in the Python language. There are some features that are not yet supported (for example, graph processing with GraphX and a few API methods here and there). See the Python section of the _Spark Programming Guide_ () for more details. + +Following on from the preceding examples, we will now write a Python version. We assume that you have Python version 2.6 and higher installed on your system (for example, most Linux and Mac OS X systems come with Python preinstalled). + +The example program is included in the sample code for this chapter, in the directory named `python-spark-app`, which also contains the CSV data file under the `data` subdirectory. The project contains a script, `pythonapp.py`, provided here: + + """A simple Spark app in Python""" + from pyspark import SparkContext + + sc = SparkContext("local[2]", "First Spark App") + # we take the raw data in CSV format and convert it into a set of records of the form (user, product, price) + data = sc.textFile("data/UserPurchaseHistory.csv").map(lambda line: line.split(",")).map(lambda record: (record[0], record[1], record[2])) + # let's count the number of purchases + numPurchases = data.count() + # let's count how many unique users made purchases + uniqueUsers = data.map(lambda record: record[0]).distinct().count() + # let's sum up our total revenue + totalRevenue = data.map(lambda record: float(record[2])).sum() + # let's find our most popular product + **products = data.map(lambda record: (record[1], 1.0)).reduceByKey(lambda a, b: a + b).collect()** + mostPopular = sorted(products, key=lambda x: x[1], reverse=True)[0] + + print "Total purchases: %d" % numPurchases + print "Unique users: %d" % uniqueUsers + print "Total revenue: %2.2f" % totalRevenue + print "Most popular product: %s with %d purchases" % (mostPopular[0], mostPopular[1]) + +If you compare the Scala and Python versions of our program, you will see that generally, the syntax looks very similar. One key difference is how we express anonymous functions (also called `lambda` functions; hence, the use of this keyword for the Python syntax). In Scala, we've seen that an anonymous function mapping an input `x` to an output `y` is expressed as `x => y`, while in Python, it is `lambda x: y`. In the highlighted line in the preceding code, we are applying an anonymous function that maps two inputs, `a` and `b`, generally of the same type, to an output. In this case, the function that we apply is the _plus_ function; hence, `lambda a, b: a + b`. + +The best way to run the script is to run the following command from the base directory of the sample project: + + **> $SPARK_HOME/bin/spark-submit pythonapp.py** + +Here, the `SPARK_HOME` variable should be replaced with the path of the directory in which you originally unpacked the Spark prebuilt binary package at the start of this chapter. + +Upon running the script, you should see output similar to that of the Scala and Java examples, with the results of our computation being the same: + + **...** + **14/01/30 11:43:47 INFO SparkContext: Job finished: collect at pythonapp.py:14, took 0.050251 s** + **Total purchases: 5** + **Unique users: 4** + **Total revenue: 39.91** + **Most popular product: iPhone Cover with 2 purchases** + +# Getting Spark running on Amazon EC2 + +The Spark project provides scripts to run a Spark cluster in the cloud on Amazon's EC2 service. These scripts are located in the `ec2` directory. You can run the `spark-ec2` script contained in this directory with the following command: + + **>./ec2/spark-ec2 ** + +Running it in this way without an argument will show the help output: + + **Usage: spark-ec2 [options] ** + **< action> can be: launch, destroy, login, stop, start, get-master** + + **Options:** + **...** + +Before creating a Spark EC2 cluster, you will need to ensure you have an Amazon account. + +### Tip + +If you don't have an Amazon Web Services account, you can sign up at . + +The AWS console is available at . + +You will also need to create an Amazon EC2 key pair and retrieve the relevant security credentials. The Spark documentation for EC2 (available at ) explains the requirements: + +> _Create an Amazon EC2 key pair for yourself. This can be done by logging into your Amazon Web Services account through the AWS console, clicking on_ **Key Pairs** _on the left sidebar, and creating and downloading a key. Make sure that you set the permissions for the private key file to 600 (that is, only you can read and write it) so that`ssh` will work._ +> +> _Whenever you want to use the_ `spark-ec2` _script, set the environment variables_ `AWS_ACCESS_KEY_ID` _and_ `AWS_SECRET_ACCESS_KEY` _to your Amazon EC2 access key ID and secret access key, respectively. These can be obtained from the AWS homepage by clicking_ **Account** | **Security Credentials** | **Access Credentials**. + +When creating a key pair, choose a name that is easy to remember. We will simply use `spark` for the key pair name. The key pair file itself will be called `spark.pem`. As mentioned earlier, ensure that the key pair file permissions are set appropriately and that the environment variables for the AWS credentials are exported using the following commands: + + **> chmod 600 spark.pem** + **> export AWS_ACCESS_KEY_ID="..."** + **> export AWS_SECRET_ACCESS_KEY="..."** + +You should also be careful to keep your downloaded key pair file safe and not lose it, as it can only be downloaded once when it is created! + +Note that launching an Amazon EC2 cluster in the following section will _incur costs_ to your AWS account. + +## Launching an EC2 Spark cluster + +We're now ready to launch a small Spark cluster by changing into the `ec2` directory and then running the cluster launch command: + + **> cd ec2** + **>./spark-ec2 -k spark -i spark.pem -s 1 --instance-type m3.medium --hadoop-major-version 2 launch test-cluster** + +This will launch a new Spark cluster called `test-cluster` with one master and one slave node of instance type `m3.medium`. This cluster will be launched with a Spark version built for Hadoop 2. The key pair name we used is `spark`, and the key pair file is `spark.pem` (if you gave the files different names or have an existing AWS key pair, use that name instead). + +It might take quite a while for the cluster to fully launch and initialize. You should see something like this screenshot immediately after running the launch command: + +If the cluster has launched successfully, you should eventually see the console output similar to the following screenshot: + +To test whether we can connect to our new cluster, we can run the following command: + + **> ssh -i spark.pem root@ec2-54-227-127-14.compute-1.amazonaws.com** + +Remember to replace the public domain name of the master node (the address after `root@` in the preceding command) with the correct Amazon EC2 public domain name that will be shown in your console output after launching the cluster. + +You can also retrieve your cluster's master public domain name by running this line of code: + + **>./spark-ec2 -i spark.pem get-master test-cluster** + +After successfully running the `ssh` command, you will be connected to your Spark master node in EC2, and your terminal output should match the following screenshot: + +We can test whether our cluster is correctly set up with Spark by changing into the Spark directory and running an example in the local mode: + + **> cd spark** + **> MASTER=local[2] ./bin/run-example SparkPi** + +You should see output similar to running the same command on your local computer: + + **...** + **14/01/30 20:20:21 INFO SparkContext: Job finished: reduce at SparkPi.scala:35, took 0.864044012 s** + **Pi is roughly 3.14032** + **...** + +Now that we have an actual cluster with multiple nodes, we can test Spark in the cluster mode. We can run the same example on the cluster, using our 1 slave node, by passing in the master URL instead of the local version: + + **> MASTER=spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077 ./bin/run-example SparkPi ** + +### Tip + +Note that you will need to substitute the preceding master domain name with the correct domain name for your specific cluster. + +Again, the output should be similar to running the example locally; however, the log messages will show that your driver program has connected to the Spark master: + + **...** + **14/01/30 20:26:17 INFO client.Client$ClientActor: Connecting to master spark://ec2-54-220-189-136.eu-west-1.compute.amazonaws.com:7077** + **14/01/30 20:26:17 INFO cluster.SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20140130202617-0001** + **14/01/30 20:26:17 INFO client.Client$ClientActor: Executor added: app-20140130202617-0001/0 on worker-20140130201049-ip-10-34-137-45.eu-west-1.compute.internal-57119 (ip-10-34-137-45.eu-west-1.compute.internal:57119) with 1 cores** + **14/01/30 20:26:17 INFO cluster.SparkDeploySchedulerBackend: Granted executor ID app-20140130202617-0001/0 on hostPort ip-10-34-137-45.eu-west-1.compute.internal:57119 with 1 cores, 2.4 GB RAM** + **14/01/30 20:26:17 INFO client.Client$ClientActor: Executor updated: app-20140130202617-0001/0 is now RUNNING** + **14/01/30 20:26:18 INFO spark.SparkContext: Starting job: reduce at SparkPi.scala:39** + **...** + +Feel free to experiment with your cluster. Try out the interactive console in Scala, for example: + + **> ./bin/spark-shell --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077** + +Once you've finished, type `exit` to leave the console. You can also try the PySpark console by running the following command: + + **> ./bin/pyspark --master spark://ec2-54-227-127-14.compute-1.amazonaws.com:7077** + +You can use the Spark Master web interface to see the applications registered with the master. To load the Master Web UI, navigate to `ec2-54-227-127-14.compute-1.amazonaws.com:8080` (again, remember to replace this domain name with your own master domain name). You should see something similar to the following screenshot showing the example you ran as well as the two console applications you launched: + +Remember that _you will be charged by Amazon_ for usage of the cluster. Don't forget to stop or terminate this test cluster once you're done with it. To do this, you can first exit the `ssh` session by typing `exit` to return to your own local system and then, run the following command: + + **>./ec2/spark-ec2 -k spark -i spark.pem destroy test-cluster** + +You should see the following output: + + **Are you sure you want to destroy the cluster test-cluster?** + **The following instances will be terminated:** + **Searching for existing cluster test-cluster...** + **Found 1 master(s), 1 slaves** + **> ec2-54-227-127-14.compute-1.amazonaws.com** + **> ec2-54-91-61-225.compute-1.amazonaws.com** + **ALL DATA ON ALL NODES WILL BE LOST!!** + **Destroy cluster test-cluster (y/N): y** + **Terminating master...** + **Terminating slaves...** + +Hit _Y_ and then _Enter_ to destroy the cluster. + +Congratulations! You've just set up a Spark cluster in the cloud, run a fully parallel example program on this cluster, and terminated it. If you would like to try out any of the example code in the subsequent chapters (or your own Spark programs) on a cluster, feel free to experiment with the Spark EC2 scripts and launch a cluster of your chosen size and instance profile (just be mindful of the costs and remember to shut it down when you're done!). + +# Summary + +In this chapter, we covered how to set up Spark locally on our own computer as well as in the cloud as a cluster running on Amazon EC2. You learned the basics of Spark's programming model and API using the interactive Scala console, and we wrote the same basic Spark program in Scala, Java, and Python. + +In the next chapter, we will consider how to go about using Spark to create a machine learning system. + +# Chapter 2. Designing a Machine Learning System + +In this chapter, we will design a high-level architecture for an intelligent, distributed machine learning system that uses Spark as its core computation engine. The problem we will focus on will be taking the existing architecture for a web-based business and redesigning it to use automated machine learning systems to power key areas of the business. In this chapter, we will: + + * Introduce our hypothetical business scenario + * Provide an overview of the current architecture + * Explore various ways in which machine learning systems can enhance or replace certain business functions + * Provide a new architecture based on these ideas + +A modern large-scale data environment includes the following requirements: + + * It must integrate with other components of the system, especially with data collection and storage systems, analytics and reporting, and frontend applications. + * It should be easily scalable and independent of the rest of the architecture. Ideally, this should be in the form of horizontal as well as vertical scalability. + * It should allow efficient computation in respect of the type of workload in mind, that is machine learning and iterative analytics applications. + * If possible, it should support both batch and real-time workloads. + +As a framework, Spark meets these criteria. However, we must ensure that the machine learning systems designed on Spark also meet these criteria. There is no good in implementing an algorithm that ends up having bottlenecks that cause our system to fail in terms of one or more of these requirements. + +# Introducing MovieStream + +To better illustrate the design of our architecture, we will introduce a practical scenario. Let's assume that we have just been appointed to head the data science team of MovieStream, a fictitious Internet business that streams movies and television shows to its users. + +MovieStream is growing rapidly, adding both users and titles to its catalogue. The current MovieStream system is outlined in the following diagram: + +MovieStream's current architecture + +As we can see in the preceding diagram, currently, MovieStream's content editorial team is responsible for deciding which movies and shows are promoted and shown on the various parts of the site. They are also responsible for creating the content for MovieStream's bulk marketing campaigns, which include e-mail and other direct marketing channels. Currently, MovieStream collects basic data on what titles are viewed by users on an aggregate basis and has access to some demographic data collected from users when they sign up to the service. In addition, they have access to some basic metadata about the titles in their catalogue. + +The MovieStream team is stretched thin due to their rapid growth, and they can't keep up with the number of new releases and the growing activity of their users. The CEO of MovieStream has heard a lot about big data, machine learning, and artificial intelligence, and would like us to build a machine learning system for MovieStream that can handle many of the functions currently handled by the content team in an automated manner. + +# Business use cases for a machine learning system + +Perhaps the first question we should answer is, "Why use machine learning at all?" Why doesn't MovieStream simply continue with human-driven decisions? There are many reasons to use machine learning (and certainly some reasons not to), but the most important ones are mentioned here: + + * The scale of data involved means that full human involvement quickly becomes infeasible as MovieStream grows + * Model-driven approaches such as machine learning and statistics can often benefit from uncovering patterns that cannot be seen by humans (due to the size and complexity of the datasets) + * Model-driven approaches can avoid human and emotional biases (as long as the correct processes are carefully applied) + +However, there is no reason why both model-driven and human-driven processes and decision making cannot coexist. For example, many machine learning systems rely on receiving labeled data in order to train models. Often, labeling such data is costly, time consuming, and requires human input. A good example of this is classifying textual data into categories or assigning a sentiment indicator to the text. Many real-world systems use some form of human-driven system to generate labels for such data (or at least part of it) to provide training data to models. These models are then used to make predictions in the live system at a larger scale. + +In the context of MovieStream, we need not fear that our machine learning system will make the content team redundant. Indeed, we will see that our aim is to lift the burden of time-consuming tasks where machine learning might be able to perform better while providing tools to allow the team to better understand the users and content. This might, for example, help them in selecting which new content to acquire for the catalogue (which involves a significant amount of cost and is therefore a critical aspect of the business). + +## Personalization + +Perhaps one of the most important potential applications of machine learning in MovieStream's business is personalization. Generally speaking, personalization refers to adapting the experience of a user and the content presented to them based on various factors, which might include user behavior data as well as external factors. + + **Recommendations** are essentially a subset of personalization. Recommendation generally refers to presenting a user with a list of items that we hope the user will be interested in. Recommendations might be used in web pages (for example, recommending related products), via e-mails or other direct marketing channels, via mobile apps, and so on. + +Personalization is very similar to recommendations, but while recommendations are usually focused on an _explicit_ presentation of products or content to the user, personalization is more generic and, often, more _implicit_. For example, applying personalization to search on the MovieStream site might allow us to adapt the search results for a given user, based on the data available about that user. This might include recommendation-based data (in the case of a search for products or content) but might also include various other factors such as geolocation and past search history. It might not be apparent to the user that the search results are adapted to their specific profile; this is why personalization tends to be more implicit. + +## Targeted marketing and customer segmentation + +In a manner similar to recommendations, targeted marketing uses a model to select what to target at users. While generally recommendations and personalization are focused on a one-to-one situation, segmentation approaches might try to assign users into groups based on characteristics and, possibly, behavioral data. The approach might be fairly simple or might involve a machine learning model such as clustering. Either way, the result is a set of segment assignments that might allow us to understand the broad characteristics of each group of users, what makes them similar to each other within a group, and what makes them different from others in different groups. + +This could help MovieStream to better understand the drivers of user behavior and might also allow a broader targeting approach where groups are targeted as opposed to (or more likely, in addition to) direct one-to-one targeting with personalization. + +These methods can also help when we don't necessarily have labeled data available (as is the case with certain user and content profile data) but we still wish to perform more focused targeting than a complete _one-size-fits-all_ approach. + +## Predictive modeling and analytics + +A third area where machine learning can be applied is in predictive analytics. This is a very broad term, and in some ways, it encompasses recommendations, personalization, and targeting too. In this context, since recommendations and segmentation are somewhat distinct, we use the term **predictive modeling** to refer to other models that seek to make predictions. An example of this can be a model to predict the potential viewing activity and revenue of new titles before any data is available on how popular the title might be. MovieStream can use past activity and revenue data, together with content attributes, to create a **regression model** that can be used to make predictions for brand new titles. + +As another example, we can use a **classification model** to automatically assign tags, keywords, or categories to new titles for which we only have partial data. + +# Types of machine learning models + +While we have highlighted a few use cases for machine learning in the context of the preceding MovieStream example, there are many other examples, some of which we will touch on in the relevant chapters when we introduce each machine learning task. + +However, we can broadly divide the preceding use cases and methods into two categories of machine learning: + + * **Supervised learning** : These types of models use _labeled_ data to learn. Recommendation engines, regression, and classification are examples of supervised learning methods. The labels in these models can be user-movie ratings (for recommendation), movie tags (in the case of the preceding classification example), or revenue figures (for regression). We will cover supervised learning models in Chapter 4, _Building a Recommendation Engine with Spark_ , Chapter 5, _Building a Classification Model with Spark_ , and Chapter 6, _Building a Regression Model with Spark_. + * **Unsupervised learning** : When a model does not require labeled data, we refer to unsupervised learning. These types of models try to learn or extract some underlying structure in the data or reduce the data down to its most important features. Clustering, dimensionality reduction, and some forms of feature extraction, such as text processing, are all unsupervised techniques and will be dealt with in Chapter 7, _Building a Clustering Model with Spark_ , Chapter 8, _Dimensionality Reduction with Spark_ , and Chapter 9, _Advanced Text Processing with Spark_. + +# The components of a data-driven machine learning system + +The high-level components of our machine learning system are outlined in the following diagram. This diagram illustrates the machine learning pipeline from which we obtain data and in which we store data. We then transform it into a form that is usable as input to a machine learning model; train, test, and refine our model; and then, deploy the final model to our production system. The process is then repeated as new data is generated. + +A general machine learning pipeline + +## Data ingestion and storage + +The first step in our machine learning pipeline will be taking in the data that we require for training our models. Like many other businesses, MovieStream's data is typically generated by user activity, other systems (this is commonly referred to as machine-generated data), and external sources (for example, the time of day and weather during a particular user's visit to the site). + +This data can be ingested in various ways, for example, gathering user activity data from browser and mobile application event logs or accessing external web APIs to collect data on geolocation or weather. + +Once the collection mechanisms are in place, the data usually needs to be stored. This includes the raw data, data resulting from intermediate processing, and final model results to be used in production. + +Data storage can be complex and involve a wide variety of systems, including HDFS, Amazon S3, and other filesystems; SQL databases such as MySQL or PostgreSQL; distributed NoSQL data stores such as HBase, Cassandra, and DynamoDB; and search engines such as Solr or Elasticsearch to stream data systems such as Kafka, Flume, or Amazon Kinesis. + +For the purposes of this book, we will assume that the relevant data is available to us, so we will focus on the processing and modeling steps in the following pipeline. + +## Data cleansing and transformation + +The majority of machine learning models operate on features, which are typically numerical representations of the input variables that will be used for the model. + +While we might want to spend the majority of our time exploring machine learning models, data collected via various systems and sources in the preceding ingestion step is, in most cases, in a raw form. For example, we might log user events such as details of when a user views the information page for a movie, when they watch a movie, or when they provide some other feedback. We might also collect external information such as the location of the user (as provided through their IP address, for example). These event logs will typically contain some combination of textual and numeric information about the event (and also, perhaps, other forms of data such as images or audio). + +In order to use this raw data in our models, in almost all cases, we need to perform preprocessing, which might include: + + * **Filtering data** : Let's assume that we want to create a model from a subset of the raw data, such as only the most recent few months of activity data or only events that match certain criteria. + * **Dealing with missing, incomplete, or corrupted data** : Many real-world datasets are incomplete in some way. This might include data that is missing (for example, due to a missing user input) or data that is incorrect or flawed (for example, due to an error in data ingestion or storage, technical issues or bugs, or software or hardware failure). We might need to filter out bad data or alternatively decide a method to fill in missing data points (such as using the average value from the dataset for missing points, for example). + * **Dealing with potential anomalies, errors, and outliers** : Erroneous or outlier data might skew the results of model training, so we might wish to filter these cases out or use techniques that are able to deal with outliers. + * **Joining together disparate data sources** : For example, we might need to match up the event data for each user with different internal data sources, such as user profiles, as well as external data, such as geolocation, weather, and economic data. + * **Aggregating data** : Certain models might require input data that is aggregated in some way, such as computing the sum of a number of different event types per user. + +Once we have performed initial preprocessing on our data, we often need to transform the data into a representation that is suitable for machine learning models. For many model types, this representation will take the form of a vector or matrix structure that contains numerical data. Common challenges during data transformation and feature extraction include: + + * Taking categorical data (such as country for geolocation or category for a movie) and encoding it in a numerical representation. + * Extracting useful features from text data. + * Dealing with image or audio data. + * We often convert numerical data into categorical data to reduce the number of values a variable can take on. An example of this is converting a variable for age into buckets (such as 25-35, 45-55, and so on). + * Transforming numerical features; for example, applying a log transformation to a numerical variable can help deal with variables that take on a very large range of values. + * Normalizing and standardizing numerical features ensures that all the different input variables for a model have a consistent scale. Many machine learning models require standardized input to work properly. + * Feature engineering is the process of combining or transforming the existing variables to create new features. For example, we can create a new variable that is the average of some other data, such as the average number of times a user watches a movie. + +We will cover all of these techniques through the examples in this book. + +These data-cleansing, exploration, aggregation, and transformation steps can be carried out using both Spark's core API functions as well as the SparkSQL engine, not to mention other external Scala, Java, or Python libraries. We can take advantage of Spark's Hadoop compatibility to read data from and write data to the various different storage systems mentioned earlier. + +## Model training and testing loop + +Once we have our training data in a form that is suitable for our model, we can proceed with the model's training and testing phase. During this phase, we are primarily concerned with **model selection**. This can refer to choosing the best modeling approach for our task, or the best parameter settings for a given model. In fact, the term model selection often refers to both of these processes, as, in many cases, we might wish to try out various models and select the best performing model (with the best performing parameter settings for each model). It is also common to explore the application of combinations of different models (known as **ensemble methods** ) in this phase. + +This is typically a fairly straightforward process of running our chosen model on our training dataset and testing its performance on a test dataset (that is, a set of data that is held out for the evaluation of the model that the model has not seen in the training phase). This process is referred to as **cross-validation**. + +However, due to the large scale of data we are typically working with, it is often useful to carry out this initial train-test loop on a smaller representative sample of our full dataset or perform model selection using parallel methods where possible. + +For this part of the pipeline, Spark's built-in machine learning library, MLlib, is a perfect fit. We will focus most of our attention in this book on the model training, evaluation, and cross-validation steps for various machine learning techniques, using MLlib and Spark's core features. + +## Model deployment and integration + +Once we have found the optimal model based on the train-test loop, we might still face the task of deploying the model to a production system so that it can be used to make actionable predictions. + +Usually, this process involves exporting the trained model to a central data store from where the production-serving system can obtain the latest version. Thus, the live system _refreshes_ the model periodically as a new model is trained. + +## Model monitoring and feedback + +It is critically important to monitor the performance of our machine learning system in production. Once we deploy our optimal trained model, we wish to understand how it is doing in the "wild". Is it performing as we expect on new, unseen data? Is its accuracy good enough? The reality is regardless of how much model selection and tuning we try to do in the earlier phases; the only way to measure true performance is to observe what happens in our production system. + +Also, bear in mind that model accuracy and predictive performance is only one aspect of a real-world system. Usually, we are concerned with other metrics related to business performance (for example, revenue and profitability) or user experience (such as the time spent on our site and how active our users are overall). In most cases, we cannot easily map model-predictive performance to these business metrics. The accuracy of a recommendation or targeting system might be important, but it relates only indirectly to the true metrics we are concerned about, namely whether we are improving user experience, activity, and ultimately, revenue. + +So, in real-world systems, we should monitor both model-accuracy metrics as well as business metrics. If possible, we should be able to experiment with different models running in production to allow us to optimize against these business metrics by making changes to the models. This is often done using live split tests. However, doing this correctly is not an easy task, and live testing and experimentation is expensive, in the sense that mistakes, poor performance, and using baseline models (they provide a control against which we test out production models) can negatively impact user experience and revenue. + +Another important aspect of this phase is **model feedback**. This is the process where the predictions of our model feed through into user behavior; this, in turn, feeds through into our model. In a real-world system, our models are essentially influencing their own future training data by impacting decision-making and potential user behavior. + +For example, if we have deployed a recommendation system, then, by making recommendations, we might be influencing user behavior because we are only allowing users a limited selection of choices. We hope that this selection is relevant due to our model; however, this feedback loop, in turn, can influence our model's training data. This, in turn, feeds back into real-world performance. It is possible to get into an ever-narrowing feedback loop; ultimately, this can negatively affect both model accuracy and our important business metrics. + +Fortunately, there are mechanisms by which we can try to limit the potential negative impact of this feedback loop. These include providing some unbiased training data by having a small portion of data coming from users who are not exposed to our models or by being principled in the way we balance exploration, to learn more about our data, and exploitation, to use what we have learned to improve our system's performance. + +We will briefly cover some aspects of real-time monitoring and model updates in Chapter 10, _Real-time Machine Learning with Spark Streaming_. + +## Batch versus real time + +In the previous sections, we outlined the common batch processing approach, where the model is retrained using all data or a subset of all data, periodically. As the preceding pipeline takes some time to complete, it might not be possible to use this approach to update models immediately as new data arrives. + +While we will be mostly covering batch machine learning approaches in this book, there is a class of machine learning algorithms known as **online learning** ; they update immediately as new data is fed into the model, thus enabling a real-time system. A common example is an online-optimization algorithm for a linear model, such as stochastic gradient descent. We can learn this algorithm using examples. The advantages of these methods are that the system can react very quickly to new information and also that the system can adapt to changes in the underlying behavior (that is, if the characteristics and distribution of the input data are changing over time, which is almost always the case in real-world situations). + +However, online-learning models come with their own unique challenges in a production context. For example, it might be difficult to ingest and transform data in real time. It can also be complex to properly perform model selection in a purely online setting. Latency of the online training and the model selection and deployment phases might be too high for true real-time requirements (for example, in online advertising, latency requirements are measured in single-digit milliseconds). Finally, batch-oriented frameworks might make it awkward to handle real-time processes of a streaming nature. + +Fortunately, Spark's real-time stream processing component, **Spark Streaming** , is a good potential fit for real-time machine learning workflows. We will explore Spark Streaming and online learning in Chapter 10, _Real-time Machine Learning with Spark Streaming_. + +Due to the complexities inherent in a true real-time machine learning system, in practice, many systems target near real-time operations. This is essentially a hybrid approach where models are not necessarily updated immediately as new data arrives; instead, the new data is collected into mini-batches of a small set of training data. These mini-batches can be fed to an online-learning algorithm. In many cases, this approach is combined with a periodic batch process that might recompute the model on the entire data set and perform more complex processing and model selection. This can help ensure that the real-time model does not degrade over time. + +Another similar approach involves making approximate updates to a more complex model as new data arrives while recomputing the entire model in a batch process periodically. In this way, the model can learn from new data, with a short delay (usually measured in seconds or, perhaps, a few minutes), but will become more and more inaccurate over time due to the approximation applied. The periodic recomputation takes care of this by retraining the model on all available data. + +# An architecture for a machine learning system + +Now that we have explored how our machine learning system might work in the context of MovieStream, we can outline a possible architecture for our system: + +MovieStream's future architecture + +As we can see, our system incorporates the machine learning pipeline outlined in the preceding diagram; this system also includes: + + * Collecting data about users, their behavior, and our content titles + * Transforming this data into features + * Training our models, including our training-testing and model-selection phases + * Deploying the trained models to both our live model-serving system as well as using these models for offline processes + * Feeding back the model results into the MovieStream website through recommendation and targeting pages + * Feeding back the model results into MovieStream's personalized marketing channels + * Using the offline models to provide tools to MovieStream's various teams to better understand user behavior, characteristics of the content catalogue, and drivers of revenue for the business + +## Practical exercise + +Imagine that you now need to provide input to the frontend and infrastructure engineering team about the data that your machine learning system will need. Consider a brief for them on how they should structure the data-collection mechanisms. Write down some examples of what the raw data might look like (for example, web logs, event logs, and so on) and how it should flow through the system. Take into account the following aspects: + + * What data sources will be required + * What format should the data be in + * How often should data be collected, processed, potentially aggregated, and stored + * What data storage will you use to ensure scalability + +# Summary + +In this chapter, you learned about the components inherent in a data-driven, automated machine learning system. We also outlined how a possible high-level architecture for such a system might look in a real-world situation. + +In the next chapter, we will discuss how to obtain publicly-available datasets for common machine learning tasks. We will also explore general concepts related to processing, cleaning, and transforming data so that they can be used to train a machine learning model. + +# Chapter 3. Obtaining, Processing, and Preparing Data with Spark + +Machine learning is an extremely broad field, and these days, applications can be found across areas that include web and mobile applications, Internet of Things and sensor networks, financial services, healthcare, and various scientific fields, to name just a few. + +Therefore, the range of data available for potential use in machine learning is enormous. In this book, we will focus mostly on business applications. In this context, the data available often consists of data internal to an organization (such as transactional data for a financial services company) as well as external data sources (such as financial asset price data for the same financial services company). + +For example, recall from Chapter 2, _Designing a Machine Learning System_ , that the main internal source of data for our hypothetical Internet business, MovieStream, consists of data on the movies available on the site, the users of the service, and their behavior. This includes data about movies and other content (for example, title, categories, description, images, actors, and directors), user information (for example, demographics, location, and so on), and user activity data (for example, web page views, title previews and views, ratings, reviews, and social data such as _likes_ , _shares_ , and social network profiles on services including Facebook and Twitter). + +External data sources in this example might include weather and geolocation services, third-party movie ratings and review sites such as _IMDB_ and _Rotten Tomatoes_ , and so on. + +Generally speaking, it is quite difficult to obtain data of an internal nature for real-world services and businesses, as it is commercially sensitive (in particular, data on purchasing activity, user or customer behavior, and revenue) and of great potential value to the organization concerned. This is why it is also often the most useful and interesting data on which to apply machine learning--a good machine learning model that can make accurate predictions can be highly valuable (witness the success of machine learning competitions such as the _Netflix Prize_ and _Kaggle_ ). + +In this book, we will make use of datasets that are publicly available to illustrate concepts around data processing and training of machine learning models. + +In this chapter, we will: + + * Briefly cover the types of data typically used in machine learning. + * Provide examples of where to obtain interesting datasets, often publicly available on the Internet. We will use some of these datasets throughout the book to illustrate the use of the models we introduce. + * Discover how to process, clean, explore, and visualize our data. + * Introduce various techniques to transform our raw data into features that can be used as input to machine learning algorithms. + * Learn how to normalize input features using external libraries as well as Spark's built-in functionality. + +# Accessing publicly available datasets + +Fortunately, while commercially-sensitive data can be hard to come by, there are still a number of useful datasets available publicly. Many of these are often used as benchmark datasets for specific types of machine learning problems. Examples of common data sources include: + + * **UCI Machine Learning Repository** : This is a collection of almost 300 datasets of various types and sizes for tasks including classification, regression, clustering, and recommender systems. The list is available at . + * **Amazon AWS public datasets** : This is a set of often very large datasets that can be accessed via Amazon S3. These datasets include the Human Genome Project, the Common Crawl web corpus, Wikipedia data, and Google Books Ngrams. Information on these datasets can be found at . + * **Kaggle** : This is a collection of datasets used in machine learning competitions run by Kaggle. Areas include classification, regression, ranking, recommender systems, and image analysis. These datasets can be found under the _Competitions_ section at . + * **KDnuggets** : This has a detailed list of public datasets, including some of those mentioned earlier. The list is available at . + +### Tip + +There are many other resources to find public datasets depending on the specific domain and machine learning task. Hopefully, you might also have exposure to some interesting academic or commercial data of your own! + +To illustrate a few key concepts related to data processing, transformation, and feature extraction in Spark, we will download a commonly-used dataset for movie recommendations; this dataset is known as the **MovieLens** dataset. As it is applicable to recommender systems as well as potentially other machine learning tasks, it serves as a useful example dataset. + +### Note + +Spark's machine learning library, MLlib, has been under heavy development since its inception, and unlike the Spark core, it is still not in a fully stable state with regard to its overall API and design. + +As of Spark Version 1.2.0, a new, experimental API for MLlib has been released under the `ml` package (whereas the current library resides under the `mllib` package). This new API aims to enhance the APIs and interfaces for models as well as feature extraction and transformation so as to make it easier to build pipelines that chain together steps that include feature extraction, normalization, dataset transformations, model training, and cross-validation. + +In the upcoming chapters, we will only cover the existing, more developed MLlib API, since the new API is still experimental and may be subject to major changes in the next few Spark releases. Over time, the various feature-processing techniques and models that we will cover will simply be ported to the new API; however, the core concepts and most underlying code will remain largely unchanged. + +## The MovieLens 100k dataset + +The MovieLens 100k dataset is a set of 100,000 data points related to ratings given by a set of users to a set of movies. It also contains movie metadata and user profiles. While it is a small dataset, you can quickly download it and run Spark code on it. This makes it ideal for illustrative purposes. + +You can download the dataset from . + +Once you have downloaded the data, unzip it using your terminal: + + **> unzip ml-100k.zip** + **inflating: ml-100k/allbut.pl** + **inflating: ml-100k/mku.sh** + **inflating: ml-100k/README** + **...** + **inflating: ml-100k/ub.base** + **inflating: ml-100k/ub.test** + +This will create a directory called `ml-100k`. Change into this directory and examine the contents. The important files are `u.user` (user profiles), `u.item` (movie metadata), and `u.data` (the ratings given by users to movies): + + **> cd ml-100k** + +The `README` file contains more information on the dataset, including the variables present in each data file. We can use the `head` command to examine the contents of the various files. + +For example, we can see that the `u.user` file contains the `user id`, `age`, `gender`, `occupation`, and `ZIP code` fields, separated by a pipe (`|` character): + + **> head -5 u.user** + **1|24|M|technician|85711** + **2|53|F|other|94043** + **3|23|M|writer|32067** + **4|24|M|technician|43537** + **5|33|F|other|15213** + +The `u.item` file contains the `movie id`, `title`, `release data`, and `IMDB link` fields and a set of fields related to movie category data. It is also separated by a `|` character: + + **> head -5 u.item** + **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0** + **2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0** + **3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0** + **4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0** + **5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0** + +Finally, the `u.data` file contains the `user id`, `movie id`, `rating (1-5 scale)`, and `timestamp` fields and is separated by a tab (the `\t` character): + + **> head -5 u.data** + **196 242 3 881250949** + **186 302 3 891717742** + **22 377 1 878887116** + **244 51 2 880606923** + **166 346 1 886397596** + +# Exploring and visualizing your data + +Now that we have our data available, let's fire up an interactive Spark console and explore it! For this section, we will use Python and the PySpark shell, as we are going to use the IPython interactive console and the matplotlib plotting library to process and visualize our data. + +### Note + +IPython is an advanced, interactive shell for Python. It includes a useful set of features called pylab, which includes NumPy and SciPy for numerical computing and matplotlib for interactive plotting and visualization. + +We recommend that you use the latest version of IPython (2.3.1 at the time of writing this book). To install IPython for your platform, follow the instructions available at . If this is the first time you are using IPython, you can find a tutorial at . + +You will need to install all the packages listed earlier in order to work through the code in this chapter. Instructions to install the packages can be found in the code bundle. If you are starting out with Python or are unfamiliar with the process of installing these packages, we strongly recommend that you use a prebuilt scientific Python installation such as Anaconda (available at ) or Enthought (available at ). These make the installation process much easier and include everything you will need to follow the example code. + +The PySpark console allows the option of setting which Python executable needs to be used to run the shell. We can choose to use IPython, as opposed to the standard Python shell, when launching our PySpark console. We can also pass in additional options to IPython, including telling it to launch with the pylab functionality enabled. + +We can do this by running the following command from the Spark home directory (that is, the same directory that we used previously to explore the Spark interactive console): + + **> IPYTHON=1 IPYTHON_OPTS="--pylab" ./bin/pyspark** + +You will see the PySpark console start up, showing output similar to the following screenshot: + +The PySpark console using IPython + +### Tip + +Notice the `IPython 2.3.1 -- An enhanced Interactive Python` and `Using matplotlib backend: MacOSX` lines; they indicate that both the IPython and pylab functionalities are being used by the PySpark shell. + +You might see a slightly different output, depending on your operating system and software versions. + +Now that we have our IPython console open, we can start to explore the MovieLens dataset and do some basic analysis. + +### Note + +You can follow along with this chapter by entering the code examples into your IPython console. IPython also provides an HTML-enabled Notebook application. It provides some enhanced functionality over the standard IPython console, such as inline graphics for plotting, the HTML markup functionality, as well as the ability to run cells of code independently. + +The images used in this chapter were generated using the IPython Notebook, so don't worry if yours look a little bit different in style, as long as they contain the same content! You can also use the Notebook for the code in this chapter, if you prefer. In addition to the Python code for this chapter, we have provided a version saved in the IPython Notebook format, which you can load into your own IPython Notebook. + +Check out the instructions on how to use the IPython Notebook at . + +## Exploring the user dataset + +First, we will analyze the characteristics of MovieLens users. Enter the following lines into your console (where `PATH` refers to the base directory in which you performed the `unzip` command to unzip the preceding MovieLens 100k dataset): + + user_data = sc.textFile("/ **PATH** /ml-100k/u.user") + user_data.first() + +You should see output similar to this: + + **u'1|24|M|technician|85711'** + +As we can see, this is the first line of our user data file, separated by the `"|"` character. + +### Tip + +The `first` function is similar to `collect`, but it only returns the first element of the RDD to the driver. We can also use `take(k)` to collect only the first _k_ elements of the RDD to the driver. + +Let's transform the data by splitting each line, around the `"|"` character. This will give us an RDD where each record is a Python list that contains the user ID, age, gender, occupation, and ZIP code fields. + +We will then count the number of users, genders, occupations, and ZIP codes. We can achieve this by running the following code in the console, line by line. Note that we do not cache the data, as it is unnecessary for this small size: + + user_fields = user_data.map(lambda line: line.split("|")) + num_users = user_fields.map(lambda fields: fields[0]).count() + num_genders = user_fields.map(lambda fields:fields[2]).distinct().count() + num_occupations = user_fields.map(lambda fields:fields[3]).distinct().count() + num_zipcodes = user_fields.map(lambda fields:fields[4]).distinct().count() + print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders, num_occupations, num_zipcodes) + +You will see the following output: + + **Users: 943, genders: 2, occupations: 21, ZIP codes: 795** + +Next, we will create a histogram to analyze the distribution of user ages, using matplotlib's `hist` function: + + ages = user_fields.map(lambda x: int(x[1])).collect() + hist(ages, bins=20, color='lightblue', normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +We passed in the `ages` array, together with the number of `bins` for our histogram (`20` in this case), to the `hist` function. Using the `normed=True` argument, we also specified that we want the histogram to be normalized so that each bucket represents the percentage of the overall data that falls into that bucket. + +You will see an image containing the histogram chart, which looks something like the one shown here. As we can see, the ages of MovieLens users are somewhat skewed towards younger viewers. A large number of users are between the ages of about 15 and 35. + +Distribution of user ages + +We might also want to explore the relative frequencies of the various occupations of our users. We can do this using the following code snippet. First, we will use the MapReduce approach introduced previously to count the occurrences of each occupation in the dataset. Then, we will use `matplotlib` to display a bar chart of occupation counts, using the `bar` function. + +Since part of our data is the descriptions of textual occupation, we will need to manipulate it a little to get it to work with the `bar` function: + + count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect() + x_axis1 = np.array([c[0] for c in count_by_occupation]) + y_axis1 = np.array([c[1] for c in count_by_occupation]) + +Once we have collected the `RDD` of counts per occupation, we will convert it into two arrays for the _x_ axis (the occupations) and the _y_ axis (the counts) of our chart. The `collect` function returns the count data to us in no particular order. We need to sort the count data so that our bar chart is ordered from the lowest to the highest count. + +We will achieve this by first creating two `numpy` arrays and then using the `argsort` method of `numpy` to select the elements from each array, ordered by the count data in an ascending fashion. Notice that here, we will sort both the _x_ and _y_ axis arrays by the _y_ axis (that is, by the counts): + + x_axis = x_axis1[np.argsort(y_axis1)] + y_axis = y_axis1[np.argsort(y_axis1)] + +Once we have the _x_ and _y_ axis data for our chart, we will create the bar chart with the occupations as labels on the _x_ axis and the counts as the values on the _y_ axis. We will also add a few lines, such as the `plt.xticks(rotation=30)` code, to display a better-looking chart: + + pos = np.arange(len(x_axis)) + width = 1.0 + + ax = plt.axes() + ax.set_xticks(pos + (width / 2)) + ax.set_xticklabels(x_axis) + + plt.bar(pos, y_axis, width, color='lightblue') + plt.xticks(rotation=30) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +The image you have generated should look like the one here. It appears that the most prevalent occupations are **student** , **other** , **educator** , **administrator** , **engineer** , and **programmer**. + +Distribution of user occupations + +Spark provides a convenience method on RDDs called `countByValue`; this method counts the occurrences of each unique value in the RDD and returns it to the driver as a Python `dict` method (or a Scala or Java `Map` method). We can create the `count_by_occupation` variable using this method: + + count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue() + print "Map-reduce approach:" + print dict(count_by_occupation2) + print "" + print "countByValue approach:" + print dict(count_by_occupation) + +You should see that the results are the same for each approach. + +## Exploring the movie dataset + +Next, we will investigate a few properties of the movie catalogue. We can inspect a row of the movie data file, as we did for the user data earlier, and then count the number of movies: + + movie_data = sc.textFile("/ **PATH** /ml-100k/u.item") + print movie_data.first() + num_movies = movie_data.count() + print "Movies: %d" % num_movies + +You will see the following output on your console: + + **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0** + **Movies: 1682** + +In the same manner as we did for user ages and occupations earlier, we can plot the distribution of movie age, that is, the year of release relative to the current date (note that for this dataset, the current year is 1998). + +In the following code block, we can see that we need a small function called `convert_year` to handle errors in the parsing of the `release date` field. This is due to some bad data in one line of the movie data: + + def convert_year(x): + try: + return int(x[-4:]) + except: + return 1900 # there is a 'bad' data point with a blank year, + which we set to 1900 and will filter out later + +Once we have our utility function to parse the year of release, we can apply it to the movie data using a `map` transformation and collect the results: + + movie_fields = movie_data.map(lambda lines: lines.split("|")) + years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)) + +Since we have assigned the value `1900` to any error in parsing, we can filter these bad values out of the resulting data using Spark's `filter` transformation: + + years_filtered = years.filter(lambda x: x != 1900) + +This is a good example of how real-world datasets can often be messy and require a more in-depth approach to parsing data. In fact, this also illustrates why data exploration is so important, as many of these issues in data integrity and quality are picked up during this phase. + +After filtering out bad data, we will transform the list of movie release years into movie ages by subtracting the current year, use `countByValue` to compute the counts for each movie age, and finally, plot our histogram of movie ages (again, using the `hist` function, where the `values` variable are the values of the result from `countByValue`, and the `bins` variable are the keys): + + movie_ages = years_filtered.map(lambda yr: 1998-yr).countByValue() + values = movie_ages.values() + bins = movie_ages.keys() + hist(values, bins=bins, color='lightblue', normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16,10) + +You will see an image similar to the one here; it illustrates that most of the movies were released in the last few years before 1998: + +Distribution of movie ages + +## Exploring the rating dataset + +Let's now take a look at the ratings data: + + rating_data = sc.textFile("/ **PATH** /ml-100k/u.data") + print rating_data.first() + num_ratings = rating_data.count() + print "Ratings: %d" % num_ratings + +This gives us the following result: + + **196 242 3 881250949** + **Ratings: 100000** + +There are 100,000 ratings, and unlike the user and movie datasets, these records are split with a tab character (`"\t"`). As you might have guessed, we'd probably want to compute some basic summary statistics and frequency histograms for the rating values. Let's do this now: + + rating_data = rating_data_raw.map(lambda line: line.split("\t")) + ratings = rating_data.map(lambda fields: int(fields[2])) + max_rating = ratings.reduce(lambda x, y: max(x, y)) + min_rating = ratings.reduce(lambda x, y: min(x, y)) + mean_rating = ratings.reduce(lambda x, y: x + y) / num_ratings + median_rating = np.median(ratings.collect()) + ratings_per_user = num_ratings / num_users + ratings_per_movie = num_ratings / num_movies + print "Min rating: %d" % min_rating + print "Max rating: %d" % max_rating + print "Average rating: %2.2f" % mean_rating + print "Median rating: %d" % median_rating + print "Average # of ratings per user: %2.2f" % ratings_per_user + print "Average # of ratings per movie: %2.2f" % ratings_per_movie + +After running these lines on your console, you will see output similar to the following result: + + **Min rating: 1** + **Max rating: 5** + **Average rating: 3.53** + **Median rating: 4** + **Average # of ratings per user: 106.00** + **Average # of ratings per movie: 59.00** + +We can see that the minimum rating is 1, while the maximum rating is 5. This is in line with what we expect, since the ratings are on a scale of 1 to 5. + +Spark also provides a `stats` function for RDDs; this function contains a numeric variable (such as `ratings` in this case) to compute similar summary statistics: + + ratings.stats() + +Here is the output: + + **(count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5.0, min: 1.0)** + +Looking at the results, the average rating given by a user to a movie is around 3.5 and the median rating is 4, so we might expect that the distribution of ratings will be skewed towards slightly higher ratings. Let's see whether this is true by creating a bar chart of rating values using a similar procedure as we did for occupations: + + count_by_rating = ratings.countByValue() + x_axis = np.array(count_by_rating.keys()) + y_axis = np.array([float(c) for c in count_by_rating.values()]) + # we normalize the y-axis here to percentages + y_axis_normed = y_axis / y_axis.sum() + pos = np.arange(len(x_axis)) + width = 1.0 + + ax = plt.axes() + ax.set_xticks(pos + (width / 2)) + ax.set_xticklabels(x_axis) + + plt.bar(pos, y_axis_normed, width, color='lightblue') + plt.xticks(rotation=30) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +The preceding code should produce the following chart: + +Distribution of rating values + +In line with what we might have expected after seeing some summary statistics, it is clear that the distribution of ratings is skewed towards average to high ratings. + +We can also look at the distribution of the number of ratings made by each user. Recall that we previously computed the `rating_data` RDD used in the preceding code by splitting the ratings with the tab character. We will now use the `rating_data` variable again in the following code. + +To compute the distribution of ratings per user, we will first extract the user ID as key and rating as value from `rating_data` RDD. We will then group the ratings by user ID using Spark's `groupByKey` function: + + user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).\ + groupByKey() + +Next, for each key (user ID), we will find the size of the set of ratings; this will give us the number of ratings for that user: + + user_ratings_byuser = user_ratings_grouped.map(lambda (k, v): (k, len(v))) + user_ratings_byuser.take(5) + +We can inspect the resulting RDD by taking a few records from it; this should give us an RDD of the (user ID, number of ratings) pairs: + + **[(1, 272), (2, 62), (3, 54), (4, 24), (5, 175)]** + +Finally, we will plot the histogram of number of ratings per user using our favorite `hist` function: + + user_ratings_byuser_local = user_ratings_byuser.map(lambda (k, v):v).collect() + hist(user_ratings_byuser_local, bins=200, color='lightblue',normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16,10) + +Your chart should look similar to the following screenshot. We can see that most of the users give fewer than 100 ratings. The distribution of the ratings shows, however, that there are fairly large number of users that provide hundreds of ratings. + +Distribution of ratings per user + +We leave it to you to perform a similar analysis to create a histogram plot for the number of ratings given to each movie. Perhaps, if you're feeling adventurous, you could also extract a dataset of movie ratings by date (taken from the timestamps in the last column of the rating dataset) and chart a time series of the total number of ratings, number of unique users who gave a rating, and the number of unique movies rated, for each day. + +# Processing and transforming your data + +Now that we have done some initial exploratory analysis of our dataset and we know a little more about the characteristics of our users and movies, what do we do next? + +In order to make the raw data usable in a machine learning algorithm, we first need to clean it up and possibly transform it in various ways before extracting useful features from the transformed data. The transformation and feature extraction steps are closely linked, and in some cases, certain transformations are themselves a case of feature extraction. + +We have already seen an example of the need to clean data in the movie dataset. Generally, real-world datasets contain bad data, missing data points, and outliers. Ideally, we would correct bad data; however, this is often not possible, as many datasets derive from some form of collection process that cannot be repeated (this is the case, for example, in web activity data and sensor data). Missing values and outliers are also common and can be dealt with in a manner similar to bad data. Overall, the broad options are as follows: + + * **Filter out or remove records with bad or missing values** : This is sometimes unavoidable; however, this means losing the good part of a bad or missing record. + * **Fill in bad or missing data** : We can try to assign a value to bad or missing data based on the rest of the data we have available. Approaches can include assigning a zero value, assigning the global mean or median, interpolating nearby or similar data points (usually, in a time-series dataset), and so on. Deciding on the correct approach is often a tricky task and depends on the data, situation, and one's own experience. + * **Apply robust techniques to outliers** : The main issue with outliers is that they might be correct values, even though they are extreme. They might also be errors. It is often very difficult to know which case you are dealing with. Outliers can also be removed or filled in, although fortunately, there are statistical techniques (such as robust regression) to handle outliers and extreme values. + * **Apply transformations to potential outliers** : Another approach for outliers or extreme values is to apply transformations, such as a logarithmic or Gaussian kernel transformation, to features that have potential outliers, or display large ranges of potential values. These types of transformations have the effect of dampening the impact of large changes in the scale of a variable and turning a nonlinear relationship into one that is linear. + +## Filling in bad or missing data + +We have already seen an example of filtering out bad data. Following on from the preceding code, the following code snippet applies the fill-in approach to the bad release date record by assigning a value to the data point that is equal to the median year of release: + + years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect() + years_pre_processed_array = np.array(years_pre_processed) + +First, we will compute the mean and median year of release after selecting all the year of release data, _except_ the bad data point. We will then use the `numpy` function, `where`, to find the index of the bad value in `years_pre_processed_array` (recall that we assigned the value `1900` to this data point). Finally, we will use this index to assign the median release year to the bad value: + + mean_year = np.mean(years_pre_processed_array[years_pre_processed_array!=1900]) + median_year = np.median(years_pre_processed_array[years_pre_processed_array!=1900]) + index_bad_data = np.where(years_pre_processed_array==1900)[0][0] + years_pre_processed_array[index_bad_data] = median_year + print "Mean year of release: %d" % mean_year + print "Median year of release: %d" % median_year + print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0] + +You should expect to see the following output: + + **Mean year of release: 1989** + **Median year of release: 1995** + **Index of '1900' after assigning median: []** + +We computed both the mean and the median year of release here. As can be seen from the output, the median release year is quite higher because of the skewed distribution of the years. While it is not always straightforward to decide on precisely which fill-in value to use for a given situation, in this case, it is certainly feasible to use the median due to this skew. + +### Tip + +Note that the preceding code example is, strictly speaking, not very scalable, as it requires collecting all the data to the driver. We can use Spark's `mean` function for numeric RDDs to compute the mean, but there is no median function available currently. We can solve this by creating our own or by computing the median on a sample of the dataset created using the `sample` function (we will see more of this in the upcoming chapters). + +# Extracting useful features from your data + +Once we have completed the initial exploration, processing, and cleaning of our data, we are ready to get down to the business of extracting actual features from the data, with which our machine learning model can be trained. + + **Features** refer to the variables that we use to train our model. Each row of data contains various information that we would like to extract into a training example. Almost all machine learning models ultimately work on numerical representations in the form of a **vector** ; hence, we need to convert raw data into numbers. + +Features broadly fall into a few categories, which are as follows: + + * **Numerical features** : These features are typically real or integer numbers, for example, the user age that we used in an example earlier. + * **Categorical features** : These features refer to variables that can take one of a set of possible states at any given time. Examples from our dataset might include a user's gender or occupation or movie categories. + * **Text features** : These are features derived from the text content in the data, for example, movie titles, descriptions, or reviews. + * **Other features** : Most other types of features are ultimately represented numerically. For example, images, video, and audio can be represented as sets of numerical data. Geographical locations can be represented as latitude and longitude or geohash data. + +Here we will cover numerical, categorical, and text features. + +## Numerical features + +What is the difference between any old number and a numerical feature? Well, in reality, any numerical data can be used as an input variable. However, in a machine learning model, we learn about a vector of weights for each feature. The weights play a role in mapping feature values to an outcome or target variable (in the case of supervised learning models). + +Thus, we want to use features that make sense, that is, where the model can learn the relationship between feature values and the target variable. For example, age might be a reasonable feature. Perhaps there is a direct relationship between increasing age and a certain outcome. Similarly, height is a good example of a numerical feature that can be used directly. + +We will often see that numerical features are less useful in their raw form, but can be turned into representations that are more useful. Location is an example of such a case. Using raw locations (say, latitude and longitude) might not be that useful unless our data is very dense indeed, since our model might not be able to learn about a useful relationship between the raw location and an outcome. However, a relationship might exist between some aggregated or binned representation of the location (for example, a city or country) and the outcome. + +## Categorical features + +Categorical features cannot be used as input in their raw form, as they are not numbers; instead, they are members of a set of possible values that the variable can take. In the example mentioned earlier, user occupation is a categorical variable that can take the value of student, programmer, and so on. + +Such categorical variables are also known as **nominal** variables where there is no concept of order between the values of the variable. By contrast, when there is a concept of order between variables (such as the ratings mentioned earlier, where a rating of 5 is conceptually higher or better than a rating of 1), we refer to **ordinal** variables. + +To transform categorical variables into a numerical representation, we can use a common approach known as **1-of-k** encoding. An approach such as 1-of-k encoding is required to represent nominal variables in a way that makes sense for machine learning tasks. Ordinal variables might be used in their raw form but are often encoded in the same way as nominal variables. + +Assume that there are k possible values that the variable can take. If we assign each possible value an index from the set of 1 to k, then we can represent a given state of the variable using a binary vector of length k; here, all entries are zero, except the entry at the index that corresponds to the given state of the variable. This entry is set to one. + +For example, we can collect all the possible states of the `occupation` variable: + + all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect() + all_occupations.sort() + +We can then assign index values to each possible occupation in turn (note that we start from zero, since Python, Scala, and Java arrays all use zero-based indices): + + idx = 0 + all_occupations_dict = {} + for o in all_occupations: + all_occupations_dict[o] = idx + idx +=1 + # try a few examples to see what "1-of-k" encoding is assigned + print "Encoding of 'doctor': %d" % all_occupations_dict['doctor'] + print "Encoding of 'programmer': %d" % all_occupations_dict['programmer'] + +You will see the following output: + + **Encoding of 'doctor': 2** + **Encoding of 'programmer': 14** + +Finally, we can encode the value of `programmer`. We will start by creating a `numpy` array of a length that is equal to the number of possible occupations (k in this case) and filling it with zeros. We will use the `zeros` function of `numpy` to create this array. + +We will then extract the index of the word `programmer` and assign a value of `1` to the array value at this index: + + K = len(all_occupations_dict) + binary_x = np.zeros(K) + k_programmer = all_occupations_dict['programmer'] + binary_x[k_programmer] = 1 + print "Binary feature vector: %s" % binary_x + print "Length of binary vector: %d" % K + +This will give us the resulting binary feature vector of length `21`: + + **Binary feature vector: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]** + **Length of binary vector: 21** + +## Derived features + +As we mentioned earlier, it is often useful to compute a derived feature from one or more available variables. We hope that the derived feature can add more information than only using the variable in its raw form. + +For instance, we can compute the average rating given by each user to all the movies they rated. This would be a feature that could provide a _user-specific_ intercept in our model (in fact, this is a commonly used approach in recommendation models). We have taken the raw rating data and created a new feature that can allow us to learn a better model. + +Examples of features derived from raw data include computing average values, median values, variances, sums, differences, maximums or minimums, and counts. We have already seen a case of this when we created a new `movie age` feature from the year of release of the movie and the current year. Often, the idea behind using these transformations is to summarize the numerical data in some way that might make it easier for a model to learn. + +It is also common to transform numerical features into categorical features, for example, by binning features. Common examples of this include variables such as age, geolocation, and time. + +### Transforming timestamps into categorical features + +To illustrate how to derive categorical features from numerical data, we will use the times of the ratings given by users to movies. These are in the form of Unix timestamps. We can use Python's `datetime` module to extract the date and time from the timestamp and, in turn, extract the `hour` of the day. This will result in an RDD of the hour of the day for each rating. + +We will need a function to extract a `datetime` representation of the rating timestamp (in seconds); we will create this function now: + + def extract_datetime(ts): + import datetime + return datetime.datetime.fromtimestamp(ts) + +We will again use the `rating_data` RDD that we computed in the earlier examples as our starting point. + +First, we will use a `map` transformation to extract the timestamp field, converting it to a Python `int` datatype. We will then apply our `extract_datetime` function to each timestamp and extract the hour from the resulting `datetime` object: + + timestamps = rating_data.map(lambda fields: int(fields[3])) + hour_of_day = timestamps.map(lambda ts: extract_datetime(ts). **hour** ) + hour_of_day.take(5) + +If we take the first five records of the resulting RDD, we will see the following output: + + **[17, 21, 9, 7, 7]** + +We have transformed the raw time data into a categorical feature that represents the hour of the day in which the rating was given. + +Now, say that we decide this is too coarse a representation. Perhaps we want to further refine the transformation. We can assign each hour-of-the-day value into a defined bucket that represents a time of day. + +For example, we can say that morning is from 7 a.m. to 11 a.m., while lunch is from 11 a.m. to 1 a.m., and so on. Using these buckets, we can create a function to assign a time of day, given the hour of the day as input: + + def assign_tod(hr): + times_of_day = { + 'morning' : range(7, 12), + 'lunch' : range(12, 14), + 'afternoon' : range(14, 18), + 'evening' : range(18, 23), + 'night' : range(23, 7) + } + for k, v in times_of_day.iteritems(): + if hr in v: + return k + +Now, we will apply the `assign_tod` function to the hour of each rating event contained in the `hour_of_day` RDD: + + time_of_day = hour_of_day.map(lambda hr: assign_tod(hr)) + time_of_day.take(5) + +If we again take the first five records of this new RDD, we will see the following transformed values: + + **['afternoon', 'evening', 'morning', 'morning', 'morning']** + +We have now transformed the timestamp variable (which can take on thousands of values and is probably not useful to a model in its raw form) into hours (taking on 24 values) and then into a time of day (taking on five possible values). Now that we have a categorical feature, we can use the same 1-of-k encoding method outlined earlier to generate a binary feature vector. + +## Text features + +In some ways, text features are a form of categorical and derived features. Let's take the example of the description for a movie (which we do not have in our dataset). Here, the raw text could not be used directly, even as a categorical feature, since there are virtually unlimited possible combinations of words that could occur if each piece of text was a possible value. Our model would almost never see two occurrences of the same feature and would not be able to learn effectively. Therefore, we would like to turn raw text into a form that is more amenable to machine learning. + +There are numerous ways of dealing with text, and the field of natural language processing is dedicated to processing, representing, and modeling textual content. A full treatment is beyond the scope of this book, but we will introduce a simple and standard approach for text-feature extraction; this approach is known as the **bag-of-words** representation. + +The bag-of-words approach treats a piece of text content as a set of the words, and possibly numbers, in the text (these are often referred to as terms). The process of the bag-of-words approach is as follows: + + * **Tokenization** : First, some form of tokenization is applied to the text to split it into a set of tokens (generally words, numbers, and so on). An example of this is simple whitespace tokenization, which splits the text on each space and might remove punctuation and other characters that are not alphabetical or numerical. + * **Stop word removal** : Next, it is usual to remove very common words such as "the", "and", and "but" (these are known as **stop words** ). + * **Stemming** : The next step can include stemming, which refers to taking a term and reducing it to its base form or stem. A common example is plural terms becoming singular (for example, dogs becomes dog and so on). There are many approaches to stemming, and text-processing libraries often contain various stemming algorithms. + * **Vectorization** : The final step is turning the processed terms into a vector representation. The simplest form is, perhaps, a binary vector representation, where we assign a value of one if a term exists in the text and zero if it does not. This is essentially identical to the categorical 1-of-k encoding we encountered earlier. Like 1-of-k encoding, this requires a dictionary of terms mapping a given term to an index number. As you might gather, there are potentially millions of individual possible terms (even after stop word removal and stemming). Hence, it becomes critical to use a sparse vector representation where only the fact that a term is present is stored, to save memory and disk space as well as compute time. + +### Note + +In Chapter 9, _Advanced Text Processing with Spark_ , we will cover more complex text processing and feature extraction, including methods to weight terms; these methods go beyond the basic binary encoding we saw earlier. + +### Simple text feature extraction + +To show an example of extracting textual features in the binary vector representation, we can use the movie titles that we have available. + +First, we will create a function to strip away the year of release for each movie, if the year is present, leaving only the title of the movie. + +We will use Python's regular expression module, `re`, to search for the year between parentheses in the movie titles. If we find a match with this regular expression, we will extract only the title up to the index of the first match (that is, the index in the title string of the opening parenthesis). This is done with the following `raw[:grps.start()]` code snippet: + + def extract_title(raw): + import re + # this regular expression finds the non-word (numbers) betweenparentheses + grps = re.search("\((\w+)\)", raw) + if grps: + # we take only the title part, and strip the trailing whitespace from the remaining text, below + return raw[:grps.start()].strip() + else: + return raw + +Next, we will extract the raw movie titles from the `movie_fields` RDD: + + raw_titles = movie_fields.map(lambda fields: fields[1]) + +We can test out our `extract_title` function on the first five raw titles as follows: + + for raw_title in raw_titles.take(5): + print extract_title(raw_title) + +We can verify that our function works by inspecting the results, which should look like this: + + **Toy Story** + **GoldenEye** + **Four Rooms** + **Get Shorty** + **Copycat** + +We would then like to apply our function to the raw titles and apply a tokenization scheme to the extracted titles to convert them to terms. We will use the simple whitespace tokenization we covered earlier: + + movie_titles = raw_titles.map(lambda m: extract_title(m)) + # next we tokenize the titles into terms. We'll use simple whitespace tokenization + title_terms = movie_titles.map(lambda t: t.split(" ")) + print title_terms.take(5) + +Applying this simple tokenization gives the following result: + + **[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]** + +We can see that we have split each title on spaces so that each word becomes a token. + +### Tip + +Here, we do not cover details such as converting text to lowercase, removing non-word or non-numerical characters such as punctuation and special characters, removing stop words, and stemming. These steps will be important in a real-world application. We will cover many of these topics in Chapter 9, _Advanced Text Processing with Spark_. + +This additional processing can be done fairly simply using string functions, regular expressions, and the Spark API (apart from stemming). Perhaps you would like to give it a try! + +In order to assign each term to an index in our vector, we need to create the term dictionary, which maps each term to an integer index. + +First, we will use Spark's `flatMap` function (highlighted in the following code snippet) to expand the list of strings in each record of the `title_terms` RDD into a new RDD of strings where each record is a term called `all_terms`. + +We can then collect all the unique terms and assign indexes in exactly the same way that we did for the 1-of-k encoding of user occupations earlier: + + # next we would like to collect all the possible terms, in order to build out dictionary of term <-> index mappings + all_terms = title_terms. **flatMap** (lambda x: x).distinct().collect() + # create a new dictionary to hold the terms, and assign the "1-of-k" indexes + idx = 0 + all_terms_dict = {} + for term in all_terms: + all_terms_dict[term] = idx + idx +=1 + +We can print out the total number of unique terms and test out our term mapping on a few terms: + + print "Total number of terms: %d" % len(all_terms_dict) + print "Index of term 'Dead': %d" % all_terms_dict['Dead'] + print "Index of term 'Rooms': %d" % all_terms_dict['Rooms'] + +This will result in the following output: + + **Total number of terms: 2645** + **Index of term 'Dead': 147** + **Index of term 'Rooms': 1963** + +We can also achieve the same result more efficiently using Spark's `zipWithIndex` function. This function takes an RDD of values and merges them together with an index to create a new RDD of key-value pairs, where the key will be the term and the value will be the index in the term dictionary. We will use `collectAsMap` to collect the key-value RDD to the driver as a Python `dict` method: + + all_terms_dict2 = title_terms.flatMap(lambda x: x).distinct().zipWithIndex().collectAsMap() + print "Index of term 'Dead': %d" % all_terms_dict2['Dead'] + print "Index of term 'Rooms': %d" % all_terms_dict2['Rooms'] + +The output is as follows: + + **Index of term 'Dead': 147** + **Index of term 'Rooms': 1963** + +The final step is to create a function that converts a set of terms into a sparse vector representation. To do this, we will create an empty sparse matrix with one row and a number of columns equal to the total number of terms in our dictionary. We will then step through each term in the input list of terms and check whether this term is in our term dictionary. If it is, we assign a value of 1 to the vector at the index that corresponds to the term in our dictionary mapping: + + # this function takes a list of terms and encodes it as a scipy sparse vector using an approach + # similar to the 1-of-k encoding + def create_vector(terms, term_dict): + from scipy import sparse as sp + num_terms = len(term_dict) + x = sp.csc_matrix((1, num_terms)) + for t in terms: + if t in term_dict: + idx = term_dict[t] + x[0, idx] = 1 + return x + +Once we have our function, we will apply it to each record in our RDD of extracted terms: + + all_terms_bcast = sc. **broadcast** (all_terms_dict) + term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value)) + term_vectors.take(5) + +We can then inspect the first few records of our new RDD of sparse vectors: + + **[ <1x2645 sparse matrix of type ''** + **with 2 stored elements in Compressed Sparse Column format >,** + ** <1x2645 sparse matrix of type ''** + **with 1 stored elements in Compressed Sparse Column format >,** + ** <1x2645 sparse matrix of type ''** + **with 2 stored elements in Compressed Sparse Column format >,** + ** <1x2645 sparse matrix of type ''** + **with 2 stored elements in Compressed Sparse Column format >,** + ** <1x2645 sparse matrix of type ''** + **with 1 stored elements in Compressed Sparse Column format >]** + +We can see that each movie title has now been transformed into a sparse vector. We can see that the titles where we extracted two terms have two non-zero entries in the vector, titles where we extracted only one term have one non-zero entry, and so on. + +### Tip + +Note the use of Spark's `broadcast` method in the preceding example code to create a broadcast variable that contains the term dictionary. In real-world applications, such term dictionaries can be extremely large, so using a broadcast variable is advisable. + +## Normalizing features + +Once the features have been extracted into the form of a vector, a common preprocessing step is to normalize the numerical data. The idea behind this is to transform each numerical feature in a way that scales it to a standard size. We can perform different kinds of normalization, which are as follows: + + * **Normalize a feature** : This is usually a transformation applied to an individual feature across the dataset, for example, subtracting the mean ( _centering_ the feature) or applying the standard normal transformation (such that the feature has a mean of zero and a standard deviation of 1). + * **Normalize a feature vector** : This is usually a transformation applied to all features in a given row of the dataset such that the resulting feature vector has a normalized length. That is, we will ensure that each feature in the vector is scaled such that the vector has a norm of 1 (typically, on an L1 or L2 norm). + +We will use the second case as an example. We can use the `norm` function of `numpy` to achieve the vector normalization by first computing the L2 norm of a random vector and then dividing each element in the vector by this norm to create our normalized vector: + + np.random.seed(42) + x = np.random.randn(10) + norm_x_2 = np.linalg.norm(x) + normalized_x = x / norm_x_2 + print "x:\n%s" % x + print "2-Norm of x: %2.4f" % norm_x_2 + print "Normalized x:\n%s" % normalized_x + print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x) + +This should give the following result (note that in the preceding code snippet, we set the random seed equal to 42 so that the result will always be the same): + + **x: [ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696 1.57921282 0.76743473 -0.46947439 0.54256004]** + **2-Norm of x: 2.5908** + **Normalized x: [ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237 0.60954584 0.29621508 -0.1812081 0.20941776]** + **2-Norm of normalized_x: 1.0000** + +### Using MLlib for feature normalization + +Spark provides some built-in functions for feature scaling and standardization in its MLlib machine learning library. These include `StandardScaler`, which applies the standard normal transformation, and `Normalizer`, which applies the same feature vector normalization we showed you in our preceding example code. + +We will explore the use of these methods in the upcoming chapters, but for now, let's simply compare the results of using MLlib's `Normalizer` to our own results: + + from pyspark.mllib.feature import Normalizer + normalizer = Normalizer() + vector = sc.parallelize([x]) + +After importing the required class, we will instantiate `Normalizer` (by default, it will use the L2 norm as we did earlier). Note that as in most situations in Spark, we need to provide `Normalizer` with an RDD as input (it contains `numpy` arrays or MLlib vectors); hence, we will create a single-element RDD from our vector `x` for illustrative purposes. + +We will then use the `transform` function of `Normalizer` on our RDD. Since the RDD only has one vector in it, we will return our vector to the driver by calling `first` and finally by calling the `toArray` function to convert the vector back into a `numpy` array: + + normalized_x_mllib = normalizer.transform(vector).first().toArray() + +Finally, we can print out the same details as we did previously, comparing the results: + + print "x:\n%s" % x + print "2-Norm of x: %2.4f" % norm_x_2 + print "Normalized x MLlib:\n%s" % normalized_x_mllib + print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib) + +You will end up with exactly the same normalized vector as we did with our own code. However, using MLlib's built-in methods is certainly more convenient and efficient than writing our own functions! + +## Using packages for feature extraction + +While we have covered many different approaches to feature extraction, it will be rather painful to have to create the code to perform these common tasks each and every time. Certainly, we can create our own reusable code libraries for this purpose; however, fortunately, we can rely on the existing tools and packages. + +Since Spark supports Scala, Java, and Python bindings, we can use packages available in these languages that provide sophisticated tools to process and extract features and represent them as vectors. A few examples of packages for feature extraction include scikit-learn, gensim, scikit-image, matplotlib, and NLTK in Python; OpenNLP in Java; and Breeze and Chalk in Scala. In fact, Breeze has been part of Spark MLlib since version 1.0, and we will see how to use some Breeze functionality for linear algebra in the later chapters. + +# Summary + +In this chapter, we saw how to find common, publicly-available datasets that can be used to test various machine learning models. You learned how to load, process, and clean data, as well as how to apply common techniques to transform raw data into feature vectors that can be used as training examples for our models. + +In the next chapter, you will learn the basics of recommender systems and explore how to create a recommendation model, use the model to make predictions, and evaluate the model. + +# Chapter 4. Building a Recommendation Engine with Spark + +Now that you have learned the basics of data processing and feature extraction, we will move on to explore individual machine learning models in detail, starting with recommendation engines. + +Recommendation engines are probably among the best types of machine learning model known to the general public. Even if people do not know exactly what a recommendation engine is, they have most likely experienced one through the use of popular websites such as Amazon, Netflix, YouTube, Twitter, LinkedIn, and Facebook. Recommendations are a core part of all these businesses, and in some cases, they drive significant percentages of their revenue. + +The idea behind recommendation engines is to predict what people might like and to uncover relationships between items to aid in the discovery process (in this way, it is similar and, in fact, often complementary to search engines, which also play a role in discovery). However, unlike search engines, recommendation engines try to present people with relevant content that they did not necessarily search for or that they might not even have heard of. + +Typically, a recommendation engine tries to model the connections between users and some type of item. In our MovieStream scenario from Chapter 2, _Designing a Machine Learning System_ , for example, we could use a recommendation engine to show our users movies that they might enjoy. If we can do this well, we could keep our users engaged using our service, which is good for both our users and us. Similarly, if we can do a good job of showing our users movies related to a given movie, we could aid in discovery and navigation on our site, again improving our users' experience, engagement, and the relevance of our content to them. + +However, recommendation engines are not limited to movies, books, or products. The techniques we will explore in this chapter can be applied to just about any user-to-item relationship as well as user-to-user connections, such as those found on social networks, allowing us to make recommendations such as people you may know or who to follow. + +Recommendation engines are most effective in two general scenarios (which are not mutually exclusive). They are explained here: + + * **Large number of available options for users** : When there are a very large number of available items, it becomes increasingly difficult for the user to find something they want. Searching can help when the user knows what they are looking for, but often, the right item might be something previously unknown to them. In this case, being recommended relevant items, that the user may not already know about, can help them discover new items. + * **A significant degree of personal taste involved** : When personal taste plays a large role in selection, recommendation models, which often utilize a wisdom of the crowd approach, can be helpful in discovering items based on the behavior of others that have similar taste profiles. + +In this chapter, we will: + + * Introduce the various types of recommendation engines + * Build a recommendation model using data about user preferences + * Use the trained model to compute recommendations for a given user as well compute similar items for a given item (that is, related items) + * Apply standard evaluation metrics to the model that we created to measure how well it performs in terms of predictive capability + +# Types of recommendation models + +Recommender systems are widely studied, and there are many approaches used, but there are two that are probably most prevalent: content-based filtering and collaborative filtering. Recently, other approaches such as ranking models have also gained in popularity. In practice, many approaches are hybrids, incorporating elements of many different methods into a model or combination of models. + +## Content-based filtering + +Content-based methods try to use the content or attributes of an item, together with some notion of similarity between two pieces of content, to generate items similar to a given item. These attributes are often textual content (such as titles, names, tags, and other metadata attached to an item), or in the case of media, they could include other features of the item, such as attributes extracted from audio and video content. + +In a similar manner, user recommendations can be generated based on attributes of users or user profiles, which are then matched to item attributes using the same measure of similarity. For example, a user can be represented by the combined attributes of the items they have interacted with. This becomes their user profile, which is then compared to item attributes to find items that match the user profile. + +## Collaborative filtering + +Collaborative filtering is a form of wisdom of the crowd approach where the set of preferences of many users with respect to items is used to generate estimated preferences of users for items with which they have not yet interacted. The idea behind this is the notion of similarity. + +In a user-based approach, if two users have exhibited similar preferences (that is, patterns of interacting with the same items in broadly the same way), then we would assume that they are similar to each other in terms of taste. To generate recommendations for unknown items for a given user, we can use the known preferences of other users that exhibit similar behavior. We can do this by selecting a set of similar users and computing some form of combined score based on the items they have shown a preference for. The overall logic is that if others have tastes similar to a set of items, these items would tend to be good candidates for recommendation. + +We can also take an item-based approach that computes some measure of similarity between items. This is usually based on the existing user-item preferences or ratings. Items that tend to be rated the same by similar users will be classed as similar under this approach. Once we have these similarities, we can represent a user in terms of the items they have interacted with and find items that are similar to these known items, which we can then recommend to the user. Again, a set of items similar to the known items is used to generate a combined score to estimate for an unknown item. + +The user- and item-based approaches are usually referred to as nearest-neighbor models, since the estimated scores are computed based on the set of most similar users or items (that is, their neighbors). + +Finally, there are many model-based methods that attempt to model the user-item preferences themselves so that new preferences can be estimated directly by applying the model to unknown user-item combinations. + +### Matrix factorization + +Since Spark's recommendation models currently only include an implementation of matrix factorization, we will focus our attention on this class of models. This focus is with good reason; however, these types of models have consistently been shown to perform extremely well in collaborative filtering and were among the best models in well-known competitions such as the Netflix prize. + +### Note + +For more information on and a brief overview of the performance of the best algorithms for the Netflix prize, see . + +#### Explicit matrix factorization + +When we deal with data that consists of preferences of users that are provided by the users themselves, we refer to explicit preference data. This includes, for example, ratings, thumbs up, likes, and so on that are given by users to items. + +We can take these ratings and form a two-dimensional matrix with users as rows and items as columns. Each entry represents a rating given by a user to a certain item. Since in most cases, each user has only interacted with a relatively small set of items, this matrix has only a few non-zero entries (that is, it is very sparse). + +As a simple example, let's assume that we have the following user ratings for a set of movies: + + **Tom, Star Wars, 5** + **Jane, Titanic, 4** + **Bill, Batman, 3** + **Jane, Star Wars, 2** + **Bill, Titanic, 3** + +We will form the following ratings matrix: + +A simple movie-rating matrix + +Matrix factorization (or matrix completion) attempts to directly model this user-item matrix by representing it as a product of two smaller matrices of lower dimension. Thus, it is a dimensionality-reduction technique. If we have **U** users and **I** items, then our user-item matrix is of dimension U x I and might look something like the one shown in the following diagram: + +A sparse ratings matrix + +If we want to find a lower dimension (low-rank) approximation to our user-item matrix with the dimension **k** , we would end up with two matrices: one for users of size U x k and one for items of size I x k. These are known as factor matrices. If we multiply these two factor matrices, we would reconstruct an approximate version of the original ratings matrix. Note that while the original ratings matrix is typically very sparse, each factor matrix is dense, as shown in the following diagram: + +The user- and item-factor matrices + +These models are often also called latent feature models, as we are trying to discover some form of hidden features (which are represented by the factor matrices) that account for the structure of behavior inherent in the user-item rating matrix. While the latent features or factors are not directly interpretable, they might, perhaps, represent things such as the tendency of a user to like movies from a certain director, genre, style, or group of actors, for example. + +As we are directly modeling the user-item matrix, the prediction in these models is relatively straightforward: to compute a predicted rating for a user and item, we compute the vector dot product between the relevant row of the user-factor matrix (that is, the user's factor vector) and the relevant row of the item-factor matrix (that is, the item's factor vector). + +This is illustrated with the highlighted vectors in the following diagram: + +Computing recommendations from user- and item-factor vectors + +To find out the similarity between two items, we can use the same measures of similarity as we would use in the nearest-neighbor models, except that we can use the factor vectors directly by computing the similarity between two item-factor vectors, as illustrated in the following diagram: + +Computing similarity with item-factor vectors + +The benefit of factorization models is the relative ease of computing recommendations once the model is created. However, for very large user and itemsets, this can become a challenge as it requires storage and computation across potentially many millions of user- and item-factor vectors. Another advantage, as mentioned earlier, is that they tend to offer very good performance. + +### Note + +Projects such as Oryx () and Prediction.io () focus on model serving for large-scale models, including recommenders based on matrix factorization. + +On the down side, factorization models are relatively more complex to understand and interpret compared to nearest-neighbor models and are often more computationally intensive during the model's training phase. + +#### Implicit matrix factorization + +So far, we have dealt with explicit preferences such as ratings. However, much of the preference data that we might be able to collect is implicit feedback, where the preferences between a user and item are not given to us, but are, instead, implied from the interactions they might have with an item. Examples include binary data (such as whether a user viewed a movie, whether they purchased a product, and so on) as well as count data (such as the number of times a user watched a movie). + +There are many different approaches to deal with implicit data. MLlib implements a particular approach that treats the input rating matrix as two matrices: a binary preference matrix, **P** , and a matrix of confidence weights, **C**. + +For example, let's assume that the user-movie ratings we saw previously were, in fact, the number of times each user had viewed that movie. The two matrices would look something like ones shown in the following screenshot. Here, the matrix **P** informs us that a movie was viewed by a user, and the matrix **C** represents the confidence weighting, in the form of the view counts--generally, the more a user has watched a movie, the higher the confidence that they actually like it. + +Representation of an implicit preference and confidence matrix + +The implicit model still creates a user- and item-factor matrix. In this case, however, the matrix that the model is attempting to approximate is not the overall ratings matrix but the preference matrix P. If we compute a recommendation by calculating the dot product of a user- and item-factor vector, the score will not be an estimate of a rating directly. It will rather be an estimate of the preference of a user for an item (though not strictly between 0 and 1, these scores will generally be fairly close to a scale of 0 to 1). + +#### Alternating least squares + + **Alternating Least Squares** ( **ALS** ) is an optimization technique to solve matrix factorization problems; this technique is powerful, achieves good performance, and has proven to be relatively easy to implement in a parallel fashion. Hence, it is well suited for platforms such as Spark. At the time of writing this book, it is the only recommendation model implemented in MLlib. + +ALS works by iteratively solving a series of least squares regression problems. In each iteration, one of the user- or item-factor matrices is treated as fixed, while the other one is updated using the fixed factor and the rating data. Then, the factor matrix that was solved for is, in turn, treated as fixed, while the other one is updated. This process continues until the model has converged (or for a fixed number of iterations). + +### Note + +Spark's documentation for collaborative filtering contains references to the papers that underlie the ALS algorithms implemented each component of explicit and implicit data. You can view the documentation at . + +# Extracting the right features from your data + +In this section, we will use explicit rating data, without additional user or item metadata or other information related to the user-item interactions. Hence, the features that we need as inputs are simply the user IDs, movie IDs, and the ratings assigned to each user and movie pair. + +## Extracting features from the MovieLens 100k dataset + +Start the Spark shell in the Spark base directory, ensuring that you provide enough memory via the `-driver-memory` option: + + **>./bin/spark-shell -driver-memory 4g** + +In this example, we will use the same MovieLens dataset that we used in the previous chapter. Use the directory in which you placed the MovieLens 100k dataset as the input path in the following code. + +First, let's inspect the raw ratings dataset: + + val rawData = sc.textFile("/ **PATH** /ml-100k/u.data") + rawData.first() + +You will see output similar to these lines of code: + + **14/03/30 11:42:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable** + **14/03/30 11:42:41 WARN LoadSnappy: Snappy native library not loaded** + **14/03/30 11:42:41 INFO FileInputFormat: Total input paths to process : 1** + **14/03/30 11:42:41 INFO SparkContext: Starting job: first at :15** + **14/03/30 11:42:41 INFO DAGScheduler: Got job 0 (first at :15) with 1 output partitions (allowLocal=true)** + **14/03/30 11:42:41 INFO DAGScheduler: Final stage: Stage 0 (first at :15)** + **14/03/30 11:42:41 INFO DAGScheduler: Parents of final stage: List()** + **14/03/30 11:42:41 INFO DAGScheduler: Missing parents: List()** + **14/03/30 11:42:41 INFO DAGScheduler: Computing the requested partition locally** + **14/03/30 11:42:41 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173** + **14/03/30 11:42:41 INFO SparkContext: Job finished: first at :15, took 0.030533 s** + **res0: String = 196 242 3 881250949** + +Recall that this dataset consisted of the `user id`, `movie id`, `rating`, `timestamp` fields separated by a tab (`"\t"`) character. We don't need the time when the rating was made to train our model, so let's simply extract the first three fields: + + val rawRatings = rawData.map(_.split("\t"). **take** (3)) + +We will first split each record on the `"\t"` character, which gives us an `Array[String]` array. We will then use Scala's `take` function to keep only the first `3` elements of the array, which correspond to `user id`, `movie id`, and `rating`, respectively. + +We can inspect the first record of our new RDD by calling `rawRatings.first()`, which collects just the first record of the RDD back to the driver program. This will result in the following output: + + **14/03/30 12:24:00 INFO SparkContext: Starting job: first at :21** + **14/03/30 12:24:00 INFO DAGScheduler: Got job 1 (first at :21) with 1 output partitions (allowLocal=true)** + **14/03/30 12:24:00 INFO DAGScheduler: Final stage: Stage 1 (first at :21)** + **14/03/30 12:24:00 INFO DAGScheduler: Parents of final stage: List()** + **14/03/30 12:24:00 INFO DAGScheduler: Missing parents: List()** + **14/03/30 12:24:00 INFO DAGScheduler: Computing the requested partition locally** + **14/03/30 12:24:00 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173** + **14/03/30 12:24:00 INFO SparkContext: Job finished: first at :21, took 0.00391 s** + **res6: Array[String] = Array(196, 242, 3)** + +We will use Spark's MLlib library to train our model. Let's take a look at what methods are available for us to use and what input is required. First, import the `ALS` model from MLlib: + + import org.apache.spark.mllib.recommendation.ALS + +On the console, we can inspect the available methods on the ALS object using tab completion. Type in `ALS.` (note the dot) and then press the _Tab_ key. You should see the autocompletion of the methods: + + **ALS.** + **asInstanceOf isInstanceOf main toString train trainImplicit** + +The method we want to use is `train`. If we type `ALS.train` and hit _Enter_ , we will get an error. However, this error will tell us what the method signature looks like: + + **ALS.train** + **< console>:12: error: ambiguous reference to overloaded definition,** + **both method train in object ALS of type (ratings: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating], rank: Int** + **, iterations: Int)org.apache.spark.mllib.recommendation.MatrixFactorizationModel** + **and method train in object ALS of type (ratings: org.apache.spark.rdd.RDD[org.apache.spark.mllib.recommendation.Rating], rank: Int, iterations: Int, lambda: Double)org.apache.spark.mllib.recommendation.MatrixFactorizationModel** + **match expected type ?** + **ALS.train** + **^** + +So, we can see that at a minimum, we need to provide the input arguments, `ratings`, `rank`, and `iterations`. The second method also requires an argument called `lambda`. We'll cover these three shortly, but let's take a look at the `ratings` argument. First, let's import the `Rating` class that it references and use a similar approach to find out what an instance of `Rating` requires, by typing in `Rating()` and hitting _Enter_ : + + **import org.apache.spark.mllib.recommendation.Rating** + **Rating()** + **< console>:13: error: not enough arguments for method apply: (user: Int, product: Int, rating: Double)org.apache.spark.mllib.recommendation.Rating in object Rating.** + **Unspecified value parameters user, product, rating.** + **Rating()** + **^** + +As we can see from the preceding output, we need to provide the `ALS` model with an RDD that consists of `Rating` records. A `Rating` class, in turn, is just a wrapper around `user id`, `movie id` (called `product` here), and the actual `rating` arguments. We'll create our rating dataset using the `map` method and transforming the array of IDs and ratings into a `Rating` object: + + val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user. **toInt** , movie. **toInt** , rating. **toDouble** ) } + +### Note + +Notice that we need to use `toInt` or `toDouble` to convert the raw rating data (which was extracted as `Strings` from the text file) to `Int` or `Double` numeric inputs. Also, note the use of a `case` statement that allows us to extract the relevant variable names and use them directly (this saves us from having to use something like `val user = ratings(0)`). + +For more on Scala case statements and pattern matching as used here, take a look at . + +We now have an `RDD[Rating]` that we can verify by calling: + + **ratings.first()** + **14/03/30 12:32:48 INFO SparkContext: Starting job: first at :24** + **14/03/30 12:32:48 INFO DAGScheduler: Got job 2 (first at :24) with 1 output partitions (allowLocal=true)** + **14/03/30 12:32:48 INFO DAGScheduler: Final stage: Stage 2 (first at :24)** + **14/03/30 12:32:48 INFO DAGScheduler: Parents of final stage: List()** + **14/03/30 12:32:48 INFO DAGScheduler: Missing parents: List()** + **14/03/30 12:32:48 INFO DAGScheduler: Computing the requested partition locally** + **14/03/30 12:32:48 INFO HadoopRDD: Input split: file:/Users/Nick/workspace/datasets/ml-100k/u.data:0+1979173** + **14/03/30 12:32:48 INFO SparkContext: Job finished: first at :24, took 0.003752 s** + **res8: org.apache.spark.mllib.recommendation.Rating = Rating(196,242,3.0)** + +# Training the recommendation model + +Once we have extracted these simple features from our raw data, we are ready to proceed with model training; MLlib takes care of this for us. All we have to do is provide the correctly-parsed input RDD we just created as well as our chosen model parameters. + +## Training a model on the MovieLens 100k dataset + +We're now ready to train our model! The other inputs required for our model are as follows: + + * `rank`: This refers to the number of factors in our ALS model, that is, the number of hidden features in our low-rank approximation matrices. Generally, the greater the number of factors, the better, but this has a direct impact on memory usage, both for computation and to store models for serving, particularly for large number of users or items. Hence, this is often a trade-off in real-world use cases. A rank in the range of 10 to 200 is usually reasonable. + * `iterations`: This refers to the number of iterations to run. While each iteration in `ALS` is guaranteed to decrease the reconstruction error of the ratings matrix, `ALS` models will converge to a reasonably good solution after relatively few iterations. So, we don't need to run for too many iterations in most cases (around 10 is often a good default). + * `lambda`: This parameter controls the regularization of our model. Thus, `lambda` controls over fitting. The higher the value of `lambda`, the more is the regularization applied. What constitutes a sensible value is very dependent on the size, nature, and sparsity of the underlying data, and as with almost all machine learning models, the regularization parameter is something that should be tuned using out-of-sample test data and cross-validation approaches. + +We'll use `rank` of `50`, `10` iterations, and a lambda parameter of `0.01` to illustrate how to train our model: + + val model = ALS.train(ratings, 50, 10, 0.01) + +This returns a `MatrixFactorizationModel` object, which contains the user and item factors in the form of an RDD of `(id, factor)` pairs. These are called `userFeatures` and `productFeatures`, respectively. For example: + + model.userFeatures + +You will see the output as: + + **res14: org.apache.spark.rdd.RDD[(Int, Array[Double])] = FlatMappedRDD[659] at flatMap at ALS.scala:231** + +We can see that the factors are in the form of an `Array[Double]`. + +Note that the operations used in MLlib's `ALS` implementation are lazy transformations, so the actual computation will only be performed once we call some sort of action on the resulting `RDDs` of the user and item factors. We can force the computation using a Spark action such as `count`: + + **model.userFeatures.count** + +This will trigger the computation, and we will see a quite a bit of output text similar to the following lines of code: + + **14/03/30 13:10:40 INFO SparkContext: Starting job: count at :26** + **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 665 (map at ALS.scala:147)** + **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 664 (map at ALS.scala:146)** + **14/03/30 13:10:40 INFO DAGScheduler: Registering RDD 674 (mapPartitionsWithIndex at ALS.scala:164)** + **...** + **14/03/30 13:10:45 INFO SparkContext: Job finished: count at :26, took 5.068255 s** + **res16: Long = 943** + +If we call `count` for the movie factors, we will see the following output: + + **model.productFeatures.count** + **14/03/30 13:15:21 INFO SparkContext: Starting job: count at :26** + **14/03/30 13:15:21 INFO DAGScheduler: Got job 10 (count at :26) with 1 output partitions (allowLocal=false)** + **14/03/30 13:15:21 INFO DAGScheduler: Final stage: Stage 165 (count at :26)** + **14/03/30 13:15:21 INFO DAGScheduler: Parents of final stage: List(Stage 169, Stage 166)** + **14/03/30 13:15:21 INFO DAGScheduler: Missing parents: List()** + **14/03/30 13:15:21 INFO DAGScheduler: Submitting Stage 165 (FlatMappedRDD[883] at flatMap at ALS.scala:231), which has no missing parents** + **14/03/30 13:15:21 INFO DAGScheduler: Submitting 1 missing tasks from Stage 165 (FlatMappedRDD[883] at flatMap at ALS.scala:231)** + **...** + **14/03/30 13:15:21 INFO SparkContext: Job finished: count at :26, took 0.030044 s** + **res21: Long = 1682** + +As expected, we have a factor array for each user (`943` factors) and movie (`1682` factors). + +### Training a model using implicit feedback data + +The standard matrix factorization approach in MLlib deals with explicit ratings. To work with implicit data, you can use the `trainImplicit` method. It is called in a manner similar to the standard `train` method. There is an additional parameter, `alpha`, that can be set (and in the same way, the regularization parameter, `lambda`, should be selected via testing and cross-validation methods). + +The `alpha` parameter controls the baseline level of confidence weighting applied. A higher level of `alpha` tends to make the model more confident about the fact that missing data equates to no preference for the relevant user-item pair. + +### Note + +As an exercise, try to take the existing MovieLens dataset and convert it into an implicit dataset. One possible approach is to convert it to binary feedback (0s and 1s) by applying a threshold on the ratings at some level. + +Another approach could be to convert the ratings' values into confidence weights (for example, perhaps, low ratings could imply zero weights, or even negative weights, which are supported by MLlib's implementation). + +Train a model on this dataset and compare the results of the following section with those generated by your implicit model. + +# Using the recommendation model + +Now that we have our trained model, we're ready to use it to make predictions. These predictions typically take one of two forms: recommendations for a given user and related or similar items for a given item. + +## User recommendations + +In this case, we would like to generate recommended items for a given user. This usually takes the form of a _top-K_ list, that is, the _K_ items that our model predicts will have the highest probability of the user liking them. This is done by computing the predicted score for each item and ranking the list based on this score. + +The exact method to perform this computation depends on the model involved. For example, in user-based approaches, the ratings of similar users on items are used to compute the recommendations for a user, while in an item-based approach, the computation is based on the similarity of items the user has rated to the candidate items. + +In matrix factorization, because we are modeling the ratings matrix directly, the predicted score can be computed as the vector dot product between a user-factor vector and an item-factor vector. + +### Generating movie recommendations from the MovieLens 100k dataset + +As MLlib's recommendation model is based on matrix factorization, we can use the factor matrices computed by our model to compute predicted scores (or ratings) for a user. We will focus on the explicit rating case using MovieLens data; however, the approach is the same when using the implicit model. + +The `MatrixFactorizationModel` class has a convenient `predict` method that will compute a predicted score for a given user and item combination: + + val predictedRating = model.predict(789, 123) + +The output is as follows: + + **14/03/30 16:10:10 INFO SparkContext: Starting job: lookup at MatrixFactorizationModel.scala:45** + **14/03/30 16:10:10 INFO DAGScheduler: Got job 30 (lookup at MatrixFactorizationModel.scala:45) with 1 output partitions (allowLocal=false)** + **...** + **14/03/30 16:10:10 INFO SparkContext: Job finished: lookup at MatrixFactorizationModel.scala:46, took 0.023077 s** + **predictedRating: Double = 3.128545693368485** + +As we can see, this model predicts a rating of `3.12` for user `789` and movie `123`. + +### Tip + +Note that you might see different results than those shown in this section because the `ALS` model is initialized randomly. So, different runs of the model will lead to different solutions. + +The `predict` method can also take an RDD of `(user, item)` IDs as the input and will generate predictions for each of these. We can use this method to make predictions for many users and items at the same time. + +To generate the _top-K_ recommended items for a user, `MatrixFactorizationModel` provides a convenience method called `recommendProducts`. This takes two arguments: `user` and `num`, where `user` is the user ID, and `num` is the number of items to recommend. + +It returns the top `num` items ranked in the order of the predicted score. Here, the scores are computed as the dot product between the user-factor vector and each item-factor vector. + +Let's generate the top `10` recommended items for user `789`: + + val userId = 789 + val K = 10 + val topKRecs = model.recommendProducts(userId, K) + +We now have a set of predicted ratings for each movie for user `789`. If we print this out, we could inspect the top 10 recommendations for this user: + + println(topKRecs.mkString("\n")) + +You should see the following output on your console: + + **Rating(789,715,5.931851273771102)** + **Rating(789,12,5.582301095666215)** + **Rating(789,959,5.516272981542168)** + **Rating(789,42,5.458065302395629)** + **Rating(789,584,5.449949837103569)** + **Rating(789,750,5.348768847643657)** + **Rating(789,663,5.30832117499004)** + **Rating(789,134,5.278933936827717)** + **Rating(789,156,5.250959077906759)** + **Rating(789,432,5.169863417126231)** + +#### Inspecting the recommendations + +We can give these recommendations a sense check by taking a quick look at the titles of the movies a user has rated and the recommended movies. First, we need to load the movie data (which is the one of the datasets we explored in the previous chapter). We'll collect this data as a `Map[Int, String]` method mapping the movie ID to the title: + + val movies = sc.textFile("/PATH/ml-100k/u.item") + val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt,array(1))).collectAsMap() + titles(123) + +The preceding code will produce the output as: + + **res68: String = Frighteners, The (1996)** + +For our user `789`, we can find out what movies they have rated, take the `10` movies with the highest rating, and then check the titles. We will do this now by first using the `keyBy` Spark function to create an RDD of key-value pairs from our `ratings` RDD, where the key will be the user ID. We will then use the `lookup` function to return just the ratings for this key (that is, that particular user ID) to the driver: + + val moviesForUser = ratings.keyBy(_.user).lookup(789) + +Let's see how many movies this user has rated. This will be the `size` of the `moviesForUser` collection: + + println(moviesForUser.size) + +We will see that this user has rated `33` movies. + +Next, we will take the 10 movies with the highest ratings by sorting the `moviesForUser` collection using the `rating` field of the `Rating` object. We will then extract the movie title for the relevant product ID attached to the `Rating` class from our mapping of movie titles and print out the top `10` titles with their ratings: + + moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product), rating.rating)).foreach(println) + +You will see the following output displayed: + + **(Godfather, The (1972),5.0)** + **(Trainspotting (1996),5.0)** + **(Dead Man Walking (1995),5.0)** + **(Star Wars (1977),5.0)** + **(Swingers (1996),5.0)** + **(Leaving Las Vegas (1995),5.0)** + **(Bound (1996),5.0)** + **(Fargo (1996),5.0)** + **(Last Supper, The (1995),5.0)** + **(Private Parts (1997),4.0)** + +Now, let's take a look at the top 10 recommendations for this user and see what the titles are using the same approach as the one we used earlier (note that the recommendations are already sorted): + + topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println) + +The output is as follows: + + **(To Die For (1995),5.931851273771102)** + **(Usual Suspects, The (1995),5.582301095666215)** + **(Dazed and Confused (1993),5.516272981542168)** + **(Clerks (1994),5.458065302395629)** + **(Secret Garden, The (1993),5.449949837103569)** + **(Amistad (1997),5.348768847643657)** + **(Being There (1979),5.30832117499004)** + **(Citizen Kane (1941),5.278933936827717)** + **(Reservoir Dogs (1992),5.250959077906759)** + **(Fantasia (1940),5.169863417126231)** + +We leave it to you to decide whether these recommendations make sense. + +## Item recommendations + +Item recommendations are about answering the following question: for a certain item, what are the items most similar to it? Here, the precise definition of similarity is dependent on the model involved. In most cases, similarity is computed by comparing the vector representation of two items using some similarity measure. Common similarity measures include Pearson correlation and cosine similarity for real-valued vectors and Jaccard similarity for binary vectors. + +### Generating similar movies for the MovieLens 100k dataset + +The current `MatrixFactorizationModel` API does not directly support item-to-item similarity computations. Therefore, we will need to create our own code to do this. + +We will use the cosine similarity metric, and we will use the jblas linear algebra library (a dependency of MLlib) to compute the required vector dot products. This is similar to how the existing `predict` and `recommendProducts` methods work, except that we will use cosine similarity as opposed to just the dot product. + +We would like to compare the factor vector of our chosen item with each of the other items, using our similarity metric. In order to perform linear algebra computations, we will first need to create a vector object out of the factor vectors, which are in the form of an `Array[Double]`. The `JBLAS` class, `DoubleMatrix`, takes an `Array[Double]` as the constructor argument as follows: + + import org.jblas.DoubleMatrix + val aMatrix = new DoubleMatrix(Array(1.0, 2.0, 3.0)) + +Here is the output of the preceding code: + + **aMatrix: org.jblas.DoubleMatrix = [1.000000; 2.000000; 3.000000]** + +### Tip + +Note that using jblas, vectors are represented as a one-dimensional `DoubleMatrix` class, while matrices are a two-dimensional `DoubleMatrix` class. + +We will need a method to compute the cosine similarity between two vectors. Cosine similarity is a measure of the angle between two vectors in an _n_ -dimensional space. It is computed by first calculating the dot product between the vectors and then dividing the result by a denominator, which is the norm (or length) of each vector multiplied together (specifically, the L2-norm is used in cosine similarity). In this way, cosine similarity is a normalized dot product. + +The cosine similarity measure takes on values between -1 and 1. A value of 1 implies completely similar, while a value of 0 implies independence (that is, no similarity). This measure is useful because it also captures negative similarity, that is, a value of -1 implies that not only are the vectors not similar, but they are also completely dissimilar. + +Let's create our `cosineSimilarity` function here: + + def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = { + vec1.dot(vec2) / (vec1.norm2() * vec2.norm2()) + } + +### Tip + +Note that we defined a return type for this function of `Double`. We are not required to do this, since Scala features type inference. However, it can often be useful to document return types for Scala functions. + +Let's try it out on one of our item factors for item `567`. We will need to collect an item factor from our model; we will do this using the `lookup` method in a similar way that we did earlier to collect the ratings for a specific user. In the following lines of code, we also use the `head` function, since `lookup` returns an array of values, and we only need the first value (in fact, there will only be one value, which is the factor vector for this item). + +Since this will be an `Array[Double]`, we will then need to create a `DoubleMatrix` object from it and compute the cosine similarity with itself: + + val itemId = 567 + val itemFactor = model.productFeatures.lookup(itemId).head + val itemVector = new DoubleMatrix(itemFactor) + cosineSimilarity(itemVector, itemVector) + +A similarity metric should measure how close, in some sense, two vectors are to each other. Here, we can see that our cosine similarity metric tells us that this item vector is identical to itself, which is what we would expect: + + **res113: Double = 1.0** + +Now, we are ready to apply our similarity metric to each item: + + val sims = model.productFeatures.map{ case (id, factor) => + val factorVector = new DoubleMatrix(factor) + val sim = cosineSimilarity(factorVector, itemVector) + (id, sim) + } + +Next, we can compute the top 10 most similar items by sorting out the similarity score for each item: + + // recall we defined K = 10 earlier + val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity }) + +In the preceding code snippet, we used Spark's `top` function, which is an efficient way to compute _top-K_ results in a distributed fashion, instead of using `collect` to return all the data to the driver and sorting it locally (remember that we could be dealing with millions of users and items in the case of recommendation models). + +We need to tell Spark how to sort the `(item id, similarity score)` pairs in the `sims` RDD. To do this, we will pass an extra argument to `top`, which is a Scala `Ordering` object that tells Spark that it should sort by the value in the key-value pair (that is, sort by `similarity`). + +Finally, we can print the 10 items with the highest computed similarity metric to our given item: + + println(sortedSims.take(10).mkString("\n")) + +You will see output like the following one: + + **(567,1.0000000000000002)** + **(1471,0.6932331537649621)** + **(670,0.6898690594544726)** + **(201,0.6897964975027041)** + **(343,0.6891221044611473)** + **(563,0.6864214133620066)** + **(294,0.6812075443259535)** + **(413,0.6754663844488256)** + **(184,0.6702643811753909)** + **(109,0.6594872765176396)** + +Not surprisingly, we can see that the top-ranked similar item is our item. The rest are the other items in our set of items, ranked in order of our similarity metric. + +#### Inspecting the similar items + +Let's see what the title of our chosen movie is: + + println(titles(itemId)) + +The preceding code will print the following output: + + **Wes Craven's New Nightmare (1994)** + +As we did for user recommendations, we can sense check our item-to-item similarity computations and take a look at the titles of the most similar movies. This time, we will take the top 11 so that we can exclude our given movie. So, we will take the numbers 1 to 11 in the list: + + val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity }) + sortedSims2.slice(1, 11).map{ case (id, sim) => (titles(id), sim) }.mkString("\n") + +You will see the movie titles and scores displayed similar to this output: + + **(Hideaway (1995),0.6932331537649621)** + **(Body Snatchers (1993),0.6898690594544726)** + **(Evil Dead II (1987),0.6897964975027041)** + **(Alien: Resurrection (1997),0.6891221044611473)** + **(Stephen King's The Langoliers (1995),0.6864214133620066)** + **(Liar Liar (1997),0.6812075443259535)** + **(Tales from the Crypt Presents: Bordello of Blood (1996),0.6754663844488256)** + **(Army of Darkness (1993),0.6702643811753909)** + **(Mystery Science Theater 3000: The Movie (1996),0.6594872765176396)** + **(Scream (1996),0.6538249646863378)** + +### Tip + +Once again note that you might see quite different results due to random model initialization. + +Now that you have computed similar items using cosine similarity, see if you can do the same with the user-factor vectors to compute similar users for a given user. + +# Evaluating the performance of recommendation models + +How do we know whether the model we have trained is a good model? We need to be able to evaluate its predictive performance in some way. **Evaluation metrics** are measures of a model's predictive capability or accuracy. Some are direct measures of how well a model predicts the model's target variable (such as Mean Squared Error), while others are concerned with how well the model performs at predicting things that might not be directly optimized in the model but are often closer to what we care about in the real world (such as Mean average precision). + +Evaluation metrics provide a standardized way of comparing the performance of the same model with different parameter settings and of comparing performance across different models. Using these metrics, we can perform model selection to choose the best-performing model from the set of models we wish to evaluate. + +Here, we will show you how to calculate two common evaluation metrics used in recommender systems and collaborative filtering models: Mean Squared Error and Mean average precision at K. + +## Mean Squared Error + +The **Mean Squared Error** ( **MSE** ) is a direct measure of the reconstruction error of the user-item rating matrix. It is also the objective function being minimized in certain models, specifically many matrix-factorization techniques, including `ALS`. As such, it is commonly used in explicit ratings settings. + +It is defined as the sum of the squared errors divided by the number of observations. The squared error, in turn, is the square of the difference between the predicted rating for a given user-item pair and the actual rating. + +We will use our user `789` as an example. Let's take the first rating for this user from the `moviesForUser` set of `Ratings` that we previously computed: + + val actualRating = moviesForUser.take(1)(0) + +Here is the output: + + **actualRating: org.apache.spark.mllib.recommendation.Rating = Rating(789,1012,4.0)** + +We will see that the rating for this user-item combination is 4. Next, we will compute the model's predicted rating: + + val predictedRating = model.predict(789, actualRating.product) + +The output of the model's predicted rating is as follows: + + **...** + **14/04/13 13:01:15 INFO SparkContext: Job finished: lookup at MatrixFactorizationModel.scala:46, took 0.025404 s** + **predictedRating: Double = 4.001005374200248** + +We will see that the predicted rating is about 4, very close to the actual rating. Finally, we will compute the squared error between the actual rating and the predicted rating: + + val squaredError = math.pow(predictedRating - actualRating.rating, 2.0) + +The preceding code will output the squared error: + + **squaredError: Double = 1.010777282523947E-6** + +So, in order to compute the overall MSE for the dataset, we need to compute this squared error for each `(user, movie, actual rating, predicted rating)` entry, sum them up, and divide them by the number of ratings. We will do this in the following code snippet. + +### Tip + +Note the following code is adapted from the Apache Spark programming guide for ALS at . + +First, we will extract the user and product IDs from the `ratings` RDD and make predictions for each user-item pair using `model.predict`. We will use the user-item pair as the key and the predicted rating as the value: + + val usersProducts = ratings.map{ case Rating(user, product, rating) => (user, product)} + val predictions = model.predict(usersProducts).map{ + case Rating(user, product, rating) => ((user, product), rating) + } + +Next, we extract the actual ratings and also map the `ratings` RDD so that the user-item pair is the key and the actual rating is the value. Now that we have two RDDs with the same form of key, we can join them together to create a new RDD with the actual and predicted ratings for each user-item combination: + + val ratingsAndPredictions = ratings.map{ + case Rating(user, product, rating) => ((user, product), rating) + }.join(predictions) + +Finally, we will compute the MSE by summing up the squared errors using `reduce` and dividing by the `count` method of the number of records: + + val MSE = ratingsAndPredictions.map{ + case ((user, product), (actual, predicted)) => math.pow((actual - predicted), 2) + }.reduce(_ + _) / ratingsAndPredictions.count + println("Mean Squared Error = " + MSE) + +The output is as follows: + + **Mean Squared Error = 0.08231947642632852** + +It is common to use the **Root Mean Squared Error** ( **RMSE** ), which is just the square root of the MSE metric. This is somewhat more interpretable, as it is in the same units as the underlying data (that is, the ratings in this case). It is equivalent to the standard deviation of the differences between the predicted and actual ratings. We can compute it simply as follows: + + val RMSE = math.sqrt(MSE) + println("Root Mean Squared Error = " + RMSE) + +The preceding code will print the Root Mean Squared Error: + + **Root Mean Squared Error = 0.2869137090247319** + +## Mean average precision at K + + **Mean average precision at K** ( **MAPK** ) is the mean of the **average precision at K** ( **APK** ) metric across all instances in the dataset. APK is a metric commonly used in information retrieval. APK is a measure of the average relevance scores of a set of the _top-K_ documents presented in response to a query. For each query instance, we will compare the set of _top-K_ results with the set of actual relevant documents (that is, a ground truth set of relevant documents for the query). + +In the APK metric, the order of the result set matters, in that, the APK score would be higher if the result documents are both relevant and the relevant documents are presented higher in the results. It is, thus, a good metric for recommender systems in that typically we would compute the _top-K_ recommended items for each user and present these to the user. Of course, we prefer models where the items with the highest predicted scores (which are presented at the top of the list of recommendations) are, in fact, the most relevant items for the user. APK and other ranking-based metrics are also more appropriate evaluation measures for implicit datasets; here, MSE makes less sense. + +In order to evaluate our model, we can use APK, where each user is the equivalent of a query, and the set of _top-K_ recommended items is the document result set. The relevant documents (that is, the ground truth) in this case, is the set of items that a user interacted with. Hence, APK attempts to measure how good our model is at predicting items that a user will find relevant and choose to interact with. + +### Note + +The code for the following average precision computation is based on . + +More information on MAPK can be found at . + +Our function to compute the APK is shown here: + + def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = { + val predK = predicted.take(k) + var score = 0.0 + var numHits = 0.0 + for ((p, i) <- predK.zipWithIndex) { + if (actual.contains(p)) { + numHits += 1.0 + score += numHits / (i.toDouble + 1.0) + } + } + if (actual.isEmpty) { + 1.0 + } else { + score / scala.math.min(actual.size, k).toDouble + } + } + +As you can see, this takes as input a list of `actual` item IDs that are associated with the user and another list of `predicted` ids so that our estimate will be relevant for the user. + +We can compute the APK metric for our example user `789` as follows. First, we will extract the actual movie IDs for the user: + + val actualMovies = moviesForUser.map(_.product) + +The output is as follows: + + **actualMovies: Seq[Int] = ArrayBuffer(1012, 127, 475, 93, 1161, 286, 293, 9, 50, 294, 181, 1, 1008, 508, 284, 1017, 137, 111, 742, 248, 249, 1007, 591, 150, 276, 151, 129, 100, 741, 288, 762, 628, 124)** + +We will then use the movie recommendations we made previously to compute the APK score using `K = 10`: + + val predictedMovies = topKRecs.map(_.product) + +Here is the output: + + **predictedMovies: Array[Int] = Array(27, 497, 633, 827, 602, 849, 401, 584, 1035, 1014)** + +The following code will produce the average precision: + + val apk10 = avgPrecisionK(actualMovies, predictedMovies, 10) + +The preceding code will print: + + **apk10: Double = 0.0** + +In this case, we can see that our model is not doing a very good job of predicting relevant movies for this user as the APK score is 0. + +In order to compute the APK for each user and average them to compute the overall MAPK, we will need to generate the list of recommendations for each user in our dataset. While this can be fairly intensive on a large scale, we can distribute the computation using our Spark functionality. However, one limitation is that each worker must have the full item-factor matrix available so that it can compute the dot product between the relevant user vector and all item vectors. This can be a problem when the number of items is extremely high as the item matrix must fit in the memory of one machine. + +### Tip + +There is actually no easy way around this limitation. One possible approach is to only compute recommendations for a subset of items from the total item set, using approximate techniques such as Locality Sensitive Hashing (). + +We will now see how to go about this. First, we will collect the item factors and form a `DoubleMatrix` object from them: + + val itemFactors = model.productFeatures.map { case (id, factor) => factor }.collect() + val itemMatrix = new DoubleMatrix(itemFactors) + println(itemMatrix.rows, itemMatrix.columns) + +The output of the preceding code is as follows: + + **(1682,50)** + +This gives us a matrix with `1682` rows and `50` columns, as we would expect from `1682` movies with a factor dimension of `50`. Next, we will distribute the item matrix as a broadcast variable so that it is available on each worker node: + + val imBroadcast = sc.broadcast(itemMatrix) + +You will see the output as follows: + + **14/04/13 21:02:01 INFO MemoryStore: ensureFreeSpace(672960) called with curMem=4006896, maxMem=311387750** + **14/04/13 21:02:01 INFO MemoryStore: Block broadcast_21 stored as values to memory (estimated size 657.2 KB, free 292.5 MB)** + **imBroadcast: org.apache.spark.broadcast.Broadcast[org.jblas.DoubleMatrix] = Broadcast(21)** + +Now we are ready to compute the recommendations for each user. We will do this by applying a `map` function to each user factor within which we will perform a matrix multiplication between the user-factor vector and the movie-factor matrix. The result is a vector (of length `1682`, that is, the number of movies we have) with the predicted rating for each movie. We will then sort these predictions by the predicted rating: + + val allRecs = model.userFeatures.map{ case (userId, array) => + val userVector = new DoubleMatrix(array) + val scores = imBroadcast.value.mmul(userVector) + val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1) + val recommendedIds = sortedWithId.map **(_._2 + 1** ).toSeq + (userId, recommendedIds) + } + +You will see the following on the screen: + + **allRecs: org.apache.spark.rdd.RDD[(Int, Seq[Int])] = MappedRDD[269] at map at :29** + +As we can see, we now have an RDD that contains a list of movie IDs for each user ID. These movie IDs are sorted in order of the estimated rating. + +### Tip + +Note that we needed to add 1 to the returned movie ids (as highlighted in the preceding code snippet), as the item-factor matrix is 0-indexed, while our movie IDs start at `1`. + +We also need the list of movie IDs for each user to pass into our APK function as the `actual` argument. We already have the `ratings` RDD ready, so we can extract just the user and movie IDs from it. + +If we use Spark's `groupBy` operator, we will get an RDD that contains a list of `(userid, movieid)` pairs for each user ID (as the user ID is the key on which we perform the `groupBy` operation): + + val userMovies = ratings.map{ case Rating(user, product, rating) => (user, product) }.groupBy(_._1) + +The output of the preceding code is as follows: + + **userMovies: org.apache.spark.rdd.RDD[(Int, Seq[(Int, Int)])] = MapPartitionsRDD[277] at groupBy at :21** + +Finally, we can use Spark's `join` operator to join these two RDDs together on the user ID key. Then, for each user, we have the list of actual and predicted movie IDs that we can pass to our APK function. In a manner similar to how we computed MSE, we will sum each of these APK scores using a `reduce` action and divide by the number of users (that is, the count of the `allRecs` RDD): + + val K = 10 + val MAPK = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => + val actual = actualWithIds.map(_._2).toSeq + avgPrecisionK(actual, predicted, K) + }.reduce(_ + _) / allRecs.count + println("Mean Average Precision at K = " + MAPK) + +The preceding code will print the mean average precision at K as follows: + + **Mean Average Precision at K = 0.030486963254725705** + +Our model achieves a fairly low MAPK. However, note that typical values for recommendation tasks are usually relatively low, especially if the item set is extremely large. + +Try out a few parameter settings for `lambda` and `rank `(and `alpha` if you are using the implicit version of ALS) and see whether you can find a model that performs better based on the RMSE and MAPK evaluation metrics. + +## Using MLlib's built-in evaluation functions + +While we have computed MSE, RMSE, and MAPK from scratch, and it a useful learning exercise to do so, MLlib provides convenience functions to do this for us in the `RegressionMetrics` and `RankingMetrics` classes. + +### RMSE and MSE + +First, we will compute the MSE and RMSE metrics using `RegressionMetrics`. We will instantiate a `RegressionMetrics` instance by passing in an RDD of key-value pairs that represent the predicted and true values for each data point, as shown in the following code snippet. Here, we will again use the `ratingsAndPredictions` RDD we computed in our earlier example: + + import org.apache.spark.mllib.evaluation.RegressionMetrics + val predictedAndTrue = ratingsAndPredictions.map { case ((user, product), (predicted, actual)) => (predicted, actual) } + val regressionMetrics = new RegressionMetrics(predictedAndTrue) + +We can then access various metrics, including MSE and RMSE. We will print out these metrics here: + + println("Mean Squared Error = " + regressionMetrics.meanSquaredError) + println("Root Mean Squared Error = " + regressionMetrics.rootMeanSquaredError) + +You will see that the output for MSE and RMSE is exactly the same as the metrics we computed earlier: + + **Mean Squared Error = 0.08231947642632852** + **Root Mean Squared Error = 0.2869137090247319** + +### MAP + +As we did for MSE and RMSE, we can compute ranking-based evaluation metrics using MLlib's `RankingMetrics` class. Similarly, to our own average precision function, we need to pass in an RDD of key-value pairs, where the key is an `Array` of predicted item IDs for a user, while the value is an array of actual item IDs. + +The implementation of the average precision at the K function in `RankingMetrics` is slightly different from ours, so we will get different results. However, the computation of the overall mean average precision (MAP, which does not use a threshold at K) is the same as our function if we select `K` to be very high (say, at least as high as the number of items in our item set): + +First, we will calculate MAP using `RankingMetrics`: + + import org.apache.spark.mllib.evaluation.RankingMetrics + val predictedAndTrueForRanking = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => + val actual = actualWithIds.map(_._2) + (predicted.toArray, actual.toArray) + } + val rankingMetrics = new RankingMetrics(predictedAndTrueForRanking) + println("Mean Average Precision = " + rankingMetrics.meanAveragePrecision) + +You will see the following output: + + **Mean Average Precision = 0.07171412913757183** + +Next, we will use our function to compute the MAP in exactly the same way as we did previously, except that we set `K` to a very high value, say `2000`: + + val MAPK2000 = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) => + val actual = actualWithIds.map(_._2).toSeq + avgPrecisionK(actual, predicted, 2000) + }.reduce(_ + _) / allRecs.count + println("Mean Average Precision = " + MAPK2000) + +You will see that the MAP from our own function is the same as the one computed using `RankingMetrics`: + + **Mean Average Precision = 0.07171412913757186** + +### Note + +We will not cover cross-validation in this chapter, as we will provide a detailed treatment in the next few chapters. However, note that the same techniques for cross-validation that are explored in the upcoming chapters can be used to evaluate recommendation models, using the performance metrics such as MSE, RMSE, and MAP, which we covered in this section. + +# Summary + +In this chapter, we used Spark's MLlib library to train a collaborative filtering recommendation model, and you learned how to use this model to make predictions for the items that a given user might have a preference for. We also used our model to find items that are similar or related to a given item. Finally, we explored common metrics to evaluate the predictive capability of our recommendation model. + +In the next chapter, you will learn how to use Spark to train a model to classify your data and to use standard evaluation mechanisms to gauge the performance of your model. + +# Chapter 5. Building a Classification Model with Spark + +In this chapter, you will learn the basics of classification models and how they can be used in a variety of contexts. Classification generically refers to classifying things into distinct categories or classes. In the case of a classification model, we typically wish to assign classes based on a set of features. The features might represent variables related to an item or object, an event or context, or some combination of these. + +The simplest form of classification is when we have two classes; this is referred to as binary classification. One of the classes is usually labeled as the positive class (assigned a label of 1), while the other is labeled as the negative class (assigned a label of -1 or, sometimes, 0). + +A simple example with two classes is shown in the following figure. The input features in this case have two dimensions, and the feature values are represented on the _x_ and _y_ axes in the figure. + +Our task is to train a model that can classify new data points in this two-dimensional space as either one class (red) or the other (blue). + +A simple binary classification problem + +If we have more than two classes, we would refer to multiclass classification, and classes are typically labeled using integer numbers starting at 0 (for example, five different classes would range from label 0 to 4). An example is shown in the following figure. Again, the input features are assumed to be two-dimensional for ease of illustration. + +A simple multiclass classification problem + +Classification is a form of supervised learning where we train a model with training examples that include known targets or outcomes of interest (that is, the model is supervised with these example outcomes). Classification models can be used in many situations, but a few common examples include: + + * Predicting the probability of Internet users clicking on an online advert; here, the classes are binary in nature (that is, click or no click) + * Detecting fraud; again, in this case, the classes are commonly binary (fraud or no fraud) + * Predicting defaults on loans (binary) + * Classifying images, video, or sounds (most often multiclass, with potentially very many different classes) + * Assigning categories or tags to news articles, web pages, or other content (multiclass) + * Discovering e-mail and web spam, network intrusions, and other malicious behavior (binary or multiclass) + * Detecting failure situations, for example in computer systems or networks + * Ranking customers or users in order of probability that they might purchase a product or use a service (this can be framed as classification by predicting probabilities and then ranking in the descending order) + * Predicting customers or users who might stop using a product, service, or provider (called churn) + +These are just a few possible use cases. In fact, it is probably safe to say that classification is one of the most widely used machine learning and statistical techniques in modern businesses and especially online businesses. + +In this chapter, we will: + + * Discuss the types of classification models available in MLlib + * Use Spark to extract the appropriate features from raw input data + * Train a number of classification models using MLlib + * Make predictions with our classification models + * Apply a number of standard evaluation techniques to assess the predictive performance of our models + * Illustrate how to improve model performance using some of the feature-extraction approaches from Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ + * Explore the impact of parameter tuning on model performance and learn how to use cross-validation to select the most optimal model parameters + +# Types of classification models + +We will explore three common classification models available in Spark: linear models, decision trees, and naive Bayes models. Linear models, while less complex, are relatively easier to scale to very large datasets. Decision tree is a powerful nonlinear technique that can be a little more difficult to scale up (fortunately, MLlib takes care of this for us!) and more computationally intensive to train, but delivers leading performance in many situations. Naive Bayes models are more simple but are easy to train efficiently and parallelize (in fact, they require only one pass over the dataset). They can also give reasonable performance in many cases when appropriate feature engineering is used. A naive Bayes model also provides a good baseline model against which we can measure the performance of other models. + +Currently, Spark's MLlib library supports binary classification for linear models, decision trees, and naive Bayes models and multiclass classification for decision trees and naive Bayes models. In this book, for simplicity in illustrating the examples, we will focus on the binary case. + +## Linear models + +The core idea of linear models (or generalized linear models) is that we model the predicted outcome of interest (often called the target or dependent variable) as a function of a simple linear predictor applied to the input variables (also referred to as features or independent variables). + + y = f(wTx) + +Here, _y_ is the target variable, _w_ is the vector of parameters (known as the weight vector), and _x_ is the vector of input features. + + _w Tx_ is the linear predictor (or vector dot product) of the weight vector _w_ and feature vector _x_. To this linear predictor, we applied a function _f_ (called the link function). + +Linear models can, in fact, be used for both classification and regression, simply by changing the link function. Standard linear regression (covered in the next chapter) uses an identity link (that is, _y = w Tx_ directly), while binary classification uses alternative link functions as discussed here. + +Let's take a look at the example of online advertising. In this case, the target variable would be 0 (often assigned the class label of -1 in mathematical treatments) if no click was observed for a given advert displayed on a web page (called an impression). The target variable would be 1 if a click occurred. The feature vector for each impression would consist of variables related to the impression event (such as features relating to the user, web page, advert and advertiser, and various other factors relating to the context of the event, such as the type of device used, time of the day, and geolocation). + +Thus, we would like to find a model that maps a given input feature vector (advert impression) to a predicted outcome (click or not). To make a prediction for a new data point, we will take the new feature vector (which is unseen, and hence, we do not know what the target variable is) and compute the dot product with our weight vector. We will then apply the relevant link function, and the result is our predicted outcome (after applying a threshold to the prediction, in the case of some models). + +Given a set of input data in the form of feature vectors and target variables, we would like to find the weight vector that is the best fit for the data, in the sense that we minimize some error between what our model predicts and the actual outcomes observed. This process is called **model** **fitting** , **training** , or **optimization**. + +More formally, we seek to find the weight vector that minimizes the sum, over all the training examples, of the loss (or error) computed from some loss function. The loss function takes the weight vector, feature vector, and the actual outcome for a given training example as input and outputs the loss. In fact, the loss function itself is effectively specified by the link function; hence, for a given type of classification or regression (that is, a given link function), there is a corresponding loss function. + +### Tip + +For further details on linear models and loss functions, see the linear methods section related to binary classification in the _Spark Programming Guide_ at .Also, see the Wikipedia entry for generalized linear models at . + +While a detailed treatment of linear models and loss functions is beyond the scope of this book, MLlib provides two loss functions suitable to binary classification (you can learn more about them from the Spark documentation). The first one is logistic loss, which equates to a model known as **logistic regression** , while the second one is the hinge loss, which is equivalent to a linear **Support Vector Machine** ( **SVM** ). Note that the SVM does not strictly fall into the statistical framework of generalized linear models but can be used in the same way as it essentially specifies a loss and link function. + +In the following image, we show the logistic loss and hinge loss relative to the actual zero-one loss. The zero-one loss is the true loss for binary classification--it is either zero if the model predicts correctly or one if the model predicts incorrectly. The reason it is not actually used is that it is not a differentiable loss function, so it is not possible to easily compute a gradient and, thus, very difficult to optimize. + +The other loss functions are approximations to the zero-one loss that make optimization possible. + +The logistic, hinge and zero-one loss functions + +### Note + +The preceding loss diagram is adapted from the scikit-learn example at . + +### Logistic regression + +Logistic regression is a probabilistic model--that is, its predictions are bounded between 0 and 1, and for binary classification equate to the model's estimate of the probability of the data point belonging to the positive class. Logistic regression is one of the most widely used linear classification models. + +As mentioned earlier, the link function used in logistic regression is the logit link: + + 1 / (1 + exp(-wTx)) + +The related loss function for logistic regression is the logistic loss: + + log(1 + exp(-ywTx)) + +Here, _y_ is the actual target variable (either _1_ for the positive class or _-1_ for the negative class). + +### Linear support vector machines + +SVM is a powerful and popular technique for regression and classification. Unlike logistic regression, it is not a probabilistic model but predicts classes based on whether the model evaluation is positive or negative. + +The SVM link function is the identity link, so the predicted outcome is: + + y = wTx + +Hence, if the evaluation of _w Tx_ is greater than or equal to a threshold of 0, the SVM will assign the data point to class 1; otherwise, the SVM will assign it to class 0 (this threshold is a model parameter of SVM and can be adjusted). + +The loss function for SVM is known as the **hinge loss** and is defined as: + + max(0, 1 - ywTx) + +SVM is a maximum margin classifier--it tries to find a weight vector such that the classes are separated as much as possible. It has been shown to perform well on many classification tasks, and the linear variant can scale to very large datasets. + +### Note + +SVMs have a large amount of theory behind them, which is beyond the scope of this book, but you can visit and for more details. + +In the following image, we have plotted the different decision functions for logistic regression (the blue line) and linear SVM (the red line), based on the simple binary classification example explained earlier. + +You can see that the SVM effectively focuses on the points that lie closest to the decision function (the margin lines are shown with red dashes): + +Decision functions for logistic regression and linear SVM for binary classification + +## The naive Bayes model + +Naive Bayes is a probabilistic model that makes predictions by computing the probability of a data point that belongs to a given class. A naive Bayes model assumes that each feature makes an independent contribution to the probability assigned to a class (it assumes conditional independence between features). + +Due to this assumption, the probability of each class becomes a function of the product of the probability of a feature occurring, given the class, as well as the probability of this class. This makes training the model tractable and relatively straightforward. The class prior probabilities and feature conditional probabilities are all estimated from the frequencies present in the dataset. Classification is performed by selecting the most probable class, given the features and class probabilities. + +An assumption is also made about the feature distributions (the parameters of which are estimated from the data). MLlib implements multinomial naive Bayes that assumes that the feature distribution is a multinomial distribution that represents non-negative frequency counts of the features. + +It is suitable for binary features (for example, _1-of-k_ encoded categorical features) and is commonly used for text and document classification (where, as we have seen in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , the bag-of-words vector is a typical feature representation). + +### Note + +Take a look at the _MLlib - Naive Bayes_ section in the Spark documentation at for more information. + +The Wikipedia page at has a more detailed explanation of the mathematical formulation. + +Here, we have shown the decision function of naive Bayes on our simple binary classification example: + +Decision function of naive Bayes for binary classification + +## Decision trees + +Decision tree model is a powerful, nonprobabilistic technique that can capture more complex nonlinear patterns and feature interactions. They have been shown to perform well on many tasks, are relatively easy to understand and interpret, can handle categorical and numerical features, and do not require input data to be scaled or standardized. They are well suited to be included in ensemble methods (for example, ensembles of decision tree models, which are called decision forests). + +The decision tree model constructs a tree where the leaves represent a class assignment to class 0 or 1, and the branches are a set of features. In the following figure, we show a simple decision tree where the binary outcome is **Stay at home** or **Go to the beach**. The features are the weather outside. + +A simple decision tree + +The decision tree algorithm is a top-down approach that begins at a root node (or feature), and then selects a feature at each step that gives the best split of the dataset, as measured by the information gain of this split. The information gain is computed from the node impurity (which is the extent to which the labels at the node are similar, or homogenous) minus the weighted sum of the impurities for the two child nodes that would be created by the split. For classification tasks, there are two measures that can be used to select the best split. These are Gini impurity and entropy. + +### Note + +See the _MLlib - Decision Tree_ section in the _Spark Programming Guide_ at for further details on the decision tree algorithm and impurity measures for classification. + +In the following screenshot, we have plotted the decision boundary for the decision tree model, as we did for the other models earlier. We can see that the decision tree is able to fit complex, nonlinear models. + +Decision function for a decision tree for binary classification + +# Extracting the right features from your data + +You might recall from Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ that the majority of machine learning models operate on numerical data in the form of feature vectors. In addition, for supervised learning methods such as classification and regression, we need to provide the target variable (or variables in the case of multiclass situations) together with the feature vector. + +Classification models in MLlib operate on instances of `LabeledPoint`, which is a wrapper around the target variable (called the **label** ) and the **feature vector** : + + case class LabeledPoint(label: Double, features: Vector) + +While in most examples of using classification, you will come across existing datasets that are already in the vector format, in practice, you will usually start with raw data that needs to be transformed into features. As we have already seen, this can involve preprocessing and transformation, such as binning numerical features, scaling and normalizing features, and using _1-of-k_ encodings for categorical features. + +## Extracting features from the Kaggle/StumbleUpon evergreen classification dataset + +In this chapter, we will use a different dataset from the one we used for our recommendation model, as the MovieLens data doesn't have much for us to work with in terms of a classification problem. We will use a dataset from a competition on Kaggle. The dataset was provided by StumbleUpon, and the problem relates to classifying whether a given web page is ephemeral (that is, short lived and will cease being popular soon) or evergreen (that is, persistently popular) on their web content recommendation pages. + +### Note + +The dataset used here can be downloaded from . + +Download the training data (`train.tsv`)--you will need to accept the terms and conditions before downloading the dataset. + +You can find more information about the competition at . + +Before we begin, it will be easier for us to work with the data in Spark if we remove the column name header from the first line of the file. Change to the directory in which you downloaded the data (referred to as `PATH` here) and run the following command to remove the first line and pipe the result to a new file called `train_noheader.tsv`: + + **> sed 1d train.tsv > train_noheader.tsv** + +Now, we are ready to start up our Spark shell (remember to run this command from your Spark installation directory): + + **>./bin/spark-shell --driver-memory 4g** + +You can type in the code that follows for the remainder of this chapter directly into your Spark shell. + +In a manner similar to what we did in the earlier chapters, we will load the raw training data into an RDD and inspect it: + + val rawData = sc.textFile("/ **PATH** /train_noheader.tsv") + val records = rawData.map(line => line.split("\t")) + records.first() + +You will the following on the screen: + + **Array[String] = Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", ...** + +You can check the fields that are available by reading through the overview on the dataset page above. The first two columns contain the URL and ID of the page. The next column contains some raw textual content. The next column contains the category assigned to the page. The next 22 columns contain numeric or categorical features of various kinds. The final column contains the target--1 is evergreen, while 0 is non-evergreen. + +We'll start off with a simple approach of using only the available numeric features directly. As each categorical variable is binary, we already have a _1-of-k_ encoding for these variables, so we don't need to do any further feature extraction. + +Due to the way the data is formatted, we will have to do a bit of data cleaning during our initial processing by trimming out the extra quotation characters (`"`). There are also missing values in the dataset; they are denoted by the `"?"` character. In this case, we will simply assign a zero value to these missing values: + + import org.apache.spark.mllib.regression.LabeledPoint + import org.apache.spark.mllib.linalg.Vectors + val data = records.map { r => + val trimmed = r.map(_.replaceAll("\"", "")) + val label = trimmed(r.size - 1).toInt + val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) + LabeledPoint(label, Vectors.dense(features)) + } + +In the preceding code, we extracted the label variable from the last column and an array of features for columns 5 to 25 after cleaning and dealing with missing values. We converted the label to an `Int` value and the features to an `Array[Double]`. Finally, we wrapped the label and features in a `LabeledPoint` instance, converting the features into an MLlib `Vector`. + +We will also cache the data and count the number of data points: + + data.cache + val numData = data.count + +You will see that the value of `numData` is 7395. + +We will explore the dataset in more detail a little later, but we will tell you now that there are some negative feature values in the numeric data. As we saw earlier, the naive Bayes model requires non-negative features and will throw an error if it encounters negative values. So, for now, we will create a version of our input feature vectors for the naive Bayes model by setting any negative feature values to zero: + + val nbData = records.map { r => + val trimmed = r.map(_.replaceAll("\"", "")) + val label = trimmed(r.size - 1).toInt + val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d) + LabeledPoint(label, Vectors.dense(features)) + } + +# Training classification models + +Now that we have extracted some basic features from our dataset and created our input RDD, we are ready to train a number of models. To compare the performance and use of different models, we will train a model using logistic regression, SVM, naive Bayes, and a decision tree. You will notice that training each model looks nearly identical, although each has its own specific model parameters that can be set. MLlib sets sensible defaults in most cases, but in practice, the best parameter setting should be selected using evaluation techniques, which we will cover later in this chapter. + +## Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset + +We can now apply the models from MLlib to our input data. First, we need to import the required classes and set up some minimal input parameters for each model. For logistic regression and SVM, this is the number of iterations, while for the decision tree model, it is the maximum tree depth: + + import org.apache.spark.mllib.classification.LogisticRegressionWithSGD + import org.apache.spark.mllib.classification.SVMWithSGD + import org.apache.spark.mllib.classification.NaiveBayes + import org.apache.spark.mllib.tree.DecisionTree + import org.apache.spark.mllib.tree.configuration.Algo + import org.apache.spark.mllib.tree.impurity.Entropy + val numIterations = 10 + val maxTreeDepth = 5 + +Now, train each model in turn. First, we will train logistic regression: + + val lrModel = LogisticRegressionWithSGD.train(data, numIterations) + + **...** + **14/12/06 13:41:47 INFO DAGScheduler: Job 81 finished: reduce at RDDFunctions.scala:112, took 0.011968 s** + **14/12/06 13:41:47 INFO GradientDescent: GradientDescent.runMiniBatchSGD finished. Last 10 stochastic losses 0.6931471805599474, 1196521.395699124, Infinity, 1861127.002201189, Infinity, 2639638.049627607, Infinity, Infinity, Infinity, Infinity** + **lrModel: org.apache.spark.mllib.classification.LogisticRegressionModel = (weights=[-0.11372778986947886,-0.511619752777837,** + **...** + +Next up, we will train an SVM model: + + val svmModel = SVMWithSGD.train(data, numIterations) + +You will see the following output: + + **...** + **14/12/06 13:43:08 INFO DAGScheduler: Job 94 finished: reduce at RDDFunctions.scala:112, took 0.007192 s** + **14/12/06 13:43:08 INFO GradientDescent: GradientDescent.runMiniBatchSGD finished. Last 10 stochastic losses 1.0, 2398226.619666797, 2196192.9647478117, 3057987.2024311484, 271452.9038284356, 3158131.191895948, 1041799.350498323, 1507522.941537049, 1754560.9909073508, 136866.76745605646** + **svmModel: org.apache.spark.mllib.classification.SVMModel = (weights=[-0.12218838697834929,-0.5275107581589767,** + **...** + +Then, we will train the naive Bayes model; remember to use your special non-negative feature dataset: + + val nbModel = NaiveBayes.train(nbData) + +The following is the output: + + **...** + **14/12/06 13:44:48 INFO DAGScheduler: Job 95 finished: collect at NaiveBayes.scala:120, took 0.441273 s** + **nbModel: org.apache.spark.mllib.classification.NaiveBayesModel = org.apache.spark.mllib.classification.NaiveBayesModel@666ac612** + **...** + +Finally, we will train our decision tree: + + val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth) + +The output is as follows: + + **...** + **14/12/06 13:46:03 INFO DAGScheduler: Job 104 finished: collectAsMap at DecisionTree.scala:653, took 0.031338 s** + **...** + **total: 0.343024** + **findSplitsBins: 0.119499** + **findBestSplits: 0.200352** + **chooseSplits: 0.199705** + **dtModel: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 5 with 61 nodes** + **...** + +Notice that we set the mode, or `Algo`, of the decision tree to `Classification`, and we used the `Entropy` impurity measure. + +# Using classification models + +We now have four models trained on our input labels and features. We will now see how to use these models to make predictions on our dataset. For now, we will use the same training data to illustrate the `predict` method of each model. + +## Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + +We will use our logistic regression model as an example (the other models are used in the same way): + + val dataPoint = data.first + val prediction = lrModel.predict(dataPoint.features) + +The following is the output: + + **prediction: Double = 1.0** + +We saw that for the first data point in our training dataset, the model predicted a label of `1` (that is, evergreen). Let's examine the true label for this data point: + + val trueLabel = dataPoint.label + +You can see the following output: + + **trueLabel: Double = 0.0** + +So, in this case, our model got it wrong! + +We can also make predictions in bulk by passing in an `RDD[Vector]` as input: + + val predictions = lrModel.predict(data.map(lp => lp.features)) + predictions.take(5) + +The following is the output: + + **Array[Double] = Array(1.0, 1.0, 1.0, 1.0, 1.0)** + +# Evaluating the performance of classification models + +When we make predictions using our model, as we did earlier, how do we know whether the predictions are good or not? We need to be able to evaluate how well our model performs. Evaluation metrics commonly used in binary classification include prediction accuracy and error, precision and recall, and area under the precision-recall curve, the **receiver operating characteristic** ( **ROC** ) curve, **area under ROC curve** ( **AUC** ), and F-measure. + +## Accuracy and prediction error + +The prediction error for binary classification is possibly the simplest measure available. It is the number of training examples that are misclassified, divided by the total number of examples. Similarly, accuracy is the number of correctly classified examples divided by the total examples. + +We can calculate the accuracy of our models in our training data by making predictions on each input feature and comparing them to the true label. We will sum up the number of correctly classified instances and divide this by the total number of data points to get the average classification accuracy: + + val lrTotalCorrect = data.map { point => + if (lrModel.predict(point.features) == point.label) 1 else 0 + }.sum + val lrAccuracy = lrTotalCorrect / data.count + +The output is as follows: + + **lrAccuracy: Double = 0.5146720757268425** + +This gives us 51.5 percent accuracy, which doesn't look particularly impressive! Our model got only half of the training examples correct, which seems to be about as good as a random chance. + +### Note + +Note that the predictions made by the model are not naturally exactly 1 or 0. The output is usually a real number that must be turned into a class prediction. This is done through use of a threshold in the classifier's decision or scoring function. + +For example, binary logistic regression is a probabilistic model that returns the estimated probability of class 1 in its scoring function. Thus, a decision threshold of 0.5 is typical. That is, if the estimated probability of being in class 1 is higher than 50 percent, the model decides to classify the point as class 1; otherwise, it will be classified as class 0. + +Note that the threshold itself is effectively a model parameter that can be tuned in some models. It also plays a role in evaluation measures, as we will see now. + +What about the other models? Let's compute the accuracy for the other three: + + val svmTotalCorrect = data.map { point => + if (svmModel.predict(point.features) == point.label) 1 else 0 + }.sum + val nbTotalCorrect = nbData.map { point => + if (nbModel.predict(point.features) == point.label) 1 else 0 + }.sum + +Note that the decision tree prediction threshold needs to be specified explicitly, as highlighted here: + + val dtTotalCorrect = data.map { point => + val score = dtModel.predict(point.features) + val predicted = if ( **score > 0.5**) 1 else 0 + if (predicted == point.label) 1 else 0 + }.sum + +We can now inspect the accuracy for the other three models. + +First, the SVM model: + + val svmAccuracy = svmTotalCorrect / numData + +Here is the output for the SVM model: + + **svmAccuracy: Double = 0.5146720757268425** + +Next, our naive Bayes model: + + val nbAccuracy = nbTotalCorrect / numData + +The output is as follows: + + **nbAccuracy: Double = 0.5803921568627451** + +Finally, we compute the accuracy for the decision tree: + + val dtAccuracy = dtTotalCorrect / numData + +And, the output is: + + **dtAccuracy: Double = 0.6482758620689655** + +We can see that both SVM and naive Bayes also performed quite poorly. The decision tree model is better with 65 percent accuracy, but this is still not particularly high. + +## Precision and recall + +In information retrieval, precision is a commonly used measure of the quality of the results, while recall is a measure of the completeness of the results. + +In the binary classification context, precision is defined as the number of true positives (that is, the number of examples correctly predicted as class 1) divided by the sum of true positives and false positives (that is, the number of examples that were incorrectly predicted as class 1). Thus, we can see that a precision of 1.0 (or 100 percent) is achieved if every example predicted by the classifier to be class 1 is, in fact, in class 1 (that is, there are no false positives). + +Recall is defined as the number of true positives divided by the sum of true positives and false negatives (that is, the number of examples that were in class 1, but were predicted as class 0 by the model). We can see that a recall of 1.0 (or 100 percent) is achieved if the model doesn't miss any examples that were in class 1 (that is, there are no false negatives). + +Generally, precision and recall are inversely related; often, higher precision is related to lower recall and vice versa. To illustrate this, assume that we built a model that always predicted class 1. In this case, the model predictions would have no false negatives because the model always predicts 1; it will not miss any of class 1. Thus, the recall will be 1.0 for this model. On the other hand, the false positive rate could be very high, meaning precision would be low (this depends on the exact distribution of the classes in the dataset). + +Precision and recall are not particularly useful as standalone metrics, but are typically used together to form an aggregate or averaged metric. Precision and recall are also dependent on the threshold selected for the model. + +Intuitively, below some threshold level, a model will always predict class 1. Hence, it will have a recall of 1, but most likely, it will have low precision. At a high enough threshold, the model will always predict class 0. The model will then have a recall of 0, since it cannot achieve any true positives and will likely have many false negatives. Furthermore, its precision score will be undefined, as it will achieve zero true positives and zero false positives. + +The **precision-recall** ( **PR** ) curve shown in the following figure plots precision against recall outcomes for a given model, as the decision threshold of the classifier is changed. The area under this PR curve is referred to as the average precision. Intuitively, an area under the PR curve of 1.0 will equate to a perfect classifier that will achieve 100 percent in both precision and recall. + +Precision-recall curve + +### Tip + +See and for more details on precision, recall, and area under the PR curve. + +## ROC curve and AUC + +The **ROC** curve is a concept similar to the PR curve. It is a graphical illustration of the true positive rate against the false positive rate for a classifier. + +The **true positive rate** ( **TPR** ) is the number of true positives divided by the sum of true positives and false negatives. In other words, it is the ratio of true positives to all positive examples. This is the same as the recall we saw earlier and is also commonly referred to as sensitivity. + +The **false positive rate** ( **FPR** ) is the number of false positives divided by the sum of false positives and **true negatives** (that is, the number of examples correctly predicted as class 0). In other words, it is the ratio of false positives to all negative examples. + +In a manner similar to precision and recall, the ROC curve (plotted in the following figure) represents the classifier's performance tradeoff of TPR against FPR, for different decision thresholds. Each point on the curve represents a different threshold in the decision function for the classifier. + +The ROC curve + +The area under the ROC curve (commonly referred to as AUC) represents an average value. Again, an AUC of 1.0 will represent a perfect classifier. An area of 0.5 is referred to as the random score. Thus, a model that achieves an AUC of 0.5 is no better than randomly guessing. + +### Note + +As both the area under the PR curve and the area under the ROC curve are effectively normalized (with a minimum of 0 and maximum of 1), we can use these measures to compare models with differing parameter settings and even compare completely different models. Thus, these metrics are popular for model evaluation and selection purposes. + +MLlib comes with a set of built-in routines to compute the area under the PR and ROC curves for binary classification. Here, we will compute these metrics for each of our models: + + import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics + val metrics = Seq(lrModel, svmModel).map { model => + val scoreAndLabels = data.map { point => + (model.predict(point.features), point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC) + } + +As we did previously to train the naive Bayes model and computing accuracy, we need to use the special `nbData` version of the dataset that we created to compute the classification metrics: + + val nbMetrics = Seq(nbModel).map{ model => + val scoreAndLabels = nbData.map { point => + val score = model.predict(point.features) + (if (score > 0.5) 1.0 else 0.0, point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC) + } + +Note that because the `DecisionTreeModel` model does not implement the `ClassificationModel` interface that is implemented by the other three models, we need to compute the results separately for this model in the following code: + + val dtMetrics = Seq(dtModel).map{ model => + val scoreAndLabels = data.map { point => + val score = model.predict(point.features) + (if (score > 0.5) 1.0 else 0.0, point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC) + } + val allMetrics = metrics ++ nbMetrics ++ dtMetrics + allMetrics.foreach{ case (m, pr, roc) => + println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%") + } + +Your output will look similar to the one here: + + **LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%** + **SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%** + **NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%** + **DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%** + +We can see that all models achieve broadly similar results for the average precision metric. + +Logistic regression and SVM achieve results of around 0.5 for AUC. This indicates that they do no better than random chance! Our naive Bayes and decision tree models fare a little better, achieving an AUC of 0.58 and 0.65, respectively. Still, this is not a very good result in terms of binary classification performance. + +### Note + +While we don't cover multiclass classification here, MLlib provides a similar evaluation class called `MulticlassMetrics`, which provides averaged versions of many common metrics. + +# Improving model performance and tuning parameters + +So, what went wrong? Why have our sophisticated models achieved nothing better than random chance? Is there a problem with our models? + +Recall that we started out by just throwing the data at our model. In fact, we didn't even throw all our data at the model, just the numeric columns that were easy to use. Furthermore, we didn't do a lot of analysis on these numeric features. + +## Feature standardization + +Many models that we employ make inherent assumptions about the distribution or scale of input data. One of the most common forms of assumption is about normally-distributed features. Let's take a deeper look at the distribution of our features. + +To do this, we can represent the feature vectors as a distributed matrix in MLlib, using the `RowMatrix` class. `RowMatrix` is an RDD made up of vector, where each vector is a row of our matrix. + +The `RowMatrix` class comes with some useful methods to operate on the matrix, one of which is a utility to compute statistics on the columns of the matrix: + + import org.apache.spark.mllib.linalg.distributed.RowMatrix + val vectors = data.map(lp => lp.features) + val matrix = new RowMatrix(vectors) + val matrixSummary = matrix.computeColumnSummaryStatistics() + +The following code statement will print the mean of the matrix: + + println(matrixSummary.mean) + +Here is the output: + + **[0.41225805299526636,2.761823191986623,0.46823047328614004, ...** + +The following code statement will print the minimum value of the matrix: + + println(matrixSummary.min) + +Here is the output: + + **[0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.045564223,-1.0, ...** + +The following code statement will print the maximum value of the matrix: + + println(matrixSummary.max) + +The output is as follows: + + **[0.999426,363.0,1.0,1.0,0.980392157,0.980392157,21.0,0.25,0.0,0.444444444, ...** + +The following code statement will print the variance of the matrix: + + println(matrixSummary.variance) + +The output of the variance is: + + **[0.1097424416755897,74.30082476809638,0.04126316989120246, ...** + +The following code statement will print the nonzero number of the matrix: + + println(matrixSummary.numNonzeros) + +Here is the output: + + **[5053.0,7354.0,7172.0,6821.0,6160.0,5128.0,7350.0,1257.0,0.0, ...** + +The `computeColumnSummaryStatistics` method computes a number of statistics over each column of features, including the mean and variance, storing each of these in a `Vector` with one entry per column (that is, one entry per feature in our case). + +Looking at the preceding output for mean and variance, we can see quite clearly that the second feature has a much higher mean and variance than some of the other features (you will find a few other features that are similar and a few others that are more extreme). So, our data definitely does not conform to a standard Gaussian distribution in its raw form. To get the data in a more suitable form for our models, we can standardize each feature such that it has zero mean and unit standard deviation. We can do this by subtracting the column mean from each feature value and then scaling it by dividing it by the column standard deviation for the feature: + + (x - μ) / sqrt(variance) + +Practically, for each feature vector in our input dataset, we can simply perform an element-wise subtraction of the preceding mean vector from the feature vector and then perform an element-wise division of the feature vector by the vector of feature standard deviations. The standard deviation vector itself can be obtained by performing an element-wise square root operation on the variance vector. + +As we mentioned in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we fortunately have access to a convenience method from Spark's `StandardScaler` to accomplish this. + +`StandardScaler` works in much the same way as the `Normalizer` feature we used in that chapter. We will instantiate it by passing in two arguments that tell it whether to subtract the mean from the data and whether to apply standard deviation scaling. We will then fit `StandardScaler` on our input `vectors`. Finally, we will pass in an input vector to the `transform` function, which will then return a normalized vector. We will do this within the following `map` function to preserve the `label` from our dataset: + + import org.apache.spark.mllib.feature.StandardScaler + val scaler = new StandardScaler(withMean = true, withStd = true).fit(vectors) + val scaledData = data.map(lp => LabeledPoint(lp.label, scaler.transform(lp.features))) + +Our data should now be standardized. Let's inspect the first row of the original and standardized features: + + println(data.first.features) + +The output of the preceding line of code is as follows: + + **[0.789131,2.055555556,0.676470588,0.205882353,** + +The following code will the first row of the standardized features: + + println(scaledData.first.features) + +The output is as follows: + + **[1.1376439023494747,-0.08193556218743517,1.025134766284205,-0.0558631837375738,** + +As we can see, the first feature has been transformed by applying the standardization formula. We can check this by subtracting the mean (which we computed earlier) from the first feature and dividing the result by the square root of the variance (which we computed earlier): + + println((0.789131 - 0.41225805299526636)/ math.sqrt(0.1097424416755897)) + +The result should be equal to the first element of our scaled vector: + + **1.137647336497682** + +We can now retrain our model using the standardized data. We will use only the logistic regression model to illustrate the impact of feature standardization (since the decision tree and naive Bayes are not impacted by this): + + val lrModelScaled = LogisticRegressionWithSGD.train(scaledData, numIterations) + val lrTotalCorrectScaled = scaledData.map { point => + if (lrModelScaled.predict(point.features) == point.label) 1 else 0 + }.sum + val lrAccuracyScaled = lrTotalCorrectScaled / numData + val lrPredictionsVsTrue = scaledData.map { point => + (lrModelScaled.predict(point.features), point.label) + } + val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue) + val lrPr = lrMetricsScaled.areaUnderPR + val lrRoc = lrMetricsScaled.areaUnderROC + println(f"${lrModelScaled.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%") + +The result should look similar to this: + + **LogisticRegressionModel** + **Accuracy: 62.0419%** + **Area under PR: 72.7254%** + **Area under ROC: 61.9663%** + +Simply through standardizing our features, we have improved the logistic regression performance for accuracy and AUC from 50 percent, no better than random, to 62 percent. + +## Additional features + +We have seen that we need to be careful about standardizing and potentially normalizing our features, and the impact on model performance can be serious. In this case, we used only a portion of the features available. For example, we completely ignored the category variable and the textual content in the boilerplate variable column. + +This was done for ease of illustration, but let's assess the impact of adding an additional feature such as the category feature. + +First, we will inspect the categories and form a mapping of index to category, which you might recognize as the basis for a _1-of-k_ encoding of this categorical feature: + + val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap + val numCategories = categories.size + println(categories) + +The output of the different categories is as follows: + + **Map("weather" - > 0, "sports" -> 6, "unknown" -> 4, "computer_internet" -> 12, "?" -> 11, "culture_politics" -> 3, "religion" -> 8, "recreation" -> 2, "arts_entertainment" -> 9, "health" -> 5, "law_crime" -> 10, "gaming" -> 13, "business" -> 1, "science_technology" -> 7)** + +The following code will print the number of categories: + + println(numCategories) + +Here is the output: + + **14** + +So, we will need to create a vector of length 14 to represent this feature and assign a value of 1 for the index of the relevant category for each data point. We can then prepend this new feature vector to the vector of other numerical features: + + val dataCategories = records.map { r => + val trimmed = r.map(_.replaceAll("\"", "")) + val label = trimmed(r.size - 1).toInt + val categoryIdx = categories(r(3)) + val categoryFeatures = Array.ofDim[Double](numCategories) + categoryFeatures(categoryIdx) = 1.0 + val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) + val features = categoryFeatures ++ otherFeatures + LabeledPoint(label, Vectors.dense(features)) + } + println(dataCategories.first) + +You should see output similar to what is shown here. You can see that the first part of our feature vector is now a vector of length 14 with one nonzero entry at the relevant category index: + + **LabeledPoint(0.0, [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])** + +Again, since our raw features are not standardized, we should perform this transformation using the same `StandardScaler` approach that we used earlier before training a new model on this expanded dataset: + + val scalerCats = new StandardScaler(withMean = true, withStd = true).fit(dataCategories.map(lp => lp.features)) + val scaledDataCats = dataCategories.map(lp => LabeledPoint(lp.label, scalerCats.transform(lp.features))) + +We can inspect the features before and after scaling as we did earlier: + + println(dataCategories.first.features) + +The output is as follows: + + **0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556 ...** + +The following code will print the features after scaling: + + println(scaledDataCats.first.features) + +You will see the following on the screen: + + **[-0.023261105535492967,2.720728254208072,-0.4464200056407091,-0.2205258360869135, ...** + +### Tip + +Note that while the original raw features were sparse (that is, there are many entries that are zero), if we subtract the mean from each entry, we would end up with a non-sparse (dense) representation, as can be seen in the preceding example. + +This is not a problem in this case as the data size is small, but often large-scale real-world problems have extremely sparse input data with many features (online advertising and text classification are good examples). In this case, it is not advisable to lose this sparsity, as the memory and processing requirements for the equivalent dense representation can quickly explode with many millions of features. We can use StandardScaler and set `withMean` to `false` to avoid this. + +We're now ready to train a new logistic regression model with our expanded feature set, and then we will evaluate the performance: + + val lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats, numIterations) + val lrTotalCorrectScaledCats = scaledDataCats.map { point => + if (lrModelScaledCats.predict(point.features) == point.label) 1 else 0 + }.sum + val lrAccuracyScaledCats = lrTotalCorrectScaledCats / numData + val lrPredictionsVsTrueCats = scaledDataCats.map { point => + (lrModelScaledCats.predict(point.features), point.label) + } + val lrMetricsScaledCats = new BinaryClassificationMetrics(lrPredictionsVsTrueCats) + val lrPrCats = lrMetricsScaledCats.areaUnderPR + val lrRocCats = lrMetricsScaledCats.areaUnderROC + println(f"${lrModelScaledCats.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaledCats * 100}%2.4f%%\nArea under PR: ${lrPrCats * 100.0}%2.4f%%\nArea under ROC: ${lrRocCats * 100.0}%2.4f%%") + +You should see output similar to this one: + + **LogisticRegressionModel** + **Accuracy: 66.5720%** + **Area under PR: 75.7964%** + **Area under ROC: 66.5483%** + +By applying a feature standardization transformation to our data, we improved both the accuracy and AUC measures from 50 percent to 62 percent, and then, we achieved a further boost to 66 percent by adding the category feature into our model (remember to apply the standardization to our new feature set). + +### Note + +Note that the best model performance in the competition was an AUC of 0.88906 (see ). + +One approach to achieving performance almost as high is outlined at . + +Notice that there are still features that we have not yet used; most notably, the text features in the boilerplate variable. The leading competition submissions predominantly use the boilerplate features and features based on the raw textual content to achieve their performance. As we saw earlier, while adding category-improved performance, it appears that most of the variables are not very useful as predictors, while the textual content turned out to be highly predictive. + +Going through some of the best performing approaches for these competitions can give you a good idea as to how feature extraction and engineering play a critical role in model performance. + +## Using the correct form of data + +Another critical aspect of model performance is using the correct form of data for each model. Previously, we saw that applying a naive Bayes model to our numerical features resulted in very poor performance. Is this because the model itself is deficient? + +In this case, recall that MLlib implements a multinomial model. This model works on input in the form of non-zero count data. This can include a binary representation of categorical features (such as the _1-of-k_ encoding covered previously) or frequency data (such as the frequency of occurrences of words in a document). The numerical features we used initially do not conform to this assumed input distribution, so it is probably unsurprising that the model did so poorly. + +To illustrate this, we'll use only the category feature, which, when _1-of-k_ encoded, is of the correct form for the model. We will create a new dataset as follows: + + val dataNB = records.map { r => + val trimmed = r.map(_.replaceAll("\"", "")) + val label = trimmed(r.size - 1).toInt + val categoryIdx = categories(r(3)) + val categoryFeatures = Array.ofDim[Double](numCategories) + categoryFeatures(categoryIdx) = 1.0 + LabeledPoint(label, Vectors.dense(categoryFeatures)) + } + +Next, we will train a new naive Bayes model and evaluate its performance: + + val nbModelCats = NaiveBayes.train(dataNB) + val nbTotalCorrectCats = dataNB.map { point => + if (nbModelCats.predict(point.features) == point.label) 1 else 0 + }.sum + val nbAccuracyCats = nbTotalCorrectCats / numData + val nbPredictionsVsTrueCats = dataNB.map { point => + (nbModelCats.predict(point.features), point.label) + } + val nbMetricsCats = new BinaryClassificationMetrics(nbPredictionsVsTrueCats) + val nbPrCats = nbMetricsCats.areaUnderPR + val nbRocCats = nbMetricsCats.areaUnderROC + println(f"${nbModelCats.getClass.getSimpleName}\nAccuracy: ${nbAccuracyCats * 100}%2.4f%%\nArea under PR: ${nbPrCats * 100.0}%2.4f%%\nArea under ROC: ${nbRocCats * 100.0}%2.4f%%") + +You should see the following output: + + **NaiveBayesModel** + **Accuracy: 60.9601%** + **Area under PR: 74.0522%** + **Area under ROC: 60.5138%** + +So, by ensuring that we use the correct form of input, we have improved the performance of the naive Bayes model slightly from 58 percent to 60 percent. + +## Tuning model parameters + +The previous section showed the impact on model performance of feature extraction and selection, as well as the form of input data and a model's assumptions around data distributions. So far, we have discussed model parameters only in passing, but they also play a significant role in model performance. + +MLlib's default `train` methods use default values for the parameters of each model. Let's take a deeper look at them. + +### Linear models + +Both logistic regression and SVM share the same parameters, because they use the same underlying optimization technique of **stochastic gradient descent** ( **SGD** ). They differ only in the loss function applied. If we take a look at the class definition for logistic regression in MLlib, we will see the following definition: + + class LogisticRegressionWithSGD private ( + private var stepSize: Double, + private var numIterations: Int, + private var regParam: Double, + private var miniBatchFraction: Double) + extends GeneralizedLinearAlgorithm[LogisticRegressionModel] ... + +We can see that the arguments that can be passed to the constructor are `stepSize`, `numIterations`, `regParam`, and `miniBatchFraction`. Of these, all except `regParam` are related to the underlying optimization technique. + +The instantiation code for logistic regression initializes the `Gradient`, `Updater`, and `Optimizer` and sets the relevant arguments for `Optimizer` (`GradientDescent` in this case): + + private val gradient = new LogisticGradient() + private val updater = new SimpleUpdater() + override val optimizer = new GradientDescent(gradient, updater) + .setStepSize(stepSize) + .setNumIterations(numIterations) + .setRegParam(regParam) + .setMiniBatchFraction(miniBatchFraction) + +`LogisticGradient` sets up the logistic loss function that defines our logistic regression model. + +### Tip + +While a detailed treatment of optimization techniques is beyond the scope of this book, MLlib provides two optimizers for linear models: SGD and L-BFGS. L-BFGS is often more accurate and has fewer parameters to tune. + +SGD is the default, while L-BGFS can currently only be used directly for logistic regression via `LogisticRegressionWithLBFGS`. Try it out yourself and compare the results to those found with SGD. + +See for further details. + +To investigate the impact of the remaining parameter settings, we will create a helper function that will train a logistic regression model, given a set of parameter inputs. First, we will import the required classes: + + import org.apache.spark.rdd.RDD + import org.apache.spark.mllib.optimization.Updater + import org.apache.spark.mllib.optimization.SimpleUpdater + import org.apache.spark.mllib.optimization.L1Updater + import org.apache.spark.mllib.optimization.SquaredL2Updater + import org.apache.spark.mllib.classification.ClassificationModel + +Next, we will define our helper function to train a mode given a set of inputs: + + def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIterations: Int, updater: Updater, stepSize: Double) = { + val lr = new LogisticRegressionWithSGD + lr.optimizer.setNumIterations(numIterations). setUpdater(updater).setRegParam(regParam).setStepSize(stepSize) + lr.run(input) + } + +Finally, we will create a second helper function to take the input data and a classification model and generate the relevant AUC metrics: + + def createMetrics(label: String, data: RDD[LabeledPoint], model: ClassificationModel) = { + val scoreAndLabels = data.map { point => + (model.predict(point.features), point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (label, metrics.areaUnderROC) + } + +We will also cache our scaled dataset, including categories, to speed up the multiple model training runs that we will be using to explore these different parameter settings: + + scaledDataCats.cache + +#### Iterations + +Many machine learning methods are iterative in nature, converging to a solution (the optimal weight vector that minimizes the chosen loss function) over a number of iteration steps. SGD typically requires relatively few iterations to converge to a reasonable solution but can be run for more iterations to improve the solution. We can see this by trying a few different settings for the `numIterations` parameter and comparing the AUC results: + + val iterResults = Seq(1, 5, 10, 50).map { param => + val model = trainWithParams(scaledDataCats, 0.0, param, new SimpleUpdater, 1.0) + createMetrics(s"$param iterations", scaledDataCats, model) + } + iterResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } + +Your output should look like this: + + **1 iterations, AUC = 64.97%** + **5 iterations, AUC = 66.62%** + **10 iterations, AUC = 66.55%** + **50 iterations, AUC = 66.81%** + +So, we can see that the number of iterations has minor impact on the results once a certain number of iterations have been completed. + +#### Step size + +In SGD, the step size parameter controls how far in the direction of the steepest gradient the algorithm takes a step when updating the model weight vector after each training example. A larger step size might speed up convergence, but a step size that is too large might cause problems with convergence as good solutions are overshot. + +We can see the impact of changing the step size here: + + val stepResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param => + val model = trainWithParams(scaledDataCats, 0.0, numIterations, new SimpleUpdater, param) + createMetrics(s"$param step size", scaledDataCats, model) + } + stepResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } + +This will give us the following results, which show that increasing the step size too much can begin to negatively impact performance. + + **0.001 step size, AUC = 64.95%** + **0.01 step size, AUC = 65.00%** + **0.1 step size, AUC = 65.52%** + **1.0 step size, AUC = 66.55%** + **10.0 step size, AUC = 61.92%** + +#### Regularization + +We briefly touched on the `Updater` class in the preceding logistic regression code. An `Updater` class in MLlib implements regularization. Regularization can help avoid over-fitting of a model to training data by effectively penalizing model complexity. This can be done by adding a term to the loss function that acts to increase the loss as a function of the model weight vector. + +Regularization is almost always required in real use cases, but is of particular importance when the feature dimension is very high (that is, the effective number of variable weights that can be learned is high) relative to the number of training examples. + +When regularization is absent or low, models can tend to over-fit. Without regularization, most models will over-fit on a training dataset. This is a key reason behind the use of cross-validation techniques for model fitting (which we will cover now). + +Conversely, since applying regularization encourages simpler models, model performance can suffer when regularization is high through under-fitting the data. + +The forms of regularization available in MLlib are: + + * `SimpleUpdater`: This equates to no regularization and is the default for logistic regression + * `SquaredL2Updater`: This implements a regularizer based on the squared L2-norm of the weight vector; this is the default for SVM models + * `L1Updater`: This applies a regularizer based on the L1-norm of the weight vector; this can lead to sparse solutions in the weight vector (as less important weights are pulled towards zero) + +### Note + +Regularization and its relation to optimization is a broad and heavily researched area. Some more information is available from the following links: + + * General regularization overview: + * L2 regularization: + * Over-fitting and under-fitting: + * Detailed overview of over-fitting and L1 versus L2 regularization: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.92.9860&rep=rep1&type=pdf + +Let's explore the impact of a range of regularization parameters using `SquaredL2Updater`: + + val regResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param => + val model = trainWithParams(scaledDataCats, param, numIterations, new SquaredL2Updater, 1.0) + createMetrics(s"$param L2 regularization parameter", scaledDataCats, model) + } + regResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } + +Your output should look like this: + + **0.001 L2 regularization parameter, AUC = 66.55%** + **0.01 L2 regularization parameter, AUC = 66.55%** + **0.1 L2 regularization parameter, AUC = 66.63%** + **1.0 L2 regularization parameter, AUC = 66.04%** + **10.0 L2 regularization parameter, AUC = 35.33%** + +As we can see, at low levels of regularization, there is not much impact in model performance. However, as we increase regularization, we can see the impact of under-fitting on our model evaluation. + +### Tip + +You will find similar results when using the L1 regularization. Give it a try by performing the same evaluation of regularization parameter against the AUC measure for `L1Updater`. + +### Decision trees + +The decision tree model we trained earlier was the best performer on the raw data that we first used. We set a parameter called `maxDepth`, which controls the maximum depth of the tree and, thus, the complexity of the model. Deeper trees result in more complex models that will be able to fit the data better. + +For classification problems, we can also select between two measures of impurity: `Gini` and `Entropy`. + +#### Tuning tree depth and impurity + +We will illustrate the impact of tree depth in a similar manner as we did for our logistic regression model. + +First, we will need to create another helper function in the Spark shell: + + import org.apache.spark.mllib.tree.impurity.Impurity + import org.apache.spark.mllib.tree.impurity.Entropy + import org.apache.spark.mllib.tree.impurity.Gini + + def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity: Impurity) = { + DecisionTree.train(input, Algo.Classification, impurity, maxDepth) + } + +Now, we're ready to compute our AUC metric for different settings of tree depth. We will simply use our original dataset in this example since we do not need the data to be standardized. + +### Tip + +Note that decision tree models generally do not require features to be standardized or normalized, nor do they require categorical features to be binary-encoded. + +First, train the model using the `Entropy` impurity measure and varying tree depths: + + val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map { param => + val model = trainDTWithParams(data, param, Entropy) + val scoreAndLabels = data.map { point => + val score = model.predict(point.features) + (if (score > 0.5) 1.0 else 0.0, point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (s"$param tree depth", metrics.areaUnderROC) + } + dtResultsEntropy.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } + +This should output the results shown here: + + **1 tree depth, AUC = 59.33%** + **2 tree depth, AUC = 61.68%** + **3 tree depth, AUC = 62.61%** + **4 tree depth, AUC = 63.63%** + **5 tree depth, AUC = 64.88%** + **10 tree depth, AUC = 76.26%** + **20 tree depth, AUC = 98.45%** + +Next, we will perform the same computation using the `Gini` impurity measure (we omitted the code as it is very similar, but it can be found in the code bundle). Your results should look something like this: + + **1 tree depth, AUC = 59.33%** + **2 tree depth, AUC = 61.68%** + **3 tree depth, AUC = 62.61%** + **4 tree depth, AUC = 63.63%** + **5 tree depth, AUC = 64.89%** + **10 tree depth, AUC = 78.37%** + **20 tree depth, AUC = 98.87%** + +As you can see from the preceding results, increasing the tree depth parameter results in a more accurate model (as expected since the model is allowed to get more complex with greater tree depth). It is very likely that at higher tree depths, the model will over-fit the dataset significantly. + +There is very little difference in performance between the two impurity measures. + +### The naive Bayes model + +Finally, let's see the impact of changing the `lambda` parameter for naive Bayes. This parameter controls additive smoothing, which handles the case when a class and feature value do not occur together in the dataset. + +### Tip + +See for more details on additive smoothing. + +We will take the same approach as we did earlier, first creating a convenience training function and training the model with varying levels of `lambda`: + + def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) = { + val nb = new NaiveBayes + nb.setLambda(lambda) + nb.run(input) + } + val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param => + val model = trainNBWithParams(dataNB, param) + val scoreAndLabels = dataNB.map { point => + (model.predict(point.features), point.label) + } + val metrics = new BinaryClassificationMetrics(scoreAndLabels) + (s"$param lambda", metrics.areaUnderROC) + } + nbResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") + } + +The results of the training are as follows: + + **0.001 lambda, AUC = 60.51%** + **0.01 lambda, AUC = 60.51%** + **0.1 lambda, AUC = 60.51%** + **1.0 lambda, AUC = 60.51%** + **10.0 lambda, AUC = 60.51%** + +We can see that `lambda` has no impact in this case, since it will not be a problem if the combination of feature and class label not occurring together in the dataset. + +## Cross-validation + +So far in this book, we have only briefly mentioned the idea of cross-validation and out-of-sample testing. Cross-validation is a critical part of real-world machine learning and is central to many model selection and parameter tuning pipelines. + +The general idea behind cross-validation is that we want to know how our model will perform on unseen data. Evaluating this on real, live data (for example, in a production system) is risky, because we don't really know whether the trained model is the best in the sense of being able to make accurate predictions on new data. As we saw previously with regard to regularization, our model might have over-fit the training data and be poor at making predictions on data it has not been trained on. + +Cross-validation provides a mechanism where we use part of our available dataset to train our model and another part to evaluate the performance of this model. As the model is tested on data that it has not seen during the training phase, its performance, when evaluated on this part of the dataset, gives us an estimate as to how well our model generalizes for the new data points. + +Here, we will implement a simple cross-validation evaluation approach using a train-test split. We will divide our dataset into two non-overlapping parts. The first dataset is used to train our model and is called the training set. The second dataset, called the test set or hold-out set, is used to evaluate the performance of our model using our chosen evaluation measure. Common splits used in practice include 50/50, 60/40, and 80/20 splits, but you can use any split as long as the training set is not too small for the model to learn (generally, at least 50 percent is a practical minimum). + +In many cases, three sets are created: a training set, an evaluation set (which is used like the above test set to tune the model parameters such as lambda and step size), and a test set (which is never used to train a model or tune any parameters, but is only used to generate an estimated true performance on completely unseen data). + +### Note + +Here, we will explore a simple train-test split approach. There are many cross-validation techniques that are more exhaustive and complex. + +One popular example is K-fold cross-validation, where the dataset is split into K non-overlapping folds. The model is trained on K-1 folds of data and tested on the remaining, held-out fold. This is repeated K times, and the results are averaged to give the cross-validation score. The train-test split is effectively like two-fold cross-validation. + +Other approaches include leave-one-out cross-validation and random sampling. See the article at for further details. + +First, we will split our dataset into a 60 percent training set and a 40 percent test set (we will use a constant random seed of 123 here to ensure that we get the same results for ease of illustration): + + val trainTestSplit = scaledDataCats.randomSplit(Array(0.6, 0.4), 123) + val train = trainTestSplit(0) + val test = trainTestSplit(1) + +Next, we will compute the evaluation metric of interest (again, we will use AUC) for a range of regularization parameter settings. Note that here we will use a finer-grained step size between the evaluated regularization parameters to better illustrate the differences in AUC, which are very small in this case: + + val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param => + val model = trainWithParams( **train** , param, numIterations, new SquaredL2Updater, 1.0) + createMetrics(s"$param L2 regularization parameter", test, model) + } + regResultsTest.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") + } + +This will compute the results of training on the training set and the results of evaluating on the test set, as shown here: + + **0.0 L2 regularization parameter, AUC = 66.480874%** + **0.001 L2 regularization parameter, AUC = 66.480874%** + **0.0025 L2 regularization parameter, AUC = 66.515027%** + **0.005 L2 regularization parameter, AUC = 66.515027%** + **0.01 L2 regularization parameter, AUC = 66.549180%** + +Now, let's compare this to the results of training and testing on the training set (this is what we were doing previously by training and testing on all data). Again, we will omit the code as it is very similar (but it is available in the code bundle): + + **0.0 L2 regularization parameter, AUC = 66.260311%** + **0.001 L2 regularization parameter, AUC = 66.260311%** + **0.0025 L2 regularization parameter, AUC = 66.260311%** + **0.005 L2 regularization parameter, AUC = 66.238294%** + **0.01 L2 regularization parameter, AUC = 66.238294%** + +So, we can see that when we train and evaluate our model on the same dataset, we generally achieve the highest performance when regularization is lower. This is because our model has seen all the data points, and with low levels of regularization, it can over-fit the data set and achieve higher performance. + +In contrast, when we train on one dataset and test on another, we see that generally a slightly higher level of regularization results in better test set performance. + +In cross-validation, we would typically find the parameter settings (including regularization as well as the various other parameters such as step size and so on) that result in the best test set performance. We would then use these parameter settings to retrain the model on all of our data in order to use it to make predictions on new data. + +### Tip + +Recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that we did not cover cross-validation. You can apply the same techniques we used earlier to split the ratings dataset from that chapter into a training and test dataset. You can then try out different parameter settings on the training set while evaluating the MSE and MAP performance metrics on the test set in a manner similar to what we did earlier. Give it a try! + +# Summary + +In this chapter, we covered the various classification models available in Spark MLlib, and we saw how to train models on input data and how to evaluate their performance using standard metrics and measures. We also explored how to apply some of the techniques previously introduced to transform our features. Finally, we investigated the impact of using the correct input data format or distribution on model performance, and we also saw the impact of adding more data to our model, tuning model parameters, and implementing cross-validation. + +In the next chapter, we will take a similar approach to delve into MLlib's regression models. + +# Chapter 6. Building a Regression Model with Spark + +In this chapter, we will build on what we covered in Chapter 5, _Building a Classification Model with Spark_. While classification models deal with outcomes that represent discrete classes, regression models are concerned with target variables that can take any real value. The underlying principle is very similar--we wish to find a model that maps input features to predicted target variables. Like classification, regression is also a form of supervised learning. + +Regression models can be used to predict just about any variable of interest. A few examples include the following: + + * Predicting stock returns and other economic variables + * Predicting loss amounts for loan defaults (this can be combined with a classification model that predicts the probability of default, while the regression model predicts the amount in the case of a default) + * Recommendations (the Alternating Least Squares factorization model from Chapter 4, _Building a Recommendation Engine with Spark_ , uses linear regression in each iteration) + * Predicting **customer lifetime value** ( **CLTV** ) in a retail, mobile, or other business, based on user behavior and spending patterns + +In the following sections, we will: + + * Introduce the various types of regression models available in MLlib + * Explore feature extraction and target variable transformation for regression models + * Train a number of regression models using MLlib + * See how to make predictions using the trained models + * Investigate the impact on performance of various parameter settings for regression using cross-validation + +# Types of regression models + +Spark's MLlib library offers two broad classes of regression models: linear models and decision tree regression models. + +Linear models are essentially the same as their classification counterparts, the only difference is that linear regression models use a different loss function, related link function, and decision function. MLlib provides a standard least squares regression model (although other types of generalized linear models for regression are planned). + +Decision trees can also be used for regression by changing the impurity measure. + +## Least squares regression + +You might recall from Chapter 5, _Building a Classification Model with Spark_ , that there are a variety of loss functions that can be applied to generalized linear models. The loss function used for least squares is the squared loss, which is defined as follows: + + ½ (wTx - y)2 + +Here, as for the classification setting, _y_ is the target variable (this time, real valued), _w_ is the weight vector, and _x_ is the feature vector. + +The related link function is the identity link, and the decision function is also the identity function, as generally, no thresholding is applied in regression. So, the model's prediction is simply _y = w Tx_. + +The standard least squares regression in MLlib does not use regularization. Looking at the squared loss function, we can see that the loss applied to incorrectly predicted points will be magnified since the loss is squared. This means that least squares regression is susceptible to outliers in the dataset and also to over-fitting. Generally, as for classification, we should apply some level of regularization in practice. + +Linear regression with L2 regularization is commonly referred to as ridge regression, while applying L1 regularization is called the **lasso**. + +### Tip + +See the section on linear least squares in the Spark MLlib documentation at for further information. + +## Decision trees for regression + +Just like using linear models for regression tasks involves changing the loss function used, using decision trees for regression involves changing the measure of the node impurity used. The impurity metric is called **variance** and is defined in the same way as the squared loss for least squares linear regression. + +### Note + +See the _MLlib - Decision Tree_ section in the Spark documentation at for further details on the decision tree algorithm and impurity measure for regression. + +Now, we will plot a simple example of a regression problem with only one input variable shown on the _x_ axis and the target variable on the _y_ axis. The linear model prediction function is shown by a red dashed line, while the decision tree prediction function is shown by a green dashed line. We can see that the decision tree allows a more complex, nonlinear model to be fitted to the data. + +Linear model and decision tree prediction functions for regression + +# Extracting the right features from your data + +As the underlying models for regression are the same as those for the classification case, we can use the same approach to create input features. The only practical difference is that the target is now a real-valued variable, as opposed to a categorical one. The `LabeledPoint` class in MLlib already takes this into account, as the `label` field is of the `Double` type, so it can handle both cases. + +## Extracting features from the bike sharing dataset + +To illustrate the concepts in this chapter, we will be using the bike sharing dataset. This dataset contains hourly records of the number of bicycle rentals in the capital bike sharing system. It also contains variables related to date and time, weather, and seasonal and holiday information. + +### Note + +The dataset is available at . + +Click on the **Data Folder** link and then download the `Bike-Sharing-Dataset.zip` file. + +The bike sharing data was enriched with weather and seasonal data by Hadi Fanaee-T at the University of Porto and used in the following paper: + +Fanaee-T, Hadi and Gama Joao, Event labeling combining ensemble detectors and background knowledge, _Progress in Artificial Intelligence_ , pp. 1-15, Springer Berlin Heidelberg, 2013. + +The paper is available at . + +Once you have downloaded the `Bike-Sharing-Dataset.zip` file, unzip it. This will create a directory called `Bike-Sharing-Dataset`, which contains the `day.csv`, `hour.csv`, and the `Readme.txt` files. + +The `Readme.txt` file contains information on the dataset, including the variable names and descriptions. Take a look at the file, and you will see that we have the following variables available: + + * `instant`: This is the record ID + * `dteday`: This is the raw date + * `season`: This is different seasons such as spring, summer, winter, and fall + * `yr`: This is the year (2011 or 2012) + * `mnth`: This is the month of the year + * `hr`: This is the hour of the day + * `holiday`: This is whether the day was a holiday or not + * `weekday`: This is the day of the week + * `workingday`: This is whether the day was a working day or not + * `weathersit`: This is a categorical variable that describes the weather at a particular time + * `temp`: This is the normalized temperature + * `atemp`: This is the normalized apparent temperature + * `hum`: This is the normalized humidity + * `windspeed`: This is the normalized wind speed + * `cnt`: This is the target variable, that is, the count of bike rentals for that hour + +We will work with the hourly data contained in `hour.csv`. If you look at the first line of the dataset, you will see that it contains the column names as a header. You can do this by running the following command: + + **> head -1 hour.csv** + +This should output the following result: + + **instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt** + +Before we work with the data in Spark, we will again remove the header from the first line of the file using the same `sed` command that we used previously to create a new file called `hour_noheader.csv`: + + **> sed 1d hour.csv > hour_noheader.csv** + +Since we will be doing some plotting of our dataset later on, we will use the Python shell for this chapter. This also serves to illustrate how to use MLlib's linear model and decision tree functionality from PySpark. + +Start up your PySpark shell from your Spark installation directory. If you want to use IPython, which we highly recommend, remember to include the `IPYTHON=1` environment variable together with the `pylab` functionality: + + **> IPYTHON=1 IPYTHON_OPTS="--pylab" ./bin/pyspark** + +If you prefer to use IPython Notebook, you can start it with the following command: + + **> IPYTHON=1 IPYTHON_OPTS=notebook ./bin/pyspark** + +You can type all the code that follows for the remainder of this chapter directly into your PySpark shell (or into IPython Notebook if you wish to use it). + +### Tip + +Recall that we used the IPython shell in Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_. Take a look at that chapter and the code bundle for instructions to install IPython. + +We'll start as usual by loading the dataset and inspecting it: + + path = "/ **PATH** /hour_noheader.csv" + raw_data = sc.textFile(path) + num_data = raw_data.count() + records = raw_data.map(lambda x: x.split(",")) + first = records.first() + print first + print num_data + +You should see the following output: + + **[u'1', u'2011-01-01', u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']** + **17379** + +So, we have `17,379` hourly records in our dataset. We have inspected the column names already. We will ignore the record ID and raw date columns. We will also ignore the `casual` and `registered` count target variables and focus on the overall count variable, `cnt` (which is the sum of the other two counts). We are left with 12 variables. The first eight are categorical, while the last 4 are normalized real-valued variables. + +To deal with the eight categorical variables, we will use the binary encoding approach with which you should be quite familiar by now. The four real-valued variables will be left as is. + +We will first cache our dataset, since we will be reading from it many times: + + records.cache() + +In order to extract each categorical feature into a binary vector form, we will need to know the feature mapping of each feature value to the index of the nonzero value in our binary vector. Let's define a function that will extract this mapping from our dataset for a given column: + + def get_mapping(rdd, idx): + return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap() + +Our function first maps the field to its unique values and then uses the `zipWithIndex` transformation to zip the value up with a unique index such that a key-value RDD is formed, where the key is the variable and the value is the index. This index will be the index of the nonzero entry in the binary vector representation of the feature. We will finally collect this RDD back to the driver as a Python dictionary. + +We can test our function on the third variable column (index 2): + + print "Mapping of first categorical feasture column: %s" % get_mapping(records, 2) + +The preceding line of code will give us the following output: + + **Mapping of first categorical feasture column: {u'1': 0, u'3': 2, u'2': 1, u'4': 3}** + +Now, we can apply this function to each categorical column (that is, for variable indices 2 to 9): + + mappings = [get_mapping(records, i) for i in range(2,10)] + cat_len = sum(map(len, mappings)) + num_len = len(records.first()[11:15]) + total_len = num_len + cat_len + +We now have the mappings for each variable, and we can see how many values in total we need for our binary vector representation: + + print "Feature vector length for categorical features: %d" % cat_len + print "Feature vector length for numerical features: %d" % num_len + print "Total feature vector length: %d" % total_len + +The output of the preceding code is as follows: + + **Feature vector length for categorical features: 57** + **Feature vector length for numerical features: 4** + **Total feature vector length: 61** + +### Creating feature vectors for the linear model + +The next step is to use our extracted mappings to convert the categorical features to binary-encoded features. Again, it will be helpful to create a function that we can apply to each record in our dataset for this purpose. We will also create a function to extract the target variable from each record. We will need to import `numpy` for linear algebra utilities and MLlib's `LabeledPoint` class to wrap our feature vectors and target variables: + + from pyspark.mllib.regression import LabeledPoint + import numpy as np + + def extract_features(record): + cat_vec = np.zeros(cat_len) + i = 0 + step = 0 + for field in record[2:9]: + m = mappings[i] + idx = m[field] + cat_vec[idx + step] = 1 + i = i + 1 + step = step + len(m) + num_vec = np.array([float(field) for field in record[10:14]]) + return np.concatenate((cat_vec, num_vec)) + + def extract_label(record): + return float(record[-1]) + +In the preceding `extract_features` function, we ran through each column in the row of data. We extracted the binary encoding for each variable in turn from the mappings we created previously. The `step` variable ensures that the nonzero feature index in the full feature vector is correct (and is somewhat more efficient than, say, creating many smaller binary vectors and concatenating them). The numeric vector is created directly by first converting the data to floating point numbers and wrapping these in a `numpy` array. The resulting two vectors are then concatenated. The `extract_label` function simply converts the last column variable (the count) into a float. + +With our utility functions defined, we can proceed with extracting feature vectors and labels from our data records: + + data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r))) + +Let's inspect the first record in the extracted feature RDD: + + first_point = data.first() + print "Raw data: " + str(first[2:]) + print "Label: " + str(first_point.label) + print "Linear Model feature vector:\n" + str(first_point.features) + print "Linear Model feature vector length: " + str(len(first_point.features)) + +You should see output similar to the following: + + **Raw data: [u'1', u'0', u'1', u'0', u'0', u'6', u'0', u'1', u'0.24', u'0.2879', u'0.81', u'0', u'3', u'13', u'16']** + **Label: 16.0** + **Linear Model feature vector: [1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.24,0.2879,0.81,0.0]** + **Linear Model feature vector length: 61** + +As we can see, we converted the raw data into a feature vector made up of the binary categorical and real numeric features, and we indeed have a total vector length of `61`. + +### Creating feature vectors for the decision tree + +As we have seen, decision tree models typically work on raw features (that is, it is not required to convert categorical features into a binary vector encoding; they can, instead, be used directly). Therefore, we will create a separate function to extract the decision tree feature vector, which simply converts all the values to floats and wraps them in a `numpy` array: + + def extract_features_dt(record): + return np.array(map(float, record[2:14])) + data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) + first_point_dt = data_dt.first() + print "Decision Tree feature vector: " + str(first_point_dt.features) + print "Decision Tree feature vector length: " + str(len(first_point_dt.features)) + +The following output shows the extracted feature vector, and we can see that we have a vector length of `12`, which matches the number of raw variables we are using: + + **Decision Tree feature vector: [1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0]** + **Decision Tree feature vector length: 12** + +# Training and using regression models + +Training for regression models using decision trees and linear models follows the same procedure as for classification models. We simply pass the training data contained in a `[LabeledPoint]` RDD to the relevant `train` method. Note that in Scala, if we wanted to customize the various model parameters (such as regularization and step size for the SGD optimizer), we are required to instantiate a new model instance and use the `optimizer` field to access these available parameter setters. + +In Python, we are provided with a convenience method that gives us access to all the available model arguments, so we only have to use this one entry point for training. We can see the details of these convenience functions by importing the relevant modules and then calling the `help` function on the `train` methods: + + from pyspark.mllib.regression import LinearRegressionWithSGD + from pyspark.mllib.tree import DecisionTree + help(LinearRegressionWithSGD.train) + +Doing this for the linear model outputs the following documentation: + +Linear regression help documentation + +We can see from the linear regression documentation that we need to pass in the training data at a minimum, but we can set any of the other model parameters using this `train` method. + +Similarly, for the decision tree model, which has a `trainRegressor` method (in addition to a `trainClassifier` method for classification models): + + help(DecisionTree.trainRegressor) + +The preceding code would display the following documentation: + +Decision tree regression help documentation + +## Training a regression model on the bike sharing dataset + +We're ready to use the features we have extracted to train our models on the bike sharing data. First, we'll train the linear regression model and take a look at the first few predictions that the model makes on the data: + + linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) + true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) + print "Linear Model predictions: " + str(true_vs_predicted.take(5)) + +Note that we have not used the default settings for `iterations` and `step` here. We've changed the number of iterations so that the model does not take too long to train. As for the step size, you will see why this has been changed from the default a little later. You will see the following output: + + **Linear Model predictions: [(16.0, 119.30920003093595), (40.0, 116.95463511937379), (32.0, 116.57294610647752), (13.0, 116.43535423855654), (1.0, 116.221247828503)]** + +Next, we will train the decision tree model simply using the default arguments to the `trainRegressor` method (which equates to using a tree depth of 5). Note that we need to pass in the other form of the dataset, `data_dt`, that we created from the raw feature values (as opposed to the binary encoded features that we used for the preceding linear model). + +We also need to pass in an argument for `categoricalFeaturesInfo`. This is a dictionary that maps the categorical feature index to the number of categories for the feature. If a feature is not in this mapping, it will be treated as continuous. For our purposes, we will leave this as is, passing in an empty mapping: + + dt_model = DecisionTree.trainRegressor(data_dt,{}) + preds = dt_model.predict(data_dt.map(lambda p: p.features)) + actual = data.map(lambda p: p.label) + true_vs_predicted_dt = actual.zip(preds) + print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) + print "Decision Tree depth: " + str(dt_model.depth()) + print "Decision Tree number of nodes: " + str(dt_model.numNodes()) + +This should output these predictions: + + **Decision Tree predictions: [(16.0, 54.913223140495866), (40.0, 54.913223140495866), (32.0, 53.171052631578945), (13.0, 14.284023668639053), (1.0, 14.284023668639053)]** + **Decision Tree depth: 5** + **Decision Tree number of nodes: 63** + +### Note + +This is not as bad as it sounds. While we do not cover it here, the Python code included with this chapter's code bundle includes an example of using `categoricalFeaturesInfo`. It does not make a large difference to performance in this case. + +From a quick glance at these predictions, it appears that the decision tree might do better, as the linear model is quite a way off in its predictions. However, we will apply more stringent evaluation methods to find out. + +# Evaluating the performance of regression models + +We saw in Chapter 5, _Building a Classification Model with Spark_ , that evaluation methods for classification models typically focus on measurements related to predicted class memberships relative to the actual class memberships. These are binary outcomes (either the predicted class is correct or incorrect), and it is less important whether the model just barely predicted correctly or not; what we care most about is the number of correct and incorrect predictions. + +When dealing with regression models, it is very unlikely that our model will precisely predict the target variable, because the target variable can take on any real value. However, we would naturally like to understand how far away our predicted values are from the true values, so will we utilize a metric that takes into account the overall deviation. + +Some of the standard evaluation metrics used to measure the performance of regression models include the **Mean Squared Error** ( **MSE** ) and **Root Mean Squared Error** ( **RMSE** ), the **Mean Absolute Error** ( **MAE** ), the R-squared coefficient, and many others. + +## Mean Squared Error and Root Mean Squared Error + +MSE is the average of the squared error that is used as the loss function for least squares regression: + +It is the sum, over all the data points, of the square of the difference between the predicted and actual target variables, divided by the number of data points. + +RMSE is the square root of MSE. MSE is measured in units that are the square of the target variable, while RMSE is measured in the same units as the target variable. Due to its formulation, MSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely. + +In order to evaluate our predictions based on the mean of an error metric, we will first make predictions for each input feature vector in an RDD of `LabeledPoint` instances by computing the error for each record using a function that takes the prediction and true target value as inputs. This will return a `[Double]` RDD that contains the error values. We can then find the average using the `mean` method of RDDs that contain `Double` values. + +Let's define our squared error function as follows: + + def squared_error(actual, pred): + return (pred - actual)**2 + +## Mean Absolute Error + +MAE is the average of the absolute differences between the predicted and actual targets: + +MAE is similar in principle to MSE, but it does not punish large deviations as much. + +Our function to compute MAE is as follows: + + def abs_error(actual, pred): + return np.abs(pred - actual) + +## Root Mean Squared Log Error + +This measurement is not as widely used as MSE and MAE, but it is used as the metric for the Kaggle competition that uses the bike sharing dataset. It is effectively the RMSE of the log-transformed predicted and target values. This measurement is useful when there is a wide range in the target variable, and you do not necessarily want to penalize large errors when the predicted and target values are themselves high. It is also effective when you care about percentage errors rather than the absolute value of errors. + +### Note + +The Kaggle competition evaluation page can be found at . + +The function to compute RMSLE is shown here: + + def squared_log_error(pred, actual): + return (np.log(pred + 1) - np.log(actual + 1))**2 + +## The R-squared coefficient + +The R-squared coefficient, also known as the coefficient of determination, is a measure of how well a model fits a dataset. It is commonly used in statistics. It measures the degree of variation in the target variable; this is explained by the variation in the input features. An R-squared coefficient generally takes a value between 0 and 1, where 1 equates to a perfect fit of the model. + +## Computing performance metrics on the bike sharing dataset + +Given the functions we defined earlier, we can now compute the various evaluation metrics on our bike sharing data. + +### Linear model + +Our approach will be to apply the relevant error function to each record in the `RDD` we computed earlier, which is `true_vs_predicted` for our linear model: + + mse = true_vs_predicted.map(lambda (t, p): squared_error(t, p)).mean() + mae = true_vs_predicted.map(lambda (t, p): abs_error(t, p)).mean() + rmsle = np.sqrt(true_vs_predicted.map(lambda (t, p): squared_log_error(t, p)).mean()) + print "Linear Model - Mean Squared Error: %2.4f" % mse + print "Linear Model - Mean Absolute Error: %2.4f" % mae + print "Linear Model - Root Mean Squared Log Error: %2.4f" % rmsle + +This outputs the following metrics: + + **Linear Model - Mean Squared Error: 28166.3824** + **Linear Model - Mean Absolute Error: 129.4506** + **Linear Model - Root Mean Squared Log Error: 1.4974** + +### Decision tree + +We will use the same approach for the decision tree model, using the `true_vs_predicted_dt` RDD: + + mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean() + mae_dt = true_vs_predicted_dt.map(lambda (t, p): abs_error(t, p)).mean() + rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda (t, p): squared_log_error(t, p)).mean()) + print "Decision Tree - Mean Squared Error: %2.4f" % mse_dt + print "Decision Tree - Mean Absolute Error: %2.4f" % mae_dt + print "Decision Tree - Root Mean Squared Log Error: %2.4f" % rmsle_dt + +You should see output similar to this: + + **Decision Tree - Mean Squared Error: 11560.7978** + **Decision Tree - Mean Absolute Error: 71.0969** + **Decision Tree - Root Mean Squared Log Error: 0.6259** + +Looking at the results, we can see that our initial guess about the decision tree model being the better performer is indeed true. + +### Note + +The Kaggle competition leaderboard lists the Mean Value Benchmark score on the test set at about 1.58. So, we see that our linear model performance is not much better. However, the decision tree with default settings achieves a performance of 0.63. + +The winning score at the time of writing this book is listed as 0.29504. + +# Improving model performance and tuning parameters + +In Chapter 5, _Building a Classification Model with Spark_ , we showed how feature transformation and selection can make a large difference to the performance of a model. In this chapter, we will focus on another type of transformation that can be applied to a dataset: transforming the target variable itself. + +## Transforming the target variable + +Recall that many machine learning models, including linear models, make assumptions regarding the distribution of the input data as well as target variables. In particular, linear regression assumes a normal distribution. + +In many real-world cases, the distributional assumptions of linear regression do not hold. In this case, for example, we know that the number of bike rentals can never be negative. This alone should indicate that the assumption of normality might be problematic. To get a better idea of the target distribution, it is often a good idea to plot a histogram of the target values. + +In this section, if you are using IPython Notebook, enter the magic function, `%pylab inline`, to import `pylab` (that is, the `numpy` and `matplotlib` plotting functions) into the workspace. This will also create any figures and plots inline within the `Notebook` cell. + +If you are using the standard IPython console, you can use `%pylab` to import the necessary functionality (your plots will appear in a separate window). + +We will now create a plot of the target variable distribution in the following piece of code: + + targets = records.map(lambda r: float(r[-1])).collect() + hist(targets, bins=40, color='lightblue', normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +Looking at the histogram plot, we can see that the distribution is highly skewed and certainly does not follow a normal distribution: + +Distribution of raw target variable values + +One way in which we might deal with this situation is by applying a transformation to the target variable, such that we take the logarithm of the target value instead of the raw value. This is often referred to as log-transforming the target variable (this transformation can also be applied to feature values). + +We will apply a log transformation to the following target variable and plot a histogram of the log-transformed values: + + log_targets = records.map(lambda r: np.log(float(r[-1]))).collect() + hist(log_targets, bins=40, color='lightblue', normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +Distribution of log-transformed target variable values + +A second type of transformation that is useful in the case of target values that do not take on negative values and, in addition, might take on a very wide range of values, is to take the square root of the variable. + +We will apply the square root transform in the following code, once more plotting the resulting target variable distribution: + + sqrt_targets = records.map(lambda r: np.sqrt(float(r[-1]))).collect() + hist(sqrt_targets, bins=40, color='lightblue', normed=True) + fig = matplotlib.pyplot.gcf() + fig.set_size_inches(16, 10) + +From the plots of the log and square root transformations, we can see that both result in a more even distribution relative to the raw values. While they are still not normally distributed, they are a lot closer to a normal distribution when compared to the original target variable. + +Distribution of square-root-transformed target variable values + +### Impact of training on log-transformed targets + +So, does applying these transformations have any impact on model performance? Let's evaluate the various metrics we used previously on log-transformed data as an example. + +We will do this first for the linear model by applying the `numpy log` function to the `label` field of each `LabeledPoint` RDD. Here, we will only transform the target variable, and we will not apply any transformations to the features: + + data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) + +We will then train a model on this transformed data and form the RDD of predicted versus true values: + + model_log = LinearRegressionWithSGD.train(data_log, iterations=10, step=0.1) + +Note that now that we have transformed the target variable, the predictions of the model will be on the log scale, as will the target values of the transformed dataset. Therefore, in order to use our model and evaluate its performance, we must first transform the log data back into the original scale by taking the exponent of both the predicted and true values using the `numpy exp` function. We will show you how to do this in the code here: + + true_vs_predicted_log = data_log.map(lambda p: (np.exp(p.label), np.exp(model_log.predict(p.features)))) + +Finally, we will compute the MSE, MAE, and RMSLE metrics for the model: + + mse_log = true_vs_predicted_log.map(lambda (t, p): squared_error(t, p)).mean() + mae_log = true_vs_predicted_log.map(lambda (t, p): abs_error(t, p)).mean() + rmsle_log = np.sqrt(true_vs_predicted_log.map(lambda (t, p): squared_log_error(t, p)).mean()) + print "Mean Squared Error: %2.4f" % mse_log + print "Mean Absolue Error: %2.4f" % mae_log + print "Root Mean Squared Log Error: %2.4f" % rmsle_log + print "Non log-transformed predictions:\n" + str(true_vs_predicted.take(3)) + print "Log-transformed predictions:\n" + str(true_vs_predicted_log.take(3)) + +You should see output similar to the following: + + **Mean Squared Error: 38606.0875** + **Mean Absolue Error: 135.2726** + **Root Mean Squared Log Error: 1.3516** + **Non log-transformed predictions:** + **[(16.0, 119.30920003093594), (40.0, 116.95463511937378), (32.0, 116.57294610647752)]** + **Log-transformed predictions:** + **[(15.999999999999998, 45.860944832110015), (40.0, 43.255903592233274), (32.0, 42.311306147884252)]** + +If we compare these results to the results on the raw target variable, we see that while we did not improve the MSE or MAE, we improved the RMSLE. + +We will perform the same analysis for the decision tree model: + + data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) + dt_model_log = DecisionTree.trainRegressor(data_dt_log,{}) + + preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features)) + actual_log = data_dt_log.map(lambda p: p.label) + true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p))) + + mse_log_dt = true_vs_predicted_dt_log.map(lambda (t, p): squared_error(t, p)).mean() + mae_log_dt = true_vs_predicted_dt_log.map(lambda (t, p): abs_error(t, p)).mean() + rmsle_log_dt = np.sqrt(true_vs_predicted_dt_log.map(lambda (t, p): squared_log_error(t, p)).mean()) + print "Mean Squared Error: %2.4f" % mse_log_dt + print "Mean Absolue Error: %2.4f" % mae_log_dt + print "Root Mean Squared Log Error: %2.4f" % rmsle_log_dt + print "Non log-transformed predictions:\n" + str(true_vs_predicted_dt.take(3)) + print "Log-transformed predictions:\n" + str(true_vs_predicted_dt_log.take(3)) + +From the results here, we can see that we actually made our metrics slightly worse for the decision tree: + + **Mean Squared Error: 14781.5760** + **Mean Absolue Error: 76.4131** + **Root Mean Squared Log Error: 0.6406** + **Non log-transformed predictions:** + **[(16.0, 54.913223140495866), (40.0, 54.913223140495866), (32.0, 53.171052631578945)]** + **Log-transformed predictions:** + **[(15.999999999999998, 37.530779787154508), (40.0, 37.530779787154508), (32.0, 7.2797070993907287)]** + +### Tip + +It is probably not surprising that the log transformation results in a better RMSLE performance for the linear model. As we are minimizing the squared error, once we have transformed the target variable to log values, we are effectively minimizing a loss function that is very similar to the RMSLE. + +This is good for Kaggle competition purposes, since we can more directly optimize against the competition-scoring metric. + +It might or might not be as useful in a real-world situation. This depends on how important larger absolute errors are (recall that RMSLE essentially penalizes relative errors rather than absolute magnitude of errors). + +## Tuning model parameters + +So far in this chapter, we have illustrated the concepts of model training and evaluation for MLlib's regression models by training and testing on the same dataset. We will now use a similar cross-validation approach that we used previously to evaluate the effect on performance of different parameter settings for our models. + +### Creating training and testing sets to evaluate parameters + +The first step is to create a test and training set for cross-validation purposes. Spark's Python API does not yet provide the `randomSplit` convenience method that is available in Scala. Hence, we will need to create a training and test dataset manually. + +One relatively easy way to do this is by first taking a random sample of, say, 20 percent of our data as our test set. We will then define our training set as the elements of the original RDD that are not in the test set RDD. + +We can achieve this using the `sample` method to take a random sample for our test set, followed by using the `subtractByKey` method, which takes care of returning the elements in one RDD where the keys do not overlap with the other RDD. + +Note that `subtractByKey`, as the name suggests, works on the keys of the RDD elements that consist of key-value pairs. Therefore, here we will use `zipWithIndex` on our RDD of extracted training examples. This creates an RDD of `(LabeledPoint, index)` pairs. + +We will then reverse the keys and values so that we can operate on the index keys: + + data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k)) + test = data_with_idx.sample(False, 0.2, 42) + train = data_with_idx.subtractByKey(test) + +Once we have the two RDDs, we will recover just the `LabeledPoint` instances we need for training and test data, using `map` to extract the value from the key-value pairs: + + train_data = train.map(lambda (idx, p): p) + test_data = test.map(lambda (idx, p) : p) + train_size = train_data.count() + test_size = test_data.count() + print "Training data size: %d" % train_size + print "Test data size: %d" % test_size + print "Total data size: %d " % num_data + print "Train + Test size : %d" % (train_size + test_size) + +We can confirm that we now have two distinct datasets that add up to the original dataset in total: + + **Training data size: 13934** + **Test data size: 3445** + **Total data size: 17379** + **Train + Test size : 17379** + +The final step is to apply the same approach to the features extracted for the decision tree model: + + data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) + test_dt = data_with_idx_dt.sample(False, 0.2, 42) + train_dt = data_with_idx_dt.subtractByKey(test_dt) + train_data_dt = train_dt.map(lambda (idx, p): p) + test_data_dt = test_dt.map(lambda (idx, p) : p) + +### The impact of parameter settings for linear models + +Now that we have prepared our training and test sets, we are ready to investigate the impact of different parameter settings on model performance. We will first carry out this evaluation for the linear model. We will create a convenience function to evaluate the relevant performance metric by training the model on the training set and evaluating it on the test set for different parameter settings. + +We will use the RMSLE evaluation metric, as it is the one used in the Kaggle competition with this dataset, and this allows us to compare our model results against the competition leaderboard to see how we perform. + +The evaluation function is defined here: + + def evaluate(train, test, iterations, step, regParam, regType, intercept): + model = LinearRegressionWithSGD.train(train, iterations, step, regParam=regParam, regType=regType, intercept=intercept) + tp = test.map(lambda p: (p.label, model.predict(p.features))) + rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean()) + return rmsle + +### Tip + +Note that in the following sections, you might get slightly different results due to some random initialization for SGD. However, your results will be comparable. + +#### Iterations + +As we saw when evaluating our classification models, we generally expect that a model trained with SGD will achieve better performance as the number of iterations increases, although the increase in performance will slow down as the number of iterations goes above some minimum number. Note that here, we will set the step size to 0.01 to better illustrate the impact at higher iteration numbers: + + params = [1, 5, 10, 20, 50, 100] + metrics = [evaluate(train_data, test_data, **param** , 0.01, 0.0, 'l2', False) for param in params] + print params + print metrics + +The output shows that the error metric indeed decreases as the number of iterations increases. It also does so at a decreasing rate, again as expected. What is interesting is that eventually, the SGD optimization tends to overshoot the optimal solution, and the RMSLE eventually starts to increase slightly: + + **[1, 5, 10, 20, 50, 100]** + **[2.3532904530306888, 1.6438528499254723, 1.4869656275309227, 1.4149741941240344, 1.4159641262731959, 1.4539667094611679]** + +Here, we will use the `matplotlib` library to plot a graph of the RMSLE metric against the number of iterations. We will use a log scale for the _x_ axis to make the output easier to visualize: + + plot(params, metrics) + fig = matplotlib.pyplot.gcf() + pyplot.xscale('log') + +Metrics for varying number of iterations + +#### Step size + +We will perform a similar analysis for step size in the following code: + + params = [0.01, 0.025, 0.05, 0.1, 1.0] + metrics = [evaluate(train_data, test_data, 10, **param** , 0.0, 'l2', False) for param in params] + print params + print metrics + +The output of the preceding code: + + **[0.01, 0.025, 0.05, 0.1, 0.5]** + **[1.4869656275309227, 1.4189071944747715, 1.5027293911925559, 1.5384660954019973, nan]** + +Now, we can see why we avoided using the default step size when training the linear model originally. The default is set to _1.0_ , which, in this case, results in a `nan` output for the RMSLE metric. This typically means that the SGD model has converged to a very poor local minimum in the error function that it is optimizing. This can happen when the step size is relatively large, as it is easier for the optimization algorithm to overshoot good solutions. + +We can also see that for low step sizes and a relatively low number of iterations (we used 10 here), the model performance is slightly poorer. However, in the preceding _Iterations_ section, we saw that for the lower step-size setting, a higher number of iterations will generally converge to a better solution. + +Generally speaking, setting step size and number of iterations involves a trade-off. A lower step size means that convergence is slower but slightly more assured. However, it requires a higher number of iterations, which is more costly in terms of computation and time, in particular at a very large scale. + +### Tip + +Selecting the best parameter settings can be an intensive process that involves training a model on many combinations of parameter settings and selecting the best outcome. Each instance of model training involves a number of iterations, so this process can be very expensive and time consuming when performed on very large datasets. + +The output is plotted here, again using a log scale for the step-size axis: + +Metrics for varying values of step size + +#### L2 regularization + +In Chapter 5, _Building a Classification Model with Spark_ , we saw that regularization has the effect of penalizing model complexity in the form of an additional loss term that is a function of the model weight vector. L2 regularization penalizes the L2-norm of the weight vector, while L1 regularization penalizes the L1-norm. + +We expect training set performance to deteriorate with increasing regularization, as the model cannot fit the dataset well. However, we would also expect some amount of regularization that will result in optimal generalization performance as evidenced by the best performance on the test set. + +We will evaluate the impact of different levels of L2 regularization in this code: + + params = [0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0] + metrics = [evaluate(train_data, test_data, 10, 0.1, **param** , **'l2'** , False) for param in params] + print params + print metrics + plot(params, metrics) + fig = matplotlib.pyplot.gcf() + pyplot.xscale('log') + +As expected, there is an optimal setting of the regularization parameter with respect to the test set RMSLE: + + **[0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]** + **[1.5384660954019971, 1.5379108106882864, 1.5329809395123755, 1.4900275345312988, 1.4016676336981468, 1.40998359211149, 1.5381771283158705]** + +This is easiest to see in the following plot (where we once more use the log scale for the regularization parameter axis): + +Metrics for varying levels of L2 regularization + +#### L1 regularization + +We can apply the same approach for differing levels of L1 regularization: + + params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] + metrics = [evaluate(train_data, test_data, 10, 0.1, **param** , **'l1'** , False) for param in params] + print params + print metrics + plot(params, metrics) + fig = matplotlib.pyplot.gcf() + pyplot.xscale('log') + +Again, the results are more clearly seen when plotted in the following graph. We see that there is a much more subtle decline in RMSLE, and it takes a very high value to cause a jump back up. Here, the level of L1 regularization required is much higher than that for the L2 form; however, the overall performance is poorer: + + **[0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]** + **[1.5384660954019971, 1.5384518080419873, 1.5383237472930684, 1.5372017600929164, 1.5303809928601677, 1.4352494587433793, 4.7551250073268614]** + +Metrics for varying levels of L1 regularization + +Using L1 regularization can encourage sparse weight vectors. Does this hold true in this case? We can find out by examining the number of entries in the weight vector that are zero, with increasing levels of regularization: + + model_l1 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=1.0, regType='l1', intercept=False) + model_l1_10 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=10.0, regType='l1', intercept=False) + model_l1_100 = LinearRegressionWithSGD.train(train_data, 10, 0.1, regParam=100.0, regType='l1', intercept=False) + print "L1 (1.0) number of zero weights: " + str(sum(model_l1.weights.array == 0)) + print "L1 (10.0) number of zeros weights: " + str(sum(model_l1_10.weights.array == 0)) + print "L1 (100.0) number of zeros weights: " + str(sum(model_l1_100.weights.array == 0)) + +We can see from the results that as we might expect, the number of zero feature weights in the model weight vector increases as greater levels of L1 regularization are applied: + + **L1 (1.0) number of zero weights: 4** + **L1 (10.0) number of zeros weights: 20** + **L1 (100.0) number of zeros weights: 55** + +#### Intercept + +The final parameter option for the linear model is whether to use an intercept or not. An intercept is a constant term that is added to the weight vector and effectively accounts for the mean value of the target variable. If the data is already centered or normalized, an intercept is not necessary; however, it often does not hurt to use one in any case. + +We will evaluate the effect of adding an intercept term to the model here: + + params = [False, True] + metrics = [evaluate(train_data, test_data, 10, 0.1, 1.0, 'l2', **param** ) for param in params] + print params + print metrics + bar(params, metrics, color='lightblue') + fig = matplotlib.pyplot.gcf() + +We can see from the result and plot that adding the intercept term results in a very slight increase in RMSLE: + + **[False, True]** + **[1.4900275345312988, 1.506469812020645]** + +Metrics without and with an intercept + +### The impact of parameter settings for the decision tree + +Decision trees provide two main parameters: maximum tree depth and the maximum number of bins. We will now perform the same evaluation of the effect of parameter settings for the decision tree model. Our starting point is to create an evaluation function for the model, similar to the one used for the linear regression earlier. This function is provided here: + + def evaluate_dt(train, test, maxDepth, maxBins): + model = DecisionTree.trainRegressor(train, {}, impurity='variance', maxDepth=maxDepth, maxBins=maxBins) + preds = model.predict(test.map(lambda p: p.features)) + actual = test.map(lambda p: p.label) + tp = actual.zip(preds) + rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean()) + return rmsle + +#### Tree depth + +We would generally expect performance to increase with more complex trees (that is, trees of greater depth). Having a lower tree depth acts as a form of regularization, and it might be the case that as with L2 or L1 regularization in linear models, there is a tree depth that is optimal with respect to the test set performance. + +Here, we will try to increase the depths of trees to see what impact they have on test set RMSLE, keeping the number of bins at the default level of `32`: + + params = [1, 2, 3, 4, 5, 10, 20] + metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params] + print params + print metrics + plot(params, metrics) + fig = matplotlib.pyplot.gcf() + +In this case, it appears that the decision tree starts over-fitting at deeper tree levels. An optimal tree depth appears to be around 10 on this dataset. + +### Note + +Notice that our best RMSLE of 0.42 is now quite close to the Kaggle winner of around 0.29! + +The output of the tree depth is as follows: + + **[1, 2, 3, 4, 5, 10, 20]** + **[1.0280339660196287, 0.92686672078778276, 0.81807794023407532, 0.74060228537329209, 0.63583503599563096, 0.42851360418692447, 0.45500008049779139]** + +Metrics for different tree depths + +#### Maximum bins + +Finally, we will perform our evaluation on the impact of setting the number of bins for the decision tree. As with the tree depth, a larger number of bins should allow the model to become more complex and might help performance with larger feature dimensions. After a certain point, it is unlikely that it will help any more and might, in fact, hinder performance on the test set due to over-fitting: + + params = [2, 4, 8, 16, 32, 64, 100] + metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params] + print params + print metrics + plot(params, metrics) + fig = matplotlib.pyplot.gcf() + +Here, we will show the output and plot to vary the number of bins (while keeping the tree depth at the default level of 5). In this case, using a small number of bins hurts performance, while there is no impact when we use around 32 bins (the default setting) or more. There seems to be an optimal setting for test set performance at around 16-20 bins: + + **[2, 4, 8, 16, 32, 64, 100]** + **[1.3069788763726049, 0.81923394899750324, 0.75745322513058744, 0.62328384445223795, 0.63583503599563096, 0.63583503599563096, 0.63583503599563096]** + +Metrics for different maximum bins + +# Summary + +In this chapter, you saw how to use MLlib's linear model and decision tree functionality in Python within the context of regression models. We explored categorical feature extraction and the impact of applying transformations to the target variable in a regression problem. Finally, we implemented various performance-evaluation metrics and used them to implement a cross-validation exercise that explores the impact of the various parameter settings available in both linear models and decision trees on test set model performance. + +In the next chapter, we will cover a different approach to machine learning, that is unsupervised learning, specifically in clustering models. + +# Chapter 7. Building a Clustering Model with Spark + +In the last few chapters, we covered supervised learning methods, where the training data is labeled with the true outcome that we would like to predict (for example, a rating for recommendations and class assignment for classification or real target variable in the case of regression). + +Next, we will consider the case when we do not have labeled data available. This is called unsupervised learning, as the model is not supervised with the true target label. The unsupervised case is very common in practice, since obtaining labeled training data can be very difficult or expensive in many real-world scenarios (for example, having humans label training data with class labels for classification). However, we would still like to learn some underlying structure in the data and use these to make predictions. + +This is where unsupervised learning approaches can be useful. Unsupervised learning models are also often combined with supervised models, for example, applying unsupervised techniques to create new input features for supervised models. + +Clustering models are, in many ways, the unsupervised equivalent of classification models. With classification, we tried to learn a model that would predict which class a given training example belonged to. The model was essentially a mapping from a set of features to the class. + +In clustering, we would like to segment the data such that each training example is assigned to a segment called a **cluster**. The clusters act much like classes, except that the true class assignments are unknown. + +Clustering models have many use cases that are the same as classification; these include the following: + + * Segmenting users or customers into different groups based on behavior characteristics and metadata + * Grouping content on a website or products in a retail business + * Finding clusters of similar genes + * Segmenting communities in ecology + * Creating image segments for use in image analysis applications such as object detection + +In this chapter, we will: + + * Briefly explore a few types of clustering models + * Extract features from data specifically using the output of one model as input features for our clustering model + * Train a clustering model and use it to make predictions + * Apply performance-evaluation and parameter-selection techniques to select the optimal number of clusters to use + +# Types of clustering models + +There are many different forms of clustering models available, ranging from simple to extremely complex ones. The MLlib library currently provides K-means clustering, which is among the simplest approaches available. However, it is often very effective, and its simplicity means it is relatively easy to understand and is scalable. + +## K-means clustering + +K-means attempts to partition a set of data points into K distinct clusters (where K is an input parameter for the model). + +More formally, K-means tries to find clusters so as to minimize the sum of squared errors (or distances) within each cluster. This objective function is known as the **within cluster sum of squared errors** ( **WCSS** ). + +It is the sum, over each cluster, of the squared errors between each point and the cluster center. + +Starting with a set of K initial cluster centers (which are computed as the mean vector for all data points in the cluster), the standard method for K-means iterates between two steps: + + 1. Assign each data point to the cluster that minimizes the WCSS. The sum of squares is equivalent to the squared Euclidean distance; therefore, this equates to assigning each point to the **closest** cluster center as measured by the Euclidean distance metric. + 2. Compute the new cluster centers based on the cluster assignments from the first step. + +The algorithm proceeds until either a maximum number of iterations has been reached or **convergence** has been achieved. Convergence means that the cluster assignments no longer change during the first step; therefore, the value of the WCSS objective function does not change either. + +### Tip + +For more details, refer to Spark's documentation on clustering at or refer to . + +To illustrate the basics of K-means, we will use the simple dataset we showed in our multiclass classification example in Chapter 5, _Building a Classification Model with Spark_. Recall that we have five classes, which are shown in the following figure: + +Multiclass dataset + +However, assume that we don't actually know the true classes. If we use K-means with five clusters, then after the first step, the model's cluster assignments might look like this: + +Cluster assignments after the first K-means iteration + +We can see that K-means has already picked out the centers of each cluster fairly well. After the next iteration, the assignments might look like those shown in the following figure: + +Cluster assignments after the second K-means iteration + +Things are starting to stabilize, but the overall cluster assignments are broadly the same as they were after the first iteration. Once the model has converged, the final assignments could look like this: + +Final cluster assignments for K-means + +As we can see, the model has done a decent job of separating the five clusters. The leftmost three are fairly accurate (with a few incorrect points). However, the two clusters in the bottom-right corner are less accurate. + +This illustrates: + + * The iterative nature of K-means + * The model's dependency on the method of initially selecting clusters' centers (here, we will use a random approach) + * That the final cluster assignments can be very good for well-separated data but can be poor for data that is more difficult + +### Initialization methods + +The standard initialization method for K-means, usually simply referred to as the random method, starts by randomly assigning each data point to a cluster before proceeding with the first update step. + +MLlib provides a parallel variant for this initialization method, called K-means ||, which is the default initialization method used. + +MLlib provides a parallel variant called **K-means** **||** , **||** , for this initialization method; this is the default initialization method used. + +### Note + +See and for more information. + +The results of using K-means++ are shown here. Note that this time, the difficult lower-right points have been mostly correctly clustered. + +Final cluster assignments for K-means++ + +### Variants + +There are many other variants of K-means; they focus on initialization methods or the core model. One of the more common variants is fuzzy K-means. This model does not assign each point to one cluster as K-means does (a so-called hard assignment). Instead, it is a soft version of K-means, where each point can belong to many clusters, and is represented by the relative membership to each cluster. So, for K clusters, each point is represented as a K-dimensional membership vector, with each entry in this vector indicating the membership proportion in each cluster. + +## Mixture models + +A **mixture model** is essentially an extension of the idea behind fuzzy K-means; however, it makes an assumption that there is an underlying probability distribution that generates the data. For example, we might assume that the data points are drawn from a set of K-independent Gaussian (normal) probability distributions. The cluster assignments are also soft, so each point is represented by K membership weights in each of the K underlying probability distributions. + +### Note + +See for further details and for a mathematical treatment of mixture models. + +## Hierarchical clustering + + **Hierarchical clustering** is a structured clustering approach that results in a multilevel hierarchy of clusters, where each cluster might contain many subclusters (or child clusters). Each child cluster is, thus, linked to the parent cluster. This form of clustering is often also called tree clustering. + +Agglomerative clustering is a bottom-up approach where: + + * Each data point begins in its own cluster + * The similarity (or distance) between each pair of clusters is evaluated + * The pair of clusters that are most similar are found; this pair is then merged to form a new cluster + * The process is repeated until only one top-level cluster remains + + **Divisive** clustering is a top-down approach that works in reverse, starting with one cluster and at each stage, splitting a cluster into two, until all data points are allocated to their own bottom-level cluster. + +### Note + +You can find more information at . + +# Extracting the right features from your data + +Like most of the machine learning models we have encountered so far, K-means clustering requires numerical vectors as input. The same feature extraction and transformation approaches that we have seen for classification and regression are applicable for clustering. + +As K-means, like least squares regression, uses a squared error function as the optimization objective, it tends to be impacted by outliers and features with large variance. + +As for regression and classification cases, input data can be normalized and standardized to overcome this, which might improve accuracy. In some cases, however, it might be desirable not to standardize data, if, for example, the objective is to find segmentations according to certain specific features. + +## Extracting features from the MovieLens dataset + +For this example, we will return to the movie rating dataset we used in Chapter 4, _Building a Recommendation Engine with Spark_. Recall that we have three main datasets: one that contains the movie ratings (in the `u.data` file), a second one with user data (`u.user`), and a third one with movie data (`u.item`). We will also be using the genre data file to extract the genres for each movie (`u.genre`). + +We will start by looking at the movie data: + + val movies = sc.textFile("/PATH/ml-100k/u.item") + println(movies.first) + +This should output the first line of the dataset: + + **1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0** + +So, we have access to the move title, and we already have the movies categorized into genres. Why do we need to apply a clustering model to the movies? Clustering the movies is a useful exercise for two reasons: + + * First, because we have access to the true genre labels, we can use these to evaluate the quality of the clusters that the model finds + * Second, we might wish to segment the movies based on some other attributes or features, apart from their genres + +For example, in this case, it seems that we don't have a lot of data to use for clustering, apart from the genres and title. However, this is not true--we also have the ratings data. Previously, we created a matrix factorization model from the ratings data. The model is made up of a set of user and movie factor vectors. + +We can think of the movie factors as representing each movie in a new latent feature space, where each latent feature, in turn, represents some form of structure in the ratings matrix. While it is not possible to directly interpret each latent feature, they might represent some hidden structure that influences the ratings behavior between users and movies. One factor could represent genre preference, another could refer to actors or directors, while yet another could represent the theme of the movie, and so on. + +So, if we use these factor vector representations of each movie as inputs to our clustering model, we will end up with a clustering that is based on the actual rating behavior of users rather than manual genre assignments. + +The same logic applies to the user factors--they represent users in the latent feature space of rating behavior, so clustering the user vectors should result in a clustering based on user rating behavior. + +### Extracting movie genre labels + +Before proceeding further, let's extract the genre mappings from the `u.genre` file. As you can see from the first line of the preceding dataset, we will need to map from the numerical genre assignments to the textual version so that they are readable. + +Take a look at the first few lines of `u.genre`: + + val genres = sc.textFile("/PATH/ml-100k/u.genre") + genres.take(5).foreach(println) + +You should see the following output displayed: + + **unknown|0** + **Action|1** + **Adventure|2** + **Animation|3** + **Children's|4** + +Here, `0` is the index of the relevant genre, while `unknown` is the genre assigned for this index. The indices correspond to the indices of the binary subvector that will represent the genres for each movie (that is, the 0s and 1s in the preceding movie data). + +To extract the genre mappings, we will split each line and extract a key-value pair, where the key is the text genre and the value is the index. Note that we have to filter out an empty line at the end; this will, otherwise, throw an error when we try to split the line (see the code highlighted here): + + val genreMap = genres.filter( **!_.isEmpty** ).map(line => line.split("\\|")).map(array => (array(1), array(0))).collectAsMap + println(genreMap) + +The preceding code will provide the following output: + + **Map(2 - > Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, ...** + +Next, we'll create a new RDD from the movie data and our genre mapping; this RDD contains the movie ID, title, and genres. We will use this later to create a more readable output when we evaluate the clusters assigned to each movie by our clustering model. + +In the following code section, we will map over each movie and extract the genres subvector (which will still contain `Strings` rather than `Int` indexes). We will then apply the `zipWithIndex` method to create a new collection that contains the indices of the genre subvector, and we will filter this collection so that we are left only with the positive assignments (that is, the 1s that denote a genre assignment for the relevant index). We can then use our extracted genre mapping to map these indices to the textual genres. Finally, we will inspect the first record of the new `RDD` to see the result of these operations: + + val titlesAndGenres = movies.map(_.split("\\|")).map { array => + val genres = array.toSeq.slice(5, array.size) + val genresAssigned = genres.zipWithIndex.filter { case (g, idx) => + g == "1" + }.map { case (g, idx) => + genreMap(idx.toString) + } + (array(0).toInt, (array(1), genresAssigned)) + } + println(titlesAndGenres.first) + +This should output the following result: + + **(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))** + +### Training the recommendation model + +To get the user and movie factor vectors, we first need to train another recommendation model. Fortunately, we have already done this in Chapter 4, _Building a Recommendation Engine with Spark_ , so we will follow the same procedure: + + import org.apache.spark.mllib.recommendation.ALS + import org.apache.spark.mllib.recommendation.Rating + val rawData = sc.textFile("/PATH/ml-100k/u.data") + val rawRatings = rawData.map(_.split("\t").take(3)) + val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } + ratings.cache + val alsModel = ALS.train(ratings, 50, 10, 0.1) + +Recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that the ALS model returned contains the factors in two RDDs of key-value pairs (called `userFeatures` and `productFeatures`) with the user or movie ID as the key and the factor as the value. We will need to extract just the factors and transform each one of them into an MLlib `Vector` to use as training input for our clustering model. + +We will do this for both users and movies as follows: + + import org.apache.spark.mllib.linalg.Vectors + val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } + val movieVectors = movieFactors.map(_._2) + val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } + val userVectors = userFactors.map(_._2) + +### Normalization + +Before we train our clustering model, it might be useful to look into the distribution of the input data in the form of the factor vectors. This will tell us whether we need to normalize the training data. + +We will follow the same approach as we did in Chapter 5, _Building a Classification Model with Spark_ , using MLlib's summary statistics available in the distributed `RowMatrix` class: + + import org.apache.spark.mllib.linalg.distributed.RowMatrix + val movieMatrix = new RowMatrix(movieVectors) + val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics() + val userMatrix = new RowMatrix(userVectors) + val userMatrixSummary = + userMatrix.computeColumnSummaryStatistics() + println("Movie factors mean: " + movieMatrixSummary.mean) + println("Movie factors variance: " + movieMatrixSummary.variance) + println("User factors mean: " + userMatrixSummary.mean) + println("User factors variance: " + userMatrixSummary.variance) + +You should see output similar to the one here: + + **Movie factors mean: [0.28047737659519767,0.26886479057520024,0.2935579964446398,0.27821738264113755, ...** + **Movie factors variance: [0.038242041794064895,0.03742229118854288,0.044116961097355877,0.057116244055791986, ...** + **User factors mean: [0.2043520841572601,0.22135773814655782,0.2149706318418221,0.23647602029329481, ...** + **User factors variance: [0.037749421148850396,0.02831191551960241,0.032831876953314174,0.036775110657850954, ...** + +If we look at the output, we will see that there do not appear to be any important outliers that might skew the clustering results, so normalization should not be required in this case. + +# Training a clustering model + +Training for K-means in MLlib takes an approach similar to the other models--we pass an RDD that contains our training data to the `train` method of the `KMeans` object. Note that here we do not use `LabeledPoint` instances, as the labels are not used in clustering; they are used only in the feature vectors. Thus, we use a RDD `[Vector]` as input to the `train` method. + +## Training a clustering model on the MovieLens dataset + +We will train a model for both the movie and user factors that we generated by running our recommendation model. We need to pass in the number of clusters K and the maximum number of iterations for the algorithm to run. Model training might run for less than the maximum number of iterations if the change in the objective function from one iteration to the next is less than the tolerance level (the default for this tolerance is 0.0001). + +MLlib's K-means provides random and K-means || initialization, with the default being K-means ||. As both of these initialization methods are based on random selection to some extent, each model training run will return a different result. + +K-means does not generally converge to a global optimum model, so performing multiple training runs and selecting the most optimal model from these runs is a common practice. MLlib's training methods expose an option to complete multiple model training runs. The best training run, as measured by the evaluation of the loss function, is selected as the final model. + +We will first set up the required imports, as well as model parameters: K, maximum iterations, and number of runs: + + import org.apache.spark.mllib.clustering.KMeans + val numClusters = 5 + val numIterations = 10 + val numRuns = 3 + +We will then run K-means on the movie factor vectors: + + val movieClusterModel = KMeans.train(movieVectors, numClusters, numIterations, numRuns) + +Once the model has completed training, we should see output that looks something like this: + + **...** + **14/09/02 21:53:58 INFO SparkContext: Job finished: collectAsMap at KMeans.scala:193, took 0.02043 s** + **14/09/02 21:53:58 INFO KMeans: Iterations took 0.331 seconds.** + **14/09/02 21:53:58 INFO KMeans: KMeans reached the max number of iterations: 10.** + **14/09/02 21:53:58 INFO KMeans: The cost for the best run is 2586.298785925147** + **.** + **...** + **movieClusterModel: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@71c6f512** + +As can be seen from the highlighted text, the model training output tells us that the maximum number of iterations was reached, so the training process did not stop early based on the convergence criterion. It also shows the training set error (that is, the value of the K-means objective function) for the best run. + +We can try a much larger setting for the maximum iterations and use only one training run to see an example where the K-means model converges: + + val movieClusterModelConverged = KMeans.train(movieVectors, numClusters, 100) + +You should be able to see the `KMeans converged in ... iterations` text in the model output; this text indicates that after so many iterations, the K-means objective function did not decrease more than the tolerance level: + + **...** + **14/09/02 22:04:38 INFO SparkContext: Job finished: collectAsMap at KMeans.scala:193, took 0.040685 s** + **14/09/02 22:04:38 INFO KMeans: Run 0 finished in 34 iterations** + **14/09/02 22:04:38 INFO KMeans: Iterations took 0.812 seconds.** + **14/09/02 22:04:38 INFO KMeans: KMeans converged in 34 iterations.** + **14/09/02 22:04:38 INFO KMeans: The cost for the best run is 2584.9354332904104.** + **...** + **movieClusterModelConverged: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@6bb28fb5** + +### Tip + +Notice that when we use a lower number of iterations but use multiple training runs, we typically get a training error (called cost above) that is very similar to the one we obtain by running the model to convergence. Using the multiple runs option can, therefore, be a very effective method to find the best possible model. + +Finally, we will also train a K-means model on the user factor vectors: + + val userClusterModel = KMeans.train(userVectors, numClusters, numIterations, numRuns) + +# Making predictions using a clustering model + +Using the trained K-means model is straightforward and similar to the other models we have encountered so far, such as classification and regression. We can make a prediction for a single `Vector` instance as follows: + + val movie1 = movieVectors.first + val movieCluster = movieClusterModel.predict(movie1) + println(movieCluster) + +We can also make predictions for multiple inputs by passing a RDD `[Vector]` to the `predict` method of the model: + + val predictions = movieClusterModel.predict(movieVectors) + println(predictions.take(10).mkString(",")) + +The resulting output is a cluster assignment for each data point: + + **0,0,1,1,2,1,0,1,1,1** + +### Tip + +Note that due to random initialization, the cluster assignments might change from one run of the model to another, so your results might differ from those shown earlier. The cluster ID themselves have no inherent meaning; they are simply arbitrarily labeled, starting from 0. + +## Interpreting cluster predictions on the MovieLens dataset + +We have covered how to make predictions for a set of input vectors, but how do we evaluate how good the predictions are? We will cover performance metrics a little later; however, here, we will see how to manually inspect and interpret the cluster assignments made by our K-means model. + +While unsupervised techniques have the advantage that they do not require us to provide labeled data for training, the disadvantage is that often, the results need to be manually interpreted. Often, we would like to further examine the clusters that are found and possibly try to interpret them and assign some sort of labeling or categorization to them. + +For example, we can examine the clustering of movies we have found to try to see whether there is some meaningful interpretation of each cluster, such as a common genre or theme among the movies in the cluster. There are many approaches we can use, but we will start by taking a few movies in each cluster that are closest to the center of the cluster. These movies, we assume, would be the ones that are least likely to be marginal in terms of their cluster assignment, and so, they should be among the most representative of the movies in the cluster. By examining these sets of movies, we can see what attributes are shared by the movies in each cluster. + +### Interpreting the movie clusters + +To begin, we need to decide what we mean by "closest to the center of each cluster". The objective function that is minimized by K-means is the sum of Euclidean distances between each point and the cluster center, summed over all clusters. Therefore, it is natural to use the Euclidean distance as our measure. + +We will define this function here. Note that we will need access to certain imports from the **Breeze** library (a dependency of MLlib) for linear algebra and vector-based numerical functions: + + import breeze.linalg._ + import breeze.numerics.pow + def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum + +### Tip + +The preceding `pow` function is a Breeze universal function. This function is the same as the `pow` function from `scala.math`, except that it operates element-wise on the vector that is returned from the minus operation between the two input vectors. + +Now, we will use this function to compute, for each movie, the distance of the relevant movie factor vector from the center vector of the assigned cluster. We will also join our cluster assignments and distances data with the movie titles and genres so that we can output the results in a more readable way: + + val titlesWithFactors = titlesAndGenres.join(movieFactors) + val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) => + val pred = movieClusterModel.predict(vector) + val clusterCentre = movieClusterModel.clusterCenters(pred) + val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray)) + (id, title, genres.mkString(" "), pred, dist) + } + val clusterAssignments = moviesAssigned.groupBy { case (id, title, genres, cluster, dist) => cluster }.collectAsMap + +After running the preceding code snippet, we have an RDD that contains a set of key-value pairs for each cluster; here, the key is the numeric cluster identifier, and the value is made up of a set of movies and related information. The movie information we have is the movie ID, title, genres, cluster index, and distance of the movie's factor vector from the cluster center. + +Finally, we will iterate through each cluster and output the top 20 movies, ranked by distance from closest to the cluster center: + + for ( (k, v) <- clusterAssignments.toSeq.sortBy(_._1)) { + println(s"Cluster $k:") + val m = v.toSeq.sortBy(_._5) + println(m.take(20).map { case (_, title, genres, _, d) => (title, genres, d) }.mkString("\n")) + println("=====\n") + } + +The following screenshot is an example output. Note that your output might differ due to random initializations of both the recommendation and clustering model. + +The first cluster + +The first cluster, labeled 0, seems to contain a lot of old movies from the 1940s, 1950s, and 1960s, as well as a scattering of recent dramas. + +The second cluster + +The second cluster has a few horror movies in a prominent position, while the rest of the movies are less clear, but dramas are common too. + +The third cluster + +The third cluster is not clear-cut but has a fair number of comedy and drama movies. + +The fourth cluster + +The next cluster is more clearly associated with dramas and contains some foreign language films in particular. + +The last cluster + +The final cluster seems to be related predominantly to action and thrillers as well as romance movies, and seems to contain a number of relatively popular movies. + +As you can see, it is not always straightforward to determine exactly what each cluster represents. However, there is some evidence here that the clustering is picking out attributes or commonalities between groups of movies, which might not be immediately obvious based only on the movie titles and genres (such as a foreign language segment, a classic movie segment, and so on). If we had more metadata available, such as directors, actors, and so on, we might find out more details about the defining features of each cluster. + +### Tip + +We leave it as an exercise for you to perform a similar investigation into the clustering of the user factors. We have already created the input vectors in the `userVectors` variable, so you can train a K-means model on these vectors. After that, in order to evaluate the clusters, you would need to investigate the closest users for each cluster center (as we did for movies) and see if some common characteristics can be identified from the movies they have rated or the user metadata available. + +# Evaluating the performance of clustering models + +Like models such as regression, classification, and recommendation engines, there are many evaluation metrics that can be applied to clustering models to analyze their performance and the goodness of the clustering of the data points. Clustering evaluation is generally divided into either internal or external evaluation. Internal evaluation refers to the case where the same data used to train the model is used for evaluation. External evaluation refers to using data external to the training data for evaluation purposes. + +## Internal evaluation metrics + +Common internal evaluation metrics include the WCSS we covered earlier (which is exactly the K-means objective function), the Davies-Bouldin index, the Dunn Index, and the silhouette coefficient. All these measures tend to reward clusters where elements within a cluster are relatively close together, while elements in different clusters are relatively far away from each other. + +### Note + +The Wikipedia page on clustering evaluation at has more details. + +## External evaluation metrics + +Since clustering can be thought of as unsupervised classification, if we have some form of labeled (or partially labeled) data available, we could use these labels to evaluate a clustering model. We can make predictions of clusters (that is, the class labels) using the model and evaluate the predictions against the true labels using metrics similar to some that we saw for classification evaluation (that is, based on true positive and negative and false positive and negative rates). + +These include the Rand measure, F-measure, Jaccard index, and others. + +### Note + +See for more information on external evaluation for clustering. + +## Computing performance metrics on the MovieLens dataset + +MLlib provides a convenient `computeCost` function to compute the WCSS objective function given a RDD `[Vector]`. We will compute this metric for the following movie and user training data: + + val movieCost = movieClusterModel.computeCost(movieVectors) + val userCost = userClusterModel.computeCost(userVectors) + println("WCSS for movies: " + movieCost) + println("WCSS for users: " + userCost) + +This should output the result similar to the following one: + + **WCSS for movies: 2586.0777166339426** + **WCSS for users: 1403.4137493396831** + +# Tuning parameters for clustering models + +In contrast to many of the other models we have come across so far, K-means only has one parameter that can be tuned. This is K, the number of cluster centers chosen. + +## Selecting K through cross-validation + +As we have done with classification and regression models, we can apply cross-validation techniques to select the optimal number of clusters for our model. This works in much the same way as for supervised learning methods. We will split the dataset into a training set and a test set. We will then train a model on the training set and compute the evaluation metric of interest on the test set. + +We will do this for the movie clustering using the built-in WCSS evaluation metric provided by MLlib in the following code, using a 60 percent / 40 percent split between the training set and test set: + + val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6, 0.4), 123) + val trainMovies = trainTestSplitMovies(0) + val testMovies = trainTestSplitMovies(1) + val costsMovies = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainMovies, numIterations, k, numRuns).computeCost(testMovies)) } + println("Movie clustering cross-validation:") + costsMovies.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") } + +This should give results that look something like the ones shown here. + +The output of movie clustering cross-validation is: + + **Movie clustering cross-validation** + **WCSS for K=2 id 942.06** + **WCSS for K=3 id 942.67** + **WCSS for K=4 id 950.35** + **WCSS for K=5 id 948.20** + **WCSS for K=10 id 943.26** + **WCSS for K=20 id 947.10** + +We can observe that the WCSS decreases as the number of clusters increases, up to a point. It then begins to increase. Another common pattern observed in the WCSS in cross-validation for K-means is that the metric continues to decrease as K increases, but at a certain point, the rate of decrease flattens out substantially. The value of K at which this occurs is generally selected as the optimal K parameter (this is sometimes called the elbow point, as this is where the line kinks when drawn as a graph). + +In our case, we might select a value of 10 for K, based on the preceding results. Also, note that the clusters that are computed by the model are often used for purposes that require some human interpretation (such as the cases of movie and customer segmentation we mentioned earlier). Therefore, this consideration also impacts the choice of K, as although a higher value of K might be more optimal from the mathematical point of view, it might be more difficult to understand and interpret many clusters. + +For completeness, we will also compute the cross-validation metrics for user clustering: + + val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123) + val trainUsers = trainTestSplitUsers(0) + val testUsers = trainTestSplitUsers(1) + val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers)) } + println("User clustering cross-validation:") + costsUsers.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") } + +We will see a pattern that is similar to the movie case: + + **User clustering cross-validation:** + **WCSS for K=2 id 544.02** + **WCSS for K=3 id 542.18** + **WCSS for K=4 id 542.38** + **WCSS for K=5 id 542.33** + **WCSS for K=10 id 539.68** + **WCSS for K=20 id 541.21** + +### Tip + +Note that your results may differ slightly due to random initialization of the clustering models. + +# Summary + +In this chapter, we explored a new class of model that learns structure from unlabeled data--unsupervised learning. We worked through required input data, feature extraction, and saw how to use the output of one model (a recommendation model in our example) as the input to another model (our K-means clustering model). Finally, we evaluated the performance of the clustering model, both using manual interpretation of the cluster assignments and using mathematical performance metrics. + +In the next chapter, we will cover another type of unsupervised learning used to reduce our data down to its most important features or components--dimensionality reduction models. + +# Chapter 8. Dimensionality Reduction with Spark + +Over the course of this chapter, we will continue our exploration of unsupervised learning models in the form of **dimensionality reduction**. + +Unlike the models we have covered so far, such as regression, classification, and clustering, dimensionality reduction does not focus on making predictions. Instead, it tries to take a set of input data with a feature dimension _D_ (that is, the length of our feature vector) and extract a representation of the data of dimension _k_ , where _k_ is usually significantly smaller than _D_. It is, therefore, a form of preprocessing or feature transformation rather than a predictive model in its own right. + +It is important that the representation that is extracted should still be able to capture a large proportion of the variability or structure of the original data. The idea behind this is that most data sources will contain some form of underlying structure. This structure is typically unknown (often called latent features or latent factors), but if we can uncover some of this structure, our models could learn this structure and make predictions from it rather than from the data in its raw form, which might be noisy or contain many irrelevant features. In other words, dimensionality reduction throws away some of the noise in the data and keeps the hidden structure that is present. + +In some cases, the dimensionality of the raw data is far higher than the number of data points we have, so without dimensionality reduction, it would be difficult for other machine learning models, such as classification and regression, to learn anything, as they need to fit a number of parameters that is far larger than the number of training examples (in this sense, these methods bear some similarity to the regularization approaches that we have seen used in classification and regression). + +A few use cases of dimensionality reduction techniques include: + + * Exploratory data analysis + * Extracting features to train other machine learning models + * Reducing storage and computation requirements for very large models in the prediction phase (for example, a production system that makes predictions) + * Reducing a large group of text documents down to a set of hidden topics or concepts + * Making learning and generalization of models easier when our data has a very large number of features (for example, when working with text, sound, images, or video data, which tends to be very high-dimensional) + +In this chapter, we will: + + * Introduce the types of dimensionality reduction models available in MLlib + * Work with images of faces to extract features suitable for dimensionality reduction + * Train a dimensionality reduction model using MLlib + * Visualize and evaluate the results + * Perform parameter selection for our dimensionality reduction model + +# Types of dimensionality reduction + +MLlib provides two models for dimensionality reduction; these models are closely related to each other. They are **Principal Components Analysis** ( **PCA** ) and **Singular Value Decomposition** ( **SVD** ). + +## Principal Components Analysis + +PCA operates on a data matrix _X_ and seeks to extract a set of _k_ principal components from _X_. The principal components are each uncorrelated to each other and are computed such that the first principal component accounts for the largest variation in the input data. Each subsequent principal component is, in turn, computed such that it accounts for the largest variation, provided that it is independent of the principal components computed so far. + +In this way, the _k_ principal components returned are guaranteed to account for the highest amount of variation in the input data possible. Each principal component, in fact, has the same feature dimensionality as the original data matrix. Hence, a projection step is required in order to actually perform dimensionality reduction, where the original data is projected into the _k-dimensional_ space represented by the principal components. + +## Singular Value Decomposition + +SVD seeks to decompose a matrix _X_ of dimension _m x n_ into three component matrices: + + * _U_ of dimension _m x m_ + * _S_ , a diagonal matrix of size _m x n_ ; the entries of _S_ are referred to as the **singular values** + * _V T_ of dimension _n x n_ + + X = U * S * V T + +Looking at the preceding formula, it appears that we have not reduced the dimensionality of the problem at all, as by multiplying _U_ , _S_ , and _V_ , we reconstruct the original matrix. In practice, the truncated SVD is usually computed. That is, only the top _k_ singular values, which represent the most variation in the data, are kept, while the rest are discarded. The formula to reconstruct _X_ based on the component matrices is then approximate: + + X ~ Uk * Sk * Vk T + +An illustration of the truncated SVD is shown here: + +The truncated Singular Value Decomposition + +Keeping the top _k_ singular values is similar to keeping the top _k_ principal components in PCA. In fact, SVD and PCA are directly related, as we will see a little later in this chapter. + +### Note + +A detailed mathematical treatment of both PCA and SVD is beyond the scope of this book. + +An overview of dimensionality reduction can be found in the Spark documentation at . + +The following links contain a more in-depth mathematical overview of PCA and SVD, respectively: and . + +## Relationship with matrix factorization + +PCA and SVD are both matrix factorization techniques, in the sense that they decompose a data matrix into subcomponent matrices, each of which has a lower dimension (or rank) than the original matrix. Many other dimensionality reduction techniques are based on matrix factorization. + +You might remember another example of matrix factorization, that is, collaborative filtering, that we have already seen in Chapter 4, _Building a Recommendation Engine with Spark_. Matrix factorization approaches to collaborative filtering work by factorizing the ratings matrix into two components: the user factor matrix and the item factor matrix. Each of these has a lower dimension than the original data, so these methods also act as dimensionality reduction models. + +### Note + +Many of the best performing approaches to collaborative filtering include models based on SVD. Simon Funk's approach to the Netflix prize is a famous example. You can look at it at . + +## Clustering as dimensionality reduction + +The clustering models we covered in the previous chapter can also be used for a form of dimensionality reduction. This works in the following way: + + * Assume that we cluster our high-dimensional feature vectors using a K-means clustering model, with _k_ clusters. The result is a set of _k_ cluster centers. + * We can represent each of our original data points in terms of how far it is from each of these cluster centers. That is, we can compute the distance of a data point to each cluster center. The result is a set of _k_ distances for each data point. + * These _k_ distances can form a new vector of dimension _k_. We can now represent our original data as a new vector of lower dimension, relative to the original feature dimension. + +Depending on the distance metric used, this can result in both dimensionality reduction and a form of nonlinear transformation of the data, allowing us to learn a more complex model while still benefiting from the speed and scalability of a linear model. For example, using a Gaussian or exponential distance function can approximate a very complex nonlinear feature transformation. + +# Extracting the right features from your data + +As with all machine learning models we have explored so far, dimensionality reduction models also operate on a feature vector representation of our data. + +For this chapter, we will dive into the world of image processing, using the **Labeled Faces in the Wild** ( **LFW** ) dataset of facial images. This dataset contains over 13,000 images of faces generally taken from the Internet and belonging to well-known public figures. The faces are labeled with the person's name. + +## Extracting features from the LFW dataset + +In order to avoid having to download and process a very large dataset, we will work with a subset of the images, using people who have names that start with an "A". This dataset can be downloaded from . + +### Note + +For more details and other variants of the data, visit . + +The original research paper reference is: + + _Gary B. Huang_ , _Manu Ramesh_ , _Tamara Berg_ , and _Erik Learned-Miller_. _Labeled Faces in the Wild: A Database for Studying Face Recognition in Unconstrained Environments_. University of Massachusetts, Amherst, Technical Report 07-49, October, 2007. + +It can be downloaded from . + +Unzip the data using the following command: + + **> tar xfvz lfw-a.tgz** + +This will create a folder called `lfw`, which contains a number of subfolders, one for each person. + +### Exploring the face data + +Start up your Spark Scala console by ensuring that you allocate sufficient memory, as dimensionality reduction methods can be quite computationally expensive: + + **>./SPARK_HOME/bin/spark-shell --driver-memory 2g** + +Now that we've unzipped the data, we face a small challenge. Spark provides us with a way to read text files and custom Hadoop input data sources. However, there is no built-in functionality to allow us to read images. + +Spark provides a method called `wholeTextFiles`, which allows us to operate on entire files at once, compared to the `textFile` method that we have been using so far, which operates on the individual lines within a text file (or multiple files). + +We will use the `wholeTextFiles` method to access the location of each file. Using these file paths, we will write custom code to load and process the images. In the following example code, we will use `PATH` to refer to the directory in which you extracted the `lfw` subdirectory. + +We can use a wildcard path specification (using the `*` character highlighted in the following code snippet) to tell Spark to look in each directory under the `lfw` directory for files: + + val path = "/ **PATH** /lfw/ ***** " + val rdd = sc.wholeTextFiles(path) + val first = rdd.first + println(first) + +Running the `first` command might take a little time, as Spark first scans the specified directory structure for all available files. Once completed, you should see output similar to the one shown here: + + **first: (String, String) = (file:/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg, ����??JFIF????? ...** + +You will see that `wholeTextFiles` returns an RDD that contains key-value pairs, where the key is the file location while the value is the content of the entire text file. For our purposes, we only care about the file path, as we cannot work directly with the image data as a string (notice that it is displayed as "binary nonsense" in the shell output). + +Let's extract the file paths from the RDD. Note that earlier, the file path starts with the `file:` text. This is used by Spark when reading files in order to differentiate between different filesystems (for example, `file://` for the local filesystem, `hdfs://` for HDFS, `s3n://` for Amazon S3, and so on). + +In our case, we will be using custom code to read the images, so we don't need this part of the path. Thus, we will remove it with the following `map` function: + + val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") } + println(files.first) + +This should display the file location with the `file:` prefix removed: + + **/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg** + +Next, we will see how many files we are dealing with: + + println(files.count) + +Running these commands creates a lot of noisy output in the Spark shell, as it outputs all the file paths that are read to the console. Ignore this part, but after the command has completed, the output should look something like this: + + **..., /PATH/lfw/Azra_Akin/Azra_Akin_0003.jpg:0+19927, /PATH/lfw/Azra_Akin/Azra_Akin_0004.jpg:0+16030** + **...** + **14/09/18 20:36:25 INFO SparkContext: Job finished: count at :19, took 1.151955 s** + **1055** + +So, we can see that we have 1055 images to work with. + +### Visualizing the face data + +Although there are a few tools available in Scala or Java to display images, this is one area where Python and the matplotlib library shine. We will use Scala to process and extract the images and run our models and IPython to display the actual images. + +You can run a separate IPython Notebook by opening a new terminal window and launching a new notebook: + + **> ipython notebook** + +### Note + +Note that if using Python Notebook, you should first execute the following code snippet to ensure that the images are displayed inline after each notebook cell (including the `%` character): `%pylab inline`. + +Alternatively, you can launch a plain IPython console without the web notebook, enabling the `pylab` plotting functionality using the following command: + + **> ipython --pylab** + +The dimensionality reduction techniques in MLlib are only available in Scala or Java at the time of writing this book, so we will continue to use the Scala Spark shell to run the models. Therefore, you won't need to run a PySpark console. + +### Tip + +We have provided the full Python code with this chapter as a Python script as well as in the IPython Notebook format. For instructions on installing IPython, see the code bundle. + +Let's display the image given by the first path we extracted earlier using matplotlib's `imread` and `imshow` functions: + + path = "/PATH/lfw/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" + ae = imread(path) + imshow(ae) + +### Note + +You should see the image displayed in your Notebook (or in a pop-up window if you are using the standard IPython shell). Note that we have not shown the image here. + +### Extracting facial images as vectors + +While a full treatment of image processing is beyond the scope of this book, we will need to know a few basics to proceed. Each color image can be represented as a three-dimensional array, or matrix, of pixels. The first two dimensions, that is the _x_ and _y_ axes, represent the position of each pixel, while the third dimension represents the **red, blue, and green** ( **RGB** ) color values for each pixel. + +A grayscale image only requires one value per pixel (there are no RGB values), so it can be represented as a plain two-dimensional matrix. For many image-processing and machine learning tasks related to images, it is common to operate on grayscale images. We will do this here by converting the color images to grayscale first. + +It is also a common practice in machine learning tasks to represent an image as a vector, instead of a matrix. We do this by concatenating each row (or alternatively, each column) of the matrix together to form a long vector (this is known as **reshaping** ). In this way, each raw, grayscale image matrix is transformed into a feature vector that is usable as input to a machine learning model. + +Fortunately for us, the built-in Java **Abstract Window Toolkit** ( **AWT** ) contains various basic image-processing functions. We will define a few utility functions to perform this processing using the `java.awt` classes. + +#### Loading images + +The first of these is a function to read an image from a file: + + import java.awt.image.BufferedImage + def loadImageFromFile(path: String): BufferedImage = { + import javax.imageio.ImageIO + import java.io.File + ImageIO.read(new File(path)) + } + +This returns an instance of a `java.awt.image.BufferedImage` class, which stores the image data and provides a number of useful methods. Let's test it out by loading the first image into our Spark shell: + + val aePath = "/PATH/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" + val aeImage = loadImageFromFile(aePath) + +You should see the image details displayed in the shell: + + **aeImage: java.awt.image.BufferedImage = BufferedImage@f41266e: type = 5 ColorModel: #pixelBits = 24 numComponents = 3 color space = java.awt.color.ICC_ColorSpace@7e420794 transparency = 1 has alpha = false isAlphaPre = false ByteInterleavedRaster: width = 250 height = 250 #numDataElements 3 dataOff[0] = 2** + +There is quite a lot of information here. Of particular interest to us is that the image width and height are 250 pixels, and as we can see, there are three components (that is, the RGB values) that are highlighted in the preceding output. + +#### Converting to grayscale and resizing the images + +The next function we will define will take the image that we have loaded with our preceding function, convert the image from color to grayscale, and resize the image's width and height. + +These steps are not strictly necessary, but both steps are done in many cases for efficiency purposes. Using RGB color images instead of grayscale increases the amount of data to be processed by a factor of 3. Similarly, larger images increase the processing and storage overhead significantly. Our raw 250 x 250 images represent 187,500 data points per image using three color components. For a set of 1055 images, this is 197,812,500 data points. Even if stored as integer values, each value stored takes 4 bytes of memory, so just 1055 images represent around 800 MB of memory! As you can see, image-processing tasks can quickly become extremely memory intensive. + +If we convert to grayscale and resize the images to, say, 50 x 50 pixels, we only require 2500 data points per image. For our 1055 images, this equates to 10 MB of memory, which is far more manageable for illustrative purposes. + +### Tip + +Another reason to resize is that MLlib's PCA model works best on _tall and skinny_ matrices with less than 10,000 columns. We will have 2500 columns (that is, each pixel becomes an entry in our feature vector), so we will come in well below this restriction. + +Let's define our processing function. We will do the grayscale conversion and resizing in one step, using the `java.awt.image` package: + + def processImage(image: BufferedImage, width: Int, height: Int): BufferedImage = { + val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY) + val g = bwImage.getGraphics() + g.drawImage(image, 0, 0, width, height, null) + g.dispose() + bwImage + } + +The first line of the function creates a new image of the desired width and height and specifies a grayscale color model. The third line draws the original image onto this newly created image. The `drawImage` method takes care of the color conversion and resizing for us! Finally, we return the new, processed image. + +Let's test this out on our sample image. We will convert it to grayscale and resize it to 100 x 100 pixels: + + val grayImage = processImage(aeImage, 100, 100) + +You should see the following output in the console: + + **grayImage: java.awt.image.BufferedImage = BufferedImage@21f8ea3b: type = 10 ColorModel: #pixelBits = 8 numComponents = 1 color space = java.awt.color.ICC_ColorSpace@5cd9d8e9 transparency = 1 has alpha = false isAlphaPre = false ByteInterleavedRaster: width = 100 height = 100 #numDataElements 1 dataOff[0] = 0** + +As you can see from the highlighted output, the image's width and height are indeed 100, and the number of color components is 1. + +Next, we will save the processed image to a temporary location so that we can read it back and display it in our IPython console: + + import javax.imageio.ImageIO + import java.io.File + ImageIO.write(grayImage, "jpg", new File("/tmp/aeGray.jpg")) + +You should see a result of `true` displayed in your console, indicating that we successfully saved the image to the `aeGray.jpg` file in our `/tmp` directory. + +Finally, we will read the image in Python and use matplotlib to display the image. Type the following code into your IPython Notebook or shell (remember that this should be open in a new terminal window): + + tmpPath = "/tmp/aeGray.jpg" + aeGary = imread(tmpPath) + imshow(aeGary, cmap=plt.cm.gray) + +This should display the image (note again, we haven't shown the image here). You should see that it is grayscale and of slightly worse quality as compared to the original image. Furthermore, you will notice that the scale of the axes are different, representing the new 100 x 100 dimension instead of the original 250 x 250 size. + +#### Extracting feature vectors + +The final step in the processing pipeline is to extract the actual feature vectors that will be the input to our dimensionality reduction model. As we mentioned earlier, the raw grayscale pixel data will be our features. We will form the vectors by flattening out the two-dimensional pixel matrix. The `BufferedImage` class provides a utility method to do just this, which we will use in our function: + + def getPixelsFromImage(image: BufferedImage): Array[Double] = { + val width = image.getWidth + val height = image.getHeight + val pixels = Array.ofDim[Double](width * height) + image.getData.getPixels(0, 0, width, height, pixels) + } + +We can then combine these three functions into one utility function that takes a file location together with the desired image's width and height and returns the raw `Array[Double]` value that contains the pixel data: + + def extractPixels(path: String, width: Int, height: Int): Array[Double] = { + val raw = loadImageFromFile(path) + val processed = processImage(raw, width, height) + getPixelsFromImage(processed) + } + +Applying this function to each element of the RDD that contains all the image file paths will give us a new RDD that contains the pixel data for each image. Let's do this and inspect the first few elements: + + val pixels = files.map(f => extractPixels(f, 50, 50)) + println(pixels.take(10).map(_.take(10).mkString("", ",", ", ...")).mkString("\n")) + +You should see output similar to this: + + **0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0, ...** + **241.0,243.0,245.0,244.0,231.0,205.0,177.0,160.0,150.0,147.0, ...** + **253.0,253.0,253.0,253.0,253.0,253.0,254.0,254.0,253.0,253.0, ...** + **244.0,244.0,243.0,242.0,241.0,240.0,239.0,239.0,237.0,236.0, ...** + **44.0,47.0,47.0,49.0,62.0,116.0,173.0,223.0,232.0,233.0, ...** + **0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...** + **1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0, ...** + **26.0,26.0,27.0,26.0,24.0,24.0,25.0,26.0,27.0,27.0, ...** + **240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0, ...** + **0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...** + +The final step is to create an MLlib `Vector` instance for each image. We will cache the RDD to speed up our later computations: + + import org.apache.spark.mllib.linalg.Vectors + val vectors = pixels.map(p => Vectors.dense(p)) + vectors.setName("image-vectors") + vectors.cache + +### Tip + +We used the `setName` function earlier to assign an RDD a name. In this case, we called it `image-vectors`. This is so that we can later identify it more easily when looking at the Spark web interface. + +### Normalization + +It is a common practice to standardize input data prior to running dimensionality reduction models, in particular for PCA. As we did in Chapter 5, _Building a Classification Model with Spark_ , we will do this using the built-in `StandardScaler` provided by MLlib's `feature` package. We will only subtract the mean from the data in this case: + + import org.apache.spark.mllib.linalg.Matrix + import org.apache.spark.mllib.linalg.distributed.RowMatrix + import org.apache.spark.mllib.feature.StandardScaler + val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors) + +Calling `fit` triggers a computation on our `RDD[Vector]`. You should see output similar to the one shown here: + + **...** + **14/09/21 11:46:58 INFO SparkContext: Job finished: reduce at RDDFunctions.scala:111, took 0.495859 s** + **scaler: org.apache.spark.mllib.feature.StandardScalerModel = org.apache.spark.mllib.feature.StandardScalerModel@6bb1a1a1** + +### Tip + +Note that subtracting the mean works for dense input data. However, for sparse vectors, subtracting the mean vector from each input will transform the sparse data into dense data. For very high-dimensional input, this will likely exhaust the available memory resources, so it is not advisable. + +Finally, we will use the returned `scaler` to transform the raw image vectors to vectors with the column means subtracted: + + val scaledVectors = vectors.map(v => scaler.transform(v)) + +We mentioned earlier that the resized grayscale images would take up around 10 MB of memory. Indeed, you can take a look at the memory usage in the Spark application monitor storage page by going to `http://localhost:4040/storage/` ` `in your web browser. + +Since we gave our RDD of image vectors a friendly name of `image-vectors`, you should see something like the following screenshot (note that as we are using `Vector[Double]`, each element takes up 8 bytes instead of 4 bytes; hence, we actually use 20 MB of memory): + +Size of image vectors in memory + +# Training a dimensionality reduction model + +Dimensionality reduction models in MLlib require vectors as inputs. However, unlike clustering that operated on an `RDD[Vector]`, PCA and SVD computations are provided as methods on a distributed `RowMatrix` (this difference is largely down to syntax, as a `RowMatrix` is simply a wrapper around an `RDD[Vector]`). + +## Running PCA on the LFW dataset + +Now that we have extracted our image pixel data into vectors, we can instantiate a new `RowMatrix` and call the `computePrincipalComponents` method to compute the top `K` principal components of our distributed matrix: + + import org.apache.spark.mllib.linalg.Matrix + import org.apache.spark.mllib.linalg.distributed.RowMatrix + val matrix = new RowMatrix(scaledVectors) + val K = 10 + val pc = matrix.computePrincipalComponents(K) + +You will likely see quite a lot of output in your console while the model runs. + +### Tip + +If you see warnings such as **WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK** or **WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK** , you can safely ignore these. + +This means that the underlying linear algebra libraries used by MLlib could not load native routines. In this case, a Java-based fallback will be used, which is slower, but there is nothing to worry about for the purposes of this example. + +Once the model training is complete, you should see a result displayed in the console that looks similar to the following one: + + **pc: org.apache.spark.mllib.linalg.Matrix =** + **-0.023183157256614906 -0.010622723054037303 ... (10 total)** + **-0.023960537953442107 -0.011495966728461177 ...** + **-0.024397470862198022 -0.013512219690177352 ...** + **-0.02463158818330343 -0.014758658113862178 ...** + **-0.024941633606137027 -0.014878858729655142 ...** + **-0.02525998879466241 -0.014602750644394844 ...** + **-0.025494722450369593 -0.014678013626511024 ...** + **-0.02604194423255582 -0.01439561589951032 ...** + **-0.025942214214865228 -0.013907665261197633 ...** + **-0.026151551334429365 -0.014707035797934148 ...** + **-0.026106572186134578 -0.016701471378568943 ...** + **-0.026242986173995755 -0.016254664123732318 ...** + **-0.02573628754284022 -0.017185663918352894 ...** + **-0.02545319635905169 -0.01653357295561698 ...** + **-0.025325893980995124 -0.0157082218373399...** + +### Visualizing the Eigenfaces + +Now that we have trained our PCA model, what is the result? Let's inspect the dimensions of the resulting matrix: + + val rows = pc.numRows + val cols = pc.numCols + println(rows, cols) + +As you should see from your console output, the matrix of principal components has 2500 rows and 10 columns: + + **(2500,10)** + +Recall that the dimension of each image is 50 x 50, so here, we have the top 10 principal components, each with a dimension identical to that of the input images. These principal components can be thought of as the set of latent (or hidden) features that capture the greatest variation in the original data. + +### Note + +In facial recognition and image processing, these principal components are often referred to as **Eigenfaces** , as PCA is closely related to the eigenvalue decomposition of the covariance matrix of the original data. + +See for more details. + +Since each principal component is of the same dimension as the original images, each component can itself be thought of and represented as an image, making it possible to visualize the Eigenfaces as we would the input images. + +As we have often done in this book, we will use functionality from the Breeze linear algebra library as well as Python's numpy and matplotlib to visualize the Eigenfaces. + +First, we will extract the `pc` variable (an MLlib matrix) into a Breeze `DenseMatrix`: + + import breeze.linalg.DenseMatrix + val pcBreeze = new DenseMatrix(rows, cols, pc.toArray) + +Breeze provides a useful function within the `linalg` package to write the matrix out as a CSV file. We will use this to save the principal components to a temporary CSV file: + + import breeze.linalg.csvwrite + csvwrite(new File("/tmp/pc.csv"), pcBreeze) + +Next, we will load the matrix in IPython and visualize the principal components as images. Fortunately, numpy provides a utility function to read the matrix from the CSV file we created: + + pcs = np.loadtxt("/tmp/pc.csv", delimiter=",") + print(pcs.shape) + +You should see the following output, confirming that the matrix we read has the same dimensions as the one we saved: + + **(2500, 10)** + +We will need a utility function to display the images, which we define here: + + def plot_gallery(images, h, w, n_row=2, n_col=5): + """Helper function to plot a gallery of portraits""" + plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) + plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) + for i in range(n_row * n_col): + plt.subplot(n_row, n_col, i + 1) + plt.imshow(images[:, i].reshape((h, w)), cmap=plt.cm.gray) + plt.title("Eigenface %d" % (i + 1), size=12) + plt.xticks(()) + plt.yticks(()) + +### Note + +This function is adapted from the LFW dataset example code in the scikit-learn documentation available at . + +We will now use this function to plot the top 10 Eigenfaces: + + plot_gallery(pcs, 50, 50) + +This should display the following plot: + +Top 10 Eigenfaces + +### Interpreting the Eigenfaces + +Looking at the preceding images, we can see that the PCA model has effectively extracted recurring patterns of variation, which represent various features of the facial images. Each principal component can, as with clustering models, be interpreted. Again, like clustering, it is not always straightforward to interpret precisely what each principal component represents. + +We can see from these images that there appear to be some images that pick up directional factors (for example, images 6 and 9), some hone in on hair patterns (such as images 4, 5, 7, and 10), while others seem to be somewhat more related to facial features such as eyes, nose, and mouth (such as images 1, 7, and 9). + +# Using a dimensionality reduction model + +It is interesting to be able to visualize the outcome of a model in this way; however, the overall purpose of using dimensionality reduction is to create a more compact representation of the data that still captures the important features and variability in the raw dataset. To do this, we need to use a trained model to transform our raw data by projecting it into the new, lower-dimensional space represented by the principal components. + +## Projecting data using PCA on the LFW dataset + +We will illustrate this concept by projecting each LFW image into a ten-dimensional vector. This is done through a matrix multiplication of the image matrix with the matrix of principal components. As the image matrix is a distributed MLlib `RowMatrix`, Spark takes care of distributing this computation for us through the `multiply` function: + + val projected = matrix.multiply(pc) + println(projected.numRows, projected.numCols) + +This will give you the following output: + + **(1055,10)** + +Observe that each image that was of dimension 2500 has been transformed into a vector of size 10. Let's take a look at the first few vectors: + + println(projected.rows.take(5).mkString("\n")) + +Here is the output: + + **[2648.9455749636277,1340.3713412351376,443.67380716760965,-353.0021423043161,52.53102289832631,423.39861446944354,413.8429065865399,-484.18122999722294,87.98862070273545,-104.62720604921965]** + **[172.67735747311974,663.9154866829355,261.0575622447282,-711.4857925259682,462.7663154755333,167.3082231097332,-71.44832640530836,624.4911488194524,892.3209964031695,-528.0056327351435]** + **[-1063.4562028554978,388.3510869550539,1508.2535609357597,361.2485590837186,282.08588829583596,-554.3804376922453,604.6680021092125,-224.16600191143075,-228.0771984153961,-110.21539201855907]** + **[-4690.549692385103,241.83448841252638,-153.58903325799685,-28.26215061165965,521.8908276360171,-442.0430200747375,-490.1602309367725,-456.78026845649435,-78.79837478503592,70.62925170688868]** + **[-2766.7960144161225,612.8408888724891,-405.76374113178616,-468.56458995613974,863.1136863614743,-925.0935452709143,69.24586949009642,-777.3348492244131,504.54033662376435,257.0263568009851]** + +As the projected data is in the form of vectors, we can use the projection as input to another machine learning model. For example, we could use these projected inputs together with a set of input data generated from various images without faces to train a facial recognition model. Alternatively, we could train a multiclass classifier where each person is a class, thus creating a model that learns to identify the particular person that a face belongs to. + +## The relationship between PCA and SVD + +We mentioned earlier that there is a close relationship between PCA and SVD. In fact, we can recover the same principal components and also apply the same projection into the space of principal components using SVD. + +In our example, the right singular vectors derived from computing the SVD will be equivalent to the principal components we have calculated. We can see that this is the case by first computing the SVD on our image matrix and comparing the right singular vectors to the result of PCA. As was the case with PCA, SVD computation is provided as a function on a distributed `RowMatrix`: + + val svd = matrix.computeSVD(10, computeU = true) + println(s"U dimension: (${svd.U.numRows}, ${svd.U.numCols})") + println(s"S dimension: (${svd.s.size}, )") + println(s"V dimension: (${svd.V.numRows}, ${svd.V.numCols})") + +We can see that SVD returns a matrix `U` of dimension 1055 x 10, a vector `S` of the singular values of length `10`, and a matrix `V` of the right singular vectors of dimension 2500 x 10: + + **U dimension: (1055, 10)** + **S dimension: (10, )** + **V dimension: (2500, 10)** + +The matrix `V` is exactly equivalent to the result of PCA (ignoring the sign of the values and floating point tolerance). We can verify this with a utility function to compare the two by approximately comparing the data arrays of each matrix: + + def approxEqual(array1: Array[Double], array2: Array[Double], tolerance: Double = 1e-6): Boolean = { + // note we ignore sign of the principal component / singular vector elements + val bools = array1.zip(array2).map { case (v1, v2) => if (math.abs(math.abs(v1) - math.abs(v2)) > 1e-6) false else true } + bools.fold(true)(_ & _) + } + +We will test the function on some test data: + + println(approxEqual(Array(1.0, 2.0, 3.0), Array(1.0, 2.0, 3.0))) + +This will give you the following output: + + **true** + +Let's try another test data: + + println(approxEqual(Array(1.0, 2.0, 3.0), Array(3.0, 2.0, 1.0))) + +This will give you the following output: + + **false** + +Finally, we can apply our equality function as follows: + + println(approxEqual(svd.V.toArray, pc.toArray)) + +Here is the output: + + **true** + +The other relationship that holds is that the multiplication of the matrix `U` and vector `S` (or, strictly speaking, the diagonal matrix `S`) is equivalent to the PCA projection of our original image data into the space of the top 10 principal components. + +We will now show that this is indeed the case. We will first use Breeze to multiply each vector in `U` by `S`, element-wise. We will then compare each vector in our PCA projected vectors with the equivalent vector in our SVD projection, and sum up the number of equal cases: + + val breezeS = breeze.linalg.DenseVector(svd.s.toArray) + val projectedSVD = svd.U.rows.map { v => + val breezeV = breeze.linalg.DenseVector(v.toArray) + val multV = breezeV **:*** breezeS + Vectors.dense(multV.data) + } + projected.rows.zip(projectedSVD).map { case (v1, v2) => approxEqual(v1.toArray, v2.toArray) }.filter(b => true).count + +This should display a result of 1055, as we would expect, confirming that each row of `projected` is equal to each row of `projectedSVD`. + +### Note + +Note that the **:*** operator highlighted in the preceding code represents element-wise multiplication of the vectors. + +# Evaluating dimensionality reduction models + +Both PCA and SVD are deterministic models. That is, given a certain input dataset, they will always produce the same result. This is in contrast to many of the models we have seen so far, which depend on some random element (most often for the initialization of model weight vectors and so on). + +Both models are also guaranteed to return the top principal components or singular values, and hence, the only parameter is _k_. Like clustering models, increasing _k_ always improves the model performance (for clustering, the relevant error function, while for PCA and SVD, the total amount of variability explained by the _k_ components). Therefore, selecting a value for _k_ is a trade-off between capturing as much structure of the data as possible while keeping the dimensionality of projected data low. + +## Evaluating k for SVD on the LFW dataset + +We will examine the singular values obtained from computing the SVD on our image data. We can verify that the singular values are the same for each run and that they are returned in decreasing order, as follows: + + val sValues = (1 to 5).map { i => matrix.computeSVD(i, computeU = false).s } + sValues.foreach(println) + +This should show us output similar to the following: + + **[54091.00997110354]** + **[54091.00997110358,33757.702867982436]** + **[54091.00997110357,33757.70286798241,24541.193694775946]** + **[54091.00997110358,33757.70286798242,24541.19369477593,23309.58418888302]** + **[54091.00997110358,33757.70286798242,24541.19369477593,23309.584188882982,21803.09841158358]** + +As with evaluating values of _k_ for clustering, in the case of SVD (and PCA), it is often useful to plot the singular values for a larger range of _k_ and see where the point on the graph is where the amount of additional variance accounted for by each additional singular value starts to flatten out considerably. + +We will do this by first computing the top 300 singular values: + + val svd300 = matrix.computeSVD(300, computeU = false) + val sMatrix = new DenseMatrix(1, 300, svd300.s.toArray) + csvwrite(new File("/tmp/s.csv"), sMatrix) + +We will write out the vector `S` of singular values to a temporary CSV file (as we did for our matrix of Eigenfaces previously) and then read it back in our IPython console, plotting the singular values for each _k_ : + + s = np.loadtxt("/tmp/s.csv", delimiter=",") + print(s.shape) + plot(s) + +You should see an image displayed similar to the one shown here: + +Top 300 singular values + +A similar pattern is seen in the cumulative variation accounted for by the top 300 singular values (which we will plot on a log scale for the _y_ axis): + + plot(cumsum(s)) + plt.yscale('log') + +Cumulative sum of top 300 singular values + +We can see that after a certain value range for _k_ (around 100 in this case), the graph flattens considerably. This indicates that a number of singular values (or principal components) equivalent to this value of _k_ probably explains enough of the variation of the original data. + +### Tip + +Of course, if we are using dimensionality reduction to help improve the performance of another model, we could use the same evaluation methods used for that model to help us choose a value for _k_. + +For example, we could use the AUC metric, together with cross-validation, to choose both the model parameters for a classification model as well as the value of _k_ for our dimensionality reduction model. This does come at the expense of higher computation cost, however, as we would have to recompute the full model training and testing pipeline. + +# Summary + +In this chapter, we explored two new unsupervised learning methods, PCA and SVD, for dimensionality reduction. We saw how to extract features for and train these models using facial image data. We visualized the results of the model in the form of Eigenfaces, saw how to apply the models to transform our original data into a reduced dimensionality representation, and investigated the close link between PCA and SVD. + +In the next chapter, we will delve more deeply into techniques for text processing and analysis with Spark. + +# Chapter 9. Advanced Text Processing with Spark + +In Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we covered various topics related to feature extraction and data processing, including the basics of extracting features from text data. In this chapter, we will introduce more advanced text processing techniques available in MLlib to work with large-scale text datasets. + +In this chapter, we will: + + * Work through detailed examples that illustrate data processing, feature extraction, and the modeling pipeline, as they relate to text data + * Evaluate the similarity between two documents based on the words in the documents + * Use the extracted text features as inputs for a classification model + * Cover a recent development in natural language processing to model words themselves as vectors and illustrate the use of Spark's **Word2Vec** model to evaluate the similarity between two words, based on their meaning + +# What's so special about text data? + +Text data can be complex to work with for two main reasons. First, text and language have an inherent structure that is not easily captured using the raw words as is (for example, meaning, context, different types of words, sentence structure, and different languages, to highlight a few). Therefore, naive feature extraction is usually relatively ineffective. + +Second, the effective dimensionality of text data is extremely large and potentially limitless. Think about the number of words in the English language alone and add all kinds of special words, characters, slang, and so on to this. Then, throw in other languages and all the types of text one might find across the Internet. The dimension of text data can easily exceed tens or even hundreds of millions of words, even in relatively small datasets. For example, the Common Crawl dataset of billions of websites contains over 840 billion individual words. + +To deal with these issues, we need ways of extracting more structured features and methods to handle the huge dimensionality of text data. + +# Extracting the right features from your data + +The field of **natural language processing** ( **NLP** ) covers a wide range of techniques to work with text, from text processing and feature extraction through to modeling and machine learning. In this chapter, we will focus on two feature extraction techniques available within MLlib: the TF-IDF term weighting scheme and feature hashing. + +Working through an example of TF-IDF, we will also explore the ways in which processing, tokenization, and filtering during feature extraction can help reduce the dimensionality of our input data as well as improve the information content and usefulness of the features we extract. + +## Term weighting schemes + +In Chapter 3, _Obtaining, Processing, and Preparing Data with Spark_ , we looked at vector representation where text features are mapped to a simple binary vector called the **bag-of-words** model. Another representation used commonly in practice is called **term frequency-inverse document frequency** ( **TF-IDF** ). + +TF-IDF weights each term in a piece of text (referred to as a **document** ) based on its frequency in the document (the **term frequency** ). A global normalization, called the **inverse document frequency** , is then applied based on the frequency of this term among all documents (the set of documents in a dataset is commonly referred to as a **corpus** ). The standard definition of TF-IDF is shown here: + + tf-idf(t,d) = tf(t,d) x idf(t) + +Here, _tf(t,d)_ is the frequency (number of occurrences) of term _t_ in document _d_ and _idf(t)_ is the inverse document frequency of term _t_ in the corpus; this is defined as follows: + + idf(t) = log(N / d) + +Here, _N_ is the total number of documents, and _d_ is the number of documents in which the term _t_ occurs. + +The TF-IDF formulation means that terms occurring many times in a document receive a higher weighting in the vector representation relative to those that occur few times in the document. However, the IDF normalization has the effect of reducing the weight of terms that are very common across all documents. The end result is that truly rare or important terms should be assigned higher weighting, while more common terms (which are assumed to have less importance) should have less impact in terms of weighting. + +### Note + +A good resource to learn more about the bag-of-words model (or **vector space model** ) is the book _Introduction to Information Retrieval_ , _Christopher D. Manning, Prabhakar Raghavan and Hinrich Sch utze_, _Cambridge University Press_ (available in HTML form at ). + +It contains sections on text processing techniques, including tokenization, stop word removal, stemming, and the vector space model, as well as weighting schemes such as TF-IDF. + +An overview can also be found at . + +## Feature hashing + + **Feature hashing** is a technique to deal with high-dimensional data and is often used with text and categorical datasets where the features can take on many unique values (often many millions of values). In the previous chapters, we often used the _1-of-K_ encoding approach for categorical features, including text. While this approach is simple and effective, it can break down in the face of extremely high-dimensional data. + +Building and using _1-of-K_ feature encoding requires us to keep a mapping of each possible feature value to an index in a vector. Furthermore, the process of creating the mapping itself requires at least one additional pass through the dataset and can be tricky to do in parallel scenarios. Up until now, we have often used a simple approach of collecting the distinct feature values and zipping this collection with a set of indices to create a map of feature value to index. This mapping is then broadcast (either explicitly in our code or implicitly by Spark) to each worker. + +However, when dealing with huge feature dimensions in the tens of millions or more that are common when working with text, this approach can be slow and can require significant memory and network resources, both on the Spark master (to collect the unique values) and workers (to broadcast the resulting mapping to each worker, which keeps it in memory to allow it to apply the feature encoding to its local piece of the input data). + +Feature hashing works by assigning the vector index for a feature based on the value obtained by hashing this feature to a number (usually, an integer value) using a hash function. For example, let's say the hash value of a categorical feature for the geolocation of `United States` is `342`. We will use the hashed value as the vector index, and the value at this index will be `1.0` to indicate the presence of the `United States` feature. The hash function used must be consistent (that is, for a given input, it returns the same output each time). + +This encoding works the same way as mapping-based encoding, except that we choose a size for our feature vector upfront. As the most common hash functions return values in the entire range of integers, we will use a _modulo_ operation to restrict the index values to the size of our vector, which is typically much smaller (a few tens of thousands to a few million, depending on our requirements). + +Feature hashing has the advantage that we do not need to build a mapping and keep it in memory. It is also easy to implement, very fast, and can be done online and in real time, thus not requiring a pass through our dataset first. Finally, because we selected a feature vector dimension that is significantly smaller than the raw dimensionality of our dataset, we bound the memory usage of our model both in training and production; hence, memory usage does not scale with the size and dimensionality of our data. + +However, there are two important drawbacks, which are as follows: + + * As we don't create a mapping of features to index values, we also cannot do the reverse mapping of feature index to value. This makes it harder to, for example, determine which features are most informative in our models. + * As we are restricting the size of our feature vectors, we might experience **hash collisions**. This happens when two different features are hashed into the same index in our feature vector. Surprisingly, this doesn't seem to have a severe impact on model performance as long as we choose a reasonable feature vector dimension relative to the dimension of the input data. + +### Note + +Further information on hashing can be found at . + +A key paper that introduced the use of hashing for feature extraction and machine learning is: + + _Kilian Weinberger_ , _Anirban Dasgupta_ , _John Langford_ , _Alex Smola_ , and _Josh Attenberg_. _Feature Hashing for Large Scale Multitask Learning_. _Proc. ICML 2009_ , which is available at . + +## Extracting the TF-IDF features from the 20 Newsgroups dataset + +To illustrate the concepts in this chapter, we will use a well-known text dataset called **20 Newsgroups** ; this dataset is commonly used for text-classification tasks. This is a collection of newsgroup messages posted across 20 different topics. There are various forms of data available. For our purposes, we will use the `bydate` version of the dataset, which is available at . + +This dataset splits up the available data into training and test sets that comprise 60 percent and 40 percent of the original data, respectively. Here, the messages in the test set occur after those in the training set. This dataset also excludes some of the message headers that identify the actual newsgroup; hence, it is an appropriate dataset to test the real-world performance of classification models. + +### Note + +Further information on the original dataset can be found in the _UCI Machine Learning Repository_ page at . + +To get started, download the data and unzip the file using the following command: + + **> tar xfvz 20news-bydate.tar.gz** + +This will create two folders: one called `20news-bydate-train` and another one called `20news-bydate-test`. Let's take a look at the directory structure under the training dataset folder: + + **> cd 20news-bydate-train/** + **> ls** + +You will see that it contains a number of subfolders, one for each newsgroup: + + **alt.atheism comp.windows.x rec.sport.hockey soc.religion.christian** + **comp.graphics misc.forsale sci.crypt talk.politics.guns** + **comp.os.ms-windows.misc rec.autos sci.electronics talk.politics.mideast** + **comp.sys.ibm.pc.hardware rec.motorcycles sci.med talk.politics.misc** + **comp.sys.mac.hardware rec.sport.baseball sci.space talk.religion.misc** + +There are a number of files under each newsgroup folder; each file contains an individual message posting: + + **> ls rec.sport.hockey** + **52550 52580 52610 52640 53468 53550 53580 53610 53640 53670 53700 53731 53761 53791** + **...** + +We can take a look at a part of one of these messages to see the format: + + **> head -20 rec.sport.hockey/52550** + **From: dchhabra@stpl.ists.ca (Deepak Chhabra)** + **Subject: Superstars and attendance (was Teemu Selanne, was +/- leaders)** + **Nntp-Posting-Host: stpl.ists.ca** + **Organization: Solar Terresterial Physics Laboratory, ISTS** + **Distribution: na** + **Lines: 115** + + **Dean J. Falcione (posting from jrmst+8@pitt.edu) writes:** + **[I wrote:]** + + **> >When the Pens got Mario, granted there was big publicity, etc, etc,** + **> >and interest was immediately generated. Gretzky did the same thing for LA.** + **> >However, imnsho, neither team would have seen a marked improvement in** + **> >attendance if the team record did not improve. In the year before Lemieux** + **> >came, Pittsburgh finished with 38 points. Following his arrival, the Pens** + **> >finished with 53, 76, 72, 81, 87, 72, 88, and 87 points, with a couple of** + **^^** + **> >Stanley Cups thrown in.** + **...** + +As we can see, each message contains some header fields that contain the sender, subject, and other metadata, followed by the raw content of the message. + +### Exploring the 20 Newsgroups data + +Now, we will start up our Spark Scala console, ensuring that we make enough memory available: + + **>./SPARK_HOME/bin/spark-shell --driver-memory 4g** + +Looking at the directory structure, you might recognize that once again, we have data contained in individual text files (one text file per message). Therefore, we will again use Spark's `wholeTextFiles` method to read the content of each file into a record in our RDD. + +In the code that follows, `PATH` refers to the directory in which you extracted the `20news-bydate` ZIP file: + + val path = "/PATH/20news-bydate-train/*" + val rdd = sc.wholeTextFiles(path) + val text = rdd.map { case (file, text) => text } + println(text.count) + +The first time you run this command, it might take quite a bit of time, as Spark needs to scan the directory structure. You will also see quite a lot of console output, as Spark logs all the file paths that are being processed. During the processing, you will see the following line displayed, indicating the total number of files that Spark has detected: + + **...** + **14/10/12 14:27:54 INFO FileInputFormat: Total input paths to process : 11314** + **...** + +After the command has finished running, you will see the total record count, which should be the same as the preceding **Total input paths to process** screen output: + + **11314** + +Next, we will take a look at the newsgroup topics available: + + val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head } + val countByGroup = newsgroups.map(n => (n, 1)).reduceByKey(_ + _).collect.sortBy(-_._2).mkString("\n") + println(countByGroup) + +This will display the following result: + + **(rec.sport.hockey,600)** + **(soc.religion.christian,599)** + **(rec.motorcycles,598)** + **(rec.sport.baseball,597)** + **(sci.crypt,595)** + **(rec.autos,594)** + **(sci.med,594)** + **(comp.windows.x,593)** + **(sci.space,593)** + **(sci.electronics,591)** + **(comp.os.ms-windows.misc,591)** + **(comp.sys.ibm.pc.hardware,590)** + **(misc.forsale,585)** + **(comp.graphics,584)** + **(comp.sys.mac.hardware,578)** + **(talk.politics.mideast,564)** + **(talk.politics.guns,546)** + **(alt.atheism,480)** + **(talk.politics.misc,465)** + **(talk.religion.misc,377)** + +We can see that the number of messages is roughly even between the topics. + +### Applying basic tokenization + +The first step in our text processing pipeline is to split up the raw text content in each document into a collection of terms (also referred to as **tokens** ). This is known as **tokenization**. We will start by applying a simple **whitespace** tokenization, together with converting each token to lowercase for each document: + + val text = rdd.map { case (file, text) => text } + val whiteSpaceSplit = text.flatMap(t => t.split(" ").map(_.toLowerCase)) + println(whiteSpaceSplit.distinct.count) + +### Tip + +In the preceding code, we used the `flatMap` function instead of `map`, as for now, we want to inspect all the tokens together for exploratory analysis. Later in this chapter, we will apply our tokenization scheme on a per-document basis, so we will use the `map` function. + +After running this code snippet, you will see the total number of unique tokens after applying our tokenization: + + **402978** + +As you can see, for even a relatively small set of text, the number of raw tokens (and, therefore, the dimensionality of our feature vectors) can be very high. + +Let's take a look at a randomly selected document: + + println(whiteSpaceSplit.sample(true, 0.3, 42).take(100).mkString(",")) + +### Tip + +Note that we set the third parameter to the `sample` function, which is the random seed. We set this function to `42` so that we get the same results from each call to `sample` so that your results match those in this chapter. + +This will display the following result: + + **atheist,resources** + **summary:,addresses,,to,atheism** + **keywords:,music,,thu,,11:57:19,11:57:19,gmt** + **distribution:,cambridge.,290** + + **archive-name:,atheism/resources** + **alt-atheism-archive-name:,december,,,,,,,,,,,,,,,,,,,,,,addresses,addresses,,,,,,,religion,to:,to:,,p.o.,53701.** + **telephone:,sell,the,,fish,on,their,cars,,with,and,written** + **inside.,3d,plastic,plastic,,evolution,evolution,7119,,,,,san,san,san,mailing,net,who,to,atheist,press** + + **aap,various,bible,,and,on.,,,one,book,is:** + + **"the,w.p.,american,pp.,,1986.,bible,contains,ball,,based,based,james,of** + +### Improving our tokenization + +The preceding simple approach results in a lot of tokens and does not filter out many nonword characters (such as punctuation). Most tokenization schemes will remove these characters. We can do this by splitting each raw document on **nonword characters** using a regular expression pattern: + + val nonWordSplit = text.flatMap(t => t.split("""\W+""").map(_.toLowerCase)) + println(nonWordSplit.distinct.count) + +This reduces the number of unique tokens significantly: + + **130126** + +If we inspect the first few tokens, we will see that we have eliminated most of the less useful characters in the text: + + println(nonWordSplit.distinct.sample(true, 0.3, 42).take(100).mkString(",")) + +You will see the following result displayed: + + **bone,k29p,w1w3s1,odwyer,dnj33n,bruns,_congressional,mmejv5,mmejv5,artur,125215,entitlements,beleive,1pqd9hinnbmi,** + **jxicaijp,b0vp,underscored,believiing,qsins,1472,urtfi,nauseam,tohc4,kielbasa,ao,wargame,seetex,museum,typeset,pgva4,** + **dcbq,ja_jp,ww4ewa4g,animating,animating,10011100b,10011100b,413,wp3d,wp3d,cannibal,searflame,ets,1qjfnv,6jx,6jx,** + **detergent,yan,aanp,unaskable,9mf,bowdoin,chov,16mb,createwindow,kjznkh,df,classifieds,hour,cfsmo,santiago,santiago,** + **1r1d62,almanac_,almanac_,chq,nowadays,formac,formac,bacteriophage,barking,barking,barking,ipmgocj7b,monger,projector,** + **hama,65e90h8y,homewriter,cl5,1496,zysec,homerific,00ecgillespie,00ecgillespie,mqh0,suspects,steve_mullins,io21087,** + **funded,liberated,canonical,throng,0hnz,exxon,xtappcontext,mcdcup,mcdcup,5seg,biscuits** + +While our nonword pattern to split text works fairly well, we are still left with numbers and tokens that contain numeric characters. In some cases, numbers can be an important part of a corpus. For our purposes, the next step in our pipeline will be to filter out numbers and tokens that are words mixed with numbers. + +We can do this by applying another regular expression pattern and using this to filter out tokens that _do not match_ the pattern: + + val regex = """[^0-9]*""".r + val filterNumbers = nonWordSplit.filter(token => regex.pattern.matcher(token).matches) + println(filterNumbers.distinct.count) + +This further reduces the size of the token set: + + **84912** + +Let's take a look at another random sample of the filtered tokens: + + println(filterNumbers.distinct.sample(true, 0.3, 42).take(100).mkString(",")) + +You will see output like the following one: + + **reunion,wuair,schwabam,eer,silikian,fuller,sloppiness,crying,crying,beckmans,leymarie,fowl,husky,rlhzrlhz,ignore,** + **loyalists,goofed,arius,isgal,dfuller,neurologists,robin,jxicaijp,majorly,nondiscriminatory,akl,sively,adultery,** + **urtfi,kielbasa,ao,instantaneous,subscriptions,collins,collins,za_,za_,jmckinney,nonmeasurable,nonmeasurable,** + **seetex,kjvar,dcbq,randall_clark,theoreticians,theoreticians,congresswoman,sparcstaton,diccon,nonnemacher,** + **arresed,ets,sganet,internship,bombay,keysym,newsserver,connecters,igpp,aichi,impute,impute,raffle,nixdorf,** + **nixdorf,amazement,butterfield,geosync,geosync,scoliosis,eng,eng,eng,kjznkh,explorers,antisemites,bombardments,** + **abba,caramate,tully,mishandles,wgtn,springer,nkm,nkm,alchoholic,chq,shutdown,bruncati,nowadays,mtearle,eastre,** + **discernible,bacteriophage,paradijs,systematically,rluap,rluap,blown,moderates** + +We can see that we have removed all the numeric characters. This still leaves us with a few strange _words_ , but we will not worry about these too much here. + +### Removing stop words + + **Stop words** refer to common words that occur many times across almost all documents in a corpus (and across most corpuses). Examples of typical English stop words include and, but, the, of, and so on. It is a standard practice in text feature extraction to exclude stop words from the extracted tokens. + +When using TF-IDF weighting, the weighting scheme actually takes care of this for us. As stop words have a very low IDF score, they will tend to have very low TF-IDF weightings and thus less importance. In some cases, for information retrieval and search tasks, it might be desirable to include stop words. However, it can still be beneficial to exclude stop words during feature extraction, as it reduces the dimensionality of the final feature vectors as well as the size of the training data. + +We can take a look at some of the tokens in our corpus that have the highest occurrence across all documents to get an idea about some other stop words to exclude: + + val tokenCounts = filterNumbers.map(t => (t, 1)).reduceByKey(_ + _) + val oreringDesc = Ordering.by[(String, Int), Int](_._2) + println(tokenCounts.top(20)(oreringDesc).mkString("\n")) + +In the preceding code, we took the tokens after filtering out numeric characters and generated a count of the occurrence of each token across the corpus. We can now use Spark's `top` function to retrieve the top 20 tokens by count. Notice that we need to provide the `top` function with an ordering that tells Spark how to order the elements of our RDD. In this case, we want to order by the count, so we will specify the second element of our key-value pair. + +Running the preceding code snippet will result in the following top tokens: + + **(the,146532)** + **(to,75064)** + **(of,69034)** + **(a,64195)** + **(ax,62406)** + **(and,57957)** + **(i,53036)** + **(in,49402)** + **(is,43480)** + **(that,39264)** + **(it,33638)** + **(for,28600)** + **(you,26682)** + **(from,22670)** + **(s,22337)** + **(edu,21321)** + **(on,20493)** + **(this,20121)** + **(be,19285)** + **(t,18728)** + +As we might expect, there are a lot of common words in this list that we could potentially label as stop words. Let's create a set of stop words with some of these as well as other common words. We will then look at the tokens after filtering out these stop words: + + val stopwords = Set( + "the","a","an","of","or","in","for","by","on","but", "is", "not", "with", "as", "was", "if", + "they", "are", "this", "and", "it", "have", "from", "at", "my", "be", "that", "to" + ) + val tokenCountsFilteredStopwords = tokenCounts.filter { case (k, v) => !stopwords.contains(k) } + println(tokenCountsFilteredStopwords.top(20)(oreringDesc).mkString("\n")) + +You will see the following output: + + **(ax,62406)** + **(i,53036)** + **(you,26682)** + **(s,22337)** + **(edu,21321)** + **(t,18728)** + **(m,12756)** + **(subject,12264)** + **(com,12133)** + **(lines,11835)** + **(can,11355)** + **(organization,11233)** + **(re,10534)** + **(what,9861)** + **(there,9689)** + **(x,9332)** + **(all,9310)** + **(will,9279)** + **(we,9227)** + **(one,9008)** + +You might notice that there are still quite a few common words in this top list. In practice, we might have a much larger set of stop words. However, we will keep a few (partly to illustrate the impact of common words when using TF-IDF weighting a little later). + +One other filtering step that we will use is removing any tokens that are only one character in length. The reasoning behind this is similar to removing stop words--these single-character tokens are unlikely to be informative in our text model and can further reduce the feature dimension and model size. We will do this with another filtering step: + + val tokenCountsFilteredSize = tokenCountsFilteredStopwords.filter { case (k, v) => k.size >= 2 } + println(tokenCountsFilteredSize.top(20)(oreringDesc).mkString("\n")) + +Again, we will examine the tokens remaining after this filtering step: + + **(ax,62406)** + **(you,26682)** + **(edu,21321)** + **(subject,12264)** + **(com,12133)** + **(lines,11835)** + **(can,11355)** + **(organization,11233)** + **(re,10534)** + **(what,9861)** + **(there,9689)** + **(all,9310)** + **(will,9279)** + **(we,9227)** + **(one,9008)** + **(would,8905)** + **(do,8674)** + **(he,8441)** + **(about,8336)** + **(writes,7844)** + +Apart from some of the common words that we have not excluded, we see that a few potentially more informative words are starting to appear. + +### Excluding terms based on frequency + +It is also a common practice to exclude terms during tokenization when their overall occurrence in the corpus is very low. For example, let's examine the least occurring terms in the corpus (notice the different ordering we use here to return the results sorted in ascending order): + + val oreringAsc = Ordering.by[(String, Int), Int](-_._2) + println(tokenCountsFilteredSize.top(20)(oreringAsc).mkString("\n")) + +You will get the following results: + + **(lennips,1)** + **(bluffing,1)** + **(preload,1)** + **(altina,1)** + **(dan_jacobson,1)** + **(vno,1)** + **(actu,1)** + **(donnalyn,1)** + **(ydag,1)** + **(mirosoft,1)** + **(xiconfiywindow,1)** + **(harger,1)** + **(feh,1)** + **(bankruptcies,1)** + **(uncompression,1)** + **(d_nibby,1)** + **(bunuel,1)** + **(odf,1)** + **(swith,1)** + **(lantastic,1)** + +As we can see, there are many terms that only occur once in the entire corpus. Since typically we want to use our extracted features for other tasks such as document similarity or machine learning models, tokens that only occur once are not useful to learn from, as we will not have enough training data relative to these tokens. We can apply another filter to exclude these rare tokens: + + val rareTokens = tokenCounts.filter{ case (k, v) => v < 2 }.map { case (k, v) => k }.collect.toSet + val tokenCountsFilteredAll = tokenCountsFilteredSize.filter { case (k, v) => !rareTokens.contains(k) } + println(tokenCountsFilteredAll.top(20)(oreringAsc).mkString("\n")) + +We can see that we are left with tokens that occur at least twice in the corpus: + + **(sina,2)** + **(akachhy,2)** + **(mvd,2)** + **(hizbolah,2)** + **(wendel_clark,2)** + **(sarkis,2)** + **(purposeful,2)** + **(feagans,2)** + **(wout,2)** + **(uneven,2)** + **(senna,2)** + **(multimeters,2)** + **(bushy,2)** + **(subdivided,2)** + **(coretest,2)** + **(oww,2)** + **(historicity,2)** + **(mmg,2)** + **(margitan,2)** + **(defiance,2)** + +Now, let's count the number of unique tokens: + + println(tokenCountsFilteredAll.count) + +You will see the following output: + + **51801** + +As we can see, by applying all the filtering steps in our tokenization pipeline, we have reduced the feature dimension from 402,978 to 51,801. + +We can now combine all our filtering logic into one function, which we can apply to each document in our RDD: + + def tokenize(line: String): Seq[String] = { + line.split("""\W+""") + .map(_.toLowerCase) + .filter(token => regex.pattern.matcher(token).matches) + .filterNot(token => stopwords.contains(token)) + .filterNot(token => rareTokens.contains(token)) + .filter(token => token.size >= 2) + .toSeq + } + +We can check whether this function gives us the same result with the following code snippet: + + println(text.flatMap(doc => tokenize(doc)).distinct.count) + +This will output `51801`, giving us the same unique token count as our step-by-step pipeline. + +We can tokenize each document in our RDD as follows: + + val tokens = text.map(doc => tokenize(doc)) + println(tokens.first.take(20)) + +You will see output similar to the following, showing the first part of the tokenized version of our first document: + + **WrappedArray(mathew, mathew, mantis, co, uk, subject, alt, atheism, faq, atheist, resources, summary, books, addresses, music, anything, related, atheism, keywords, faq)** + +### A note about stemming + +A common step in text processing and tokenization is **stemming**. This is the conversion of whole words to a **base form** (called a **word stem** ). For example, plurals might be converted to singular ( _dogs_ becomes _dog_ ), and forms such as _walking_ and _walker_ might become _walk_. Stemming can become quite complex and is typically handled with specialized NLP or search engine software (such as NLTK, OpenNLP, and Lucene, for example). We will ignore stemming for the purpose of our example here. + +### Note + +A full treatment of stemming is beyond the scope of this book. You can find more details at . + +### Training a TF-IDF model + +We will now use MLlib to transform each document, in the form of processed tokens, into a vector representation. The first step will be to use the `HashingTF` implementation, which makes use of feature hashing to map each token in the input text to an index in the vector of term frequencies. Then, we will compute the global IDF and use it to transform the term frequency vectors into TF-IDF vectors. + +For each token, the index will thus be the hash of the token (mapped in turn onto the dimension of the feature vector). The value for each token will be the TF-IDF weighting for that token (that is, the term frequency multiplied by the inverse document frequency). + +First, we will import the classes we need and create our `HashingTF` instance, passing in a `dim` dimension parameter. While the default feature dimension is 220 (or around 1 million), we will choose 218 (or around 260,000), since with about 50,000 tokens, we should not experience a significant number of hash collisions, and a smaller dimension will be more memory and processing friendly for illustrative purposes: + + import org.apache.spark.mllib.linalg.{ SparseVector => SV } + import org.apache.spark.mllib.feature.HashingTF + import org.apache.spark.mllib.feature.IDF + val dim = math.pow(2, 18).toInt + val hashingTF = new HashingTF(dim) + val tf = hashingTF.transform(tokens) + tf.cache + +### Tip + +Note that we imported MLlib's `SparseVector` using an alias of `SV`. This is because later, we will use Breeze's `linalg` module, which itself also imports `SparseVector`. This way, we will avoid namespace collisions. + +The `transform` function of `HashingTF` maps each input document (that is, a sequence of tokens) to an MLlib `Vector`. We will also call `cache` to pin the data in memory to speed up subsequent operations. + +Let's inspect the first element of our transformed dataset: + +### Tip + +Note that `HashingTF.transform` returns an `RDD[Vector]`, so we will cast the result returned to an instance of an MLlib `SparseVector`. + +The `transform` method can also work on an individual document by taking an `Iterable` argument (for example, a document as a `Seq[String]`). This returns a single vector. + + val v = tf.first.asInstanceOf[SV] + println(v.size) + println(v.values.size) + println(v.values.take(10).toSeq) + println(v.indices.take(10).toSeq) + +You will see the following output displayed: + + **262144** + **706** + **WrappedArray(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0)** + **WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)** + +We can see that the dimension of each sparse vector of term frequencies is 262,144 (or 218 as we specified). However, the number on non-zero entries in the vector is only 706. The last two lines of the output show the frequency counts and vector indexes for the first few entries in the vector. + +We will now compute the inverse document frequency for each term in the corpus by creating a new `IDF` instance and calling `fit` with our RDD of term frequency vectors as the input. We will then transform our term frequency vectors to TF-IDF vectors through the `transform` function of `IDF`: + + val idf = new IDF().fit(tf) + val tfidf = idf.transform(tf) + val v2 = tfidf.first.asInstanceOf[SV] + println(v2.values.size) + println(v2.values.take(10).toSeq) + println(v2.indices.take(10).toSeq) + +When you examine the first element in the RDD of TF-IDF transformed vectors, you will see output similar to the one shown here: + + **706** + **WrappedArray(2.3869085659322193, 4.670445463955571, 6.561295835827856, 4.597686109673142, ...** + **WrappedArray(313, 713, 871, 1202, 1203, 1209, 1795, 1862, 3115, 3166)** + +We can see that the number of non-zero entries hasn't changed (at 706), nor have the vector indices for the terms. What has changed are the values for each term. Earlier, these represented the frequency of each term in the document, but now, the new values represent the frequencies weighted by the `IDF`. + +### Analyzing the TF-IDF weightings + +Next, let's investigate the TF-IDF weighting for a few terms to illustrate the impact of the commonality or rarity of a term. + +First, we can compute the minimum and maximum TF-IDF weights across the entire corpus: + + val minMaxVals = tfidf.map { v => + val sv = v.asInstanceOf[SV] + (sv.values.min, sv.values.max) + } + val globalMinMax = minMaxVals.reduce { case ((min1, max1), (min2, max2)) => + (math.min(min1, min2), math.max(max1, max2)) + } + println(globalMinMax) + +As we can see, the minimum TF-IDF is zero, while the maximum is significantly larger: + + **(0.0,66155.39470409753)** + +We will now explore the TF-IDF weight attached to various terms. In the previous section on stop words, we filtered out many common terms that occur frequently. Recall that we did not remove all such potential stop words. Instead, we kept a few in the corpus so that we could illustrate the impact of applying the TF-IDF weighting scheme on these terms. + +TF-IDF weighting will tend to assign a lower weighting to common terms. To see this, we can compute the TF-IDF representation for a few of the terms that appear in the list of top occurrences that we previously computed, such as `you`, `do`, and `we`: + + val common = sc.parallelize(Seq(Seq("you", "do", "we"))) + val tfCommon = hashingTF.transform(common) + val tfidfCommon = idf.transform(tfCommon) + val commonVector = tfidfCommon.first.asInstanceOf[SV] + println(commonVector.values.toSeq) + +If we form a TF-IDF vector representation of this document, we would see the following values assigned to each term. Note that because of feature hashing, we are not sure exactly which term represents what. However, the values illustrate that the weighting applied to these terms is relatively low: + + **WrappedArray(0.9965359935704624, 1.3348773448236835, 0.5457486182039175)** + +Now, let's apply the same transformation to a few less common terms that we might intuitively associate with being more linked to specific topics or concepts: + + val uncommon = sc.parallelize(Seq(Seq("telescope", "legislation", "investment"))) + val tfUncommon = hashingTF.transform(uncommon) + val tfidfUncommon = idf.transform(tfUncommon) + val uncommonVector = tfidfUncommon.first.asInstanceOf[SV] + println(uncommonVector.values.toSeq) + +We can see from the following results that the TF-IDF weightings are indeed significantly higher than for the more common terms: + + **WrappedArray(5.3265513728351666, 5.308532867332488, 5.483736956357579)** + +# Using a TF-IDF model + +While we often refer to training a TF-IDF model, it is actually a feature extraction process or transformation rather than a machine learning model. TF-IDF weighting is often used as a preprocessing step for other models, such as dimensionality reduction, classification, or regression. + +To illustrate the potential uses of TF-IDF weighting, we will explore two examples. The first is using the TF-IDF vectors to compute document similarity, while the second involves training a multilabel classification model with the TF-IDF vectors as input features. + +## Document similarity with the 20 Newsgroups dataset and TF-IDF features + +You might recall from Chapter 4, _Building a Recommendation Engine with Spark_ , that the similarity between two vectors can be computed using a distance metric. The closer two vectors are (that is, the lower the distance metric), the more similar they are. One such metric that we used to compute similarity between movies is cosine similarity. + +Just like we did for movies, we can also compute the similarity between two documents. Using TF-IDF, we have transformed each document into a vector representation. Hence, we can use the same techniques as we used for movie vectors to compare two documents. + +Intuitively, we might expect two documents to be more similar to each other if they share many terms. Conversely, we might expect two documents to be less similar if they each contain many terms that are different from each other. As we compute cosine similarity by computing a dot product of the two vectors and each vector is made up of the terms in each document, we can see that documents with a high overlap of terms will tend to have a higher cosine similarity. + +Now, we can see TF-IDF at work. We might reasonably expect that even very different documents might contain many overlapping terms that are relatively common (for example, our stop words). However, due to a low TF-IDF weighting, these terms will not have a significant impact on the dot product and, therefore, will not have much impact on the similarity computed. + +For example, we might expect two randomly chosen messages from the `hockey` newsgroup to be relatively similar to each other. Let's see if this is the case: + + val hockeyText = rdd.filter { case (file, text) => file.contains("hockey") } + val hockeyTF = hockeyText.mapValues(doc => hashingTF.transform(tokenize(doc))) + val hockeyTfIdf = idf.transform(hockeyTF.map(_._2)) + +In the preceding code, we first filtered our raw input RDD to keep only the messages within the hockey topic. We then applied our tokenization and term frequency transformation functions. Note that the `transform` method used is the version that works on a single document (in the form of a `Seq[String]`) rather than the version that works on an RDD of documents. + +Finally, we applied the `IDF` transform (note that we use the same IDF that we have already computed on the whole corpus). + +Once we have our `hockey` document vectors, we can select two of these vectors at random and compute the cosine similarity between them (as we did earlier, we will use Breeze for the linear algebra functionality, in particular converting our MLlib vectors to Breeze `SparseVector` instances first): + + import breeze.linalg._ + val hockey1 = hockeyTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV] + val breeze1 = new SparseVector(hockey1.indices, hockey1.values, hockey1.size) + val hockey2 = hockeyTfIdf.sample(true, 0.1, 43).first.asInstanceOf[SV] + val breeze2 = new SparseVector(hockey2.indices, hockey2.values, hockey2.size) + val cosineSim = breeze1.dot(breeze2) / (norm(breeze1) * norm(breeze2)) + println(cosineSim) + +We can see that the cosine similarity between the documents is around 0.06: + + **0.060250114361164626** + +While this might seem quite low, recall that the effective dimensionality of our features is high due to the large number of unique terms that is typical when dealing with text data. Hence, we can expect that any two documents might have a relatively low overlap of terms even if they are about the same topic, and therefore would have a lower absolute similarity score. + +By contrast, we can compare this similarity score to the one computed between one of our `hockey` documents and another document chosen randomly from the `comp.graphics` newsgroup, using the same methodology: + + val graphicsText = rdd.filter { case (file, text) => file.contains("comp.graphics") } + val graphicsTF = graphicsText.mapValues(doc => hashingTF.transform(tokenize(doc))) + val graphicsTfIdf = idf.transform(graphicsTF.map(_._2)) + val graphics = graphicsTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV] + val breezeGraphics = new SparseVector(graphics.indices, graphics.values, graphics.size) + val cosineSim2 = breeze1.dot(breezeGraphics) / (norm(breeze1) * norm(breezeGraphics)) + println(cosineSim2) + +The cosine similarity is significantly lower at 0.0047: + + **0.004664850323792852** + +Finally, it is likely that a document from another sports-related topic might be more similar to our `hockey` document than one from a computer-related topic. However, we would probably expect a `baseball` document to not be as similar as our `hockey` document. Let's see whether this is the case by computing the similarity between a random message from the `baseball` newsgroup and our `hockey` document: + + val baseballText = rdd.filter { case (file, text) => file.contains("baseball") } + val baseballTF = baseballText.mapValues(doc => hashingTF.transform(tokenize(doc))) + val baseballTfIdf = idf.transform(baseballTF.map(_._2)) + val baseball = baseballTfIdf.sample(true, 0.1, 42).first.asInstanceOf[SV] + val breezeBaseball = new SparseVector(baseball.indices, baseball.values, baseball.size) + val cosineSim3 = breeze1.dot(breezeBaseball) / (norm(breeze1) * norm(breezeBaseball)) + println(cosineSim3) + +Indeed, as we expected, we found that the `baseball` and `hockey` documents have a cosine similarity of 0.05, which is significantly higher than the `comp.graphics` document, but also somewhat lower than the other `hockey` document: + + **0.05047395039466008** + +## Training a text classifier on the 20 Newsgroups dataset using TF-IDF + +When using TF-IDF vectors, we expected that the cosine similarity measure would capture the similarity between documents, based on the overlap of terms between them. In a similar way, we would expect that a machine learning model, such as a classifier, would be able to learn weightings for individual terms; this would allow it to distinguish between documents from different classes. That is, it should be possible to learn a mapping between the presence (and weighting) of certain terms and a specific topic. + +In the 20 Newsgroups example, each newsgroup topic is a class, and we can train a classifier using our TF-IDF transformed vectors as input. + +Since we are dealing with a multiclass classification problem, we will use the naive Bayes model in MLlib, which supports multiple classes. As the first step, we will import the Spark classes that we will be using: + + import org.apache.spark.mllib.regression.LabeledPoint + import org.apache.spark.mllib.classification.NaiveBayes + import org.apache.spark.mllib.evaluation.MulticlassMetrics + +Next, we will need to extract the 20 topics and convert them to class mappings. We can do this in exactly the same way as we might for 1-of-K feature encoding, by assigning a numeric index to each class: + + val newsgroupsMap = newsgroups.distinct.collect().zipWithIndex.toMap + val zipped = newsgroups.zip(tfidf) + val train = zipped.map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } + train.cache + +In the preceding code snippet, we took the `newsgroups` RDD, where each element is the topic, and used the `zip` function to combine it with each element in our `tfidf` RDD of TF-IDF vectors. We then mapped over each key-value element in our new `zipped` RDD and created a `LabeledPoint` instance, where `label` is the class index and `features` is the TF-IDF vector. + +### Tip + +Note that the `zip` operator assumes that each RDD has the same number of partitions as well as the same number of elements in each partition. It will fail if this is not the case. We can make this assumption here because we have effectively created both our `tfidf` RDD and `newsgroups` RDD from a series of `map` transformations on the same original RDD that preserved the partitioning structure. + +Now that we have an input RDD in the correct form, we can simply pass it to the naive Bayes `train` function: + + val model = NaiveBayes.train(train, lambda = 0.1) + +Let's evaluate the performance of the model on the test dataset. We will load the raw test data from the `20news-bydate-test` directory, again using `wholeTextFiles` to read each message into an RDD element. We will then extract the class labels from the file paths in the same way as we did for the `newsgroups` RDD: + + val testPath = "/PATH/20news-bydate-test/*" + val testRDD = sc.wholeTextFiles(testPath) + val testLabels = testRDD.map { case (file, text) => + val topic = file.split("/").takeRight(2).head + newsgroupsMap(topic) + } + +Transforming the text in the test dataset follows the same procedure as for the training data--we will apply our `tokenize` function followed by the term frequency transformation, and we will again use the same IDF computed from the training data to transform the TF vectors into TF-IDF vectors. Finally, we will zip the test class labels with the TF-IDF vectors and create our test `RDD[LabeledPoint]`: + + val testTf = testRDD.map { case (file, text) => hashingTF.transform(tokenize(text)) } + val testTfIdf = idf.transform(testTf) + val zippedTest = testLabels.zip(testTfIdf) + val test = zippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } + +### Tip + +Note that it is important that we use the training set IDF to transform the test data, as this creates a more realistic estimation of model performance on new data, which might potentially contain terms that the model has not yet been trained on. It would be "cheating" to recompute the IDF vector based on the test dataset and, more importantly, would potentially lead to incorrect estimates of optimal model parameters selected through cross-validation. + +Now, we're ready to compute the predictions and true class labels for our model. We will use this RDD to compute accuracy and the multiclass weighted F-measure for our model: + + val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) + val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() + val metrics = new MulticlassMetrics(predictionAndLabel) + println(accuracy) + println(metrics.weightedFMeasure) + +### Tip + +The weighted F-measure is an overall measure of precision and recall performance (where, like area under an ROC curve, values closer to 1.0 indicate better performance), which is then combined through a weighted averaged across the classes. + +We can see that our simple multiclass naive Bayes model has achieved close to 80 percent for both accuracy and F-measure: + + **0.7915560276155071** + **0.7810675969031116** + +# Evaluating the impact of text processing + +Text processing and TF-IDF weighting are examples of feature extraction techniques designed to both reduce the dimensionality of and extract some structure from raw text data. We can see the impact of applying these processing techniques by comparing the performance of a model trained on raw text data with one trained on processed and TF-IDF weighted text data. + +## Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset + +In this example, we will simply apply the hashing term frequency transformation to the raw text tokens obtained using a simple whitespace splitting of the document text. We will train a model on this data and evaluate the performance on the test set as we did for the model trained with TF-IDF features: + + val rawTokens = rdd.map { case (file, text) => text.split(" ") } + val rawTF = texrawTokenst.map(doc => hashingTF.transform(doc)) + val rawTrain = newsgroups.zip(rawTF).map { case (topic, vector) => LabeledPoint(newsgroupsMap(topic), vector) } + val rawModel = NaiveBayes.train(rawTrain, lambda = 0.1) + val rawTestTF = testRDD.map { case (file, text) => hashingTF.transform(text.split(" ")) } + val rawZippedTest = testLabels.zip(rawTestTF) + val rawTest = rawZippedTest.map { case (topic, vector) => LabeledPoint(topic, vector) } + val rawPredictionAndLabel = rawTest.map(p => (rawModel.predict(p.features), p.label)) + val rawAccuracy = 1.0 * rawPredictionAndLabel.filter(x => x._1 == x._2).count() / rawTest.count() + println(rawAccuracy) + val rawMetrics = new MulticlassMetrics(rawPredictionAndLabel) + println(rawMetrics.weightedFMeasure) + +Perhaps surprisingly, the raw model does quite well, although both accuracy and F-measure are a few percentage points lower than those of the TF-IDF model. This is also partly a reflection of the fact that the naive Bayes model is well suited to data in the form of raw frequency counts: + + **0.7661975570897503** + **0.7628947184990661** + +# Word2Vec models + +Until now, we have used a bag-of-words vector, optionally with some weighting scheme such as TF-IDF to represent the text in a document. Another recent class of models that has become popular is related to representing individual words as vectors. + +These are generally based in some way on the co-occurrence statistics between the words in a corpus. Once the vector representation is computed, we can use these vectors in ways similar to how we might use TF-IDF vectors (such as using them as features for other machine learning models). One such common use case is computing the similarity between two words with respect to their meanings, based on their vector representations. + +Word2Vec refers to a specific implementation of one of these models, often referred to as **distributed vector representations**. The MLlib model uses a **skip-gram** model, which seeks to learn vector representations that take into account the contexts in which words occur. + +### Note + +While a detailed treatment of Word2Vec is beyond the scope of this book, Spark's documentation at contains some further details on the algorithm as well as links to the reference implementation. + +One of the main academic papers underlying Word2Vec is _Tomas Mikolov_ , _Kai Chen_ , _Greg Corrado_ , and _Jeffrey Dean_. _Efficient Estimation of Word Representations in Vector Space_. _In Proceedings of Workshop at ICLR_ , _2013_. + +It is available at . + +Another recent model in the area of word vector representations is GloVe at . + +## Word2Vec on the 20 Newsgroups dataset + +Training a Word2Vec model in Spark is relatively simple. We will pass in an RDD where each element is a sequence of terms. We can use the RDD of tokenized documents we have already created as input to the model: + + import org.apache.spark.mllib.feature.Word2Vec + val word2vec = new Word2Vec() + word2vec.setSeed(42) + val word2vecModel = word2vec.fit(tokens) + +### Tip + +Note that we used `setSeed` to set the random seed for model training so that you can see the same results each time the model is trained. + +You will see some output similar to the following while the model is being trained: + + **...** + **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2133172, alpha = 0.0011868763094487506** + **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2144172, alpha = 0.0010640806039941193** + **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2155172, alpha = 9.412848985394907E-4** + **14/10/25 14:21:59 INFO Word2Vec: wordCount = 2166172, alpha = 8.184891930848592E-4** + **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2177172, alpha = 6.956934876302307E-4** + **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2188172, alpha = 5.728977821755993E-4** + **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2199172, alpha = 4.501020767209707E-4** + **14/10/25 14:22:00 INFO Word2Vec: wordCount = 2210172, alpha = 3.2730637126634213E-4** + **14/10/25 14:22:01 INFO Word2Vec: wordCount = 2221172, alpha = 2.0451066581171076E-4** + **14/10/25 14:22:01 INFO Word2Vec: wordCount = 2232172, alpha = 8.171496035708214E-5** + **...** + **14/10/25 14:22:02 INFO SparkContext: Job finished: collect at Word2Vec.scala:368, took 56.585983 s** + **14/10/25 14:22:02 INFO MappedRDD: Removing RDD 200 from persistence list** + **14/10/25 14:22:02 INFO BlockManager: Removing RDD 200** + **14/10/25 14:22:02 INFO BlockManager: Removing block rdd_200_0** + **14/10/25 14:22:02 INFO MemoryStore: Block rdd_200_0 of size 9008840 dropped from memory (free 1755596828)** + **word2vecModel: org.apache.spark.mllib.feature.Word2VecModel = org.apache.spark.mllib.feature.Word2VecModel@2b94e480** + +Once trained, we can easily find the top 20 synonyms for a given term (that is, the most similar term to the input term, computed by cosine similarity between the word vectors). For example, to find the 20 most similar terms to _hockey_ , use the following lines of code: + + word2vecModel.findSynonyms("hockey", 20).foreach(println) + +As we can see from the following output, most of the terms relate to hockey or other sports topics: + + **(sport,0.6828256249427795)** + **(ecac,0.6718048453330994)** + **(hispanic,0.6519884467124939)** + **(glens,0.6447514891624451)** + **(woofers,0.6351765394210815)** + **(boxscores,0.6009076237678528)** + **(tournament,0.6006366014480591)** + **(champs,0.5957855582237244)** + **(aargh,0.584071934223175)** + **(playoff,0.5834275484085083)** + **(ahl,0.5784651637077332)** + **(ncaa,0.5680188536643982)** + **(pool,0.5612311959266663)** + **(olympic,0.5552600026130676)** + **(champion,0.5549421310424805)** + **(filinuk,0.5528956651687622)** + **(yankees,0.5502706170082092)** + **(motorcycles,0.5484763979911804)** + **(calder,0.5481109023094177)** + **(rec,0.5432182550430298)** + +As another example, we can find 20 synonyms for the term _legislation_ as follows: + + word2vecModel.findSynonyms("legislation", 20).foreach(println) + +In this case, we observe the terms related to _regulation_ , _politics_ , and _business_ feature prominently: + + **(accommodates,0.8149217963218689)** + **(briefed,0.7582570314407349)** + **(amended,0.7310371994972229)** + **(telephony,0.7139414548873901)** + **(aclu,0.7080780863761902)** + **(pitted,0.7062571048736572)** + **(licensee,0.6981208324432373)** + **(agency,0.6880651712417603)** + **(policies,0.6828961372375488)** + **(senate,0.6821110844612122)** + **(businesses,0.6814320087432861)** + **(permit,0.6797110438346863)** + **(cpsr,0.6764014959335327)** + **(cooperation,0.6733141541481018)** + **(surveillance,0.6670728325843811)** + **(restricted,0.6666574478149414)** + **(congress,0.6661365628242493)** + **(procure,0.6655452251434326)** + **(industry,0.6650314927101135)** + **(inquiry,0.6644254922866821)** + +# Summary + +In this chapter, we took a deeper look into more complex text processing and explored MLlib's text feature extraction capabilities, in particular the TF-IDF term weighting schemes. We covered examples of using the resulting TF-IDF feature vectors to compute document similarity and train a newsgroup topic classification model. Finally, you learned how to use MLlib's cutting-edge Word2Vec model to compute a vector representation of words in a corpus of text and use the trained model to find words with contextual meaning that is similar to a given word. + +In the next chapter, we will take a look at online learning, and you will learn how Spark Streaming relates to online learning models. + +# Chapter 10. Real-time Machine Learning with Spark Streaming + +So far in this book, we have focused on **batch** data processing. That is, all our analysis, feature extraction, and model training has been applied to a fixed set of data that does not change. This fits neatly into Spark's core abstraction of RDDs, which are immutable distributed datasets. Once created, the data underlying the RDD does not change, although we might create new RDDs from the original RDD through Spark's transformation and action operators. + +Our attention has also been on batch machine learning models where we train a model on a fixed batch of training data that is usually represented as an RDD of feature vectors (and labels, in the case of supervised learning models). + +In this chapter, we will: + + * Introduce the concept of online learning, where models are trained and updated on new data as it becomes available + * Explore stream processing using Spark Streaming + * See how Spark Streaming fits together with the online learning approach + +# Online learning + +The batch machine learning methods that we have applied in this book focus on processing an existing fixed set of training data. Typically, these techniques are also iterative, and we have performed multiple passes over our training data in order to converge to an optimal model. + +By contrast, online learning is based on performing only one sequential pass through the training data in a fully incremental fashion (that is, one training example at a time). After seeing each training example, the model makes a prediction for this example and then receives the true outcome (for example, the label for classification or real target for regression). The idea behind online learning is that the model continually updates as new information is received instead of being retrained periodically in batch training. + +In some settings, when data volume is very large or the process that generates the data is changing rapidly, online learning methods can adapt more quickly and in near real time, without needing to be retrained in an expensive batch process. + +However, online learning methods do not have to be used in a purely online manner. In fact, we have already seen an example of using an online learning model in the batch setting when we used **stochastic gradient descent** optimization to train our classification and regression models. SGD updates the model after each training example. However, we still made use of multiple passes over the training data in order to converge to a better result. + +In the pure online setting, we do not (or perhaps cannot) make multiple passes over the training data; hence, we need to process each input as it arrives. Online methods also include mini-batch methods where, instead of processing one input at a time, we process a small batch of training data. + +Online and batch methods can also be combined in real-world situations. For example, we can periodically retrain our models offline (say, every day) using batch methods. We can then deploy the trained model to production and update it using online methods in real time (that is, during the day, in between batch retraining) to adapt to any changes in the environment. + +As we will see in this chapter, the online learning setting can fit neatly into stream processing and the Spark Streaming framework. + +### Note + +See for more details on online machine learning. + +# Stream processing + +Before covering online learning with Spark, we will first explore the basics of stream processing and introduce the Spark Streaming library. + +In addition to the core Spark API and functionality, the Spark project contains another major library (in the same way as MLlib is a major project library) called **Spark Streaming** , which focuses on processing data streams in real time. + +A data stream is a continuous sequence of records. Common examples include activity stream data from a web or mobile application, time-stamped log data, transactional data, and event streams from sensor or device networks. + +The batch processing approach typically involves saving the data stream to an intermediate storage system (for example, HDFS or a database) and running a batch process on the saved data. In order to generate up-to-date results, the batch process must be run periodically (for example, daily, hourly, or even every few minutes) on the latest data available. + +By contrast, the stream-based approach applies processing to the data stream as it is generated. This allows near real-time processing (of the order of a subsecond to a few tenths of a second time frames rather than minutes, hours, days, or even weeks with typical batch processing). + +## An introduction to Spark Streaming + +There are a few different general techniques to deal with stream processing. Two of the most common ones are as follows: + + * Treat each record individually and process it as soon as it is seen. + * Combine multiple records into **mini-batches**. These mini-batches can be delineated either by time or by the number of records in a batch. + +Spark Streaming takes the second approach. The core primitive in Spark Streaming is the **discretized stream** , or **DStream**. A DStream is a sequence of mini-batches, where each mini-batch is represented as a Spark RDD: + +The discretized stream abstraction + +A DStream is defined by its input source and a time window called the **batch interval**. The stream is broken up into time periods equal to the batch interval (beginning from the starting time of the application). Each RDD in the stream will contain the records that are received by the Spark Streaming application during a given batch interval. If no data is present in a given interval, the RDD will simply be empty. + +### Input sources + +Spark Streaming **receivers** are responsible for receiving data from an **input source** and converting the raw data into a DStream made up of Spark RDDs. + +Spark Streaming supports various input sources, including file-based sources (where the receiver watches for new files arriving at the input location and creates the DStream from the contents read from each new file) and network-based sources (such as receivers that communicate with socket-based sources, the Twitter API stream, Akka actors, or message queues and distributed stream and log transfer frameworks, such Flume, Kafka, and Amazon Kinesis). + +### Note + +See the documentation on input sources at for more details and for links to various advanced sources. + +### Transformations + +As we saw in Chapter 1, _Getting Up and Running with Spark_ , and throughout this book, Spark allows us to apply powerful transformations to RDDs. As DStreams are made up of RDDs, Spark Streaming provides a set of transformations available on DStreams; these transformations are similar to those available on RDDs. These include `map`, `flatMap`, `filter`, `join`, and `reduceByKey`. + +Spark Streaming transformations, such as those applicable to RDDs, operate on each element of a DStream's underlying data. That is, the transformations are effectively applied to each RDD in the DStream, which, in turn, applies the transformation to the elements of the RDD. + +Spark Streaming also provides operators such as `reduce` and `count`. These operators return a DStream made up of a single element (for example, the count value for each batch). Unlike the equivalent operators on RDDs, these do not trigger computation on DStreams directly. That is, they are not **actions** , but they are still transformations, as they return another DStream. + +#### Keeping track of state + +When we were dealing with batch processing of RDDs, keeping and updating a state variable was relatively straightforward. We could start with a certain state (for example, a count or sum of values) and then use broadcast variables or accumulators to update this state in parallel. Usually, we would then use an RDD action to collect the updated state to the driver and, in turn, update the global state. + +With DStreams, this is a little more complex, as we need to keep track of states across batches in a fault-tolerant manner. Conveniently, Spark Streaming provides the `updateStateByKey` function on a DStream of key-value pairs, which takes care of this for us, allowing us to create a stream of arbitrary state information and update it with each batch of data seen. For example, the state could be a global count of the number of times each key has been seen. The state could, thus, represent the number of visits per web page, clicks per advert, tweets per user, or purchases per product, for example. + +#### General transformations + +The Spark Streaming API also exposes a general `transform` function that gives us access to the underlying RDD for each batch in the stream. That is, where the higher level functions such as `map` transform a DStream to another DStream, `transform` allows us to apply functions from an RDD to another RDD. For example, we can use the RDD `join` operator to join each batch of the stream to an existing RDD that we computed separately from our streaming application (perhaps, in Spark or some other system). + +### Note + +The full list of transformations and further information on each of them is provided in the Spark documentation at . + +### Actions + +While some of the operators we have seen in Spark Streaming, such as `count`, are not actions as in the batch RDD case, Spark Streaming has the concept of **actions** on DStreams. Actions are **output** operators that, when invoked, trigger computation on the DStream. They are as follows: + + * `print`: This prints the first 10 elements of each batch to the console and is typically used for debugging and testing. + * `saveAsObjectFile`, `saveAsTextFiles`, and `saveAsHadoopFiles`: These functions output each batch to a Hadoop-compatible filesystem with a filename (if applicable) derived from the batch start timestamp. + * `forEachRDD`: This operator is the most generic and allows us to apply any arbitrary processing to the RDDs within each batch of a DStream. It is used to apply _side effects_ , such as saving data to an external system, printing it for testing, exporting it to a dashboard, and so on. + +### Tip + +Note that like batch processing with Spark, DStream operators are **lazy**. In the same way in which we need to call an action, such as `count`, on an RDD to ensure that processing takes place, we need to call one of the preceding action operators in order to trigger computation on a DStream. Otherwise, our streaming application will not actually perform any computation. + +### Window operators + +As Spark Streaming operates on time-ordered batched streams of data, it introduces a new concept, which is that of **windowing**. A `window` function computes a transformation over a sliding window applied to the stream. + +A window is defined by the length of the window and the sliding interval. For example, with a 10-second window and a 5-second sliding interval, we will compute results every 5 seconds, based on the latest 10 seconds of data in the DStream. For example, we might wish to calculate the top websites by page view numbers over the last 10 seconds and recompute this metric every 5 seconds using a sliding window. + +The following figure illustrates a windowed DStream: + +A windowed DStream + +## Caching and fault tolerance with Spark Streaming + +Like Spark RDDs, DStreams can be cached in memory. The use cases for caching are similar to those for RDDs--if we expect to access the data in a DStream multiple times (perhaps performing multiple types of analysis or aggregation or outputting to multiple external systems), we will benefit from caching the data. Stateful operators, which include `window` functions and `updateStateByKey`, do this automatically for efficiency. + +Recall that RDDs are immutable datasets and are defined by their input data source and **lineage** --that is, the set of transformations and actions that are applied to the RDD. Fault tolerance in RDDs works by recreating the RDD (or partition of an RDD) that is lost due to the failure of a worker node. + +As DStreams are themselves batches of RDDs, they can also be recomputed as required to deal with worker node failure. However, this depends on the input data still being available. If the data source itself is fault-tolerant and persistent (such as HDFS or some other fault-tolerant data store), then the DStream can be recomputed. + +If data stream sources are delivered over a network (which is a common case with stream processing), Spark Streaming's default persistence behavior is to replicate data to two worker nodes. This allows network DStreams to be recomputed in the case of failure. Note, however, that any data received by a node but _not yet replicated_ might be lost when a node fails. + +Spark Streaming also supports recovery of the driver node in the event of failure. However, currently, for network-based sources, data in the memory of worker nodes will be lost in this case. Hence, Spark Streaming is not fully fault-tolerant in the face of failure of the driver node or application. + +### Note + +See http://spark.apache.org/docs/latest/streaming-programming-guide.html#caching--persistence and for more details. + +# Creating a Spark Streaming application + +We will now work through creating our first Spark Streaming application to illustrate some of the basic concepts around Spark Streaming that we introduced earlier. + +We will expand on the example applications used in Chapter 1, _Getting Up and Running with Spark_ , where we used a small example dataset of product purchase events. For this example, instead of using a static set of data, we will create a simple producer application that will randomly generate events and send them over a network connection. We will then create a few Spark Streaming consumer applications that will process this event stream. + +The sample project for this chapter contains the code you will need. It is called `scala-spark-streaming-app`. It consists of a Scala SBT project definition file, the example application source code, and a `\src\main\resources` directory that contains a file called `names.csv`. + +The `build.sbt` file for the project contains the following project definition: + + name := "scala-spark-streaming-app" + + version := "1.0" + + scalaVersion := "2.10.4" + + libraryDependencies += "org.apache.spark" %% "spark-mllib" % "1.1.0" + + libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.1.0" + +Note that we added a dependency on Spark MLlib and Spark Streaming, which includes the dependency on the Spark core. + +The `names.csv` file contains a set of 20 randomly generated user names. We will use these names as part of our data generation function in our producer application: + + **Miguel,Eric,James,Juan,Shawn,James,Doug,Gary,Frank,Janet,Michael,James,Malinda,Mike,Elaine,Kevin,Janet,Richard,Saul,Manuela** + +## The producer application + +Our producer needs to create a network connection and generate some random purchase event data to send over this connection. First, we will define our object and main method definition. We will then read the random names from the `names.csv` resource and create a set of products with prices, from which we will generate our random product events: + + /** + * A producer application that generates random "product events", up to 5 per second, and sends them over a + * network connection + */ + object StreamingProducer { + + def main(args: Array[String]) { + + val random = new Random() + + // Maximum number of events per second + val MaxEvents = 6 + + // Read the list of possible names + val namesResource = this.getClass.getResourceAsStream("/names.csv") + val names = scala.io.Source.fromInputStream(namesResource) + .getLines() + .toList + .head + .split(",") + .toSeq + + // Generate a sequence of possible products + val products = Seq( + "iPhone Cover" -> 9.99, + "Headphones" -> 5.49, + "Samsung Galaxy Cover" -> 8.95, + "iPad Cover" -> 7.49 + ) + +Using the list of names and map of product name to price, we will create a function that will randomly pick a product and name from these sources, generating a specified number of product events: + + /** Generate a number of random product events */ + def generateProductEvents(n: Int) = { + (1 to n).map { i => + val (product, price) = products(random.nextInt(products.size)) + val user = random.shuffle(names).head + (user, product, price) + } + } + +Finally, we will create a network socket and set our producer to listen on this socket. As soon as a connection is made (which will come from our consumer streaming application), the producer will start generating random events at a random rate between 0 and 5 per second: + + // create a network producer + val listener = new ServerSocket(9999) + println("Listening on port: 9999") + + while (true) { + val socket = listener.accept() + new Thread() { + override def run = { + println("Got client connected from: " + socket.getInetAddress) + val out = new PrintWriter(socket.getOutputStream(), true) + + while (true) { + Thread.sleep(1000) + val num = random.nextInt(MaxEvents) + val productEvents = generateProductEvents(num) + productEvents.foreach{ event => + out.write(event.productIterator.mkString(",")) + out.write("\n") + } + out.flush() + println(s"Created $num events...") + } + socket.close() + } + }.start() + } + } + } + +### Note + +This producer example is based on the `PageViewGenerator` example in the Spark Streaming examples. + +The producer can be run by changing into the base directory of `scala-spark-streaming-app` and using SBT to run the application, as we did in Chapter 1, _Getting Up and Running with Spark_ : + + **> cd scala-spark-streaming-app** + **> sbt** + **[info] ...** + **>** + +Use the `run` command to execute the application: + + **> run** + +You should see output similar to the following: + + **...** + **Multiple main classes detected, select one to run:** + + **[1] StreamingProducer** + **[2] SimpleStreamingApp** + **[3] StreamingAnalyticsApp** + **[4] StreamingStateApp** + **[5] StreamingModelProducer** + **[6] SimpleStreamingModel** + **[7] MonitoringStreamingModel** + + **Enter number:** + +Select the `StreamingProducer` option. The application will start running, and you should see the following output: + + **[info] Running StreamingProducer** + **Listening on port: 9999** + +We can see that the producer is listening on port `9999`, waiting for our consumer application to connect. + +## Creating a basic streaming application + +Next, we will create our first streaming program. We will simply connect to the producer and print out the contents of each batch. Our streaming code looks like this: + + /** + * A simple Spark Streaming app in Scala + */ + object SimpleStreamingApp { + + def main(args: Array[String]) { + + val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10)) + val stream = ssc.socketTextStream("localhost", 9999) + + // here we simply print out the first few elements of each + // batch + stream.print() + ssc.start() + ssc.awaitTermination() + + } + } + +It looks fairly simple, and it is mostly due to the fact that Spark Streaming takes care of all the complexity for us. First, we initialized a `StreamingContext` (which is the streaming equivalent of the `SparkContext` we have used so far), specifying similar configuration options that are used to create a `SparkContext`. Notice, however, that here we are required to provide the batch interval, which we set to 10 seconds. + +We then created our data stream using a predefined streaming source, `socketTextStream`, which reads text from a socket host and port and creates a `DStream[String]`. We then called the `print` function on the DStream; this function prints out the first few elements of each batch. + +### Tip + +Calling `print` on a DStream is similar to calling `take` on an RDD. It displays only the first few elements. + +We can run this program using SBT. Open a second terminal window, leaving the producer program running, and run `sbt`: + + **> sbt** + **[info] ...** + **> run** + **....** + +Again, you should see a few options to select: + + **Multiple main classes detected, select one to run:** + + **[1] StreamingProducer** + **[2] SimpleStreamingApp** + **[3] StreamingAnalyticsApp** + **[4] StreamingStateApp** + **[5] StreamingModelProducer** + **[6] SimpleStreamingModel** + **[7] MonitoringStreamingModel** + +Run the `SimpleStreamingApp` main class. You should see the streaming program start up, displaying output similar to the one shown here: + + **...** + **14/11/15 21:02:23 INFO scheduler.ReceiverTracker: ReceiverTracker started** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: metadataCleanupDelay = -1** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: metadataCleanupDelay = -1** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Slide time = 10000 ms** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Storage level = StorageLevel(false, false, false, false, 1)** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Checkpoint interval = null** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Remember duration = 10000 ms** + **14/11/15 21:02:23 INFO dstream.SocketInputDStream: Initialized and validated org.apache.spark.streaming.dstream.SocketInputDStream@ff3436d** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: Slide time = 10000 ms** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: Storage level = StorageLevel(false, false, false, false, 1)** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: Checkpoint interval = null** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: Remember duration = 10000 ms** + **14/11/15 21:02:23 INFO dstream.ForEachDStream: Initialized and validated org.apache.spark.streaming.dstream.ForEachDStream@5a10b6e8** + **14/11/15 21:02:23 INFO scheduler.ReceiverTracker: Starting 1 receivers** + **14/11/15 21:02:23 INFO spark.SparkContext: Starting job: runJob at ReceiverTracker.scala:275** + **...** + +At the same time, you should see that the terminal window running the producer displays something like the following: + + **...** + **Got client connected from: /127.0.0.1** + **Created 2 events...** + **Created 2 events...** + **Created 3 events...** + **Created 1 events...** + **Created 5 events...** + **...** + +After about 10 seconds, which is the time of our streaming batch interval, Spark Streaming will trigger a computation on the stream due to our use of the `print` operator. This should display the first few events in the batch, which will look something like the following output: + + **...** + **14/11/15 21:02:30 INFO spark.SparkContext: Job finished: take at DStream.scala:608, took 0.05596 s** + **-------------------------------------------** + **Time: 1416078150000 ms** + **-------------------------------------------** + **Michael,Headphones,5.49** + **Frank,Samsung Galaxy Cover,8.95** + **Eric,Headphones,5.49** + **Malinda,iPad Cover,7.49** + **James,iPhone Cover,9.99** + **James,Headphones,5.49** + **Doug,iPhone Cover,9.99** + **Juan,Headphones,5.49** + **James,iPhone Cover,9.99** + **Richard,iPad Cover,7.49** + **...** + +### Tip + +Note that you might see different results, as the producer generates a random number of random events each second. + +You can terminate the streaming app by pressing _Ctrl_ \+ _C_. If you want to, you can also terminate the producer (if you do, you will need to restart it again before starting the next streaming programs that we will create). + +## Streaming analytics + +Next, we will create a slightly more complex streaming program. In Chapter 1, _Getting Up and Running with Spark_ , we calculated a few metrics on our dataset of product purchases. These included the total number of purchases, the number of unique users, the total revenue, and the most popular product (together with its number of purchases and total revenue). + +In this example, we will compute the same metrics on our stream of purchase events. The key difference is that these metrics will be computed per batch and printed out. + +We will define our streaming application code here: + + /** + * A more complex Streaming app, which computes statistics and prints the results for each batch in a DStream + */ + object StreamingAnalyticsApp { + + def main(args: Array[String]) { + + val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10)) + val stream = ssc.socketTextStream("localhost", 9999) + + // create stream of events from raw text elements + val events = stream.map { record => + val event = record.split(",") + (event(0), event(1), event(2)) + } + +First, we created exactly the same `StreamingContext` and socket stream as we did earlier. Our next step is to apply a `map` transformation to the raw text, where each record is a comma-separated string representing the purchase event. The `map` function splits the text and creates a tuple of `(user, product, price)`. This illustrates the use of `map` on a DStream and how it is the same as if we had been operating on an RDD. + +Next, we will use `foreachRDD` to apply arbitrary processing on each RDD in the stream to compute our desired metrics and print them to the console: + + /* + We compute and print out stats for each batch. + Since each batch is an RDD, we call forEeachRDD on the DStream, and apply the usual RDD functions + we used in Chapter 1. + */ + events.foreachRDD { (rdd, time) => + val numPurchases = rdd.count() + val uniqueUsers = rdd.map { case (user, _, _) => user }.distinct().count() + val totalRevenue = rdd.map { case (_, _, price) => price.toDouble }.sum() + val productsByPopularity = rdd + .map { case (user, product, price) => (product, 1) } + .reduceByKey(_ + _) + .collect() + .sortBy(-_._2) + val mostPopular = productsByPopularity(0) + + val formatter = new SimpleDateFormat + val dateStr = formatter.format(new Date(time.milliseconds)) + println(s"== Batch start time: $dateStr ==") + println("Total purchases: " + numPurchases) + println("Unique users: " + uniqueUsers) + println("Total revenue: " + totalRevenue) + println("Most popular product: %s with %d purchases".format(mostPopular._1, mostPopular._2)) + } + + // start the context + ssc.start() + ssc.awaitTermination() + + } + + } + +If you compare the code operating on the RDDs inside the preceding `foreachRDD` block with that used in Chapter 1, _Getting Up and Running with Spark_ , you will notice that it is virtually the same code. This shows that we can apply any RDD-related processing we wish within the streaming setting by operating on the underlying RDDs, as well as using the built-in higher level streaming operations. + +Let's run the streaming program again by calling `sbt run` and selecting `StreamingAnalyticsApp`. + +### Tip + +Remember that you might also need to restart the producer if you previously terminated the program. This should be done before starting the streaming application. + +After about 10 seconds, you should see output from the streaming program similar to the following: + + **...** + **14/11/15 21:27:30 INFO spark.SparkContext: Job finished: collect at Streaming.scala:125, took 0.071145 s** + **== Batch start time: 2014/11/15 9:27 PM ==** + **Total purchases: 16** + **Unique users: 10** + **Total revenue: 123.72** + **Most popular product: iPad Cover with 6 purchases** + **...** + +You can again terminate the streaming program using _Ctrl_ \+ _C_. + +## Stateful streaming + +As a final example, we will apply the concept of **stateful** streaming using the `updateStateByKey` function to compute a global state of revenue and number of purchases per user, which will be updated with new data from each 10-second batch. Our `StreamingStateApp` app is shown here: + + object StreamingStateApp { + import org.apache.spark.streaming.StreamingContext._ + +We will first define an `updateState` function that will compute the new state from the running state value and the new data in the current batch. Our state, in this case, is a tuple of `(number of products, revenue)` pairs, which we will keep for each user. We will compute the new state given the set of `(product, revenue)` pairs for the current batch and the accumulated state at the current time. + +Notice that we will deal with an `Option` value for the current state, as it might be empty (which will be the case for the first batch), and we need to define a default value, which we will do using `getOrElse` as shown here: + + def updateState(prices: Seq[(String, Double)], currentTotal: Option[(Int, Double)]) = { + val currentRevenue = prices.map(_._2).sum + val currentNumberPurchases = prices.size + val state = currentTotal.getOrElse((0, 0.0)) + Some((currentNumberPurchases + state._1, currentRevenue + state._2)) + } + + def main(args: Array[String]) { + + val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10)) + // for stateful operations, we need to set a checkpoint + // location + ssc.checkpoint("/tmp/sparkstreaming/") + val stream = ssc.socketTextStream("localhost", 9999) + + // create stream of events from raw text elements + val events = stream.map { record => + val event = record.split(",") + (event(0), event(1), event(2).toDouble) + } + + val users = events.map{ case (user, product, price) => (user, (product, price)) } + val revenuePerUser = users.updateStateByKey(updateState) + revenuePerUser.print() + + // start the context + ssc.start() + ssc.awaitTermination() + + } + } + +After applying the same string split transformation we used in our previous example, we called `updateStateByKey` on our DStream, passing in our defined `updateState` function. We then printed the results to the console. + +Start the streaming example using `sbt run` and by selecting `[4] StreamingStateApp` (also restart the producer program if necessary). + +After around 10 seconds, you will start to see the first set of state output. We will wait another 10 seconds to see the next set of output. You will see the overall global state being updated: + + **...** + **-------------------------------------------** + **Time: 1416080440000 ms** + **-------------------------------------------** + **(Janet,(2,10.98))** + **(Frank,(1,5.49))** + **(James,(2,12.98))** + **(Malinda,(1,9.99))** + **(Elaine,(3,29.97))** + **(Gary,(2,12.98))** + **(Miguel,(3,20.47))** + **(Saul,(1,5.49))** + **(Manuela,(2,18.939999999999998))** + **(Eric,(2,18.939999999999998))** + **...** + **-------------------------------------------** + **Time: 1416080441000 ms** + **-------------------------------------------** + **(Janet,(6,34.94))** + **(Juan,(4,33.92))** + **(Frank,(2,14.44))** + **(James,(7,48.93000000000001))** + **(Malinda,(1,9.99))** + **(Elaine,(7,61.89))** + **(Gary,(4,28.46))** + **(Michael,(1,8.95))** + **(Richard,(2,16.439999999999998))** + **(Miguel,(5,35.95))** + **...** + +We can see that the number of purchases and revenue totals for each user are added to with each batch of data. + +### Tip + +Now, see if you can adapt this example to use Spark Streaming's `window` functions. For example, you can compute similar statistics per user over the past minute, sliding every 30 seconds. + +# Online learning with Spark Streaming + +As we have seen, Spark Streaming makes it easy to work with data streams in a way that should be familiar to us from working with RDDs. Using Spark's stream processing primitives combined with the online learning capabilities of MLlib's SGD-based methods, we can create real-time machine learning models that we can update on new data in the stream as it arrives. + +## Streaming regression + +Spark provides a built-in streaming machine learning model in the `StreamingLinearAlgorithm` class. Currently, only a linear regression implementation is available--`StreamingLinearRegressionWithSGD`--but future versions will include classification. + +The streaming regression model provides two methods for usage: + + * `trainOn`: This takes `DStream[LabeledPoint]` as its argument. This tells the model to train on every batch in the input DStream. It can be called multiple times to train on different streams. + * `predictOn`: This also takes `DStream[LabeledPoint]`. This tells the model to make predictions on the input DStream, returning a new `DStream[Double]` that contains the model predictions. + +Under the hood, the streaming regression model uses `foreachRDD` and `map` to accomplish this. It also updates the model variable after each batch and exposes the latest trained model, which allows us to use this model in other applications or save it to an external location. + +The streaming regression model can be configured with parameters for step size and number of iterations in the same way as standard batch regression--the model class used is the same. We can also set the initial model weight vector. + +When we first start training a model, we can set the initial weights to a zero vector, or a random vector, or perhaps load the latest model from the result of an offline batch process. We can also decide to save the model periodically to an external system and use the latest model state as the starting point (for example, in the case of a restart after a node or application failure). + +## A simple streaming regression program + +To illustrate the use of streaming regression, we will create a simple example similar to the preceding one, which uses simulated data. We will write a producer program that generates random feature vectors and target variables, given a fixed, known weight vector, and writes each training example to a network stream. + +Our consumer application will run a streaming regression model, training and then testing on our simulated data stream. Our first example consumer will simply print its predictions to the console. + +### Creating a streaming data producer + +The data producer operates in a manner similar to our product event producer example. Recall from Chapter 5, _Building a Classification Model with Spark_ , that a linear model is a linear combination (or vector dot product) of a weight vector, _w_ , and a feature vector, _x_ (that is, _wTx_ ). Our producer will generate synthetic data using a fixed, known weight vector and randomly generated feature vectors. This data fits the linear model formulation exactly, so we will expect our regression model to learn the true weight vector fairly easily. + +First, we will set up a maximum number of events per second (say, 100) and the number of features in our feature vector (also 100 in this example): + + /** + * A producer application that generates random linear regression data. + */ + object StreamingModelProducer { + import breeze.linalg._ + + def main(args: Array[String]) { + + // Maximum number of events per second + val MaxEvents = 100 + val NumFeatures = 100 + + val random = new Random() + +The `generateRandomArray` function creates an array of the specified size where the entries are randomly generated from a normal distribution. We will use this function initially to generate our known weight vector, `w`, which will be fixed throughout the life of the producer. We will also create a random `intercept` value that will also be fixed. The weight vector and `intercept` will be used to generate each data point in our stream: + + /** Function to generate a normally distributed dense vector */ + def generateRandomArray(n: Int) = Array.tabulate(n)(_ => random.nextGaussian()) + + // Generate a fixed random model weight vector + val w = new DenseVector(generateRandomArray(NumFeatures)) + val intercept = random.nextGaussian() * 10 + +We will also need a function to generate a specified number of random data points. Each event is made up of a random feature vector and the target that we get from computing the dot product of our known weight vector with the random feature vector and adding the `intercept` value: + + /** Generate a number of random data events*/ + def generateNoisyData(n: Int) = { + (1 to n).map { i => + val x = new DenseVector(generateRandomArray(NumFeatures)) + val y: Double = w.dot(x) + val noisy = y + intercept + (noisy, x) + } + } + +Finally, we will use code similar to our previous producer to instantiate a network connection and send a random number of data points (between 0 and 100) in text format over the network each second: + + // create a network producer + val listener = new ServerSocket(9999) + println("Listening on port: 9999") + + while (true) { + val socket = listener.accept() + new Thread() { + override def run = { + println("Got client connected from: " + socket.getInetAddress) + val out = new PrintWriter(socket.getOutputStream(), true) + + while (true) { + Thread.sleep(1000) + val num = random.nextInt(MaxEvents) + val data = generateNoisyData(num) + data.foreach { case (y, x) => + val xStr = x.data.mkString(",") + val eventStr = s"$y\t$xStr" + out.write(eventStr) + out.write("\n") + } + out.flush() + println(s"Created $num events...") + } + socket.close() + } + }.start() + } + } + } + +You can start the producer using `sbt run`, followed by choosing to execute the `StreamingModelProducer` main method. This should result in the following output, thus indicating that the producer program is waiting for connections from our streaming regression application: + + **[info] Running StreamingModelProducer** + **Listening on port: 9999** + +### Creating a streaming regression model + +In the next step in our example, we will create a streaming regression program. The basic layout and setup is the same as our previous streaming analytics examples: + + /** + * A simple streaming linear regression that prints out predicted value for each batch + */ + object SimpleStreamingModel { + + def main(args: Array[String]) { + + val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10)) + val stream = ssc.socketTextStream("localhost", 9999) + +Here, we will set up the number of features to match the records in our input data stream. We will then create a zero vector to use as the initial weight vector of our streaming regression model. Finally, we will select the number of iterations and step size: + + val NumFeatures = 100 + val zeroVector = DenseVector.zeros[Double](NumFeatures) + val model = new StreamingLinearRegressionWithSGD() + .setInitialWeights(Vectors.dense(zeroVector.data)) + .setNumIterations(1) + .setStepSize(0.01) + +Next, we will again use the `map` function to transform the input DStream, where each record is a string representation of our input data, into a `LabeledPoint` instance that contains the target value and feature vector: + + // create a stream of labeled points + val labeledStream = stream.map { event => + val split = event.split("\t") + val y = split(0).toDouble + val features = split(1).split(",").map(_.toDouble) + LabeledPoint(label = y, features = Vectors.dense(features)) + } + +The final step is to tell the model to train and test on our transformed DStream and also to print out the first few elements of each batch in the DStream of predicted values: + + // train and test model on the stream, and print predictions + // for illustrative purposes + model.trainOn(labeledStream) + model.predictOn(labeledStream).print() + + ssc.start() + ssc.awaitTermination() + + } + } + +### Tip + +Note that because we are using the same MLlib model classes for streaming as we did for batch processing, we can, if we choose, perform multiple iterations over the training data in each batch (which is just an RDD of `LabeledPoint` instances). + +Here, we will set the number of iterations to `1` to simulate purely online learning. In practice, you can set the number of iterations higher, but note that the training time per batch will go up. If the training time per batch is much higher than the batch interval, the streaming model will start to lag behind the velocity of the data stream. + +This can be handled by decreasing the number of iterations, increasing the batch interval, or increasing the parallelism of our streaming program by adding more Spark workers. + +Now, we're ready to run `SimpleStreamingModel` in our second terminal window using `sbt run` in the same way as we did for the producer (remember to select the correct main method for SBT to execute). Once the streaming program starts running, you should see the following output in the producer console: + + **Got client connected from: /127.0.0.1** + **...** + **Created 10 events...** + **Created 83 events...** + **Created 75 events...** + **...** + +After about 10 seconds, you should start seeing the model predictions being printed to the streaming application console, similar to those shown here: + + **14/11/16 14:54:00 INFO StreamingLinearRegressionWithSGD: Model updated at time 1416142440000 ms** + **14/11/16 14:54:00 INFO StreamingLinearRegressionWithSGD: Current model: weights, [0.05160959387864821,0.05122747155689144,-0.17224086785756998,0.05822993392274008,0.07848094246845688,-0.1298315806501979,0.006059323642394124, ...** + **...** + **14/11/16 14:54:00 INFO JobScheduler: Finished job streaming job 1416142440000 ms.0 from job set of time 1416142440000 ms** + **14/11/16 14:54:00 INFO JobScheduler: Starting job streaming job 1416142440000 ms.1 from job set of time 1416142440000 ms** + **14/11/16 14:54:00 INFO SparkContext: Starting job: take at DStream.scala:608** + **14/11/16 14:54:00 INFO DAGScheduler: Got job 3 (take at DStream.scala:608) with 1 output partitions (allowLocal=true)** + **14/11/16 14:54:00 INFO DAGScheduler: Final stage: Stage 3(take at DStream.scala:608)** + **14/11/16 14:54:00 INFO DAGScheduler: Parents of final stage: List()** + **14/11/16 14:54:00 INFO DAGScheduler: Missing parents: List()** + **14/11/16 14:54:00 INFO DAGScheduler: Computing the requested partition locally** + **14/11/16 14:54:00 INFO SparkContext: Job finished: take at DStream.scala:608, took 0.014064 s** + **-------------------------------------------** + **Time: 1416142440000 ms** + **-------------------------------------------** + **-2.0851430248312526** + **4.609405228401022** + **2.817934589675725** + **3.3526557917118813** + **4.624236379848475** + **-2.3509098272485156** + **-0.7228551577759544** + **2.914231548990703** + **0.896926579927631** + **1.1968162940541283** + **...** + +Congratulations! You've created your first streaming online learning model! + +You can shut down the streaming application (and, optionally, the producer) by pressing _Ctrl_ \+ _C_ in each terminal window. + +## Streaming K-means + +MLlib also includes a streaming version of K-means clustering; this is called `StreamingKMeans`. This model is an extension of the mini-batch K-means algorithm where the model is updated with each batch based on a combination between the cluster centers computed from the previous batches and the cluster centers computed for the current batch. + +`StreamingKMeans` supports a _forgetfulness_ parameter _alpha_ (set using the `setDecayFactor` method); this controls how aggressive the model is in giving weight to newer data. An alpha value of 0 means the model will only use new data, while with an alpha value of `1`, all data since the beginning of the streaming application will be used. + +We will not cover streaming K-means further here (the Spark documentation at contains further detail and an example). However, perhaps you could try to adapt the preceding streaming regression data producer to generate input data for a `StreamingKMeans` model. You could also adapt the streaming regression application to use `StreamingKMeans`. + +You can create the clustering data producer by first selecting a number of clusters, _K_ , and then generating each data point by: + + * Randomly selecting a cluster index. + * Generating a random vector using specific normal distribution parameters for each cluster. That is, each of the _K_ clusters will have a mean and variance parameter, from which the random vectors will be generated using an approach similar to our preceding `generateRandomArray` function. + +In this way, each data point that belongs to the same cluster will be drawn from the same distribution, so our streaming clustering model should be able to learn the correct cluster centers over time. + +# Online model evaluation + +Combining machine learning with Spark Streaming has many potential applications and use cases, including keeping a model or set of models up to date on new training data as it arrives, thus enabling them to adapt quickly to changing situations or contexts. + +Another useful application is to track and compare the performance of multiple models in an online manner and, possibly, also perform model selection in real time so that the best performing model is always used to generate predictions on live data. + +This can be used to do real-time "A/B testing" of models, or combined with more advanced online selection and learning techniques, such as Bayesian update approaches and bandit algorithms. It can also be used simply to monitor model performance in real time, thus being able to respond or adapt if performance degrades for some reason. + +In this section, we will walk through a simple extension to our streaming regression example. In this example, we will compare the evolving error rate of two models with different parameters as they see more and more data in our input stream. + +## Comparing model performance with Spark Streaming + +As we have used a known weight vector and intercept to generate the training data in our producer application, we would expect our model to eventually learn this underlying weight vector (in the absence of random noise, which we do not add for this example). + +Therefore, we should see the model's error rate decrease over time, as it sees more and more data. We can also use standard regression error metrics to compare the performance of multiple models. + +In this example, we will create two models with different learning rates, training them both on the same data stream. We will then make predictions for each model and measure the **mean-squared error** ( **MSE** ) and **root mean-squared error** ( **RMSE** ) metrics for each batch. + +Our new monitored streaming model code is shown here: + + /** + * A streaming regression model that compares the model performance of two models, printing out metrics for + * each batch + */ + object MonitoringStreamingModel { + import org.apache.spark.SparkContext._ + + def main(args: Array[String]) { + + val ssc = new StreamingContext("local[2]", "First Streaming App", Seconds(10)) + val stream = ssc.socketTextStream("localhost", 9999) + + val NumFeatures = 100 + val zeroVector = DenseVector.zeros[Double](NumFeatures) + val model1 = new StreamingLinearRegressionWithSGD() + .setInitialWeights(Vectors.dense(zeroVector.data)) + .setNumIterations(1) + .setStepSize(0.01) + + val model2 = new StreamingLinearRegressionWithSGD() + .setInitialWeights(Vectors.dense(zeroVector.data)) + .setNumIterations(1) + .setStepSize(1.0) + // create a stream of labeled points + val labeledStream = stream.map { event => + val split = event.split("\t") + val y = split(0).toDouble + val features = split(1).split(",").map(_.toDouble) + LabeledPoint(label = y, features = Vectors.dense(features)) + } + +Note that most of the preceding setup code is the same as our simple streaming model example. However, we created two instances of `StreamingLinearRegressionWithSGD`: one with a learning rate of `0.01` and one with the learning rate set to `1.0`. + +Next, we will train each model on our input stream, and using Spark Streaming's `transform` function, we will create a new DStream that contains the error rates for each model: + + // train both models on the same stream + model1.trainOn(labeledStream) + model2.trainOn(labeledStream) + + // use transform to create a stream with model error rates + val predsAndTrue = labeledStream.transform { rdd => + val latest1 = model1.latestModel() + val latest2 = model2.latestModel() + rdd.map { point => + val pred1 = latest1.predict(point.features) + val pred2 = latest2.predict(point.features) + (pred1 - point.label, pred2 - point.label) + } + } + +Finally, we will use `foreachRDD` to compute the MSE and RMSE metrics for each model and print them to the console: + + // print out the MSE and RMSE metrics for each model per batch + predsAndTrue.foreachRDD { (rdd, time) => + val mse1 = rdd.map { case (err1, err2) => err1 * err1 }.mean() + val rmse1 = math.sqrt(mse1) + val mse2 = rdd.map { case (err1, err2) => err2 * err2 }.mean() + val rmse2 = math.sqrt(mse2) + println( + s""" + |------------------------------------------- + |Time: $time + |------------------------------------------- + """.stripMargin) + println(s"MSE current batch: Model 1: $mse1; Model 2: $mse2") + println(s"RMSE current batch: Model 1: $rmse1; Model 2: $rmse2") + println("...\n") + } + + ssc.start() + ssc.awaitTermination() + + } + } + +If you terminated the producer earlier, start it again by executing `sbt run` and selecting `StreamingModelProducer`. Once the producer is running again, in your second terminal window, execute `sbt run` and choose the main class for `MonitoringStreamingModel`. + +You should see the streaming program startup, and after about 10 seconds, the first batch will be processed, printing output similar to the following: + + **...** + **14/11/16 14:56:11 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.09122 s** + + **-------------------------------------------** + **Time: 1416142570000 ms** + **-------------------------------------------** + + **MSE current batch: Model 1: 97.9475827857361; Model 2: 97.9475827857361** + **RMSE current batch: Model 1: 9.896847113385965; Model 2: 9.896847113385965** + **...** + +Since both models start with the same initial weight vector, we see that they both make the same predictions on this first batch and, therefore, have the same error. + +If we leave the streaming program running for a few minutes, we should eventually see that one of the models has started converging, leading to a lower and lower error, while the other model has tended to diverge to a poorer model due to the overly high learning rate: + + **...** + **14/11/16 14:57:30 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.069175 s** + + **-------------------------------------------** + **Time: 1416142650000 ms** + **-------------------------------------------** + + **MSE current batch: Model 1: 75.54543031658632; Model 2: 10318.213926882852** + **RMSE current batch: Model 1: 8.691687426304878; Model 2: 101.57860959317593** + **...** + +If you leave the program running for a number of minutes, you should eventually see the first model's error rate getting quite small: + + **...** + **14/11/16 17:27:00 INFO SparkContext: Job finished: mean at StreamingModel.scala:159, took 0.037856 s** + + **-------------------------------------------** + **Time: 1416151620000 ms** + **-------------------------------------------** + + **MSE current batch: Model 1: 6.551475362521364; Model 2: 1.057088005456417E26** + **RMSE current batch: Model 1: 2.559584998104451; Model 2: 1.0281478519436867E13** + **...** + +### Tip + +Note again that due to random data generation, you might see different results, but the overall result should be the same--in the first batch, the models will have the same error, and subsequently, the first model should start to generate to a smaller and smaller error. + +# Summary + +In this chapter, we connected some of the dots between online machine learning and streaming data analysis. We introduced the Spark Streaming library and API for continuous processing of data streams based on familiar RDD functionality and worked through examples of streaming analytics applications that illustrate this functionality. + +Finally, we used MLlib's streaming regression model in a streaming application that involves computing and comparing model performance on a stream of input feature vectors. + +# Index + +## A + + * Abstract Window Toolkit (AWT) / Extracting facial images as vectors + * accumulators / Broadcast variables and accumulators + * additive smoothing + * URL / The naïve Bayes model + * agglomerative clustering + * about / Hierarchical clustering + * alpha parameter / Training a model using implicit feedback data + * Alternating Least Squares (ALS) / Alternating least squares + * Amazon AWS public datasets + * URL / Accessing publicly available datasets + * about / Accessing publicly available datasets + * Amazon EC2 + * Spark, running on / Getting Spark running on Amazon EC2 + * EC2 Spark cluster, launching / Launching an EC2 Spark cluster + * Amazon Web Services account + * URL / Getting Spark running on Amazon EC2 + * Anaconda + * URL / Exploring and visualizing your data + * analytics + * streaming / Streaming analytics + * architecture, machine learning system / An architecture for a machine learning system + * area under ROC curve (AUC) / Evaluating the performance of classification models + * AUC, classification models / ROC curve and AUC + * AWS console + * URL / Getting Spark running on Amazon EC2 + +## B + + * bad data + * filling / Filling in bad or missing data + * bag-of-words model + * about / Term weighting schemes + * base form / A note about stemming + * basic streaming application + * creating / Creating a basic streaming application + * batch interval + * about / An introduction to Spark Streaming + * bike sharing dataset + * features, extracting from / Extracting features from the bike sharing dataset + * regression model, training on / Training a regression model on the bike sharing dataset + * performance metrics, computing on / Computing performance metrics on the bike sharing dataset + * Breeze library / Interpreting the movie clusters + * broadcast variable / Broadcast variables and accumulators + * built-in evaluation functions + * using / Using MLlib's built-in evaluation functions + * RMSE / RMSE and MSE + * MSE / RMSE and MSE + * MAP / MAP + * business use cases, machine learning system + * about / Business use cases for a machine learning system + * personalization / Personalization + * targeted marketing / Targeted marketing and customer segmentation + * customer segmentation / Targeted marketing and customer segmentation + * predictive modelling and analytics / Predictive modeling and analytics + +## C + + * categorical features / Categorical features + * timestamps, transforming into / Transforming timestamps into categorical features + * classification model + * about / Predictive modeling and analytics + * classification models + * types / Types of classification models + * linear models / Linear models + * naïve Bayes model / The naïve Bayes model + * decision trees / Decision trees + * training / Training classification models + * training, on Kaggle/StumbleUpon evergreen classification dataset / Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset + * using / Using classification models + * predictions generating, for Kaggle/StumbleUpon evergreen classification dataset / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + * clustering evaluation + * URL / Internal evaluation metrics + * clustering model + * training / Training a clustering model + * training, on MovieLens dataset / Training a clustering model on the MovieLens dataset + * used, for making predictions / Making predictions using a clustering model + * clustering models + * types / Types of clustering models + * K-means clustering / K-means clustering + * mixture model / Mixture models + * hierarchical clustering / Hierarchical clustering + * parameters, tuning for / Tuning parameters for clustering models + * K, selecting through cross-validation / Selecting K through cross-validation + * cluster predictions + * interpreting, on MovieLens dataset / Interpreting cluster predictions on the MovieLens dataset + * collaborative filtering + * about / Collaborative filtering + * matrix factorization / Matrix factorization + * comma-separated-value (CSV) / The first step to a Spark program in Scala + * components, data-driven machine learning system + * about / The components of a data-driven machine learning system + * data ingestion / Data ingestion and storage + * data storage / Data ingestion and storage + * data cleansing / Data cleansing and transformation + * data transformation / Data cleansing and transformation + * model training / Model training and testing loop + * testing loop / Model training and testing loop + * model deployment / Model deployment and integration + * model integration / Model deployment and integration + * model monitoring / Model monitoring and feedback + * model feedback / Model monitoring and feedback + * batch, versus real time / Batch versus real time + * content-based filtering / Content-based filtering + * convergence + * about / K-means clustering + * corpus + * about / Term weighting schemes + * correct form of data + * using / Using the correct form of data + * cross-validation + * K, selecting through / Selecting K through cross-validation + * cross validation + * about / Model training and testing loop, Cross-validation + * URL / Cross-validation + * customer segmentation + * about / Targeted marketing and customer segmentation + +## D + + * data + * exploring / Exploring and visualizing your data + * visualizing / Exploring and visualizing your data + * user dataset, exploring / Exploring the user dataset + * movie dataset, exploring / Exploring the movie dataset + * rating dataset, exploring / Exploring the rating dataset + * processing / Processing and transforming your data + * transforming / Processing and transforming your data + * features, extracting from / Extracting useful features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data + * projecting, PCA used / Projecting data using PCA on the LFW dataset + * data-driven machine learning system + * components / The components of a data-driven machine learning system, Data ingestion and storage, Data cleansing and transformation, Model training and testing loop, Model monitoring and feedback, Batch versus real time + * data cleansing / Data cleansing and transformation + * data ingestion / Data ingestion and storage + * datasets + * accessing / Accessing publicly available datasets + * MovieLens 100k dataset / The MovieLens 100k dataset + * data sources + * UCI Machine Learning Repository / Accessing publicly available datasets + * Amazon AWS public datasets / Accessing publicly available datasets + * Kaggle / Accessing publicly available datasets + * KDnuggets / Accessing publicly available datasets + * data storage / Data ingestion and storage + * data transformation / Data cleansing and transformation + * decision tree / Decision tree + * decision trees / Decision trees + * about / Decision trees + * tree depth, tuning / Tuning tree depth and impurity + * impurity, tuning / Tuning tree depth and impurity + * used, for regression / Decision trees for regression + * derived features + * about / Derived features + * timestamps, transforming into categorical features / Transforming timestamps into categorical features + * dimensionality reduction + * types / Types of dimensionality reduction + * PCA / Principal Components Analysis + * SVD / Singular Value Decomposition + * relationship, to matrix factorization / Relationship with matrix factorization + * clustering as / Clustering as dimensionality reduction + * dimensionality reduction model + * training / Training a dimensionality reduction model + * PCA running, on LFW dataset / Running PCA on the LFW dataset + * using / Using a dimensionality reduction model + * data projecting, PCA used / Projecting data using PCA on the LFW dataset + * PCA and SVD, relationship between / The relationship between PCA and SVD + * dimensionality reduction models + * evaluating / Evaluating dimensionality reduction models + * k, evaluating for SVD / Evaluating k for SVD on the LFW dataset + * discretized stream + * about / An introduction to Spark Streaming + * distributed vector representations + * about / Word2Vec models + * divisive clustering + * about / Hierarchical clustering + * document similarity + * with 20 Newsgroups dataset / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * with TF-IDF features / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * DStream + * about / An introduction to Spark Streaming + * actions / Actions + +## E + + * EC2 Spark cluster + * launching / Launching an EC2 Spark cluster + * Eigenfaces + * visualizing / Visualizing the Eigenfaces + * about / Visualizing the Eigenfaces + * URL / Visualizing the Eigenfaces + * interpreting / Interpreting the Eigenfaces + * ensemble methods / Model training and testing loop + * evaluation metrics + * about / Evaluating the performance of recommendation models + * explicit matrix factorization / Explicit matrix factorization + * external evaluation metrics / External evaluation metrics + +## F + + * face data + * exploring / Exploring the face data + * visualizing / Visualizing the face data + * facial images, as vectors + * extracting / Extracting facial images as vectors + * images, loading / Loading images + * grayscale, converting to / Converting to grayscale and resizing the images + * images, resizing / Converting to grayscale and resizing the images + * feature vectors, extracting / Extracting feature vectors + * false positive rate (FPR) / ROC curve and AUC + * feature extraction + * packages, used for / Using packages for feature extraction + * feature extraction techniques + * term weighting schemes / Term weighting schemes + * feature hashing / Feature hashing + * TF-IDF features, extracting from 20 Newsgroups dataset / Extracting the TF-IDF features from the 20 Newsgroups dataset + * feature hashing / Feature hashing + * features + * extracting, from data / Extracting useful features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data, Extracting the right features from your data + * about / Extracting useful features from your data + * numerical features / Extracting useful features from your data, Numerical features + * categorical features / Extracting useful features from your data, Categorical features + * text features / Extracting useful features from your data, Text features + * derived features / Derived features + * normalizing features / Normalizing features + * extracting / Extracting the right features from your data + * extracting, from MovieLens 100k dataset / Extracting features from the MovieLens 100k dataset + * extracting, from Kaggle/StumbleUpon evergreen classification dataset / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset + * extracting, from bike sharing dataset / Extracting features from the bike sharing dataset + * extracting, from MovieLens dataset / Extracting features from the MovieLens dataset + * extracting, from LFW dataset / Extracting features from the LFW dataset + * features, extracting + * feature vectors, creating for linear model / Creating feature vectors for the linear model + * feature vectors, creating for decision tree / Creating feature vectors for the decision tree + * features, MovieLens dataset + * movie genre labels, extracting / Extracting movie genre labels + * recommendation model, training / Training the recommendation model + * normalization / Normalization + * feature standardization, model performance / Feature standardization + * feature vector + * about / Extracting the right features from your data + * feature vectors + * creating, for linear model / Creating feature vectors for the linear model + * creating, for decision tree / Creating feature vectors for the decision tree + * extracting / Extracting feature vectors + +## G + + * generalized linear models + * URL / Linear models + * general regularization + * URL / Regularization + * grayscale + * converting to / Converting to grayscale and resizing the images + +## H + + * Hadoop Distributed File System (HDFS) / Installing and setting up Spark locally + * hash collisions + * about / Feature hashing + * hierarchical clustering / Hierarchical clustering + * hinge loss + * about / Linear support vector machines + +## I + + * images + * loading / Loading images + * resizing / Converting to grayscale and resizing the images + * implicit feedback data + * used, for training model / Training a model using implicit feedback data + * implicit matrix factorization / Implicit matrix factorization + * initialization methods, K-means clustering / Initialization methods + * internal evaluation metrics / Internal evaluation metrics + * inverse document frequency + * about / Term weighting schemes + * IPython + * about / Exploring and visualizing your data + * IPython Notebook + * URL / Exploring and visualizing your data + * item recommendations + * about / Item recommendations + * similar movies, generating for MovieLens 100K dataset / Generating similar movies for the MovieLens 100k dataset + +## J + + * Java + * Spark program, writing in / The first step to a Spark program in Java + * Java Development Kit (JDK) / Installing and setting up Spark locally + * Java Runtime Environment (JRE) / Installing and setting up Spark locally + +## K + + * K + * selecting, through cross-validation / Selecting K through cross-validation + * k + * evaluating, for SVD on LFW dataset / Evaluating k for SVD on the LFW dataset + * K-means + * streaming / Streaming K-means + * K-means clustering + * about / K-means clustering + * initialization methods / Initialization methods + * variants / Variants + * K-means || + * about / Initialization methods + * Kaggle + * about / Accessing publicly available datasets + * URL / Accessing publicly available datasets + * Kaggle/StumbleUpon evergreen classification dataset + * features, extracting from / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset + * URL / Extracting features from the Kaggle/StumbleUpon evergreen classification dataset + * classification models, training on / Training a classification model on the Kaggle/StumbleUpon evergreen classification dataset + * predictions, generating for / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + * Kaggle competition evaluation page + * URL / Root Mean Squared Log Error + * KDnuggets + * about / Accessing publicly available datasets + * URL / Accessing publicly available datasets + +## L + + * L1 regularization / L1 regularization + * L2 regularization + * URL / Regularization +/ L2 regularization + * label + * about / Extracting the right features from your data + * Labeled Faces in the Wild (LFW) + * about / Extracting the right features from your data + * lasso + * about / Least squares regression + * latent feature models + * about / Explicit matrix factorization + * Least Squares Regression / Least squares regression + * LFW dataset + * features, extracting from / Extracting features from the LFW dataset + * face data, exploring / Exploring the face data + * face data, visualizing / Visualizing the face data + * facial images, extracting as vectors / Extracting facial images as vectors + * normalization / Normalization + * PCA, running on / Running PCA on the LFW dataset + * Eigenfaces, visualizing / Visualizing the Eigenfaces + * Eigenfaces, interpreting / Interpreting the Eigenfaces + * data projecting, PCA used / Projecting data using PCA on the LFW dataset + * k evaluating, for SVD / Evaluating k for SVD on the LFW dataset + * line => line.size syntax + * about / Spark operations + * linear model / Linear model + * linear models + * about / Linear models, Linear models + * logistic regression / Logistic regression + * linear support vector machines / Linear support vector machines + * iterations / Iterations + * step size parameter / Step size + * regularization / Regularization + * linear support vector machines / Linear support vector machines + * log-transformed targets + * training, impact / Impact of training on log-transformed targets + * logistic regression + * about / Linear models, Logistic regression + +## M + + * machine learning models, types + * about / Types of machine learning models + * supervised learning / Types of machine learning models + * unsupervised learning / Types of machine learning models + * machine learning system + * business use cases / Business use cases for a machine learning system, Personalization, Targeted marketing and customer segmentation + * architecture / An architecture for a machine learning system + * MAE / Mean Absolute Error + * MAP + * about / MAP + * calculating / MAP + * map function / Broadcast variables and accumulators + * MAPK + * URL / Mean average precision at K + * matrix factorization + * about / Matrix factorization, Relationship with matrix factorization + * explicit matrix factorization / Explicit matrix factorization + * implicit matrix factorization / Implicit matrix factorization + * Alternating Least Squares (ALS) / Alternating least squares + * mean-squared error (MSE) / Comparing model performance with Spark Streaming + * Mean average precision at K (MAPK) / Mean average precision at K + * Mean Squared Error (MSE) / Mean Squared Error + * mini-batches + * about / An introduction to Spark Streaming + * missing data + * filling / Filling in bad or missing data + * mixture model / Mixture models + * MLlib + * used, for normalizing features / Using MLlib for feature normalization + * model + * training, on MovieLens 100k dataset / Training a model on the MovieLens 100k dataset + * training, implicit feedback data used / Training a model using implicit feedback data + * model deployment / Model deployment and integration + * model feedback + * about / Model monitoring and feedback + * model fitting + * about / Linear models + * model inputs + * rank / Training a model on the MovieLens 100k dataset + * iterations / Training a model on the MovieLens 100k dataset + * lambda / Training a model on the MovieLens 100k dataset + * model integration / Model deployment and integration + * model monitoring / Model monitoring and feedback + * model parameters + * tuning / Tuning model parameters, Tuning model parameters + * linear models / Linear models + * decision trees / Decision trees + * naïve Bayes model / The naïve Bayes model + * testing set, creating to evaluate parameters / Creating training and testing sets to evaluate parameters + * training set, creating to evaluate parameters / Creating training and testing sets to evaluate parameters + * parameter settings, impact for linear models / The impact of parameter settings for linear models + * parameter settings, impact for decision tree / The impact of parameter settings for the decision tree + * model performance + * improving / Improving model performance and tuning parameters, Improving model performance and tuning parameters + * feature standardization / Feature standardization + * additional features / Additional features + * correct form of data, using / Using the correct form of data + * comparing, with Spark Streaming / Comparing model performance with Spark Streaming + * model selection + * about / Model training and testing loop + * model training / Model training and testing loop + * movie clusters + * interpreting / Interpreting the movie clusters + * movie dataset + * exploring / Exploring the movie dataset + * movie genre labels + * extracting / Extracting movie genre labels + * MovieLens 100K dataset + * similar movies, generating for / Generating similar movies for the MovieLens 100k dataset + * MovieLens 100k dataset / The MovieLens 100k dataset + * URL / The MovieLens 100k dataset + * features, extracting from / Extracting features from the MovieLens 100k dataset + * movie recommendations, generating from / Generating movie recommendations from the MovieLens 100k dataset + * MovieLens dataset + * about / Accessing publicly available datasets + * features, extracting from / Extracting features from the MovieLens dataset + * clustering model, training on / Training a clustering model on the MovieLens dataset + * cluster predictions, interpreting on / Interpreting cluster predictions on the MovieLens dataset + * performance metrics, computing on / Computing performance metrics on the MovieLens dataset + * movie recommendations + * generating, from MovieLens 100k dataset / Generating movie recommendations from the MovieLens 100k dataset + * MovieStream + * about / Introducing MovieStream + * MSE / RMSE and MSE, Mean Squared Error and Root Mean Squared Error + +## N + + * 20 Newsgroups + * about / Extracting the TF-IDF features from the 20 Newsgroups dataset + * URL / Extracting the TF-IDF features from the 20 Newsgroups dataset + * 20 Newsgroups data + * exploring / Exploring the 20 Newsgroups data + * 20 Newsgroups dataset + * TF-IDF features, extracting from / Extracting the TF-IDF features from the 20 Newsgroups dataset + * document similarity, used with / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * text classifier, training on / Training a text classifier on the 20 Newsgroups dataset using TF-IDF + * Word2Vec models, used on / Word2Vec on the 20 Newsgroups dataset + * natural language processing (NLP) + * about / Extracting the right features from your data + * naïve Bayes model / The naïve Bayes model, The naïve Bayes model + * nominal variables + * about / Categorical features + * nonword characters / Improving our tokenization + * normalization + * normalize a feature / Normalizing features + * normalize a feature vector / Normalizing features + * normalization, LFW dataset / Normalization + * normalization, MovieLens dataset / Normalization + * normalizing features + * about / Normalizing features + * MLlib, used for / Using MLlib for feature normalization + * numerical features / Numerical features + +## O + + * 1-of-k encoding + * about / Categorical features + * online learning / Batch versus real time + * about / Online learning + * online learning, with Spark Streaming + * about / Online learning with Spark Streaming + * streaming regression model / Streaming regression + * streaming regression program / A simple streaming regression program + * K-means, streaming / Streaming K-means + * online machine learning + * URL / Online learning + * online model evaluation + * about / Online model evaluation + * model performance, comparing with Spark Streaming / Comparing model performance with Spark Streaming + * optimization + * about / Linear models + * options, data transformation + * about / Processing and transforming your data + * ordinal variables + * about / Categorical features + * Oryx + * URL / Explicit matrix factorization + * over-fitting and under-fitting + * URL / Regularization + +## P + + * packages + * used, for feature extraction / Using packages for feature extraction + * parameters + * tuning / Improving model performance and tuning parameters, Improving model performance and tuning parameters + * tuning, for clustering models / Tuning parameters for clustering models + * parameter settings impact, for decision tree + * about / The impact of parameter settings for the decision tree + * tree depth / Tree depth + * maximum bins / Maximum bins + * parameter settings impact, for linear models + * about / The impact of parameter settings for linear models + * iterations / Iterations + * step size / Step size + * L2 regularization / L2 regularization + * L1 regularization / L1 regularization + * intercept, using / Intercept + * PCA / Principal Components Analysis + * running, on LFW dataset / Running PCA on the LFW dataset + * and SVD, relationship between / The relationship between PCA and SVD + * performance, classification models + * evaluating / Evaluating the performance of classification models + * accuracy, calculating / Accuracy and prediction error + * prediction error / Accuracy and prediction error + * precision / Precision and recall + * recall / Precision and recall + * ROC curve / ROC curve and AUC + * AUC / ROC curve and AUC + * performance, clustering models + * evaluating / Evaluating the performance of clustering models + * internal evaluation metrics / Internal evaluation metrics + * external evaluation metrics / External evaluation metrics + * performance metrics, computing on MovieLens dataset / Computing performance metrics on the MovieLens dataset + * performance, recommendation models + * evaluating / Evaluating the performance of recommendation models + * Mean Squared Error (MSE) / Mean Squared Error + * Mean average precision at K (MAPK) / Mean average precision at K + * built-in evaluation functions, using / Using MLlib's built-in evaluation functions + * performance, regression models + * evaluating / Evaluating the performance of regression models + * MSE / Mean Squared Error and Root Mean Squared Error + * RMSE / Mean Squared Error and Root Mean Squared Error + * MAE / Mean Absolute Error + * Root Mean Squared Log Error / Root Mean Squared Log Error + * R-squared coefficient / The R-squared coefficient + * performance metrics, computing on bike sharing dataset / Computing performance metrics on the bike sharing dataset + * performance metrics + * computing, on bike sharing dataset / Computing performance metrics on the bike sharing dataset + * linear model / Linear model + * decision tree / Decision tree + * computing, on MovieLens dataset / Computing performance metrics on the MovieLens dataset + * personalization / Personalization + * precision, classification models / Precision and recall + * precision-recall (PR) curve / Precision and recall + * Prediction.io + * URL / Explicit matrix factorization + * prediction error, classification models / Accuracy and prediction error + * predictions + * generating, for Kaggle/StumbleUpon evergreen / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + * generating, for Kaggle/StumbleUpon evergreen classification dataset / Generating predictions for the Kaggle/StumbleUpon evergreen classification dataset + * making, clustering model used / Making predictions using a clustering model + * predictive modeling + * about / Predictive modeling and analytics + * producer application / The producer application + * pylab + * about / Exploring and visualizing your data + * Python + * Spark program, writing in / The first step to a Spark program in Python + +## R + + * R-squared coefficient / The R-squared coefficient + * rating dataset + * exploring / Exploring the rating dataset + * RDD caching + * URL / Caching RDDs + * RDDs + * about / Resilient Distributed Datasets + * creating / Creating RDDs + * Spark operations / Spark operations + * caching / Caching RDDs + * Readme.txt file + * about / Extracting features from the bike sharing dataset + * variables / Extracting features from the bike sharing dataset + * recall, classification models / Precision and recall + * receiver operating characteristic (ROC) / Evaluating the performance of classification models + * recommendation model + * training / Training the recommendation model, Training the recommendation model + * model, training on MovieLens 100k dataset / Training a model on the MovieLens 100k dataset + * using / Using the recommendation model + * user recommendations / User recommendations + * item recommendations / Item recommendations + * recommendation models + * about / Types of recommendation models + * types / Types of recommendation models + * content-based filtering / Content-based filtering + * collaborative filtering / Collaborative filtering + * recommendations / Personalization + * inspecting / Inspecting the recommendations + * red, blue, and green (RGB) / Extracting facial images as vectors + * regression model + * about / Predictive modeling and analytics + * regression models + * types / Types of regression models + * Least Squares Regression / Least squares regression + * decision trees, for regression / Decision trees for regression + * training / Training and using regression models + * using / Training and using regression models + * training, on bike sharing dataset / Training a regression model on the bike sharing dataset + * regularization forms + * SimpleUpdater / Regularization + * SquaredL2Updater / Regularization + * L1Updater / Regularization + * REPL (Read-Eval-Print-Loop) + * about / The Spark shell + * reshaping / Extracting facial images as vectors + * RMSE + * about / Mean Squared Error +/ RMSE and MSE, Mean Squared Error and Root Mean Squared Error + * ROC curve + * URL / ROC curve and AUC + * ROC curve, classification models / ROC curve and AUC + * root mean-squared error (RMSE) / Comparing model performance with Spark Streaming + * Root Mean Squared Log Error / Root Mean Squared Log Error + +## S + + * Scala + * Spark program, writing in / The first step to a Spark program in Scala + * Scala Build Tool (sbt) / The first step to a Spark program in Scala + * similar items + * inspecting / Inspecting the similar items + * singular values + * about / Singular Value Decomposition + * skip-gram model + * about / Word2Vec models + * Spark + * installing / Installing and setting up Spark locally + * setting up / Installing and setting up Spark locally + * running, on Amazon EC2 / Getting Spark running on Amazon EC2 + * Spark clusters + * about / Spark clusters + * URL / Spark clusters + * SparkConf / SparkContext and SparkConf + * SparkContext / SparkContext and SparkConf + * Spark documentation + * URL / Linear models, Decision trees for regression, General transformations + * Spark documentation, for EC2 + * URL / Getting Spark running on Amazon EC2 + * Spark operations / Spark operations + * Spark program + * in Scala / The first step to a Spark program in Scala + * in Java / The first step to a Spark program in Java + * in Python / The first step to a Spark program in Python + * Spark programming guide + * URL / The first step to a Spark program in Python + * Spark Programming Guide + * URL / Broadcast variables and accumulators + * Spark programming model + * about / The Spark programming model + * SparkContext / SparkContext and SparkConf + * SparkConf / SparkContext and SparkConf + * Spark shell / The Spark shell + * RDDs / Resilient Distributed Datasets + * broadcast variable / Broadcast variables and accumulators + * accumulators / Broadcast variables and accumulators + * Spark project documentation website + * URL / Installing and setting up Spark locally + * Spark project website + * URL / Installing and setting up Spark locally + * Spark Quick Start + * URL / The Spark programming model + * Spark shell / The Spark shell + * Spark Streaming + * about / Batch versus real time, An introduction to Spark Streaming + * input sources / Input sources + * transformations / Transformations + * actions / Actions + * window operators / Window operators + * model performance, comparing with / Comparing model performance with Spark Streaming + * Spark Streaming application + * creating / Creating a Spark Streaming application + * producer application / The producer application + * basic streaming application, creating / Creating a basic streaming application + * analytics, streaming / Streaming analytics + * stateful streaming / Stateful streaming + * stateful streaming / Stateful streaming + * stemming + * about / A note about stemming + * URL / A note about stemming + * stochastic gradient descent + * about / Online learning + * Stochastic Gradient Descent (SGD) / Linear models + * stop words + * removing / Removing stop words + * streaming data producer + * creating / Creating a streaming data producer + * streaming regression model / Streaming regression + * trainOn method / Streaming regression + * predictOn method / Streaming regression + * creating / Creating a streaming regression model + * streaming regression program + * about / A simple streaming regression program + * streaming data producer, creating / Creating a streaming data producer + * streaming regression model, creating / Creating a streaming regression model + * Stream processing + * about / Stream processing + * Spark Streaming / An introduction to Spark Streaming + * caching, with Spark Streaming / Caching and fault tolerance with Spark Streaming + * fault tolerance, with Spark Streaming / Caching and fault tolerance with Spark Streaming + * supervised learning + * about / Types of machine learning models + * Support Vector Machine (SVM) + * about / Linear models + * SVD + * about / Singular Value Decomposition + * and PCA, relationship between / The relationship between PCA and SVD + +## T + + * targeted marketing + * about / Targeted marketing and customer segmentation + * target variable + * transforming / Transforming the target variable + * training on log-transformed targets, impact / Impact of training on log-transformed targets + * term frequency + * about / Term weighting schemes + * term frequency-inverse document frequency (TF-IDF) + * about / Term weighting schemes + * terms based on frequency + * excluding / Excluding terms based on frequency + * term weighting schemes / Term weighting schemes + * testing loop / Model training and testing loop + * testing set + * creating, to evaluate parameters / Creating training and testing sets to evaluate parameters + * text classifier + * training, on 20 Newsgroups dataset / Training a text classifier on the 20 Newsgroups dataset using TF-IDF + * text data + * about / What's so special about text data? + * text features + * about / Text features + * extraction / Simple text feature extraction + * text processing impact + * evaluating / Evaluating the impact of text processing + * raw features, comparing / Comparing raw features with processed TF-IDF features on the 20 Newsgroups dataset + * TF-IDF + * used, for training text classifier / Training a text classifier on the 20 Newsgroups dataset using TF-IDF + * TF-IDF features + * extracting, from 20 Newsgroups dataset / Extracting the TF-IDF features from the 20 Newsgroups dataset + * document similarity, used with / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * TF-IDF model + * training / Training a TF-IDF model + * using / Using a TF-IDF model + * document similarity, with 20 Newsgroups dataset / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * document similarity, with TF-IDF features / Document similarity with the 20 Newsgroups dataset and TF-IDF features + * text classifier, training on 20 Newsgroups dataset / Training a text classifier on the 20 Newsgroups dataset using TF-IDF + * TF-IDF weightings + * analyzing / Analyzing the TF-IDF weightings + * timestamps + * transforming, into categorical features / Transforming timestamps into categorical features + * tokenization + * applying / Applying basic tokenization + * improving / Improving our tokenization + * training + * about / Linear models + * training set + * creating, to evaluate parameters / Creating training and testing sets to evaluate parameters + * transformations + * about / Transformations + * state, tracking / Keeping track of state + * general transformations / General transformations + * true positive rate (TPR) / ROC curve and AUC + +## U + + * UCI Machine Learning Repository + * about / Accessing publicly available datasets + * URL / Accessing publicly available datasets + * unsupervised learning + * about / Types of machine learning models + * user dataset + * exploring / Exploring the user dataset + * user recommendations + * about / User recommendations + * movie recommendations, generating / Generating movie recommendations from the MovieLens 100k dataset + +## V + + * variance + * about / Decision trees for regression + * variants, K-means clustering / Variants + * vector + * about / Extracting useful features from your data + * vector space model + * about / Term weighting schemes + +## W + + * whitespace tokenization + * URL / Applying basic tokenization + * window + * about / Window operators + * windowing + * about / Window operators + * window operators / Window operators + * within cluster sum of squared errors (WCSS) / K-means clustering + * Word2Vec models + * about / Word2Vec models + * on 20 Newsgroups dataset / Word2Vec on the 20 Newsgroups dataset + * word stem / A note about stemming + + diff --git a/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt b/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt new file mode 100644 index 00000000..088069a6 --- /dev/null +++ b/kag/examples/csqa/builder/data/mastering_vba_for_microsoft_office.txt @@ -0,0 +1,27264 @@ +Mastering VBA for Microsoft Office 2013 + +Table of Contents + +Acknowledgments + +About the Author + +Introduction + +Part 1: Recording Macros and Getting Started with VBA + +Chapter 1: Recording and Running Macros in the Office Applications + +What Is VBA and What Can You Do with It? + +Understanding Macro Basics + +Recording a Macro + +Running a Macro + +Recording a Sample Word Macro + +Recording a Sample Excel Macro + +Specifying How to Trigger an Existing Macro + +Deleting a Macro + +The Bottom Line + +Chapter 2: Getting Started with the Visual Basic Editor + +Opening the Visual Basic Editor + +Using the Visual Basic Editor's Main Windows + +Setting Properties for a Project + +Customizing the Visual Basic Editor + +The Bottom Line + +Chapter 3: Editing Recorded Macros + +Testing a Macro in the Visual Basic Editor + +Editing the Word Macro + +Editing the Excel Macro + +Editing a PowerPoint Macro + +The Bottom Line + +Chapter 4: Creating Code from Scratch in the Visual Basic Editor + +Setting Up the Visual Basic Editor for Creating the Procedures + +Creating a Procedure for Word + +Creating a Procedure for Excel + +Creating a Procedure for PowerPoint + +Creating a Procedure for Access + +The Bottom Line + +Part 2: Learning How to Work with VBA + +Chapter 5: Understanding the Essentials of VBA Syntax + +Getting Ready + +Procedures + +Statements + +Keywords + +Expressions + +Operators + +Variables + +Constants + +Arguments + +Objects + +Collections + +Properties + +Methods + +Events + +The Bottom Line + +Chapter 6: Working with Variables, Constants, and Enumerations + +Working with Variables + +Working with Constants + +Working with Enumerations + +The Bottom Line + +Chapter 7: Using Array Variables + +What Is an Array? + +Declaring an Array + +Storing Values in an Array + +Multidimensional Arrays + +Declaring a Dynamic Array + +Redimensioning an Array + +Returning Information from an Array + +Erasing an Array + +Finding Out Whether a Variable Is an Array + +Finding the Bounds of an Array + +Sorting an Array + +Searching through an Array + +The Bottom Line + +Chapter 8: Finding the Objects, Methods, and Properties You Need + +What Is an Object? + +Working with Collections + +Finding the Objects You Need + +Using Object Variables to Represent Objects + +Team Programming and OOP + +The Bottom Line + +Part 3: Making Decisions and Using Loops and Functions + +Chapter 9: Using Built-in Functions + +What Is a Function? + +Using Functions + +Using Functions to Convert Data + +Using Functions to Manipulate Strings + +Using VBA's Mathematical Functions + +Using VBA's Date and Time Functions + +Using File-Management Functions + +The Bottom Line + +Chapter 10: Creating Your Own Functions + +Components of a Function + +Creating a Function + +Examples of Functions for Any VBA-Enabled Office Application + +Creating a Function for Word + +Creating a Function for Excel + +Creating a Function for PowerPoint + +Creating a Function for Access + +The Bottom Line + +Chapter 11: Making Decisions in Your Code + +How Do You Compare Things in VBA? + +Testing Multiple Conditions by Using Logical Operators + +_Select Case_ Blocks + +The Bottom Line + +Chapter 12: Using Loops to Repeat Actions + +When Should You Use a Loop? + +Understanding the Basics of Loops + +Using For...loops for Fixed Repetitions + +Using _Do..._ Loops for Variable Numbers of Repetitions + +_While... Wend_ Loops + +Nesting Loops + +Avoiding Infinite Loops + +The Bottom Line + +Part 4: Using Message Boxes, Input Boxes, and Dialog Boxes + +Chapter 13: Getting User Input with Message Boxes and Input Boxes + +Opening a Procedure to Work On + +Displaying Status-Bar Messages in Word and Excel + +Message Boxes + +Input Boxes + +Forms: When Message Boxes and Input Boxes Won't Suffice + +The Bottom Line + +Chapter 14: Creating Simple Custom Dialog Boxes + +When Should You Use a Custom Dialog Box? + +Creating a Custom Dialog Box + +Linking a Form to a Procedure + +Retrieving the User's Choices from a Dialog Box + +Examples of Connecting Forms to Procedures + +Using an Application's Built-in Dialog Boxes from VBA + +The Bottom Line + +Chapter 15: Creating Complex Forms + +Creating and Working with Complex Dialog Boxes + +Using Events to Control Forms + +The Bottom Line + +Part 5: Building Modular Code and Using Classes + +Chapter 16: Building Modular Code and Using Classes + +Creating Modular Code + +Creating and Using Classes + +The Bottom Line + +Chapter 17: Debugging Your Code and Handling Errors + +Principles of Debugging + +The Different Types of Errors + +VBA's Debugging Tools + +Dealing with Infinite Loops + +Dealing with Runtime Errors + +Suppressing Alerts + +Handling User Interrupts in Word, Excel, and Project + +Documenting Your Code + +The Bottom Line + +Chapter 18: Building Well-Behaved Code + +What Is a Well-Behaved Procedure? + +Retaining or Restoring the User Environment + +Leaving the User in the Best Position to Continue Working + +Keeping the User Informed during the Procedure + +Making Sure a Procedure Is Running under Suitable Conditions + +Cleaning Up after a Procedure + +The Bottom Line + +Chapter 19: Securing Your Code with VBA's Security Features + +Understanding How VBA Implements Security + +Signing Your Macro Projects with Digital Signatures + +Choosing a Suitable Level of Security + +Locking Your Code + +The Bottom Line + +Part 6: Programming the Office Applications + +Chapter 20: Understanding the Word Object Model and Key Objects + +Examining the Word Object Model + +Working with the Documents Collection and the Document Object + +Working with the Selection Object + +Creating and Using Ranges + +Manipulating Options + +The Bottom Line + +Chapter 21: Working with Widely Used Objects in Word + +Using Find and Replace via VBA + +Working with Headers, Footers, and Page Numbers + +Working with Sections, Page Setup, Windows, and Views + +Working with Tables + +The Bottom Line + +Chapter 22: Understanding the Excel Object Model and Key Objects + +Getting an Overview of the Excel Object Model + +Understanding Excel's Creatable Objects + +Managing Workbooks + +Working with Worksheets + +Working with the Active Cell or Selection + +Working with Ranges + +Setting Options + +The Bottom Line + +Chapter 23: Working with Widely Used Objects in Excel + +Working with Charts + +Working with Windows + +Working with Find and Replace + +Adding Shapes + +The Bottom Line + +Chapter 24: Understanding the PowerPoint Object Model and Key Objects + +Getting an Overview of the PowerPoint Object Model + +Understanding PowerPoint's Creatable Objects + +Working with Presentations + +Working with Windows and Views + +Working with Slides + +Working with Masters + +The Bottom Line + +Chapter 25: Working with Shapes and Running Slide Shows + +Working with Shapes + +Working with Headers and Footers + +Setting Up and Running a Slide Show + +The Bottom Line + +Chapter 26: Understanding the Outlook Object Model and Key Objects + +Getting an Overview of the Outlook Object Model + +Working with the Application Object + +Understanding General Methods for Working with Outlook Objects + +Working with Messages + +Working with Calendar Items + +Working with Tasks and Task Requests + +Searching for Items + +The Bottom Line + +Chapter 27: Working with Events in Outlook + +Working with Application-Level Events + +Working with Item-Level Events + +Understanding Quick Steps + +The Bottom Line + +Chapter 28: Understanding the Access Object Model and Key Objects + +Getting Started with VBA in Access + +Getting an Overview of the Access Object Model + +Understanding Creatable Objects in Access + +Opening and Closing Databases + +Working with the _Screen_ Object + +Using the _DoCmd_ Object to Run Access Commands + +The Bottom Line + +Chapter 29: Manipulating the Data in an Access Database via VBA + +Understanding How to Proceed + +Preparing to Manage the Data in a Database + +Opening a Recordset + +Accessing a Particular Record in a Recordset + +Searching for a Record + +Returning the Fields in a Record + +Editing a Record + +Inserting and Deleting Records + +Closing a Recordset + +Saving a Recordset to the Cloud + +The Bottom Line + +Chapter 30: Accessing One Application from Another Application + +Understanding the Tools Used to Communicate between Applications + +Using Automation to Transfer Information + +Using the _Shell_ Function to Run an Application + +Using Data Objects to Store and Retrieve Information + +Communicating via DDE + +Communicating via _SendKeys_ + +Going beyond VBA + +The Bottom Line + +Chapter 31: Programming the Office 2013 Ribbon + +What Is XML? + +Hiding the Editing Group on the Word Ribbon + +Working with Excel and PowerPoint + +Undoing Ribbon Modifications + +Selecting the Scope of Your Ribbon Customization + +Adding a New Group + +Adding Callbacks + +Adding Attributes + +Using Menus and Lists + +Toggling with a Toggle-Button Control + +Modifying the Ribbon in Access + +Adding a Callback in Access + +What to Look For If Things Go Wrong + +Where to Go from Here + +The Bottom Line + +Appendix: The Bottom Line + +Chapter 1: Recording and Running Macros in the Office Applications + +Chapter 2: Getting Started with the Visual Basic Editor + +Chapter 3: Editing Recorded Macros + +Chapter 4: Creating Code from Scratch in the Visual Basic Editor + +Chapter 5: Understanding the Essentials of VBA Syntax + +Chapter 6: Working with Variables, Constants, and Enumerations + +Chapter 7: Using Array Variables + +Chapter 8: Finding the Objects, Methods, and Properties You Need + +Chapter 9: Using Built-in Functions + +Chapter 10: Creating Your Own Functions + +Chapter 11: Making Decisions in Your Code + +Chapter 12: Using Loops to Repeat Actions + +Chapter 13: Getting User Input with Message Boxes and Input Boxes + +Chapter 14: Creating Simple Custom Dialog Boxes + +Chapter 15: Creating Complex Forms + +Chapter 16: Building Modular Code and Using Classes + +Chapter 17: Debugging Your Code and Handling Errors + +Chapter 18: Building Well-Behaved Code + +Chapter 19: Securing Your Code with VBA's Security Features + +Chapter 20: Understanding the Word Object Model and Key Objects + +Chapter 21: Working with Widely Used Objects in Word + +Chapter 22: Understanding the Excel Object Model and Key Objects + +Chapter 23: Working with Widely Used Objects in Excel + +Chapter 24: Understanding the PowerPoint Object Model and Key Objects + +Chapter 25: Working with Shapes and Running Slide Shows + +Chapter 26: Understanding the Outlook Object Model and Key Objects + +Chapter 27: Working with Events in Outlook + +Chapter 28: Understanding the Access Object Model and Key Objects + +Chapter 29: Manipulating the Data in an Access Database via VBA + +Chapter 30: Accessing One Application from Another Application + +Chapter 31: Programming the Office 2013 Ribbon + +Acquisitions Editor: Mariann Barsolo + +Development Editor: David Clark + +Technical Editor: Russ Mullen + +Production Editor: Eric Charbonneau + +Copy Editor: Judy Flynn + +Editorial Manager: Pete Gaughan + +Production Manager: Tim Tate + +Vice President and Executive Group Publisher: Richard Swadley + +Vice President and Publisher: Neil Edde + +Book Designers: Maureen Forys and Judy Fung + +Proofreader: Candace Cunningham + +Indexer: Ted Laux + +Project Coordinator, Cover: Katherine Crocker + +Cover Designer: Ryan Sneed + +Cover Image: ©iStockphoto.com/pic4you + +Copyright © 2013 by John Wiley & Sons, Inc., Indianapolis, Indiana +Published simultaneously in Canada + +ISBN: 978-1-118-69512-8 +ISBN: 978-1-118-75022-3 (ebk.) +ISBN: 978-1-118-78630-7 (ebk.) + +No part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or by any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permitted under Sections 107 or 108 of the 1976 United States Copyright Act, without either the prior written permission of the Publisher, or authorization through payment of the appropriate per-copy fee to the Copyright Clearance Center, 222 Rosewood Drive, Danvers, MA 01923, (978) 750-8400, fax (978) 646-8600. Requests to the Publisher for permission should be addressed to the Permissions Department, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fax (201) 748-6008, or online at www.wiley.com/go/permissions. + +Limit of Liability/Disclaimer of Warranty: The publisher and the author make no representations or warranties with respect to the accuracy or completeness of the contents of this work and specifically disclaim all warranties, including without limitation warranties of fitness for a particular purpose. No warranty may be created or extended by sales or promotional materials. The advice and strategies contained herein may not be suitable for every situation. This work is sold with the understanding that the publisher is not engaged in rendering legal, accounting, or other professional services. If professional assistance is required, the services of a competent professional person should be sought. Neither the publisher nor the author shall be liable for damages arising herefrom. The fact that an organization or Web site is referred to in this work as a citation and/or a potential source of further information does not mean that the author or the publisher endorses the information the organization or Web site may provide or recommendations it may make. Further, readers should be aware that Internet Web sites listed in this work may have changed or disappeared between when this work was written and when it is read. + +For general information on our other products and services or to obtain technical support, please contact our Customer Care Department within the U.S. at (877) 762-2974, outside the U.S. at (317) 572-3993 or fax (317) 572-4002. + +Wiley publishes in a variety of print and electronic formats and by print-on-demand. Some material included with standard print versions of this book may not be included in e-books or in print-on-demand. If this book refers to media such as a CD or DVD that is not included in the version you purchased, you may download this material at . For more information about Wiley products, visit www.wiley.com. + +**Library of Congress Control Number:** 2013945361 + +TRADEMARKS: Wiley, the Wiley logo, and the Sybex logo are trademarks or registered trademarks of John Wiley & Sons, Inc. and/or its affiliates, in the United States and other countries, and may not be used without written permission. Microsoft is a registered trademark of Microsoft Corporation. All other trademarks are the property of their respective owners. John Wiley & Sons, Inc. is not associated with any product or vendor mentioned in this book. +Dear Reader, + +Thank you for choosing _Mastering VBA for Microsoft Office 2013_. This book is part of a family of premium-quality Sybex books, all of which are written by outstanding authors who combine practical experience with a gift for teaching. + +Sybex was founded in 1976. More than 30 years later, we're still committed to producing consistently exceptional books. With each of our titles, we're working hard to set a new standard for the industry. From the paper we print on to the authors we work with, our goal is to bring you the best books available. + +I hope you see all that reflected in these pages. I'd be very interested to hear your comments and get your feedback on how we're doing. Feel free to let me know what you think about this or any other Sybex book by sending me an email at nedde@wiley.com. If you think you've found a technical error in this book, please visit . Customer feedback is critical to our efforts at Sybex. + +I dedicate this book to my good friend + +Leroy Fincham. +Acknowledgments + +I'd like to thank all the good people at Sybex who contributed to this book. Mariann Barsolo's encouragement made this book possible in the first place, and Pete Gaughan provided thoughtful guidance while launching the project. I am also indebted to development editor David Clark, whose valuable suggestions contributed to this book's tone and organization. Technical editor Russ Mullen carefully checked the book for accuracy and ensured that all the code examples work without any errors. Finally, thanks to Eric Charbonneau, production editor, the book went smoothly through its final stages—author review, design, and assembly. My gratitude also goes to copyeditor Judy Flynn, who, via a very close read, polished this book in many ways; she is truly an exceptional copy editor. Candace Cunningham is also great at her job, and she flagged important issues during her proofreading. +About the Author + +_Mastering VBA for Microsoft Office 2013_ is **Richard Mansfield's** 45th book. His recent titles include _CSS Web Design for Dummies_ (Wiley), _Office Application Development All_ - _in_ - _One Desk Reference for Dummies_ (Wiley), _How to Do Everything with Second Life_ (McGraw-Hill), and _Programming_ : _A Beginner's Guide_ (McGraw-Hill). Overall, his books have sold more than 500,000 copies worldwide and have been translated into 12 languages. +Introduction + +Visual Basic for Applications (VBA) is a powerful tool that enables you to automate tasks in Microsoft Office applications. + +Automating can save you and your colleagues considerable time and effort. Getting more work done in less time is usually good for your self-esteem, and it can do wonderful things for your job security and your career. + +# Where to Get This Book's Example Code + +Throughout this book you'll find many code (programming) examples. Rather than type in the code, you'll save yourself time (and typo-debugging headaches) if you just copy the code from this book's web page, then paste it into the Visual Basic Editor. You can find all the code from this book—accurate, fully tested, and bug-free—at this book's web page: + +www.sybex.com/go/masteringvbaoffice2013 + +# If You Have Questions + +I'm happy to hear from readers, so if you have any difficulty while using this book, write me at earth@triad.rr.com. + +I'll try to respond the same day. We've all been beginners at some point, so don't feel your question is silly. If you're embarrassed, sign your email _Connie_ and I'll think you're Connie. + +# What Can I Do with VBA? + +You can use VBA to automate almost any action that you can perform interactively (manually) with an Office 2013 application. For example, in Word, VBA can create a document, add text to it, format it, edit it, and save it. + +In Excel, you can automatically integrate data from multiple workbooks into a single workbook. PowerPoint's VBA can create a custom presentation, including the latest data drawn from a variety of sources with no human intervention. And in Access you can create new tables, populate them with data, and send the table up to the cloud. + +VBA performs actions faster, more accurately, more reliably, and far more cheaply than any human. You can specify conditions for making a decision, then let VBA make those decisions for you in the future. By adding decision-making structures and loops (repetitions) to your code, you can go far beyond the range of actions that any human user can perform and finish the job in less than a second. + +Beyond automating actions you would otherwise perform manually, VBA gives you the tools to create user interfaces for your code—message boxes, input boxes, and _user forms_ —windows containing graphical objects that you can use to create forms and custom dialog boxes to display to the user. + +Using VBA, you can also create custom applications that run within the host application. For example, you could build within PowerPoint a custom application that automatically creates presentations for you. + +VBA can also communicate between applications. For example, Word can't do much in the way of mathematical calculations on sets of data: that's Excel's specialty. So, you can make Word start Excel running, perform some calculations, and then put the results into a Word document. Similarly, you could send graphs from Excel to PowerPoint or Outlook. You get the picture. + +Because VBA provides a standard set of tools that differ only in the specializations of the host applications, once you've learned to use VBA in one application, you'll be able to apply that knowledge quickly to using VBA in another application. For example, you might start by learning VBA in order to manipulate Excel and then move on to using your VBA skills with Outlook. You'll need to learn the components particular to Outlook, because they're different from Excel's features, but you'll be up to speed rapidly. It's like shopping. Once you understand the basics, going to a hardware store differs from going to a bookstore only in the particulars. + +As with any programming language, getting started with VBA involves a learning curve—but you'll be surprised how many tools VBA provides to help you quickly learn the fundamentals. + +The VBA Editor is among the best programming environments available. It includes help features that list programming options while you're typing, that instantly point out problems (and suggest solutions), that prevent you from making some kinds of mistakes, that offer context- sensitive help (with example programming), that even automatically complete your lines (sentences) of programming code. + +What's more, you can create some kinds of VBA programs without even writing a single line of code! You use the Macro Recorder tool built into Word and Excel—a great way to learn VBA more quickly. You turn on the Recorder and do what you want with Word or Excel manually via keyboard and mouse while the Recorder translates all your actions into programming code for you. Can't remember the programming code for saving a document? Just turn on the Recorder (click the icon on the lower left of Word's or Excel's status bar), save a document, then you've got the code it recorded: + + ActiveDocument.Save + +Another truly cool thing about VBA: Its words—most of the programming commands that make the language do what you want—are English words. Unlike less efficient programming languages, Basic strives to be human-friendly, understandable, readable. The programming code that saves Word's current document is ActiveDocument Save. For Excel, you use ActiveWorkbook Save. + +For fun, search "save a document in c++" in Google, and you'll find lots of puzzling explanations attempting to accomplish this straightforward task in unfortunately unstraightforward ways, using often-puzzling diction. If you've tried programming in other languages, you'll find the simplicity and plain English of VBA a great relief. It's easy to learn, easy to use, yet no less powerful than any other programming language. + +This book uses the Macro Recorder as the jumping-off point for you to start creating code. You first explore how to record macros (small programs) and then learn to edit this recorded code to make it do other things. After that easy introduction, you go on to explore the essentials of VBA diction and syntax. The book concludes with ambitious topics. + +Word, because it's the most popular Office application and because it has the most sophisticated and efficient programming tools, is used for many of the examples in this book. But there are plenty of examples showing how to program Excel, PowerPoint, Outlook, and even Access. And code that works in one Office 2013 application will generally work with other applications in the suite—with little or sometimes no modification. + +# What's in This Book? + +This book teaches you how to use VBA to automate your work in Office 2013 applications. For its general examples, the book focuses on Word, Excel, Outlook, and PowerPoint, because those are the Microsoft Office applications that you're most likely to have, and because they have less eccentric programming tools and strategies than Access. The last part of the book continues the discussion of how to program these four applications, but also increases coverage of Access. + +Part 1 of the book, "Recording Macros and Getting Started with VBA," comprises the following chapters: + + * Chapter 1 shows you how to record a macro using the Macro Recorder in Word and Excel. You also learn several ways to run macros and how to delete them. + * Chapter 2 introduces you to the powerful VBA Editor, the application in which you create VBA code (either by editing recorded code or by writing code from scratch) and user forms. The second half of this chapter discusses how you can customize the Visual Basic Editor so that you can work in it more efficiently. + * Chapter 3 shows you how to edit recorded macros, using the macros you recorded in Chapter 1. You learn how to step through and test a macro in the Visual Basic Editor. + * Chapter 4 teaches you how to start writing code from scratch in the Visual Basic Editor. You create a procedure (a small program called a macro) for Word, one for Excel, and a third for PowerPoint. + +Part 2, "Learning How to Work with VBA," contains the following chapters: + + * Chapter 5 explains the essentials of VBA syntax, giving you a brief overview of the concepts you need to know. You also practice creating statements in the Visual Basic Editor. + * Chapter 6 shows you how to work with variables and constants, which are used to store information for your procedures to work on. + * Chapter 7 discusses how to use arrays. Arrays are like super-variables that can store multiple pieces of information at the same time. + * Chapter 8 teaches you how to find the objects you need to create your procedures. You learn how to correctly write code involving objects by employing the Macro Recorder, the Object Browser, and the Help system. And you see how to use object variables to represent objects. Finally, you explore the uses of object models. + +Part 3, "Making Decisions and Using Loops and Functions," consists of the following chapters: + + * Chapter 9 describes how to use VBA's built-in functions—everything from string-conversion functions through mathematical and date functions to file-management functions. + * Chapter 10 shows you how to create functions of your own to supplement the built-in libraries of functions. You create functions that work in any VBA-enabled application, together with application-specific functions for Word, Excel, and PowerPoint. + * Chapter 11 shows you how to use conditional statements (such as If statements) to make decisions in your code. Conditional statements are key to making your code flexible and intelligent. + * Chapter 12 covers how you can use loops to repeat actions in your procedures: fixed-iteration loops for fixed numbers of repetitions, and indefinite loops that repeat until they satisfy a condition you specify. You also learn how to avoid creating infinite loops, which can cause your code to run either forever or until your computer crashes. + +Part 4, "Using Message Boxes, Input Boxes, and Dialog Boxes," has the following chapters: + + * Chapter 13 shows you how to use message boxes to communicate with the users of your procedures and let them make simple decisions about how the procedures run. You also explore input boxes, which are dialog boxes that give the users a way to supply information the procedures need. + * Chapter 14 discusses how to employ VBA's user forms to create custom dialog boxes that enable the users to supply information, make choices, and otherwise interact with your macros. + * Chapter 15 discusses how to build more-complex dialog boxes. These include dynamic dialog boxes that update themselves when the user clicks a button, dialog boxes with hidden zones that the user can reveal to access infrequently used options, dialog boxes with multiple pages of information, and dialog boxes with controls that respond to actions the user takes. + +Part 5, "Creating Effective Code," contains the following chapters: + + * Chapter 16 illustrates the benefits of reusable modular code rather than single-purpose procedures and then shows you how to write this reusable code. + * Chapter 17 explains the principles of debugging VBA code, examines the different kinds of errors that occur, and discusses how to deal with them. + * Chapter 18 explores how to build well-behaved code that's stable enough to withstand being run under the wrong circumstances and civilized enough to leave the user in the best possible state to continue their work after it finishes running. + * Chapter 19 discusses the security mechanisms that Windows and VBA provide for safeguarding VBA code and ensuring that you or your users do not run malevolent code (viruses, trojans, worms, and so on). The chapter discusses digital certificates and digital signatures, how to choose an appropriate security setting for the application you're using, and how to manage passwords. + +Part 6, "Programming the Office Applications," consists of these 12 chapters: + + * Chapter 20 explains the Word object model and shows you how to work with key objects in Word, including the Document object, the Selection object, and Range objects. You also learn how to set options in Word and manage cloud storage via such systems as Dropbox or Microsoft's SkyDrive. + * Chapter 21 discusses how to work with widely used objects in Word, including the objects for Find and Replace; headers, footers, and page numbers; sections, page setup, windows, and views; and tables. + * Chapter 22 introduces you to the Excel object model and shows you how to work with key objects in Excel, including the Workbook object, the Worksheet object, the ActiveCell object, and Range objects. You also learn how to set options in Excel. + * Chapter 23 shows you how to work with charts, windows, and Find and Replace in Excel via VBA. + * Chapter 24 gets you started working with the PowerPoint object model and the key objects that it contains. You work with Presentation objects, Window objects, Slide objects, and Master objects. + * Chapter 25 teaches you how to go further with VBA in PowerPoint by working with shapes, headers and footers, and the VBA objects that enable you to set up and run a slide show automatically. + * Chapter 26 introduces you to Outlook's object model and the key objects that it contains. You meet Outlook's creatable objects and main interface items; learn general methods for working with Outlook objects; and work with messages, calendar items, tasks and task requests, and searches. + * Chapter 27 shows you how to work with events in Outlook. There are two types of events, application-level events and item-level events, which you can program to respond to both Outlook actions (such as new mail arriving) and user actions (such as creating a new contact). + * Chapter 28 familiarizes you with the Access object model and demonstrates how to perform key tasks with some of its main objects. + * Chapter 29 shows you how to manipulate the data in an Access database via VBA. + * Chapter 30 shows you how to communicate between applications via VBA. You learn which tools are available, how to use Automation, how to work with the Shell function, and how to use data objects, DDE, and SendKeys. + * Chapter 31 explores the various ways you can customize the Ribbon programmatically. It's not possible to customize it by VBA code alone. Instead, you must write XML code to modify what the user sees on the Ribbon and write _callbacks_ (event-handler procedures in VBA) to respond when the user clicks one of the buttons or other controls you've added to the Ribbon. You see how to modify tabs, groups, and individual controls—in Word, PowerPoint, Excel, and, using different techniques, in Access. + +# How Should I Use This Book? + +This book tries to present material in a sensible and logical way. To avoid repeating information unnecessarily, the chapters build on each other, so the later chapters generally assume that you've read the earlier chapters. + +The first five parts of the book offer a variety of code samples using Word, Excel, PowerPoint, and, to a lesser extent, Access. If you have these applications (or some of them), work through these examples as far as possible to get the most benefit from them. While you may be able to apply some of the examples directly to your work, mostly you'll find them illustrative of general VBA techniques and principles, and you'll need to customize them to suit your own needs. + +The sixth and last part of this book shows you some more-advanced techniques that are useful when using VBA to program Word, Excel, PowerPoint, Outlook, and Access. Work through the chapters that cover the application or applications that you want to program with VBA. + +Chapters 30 and 31 are specialized, but quite useful. Chapter 30 shows you how to use one application to control another application; for example, you might use Word to contact Excel and exploit its special mathematic or graphing capabilities. And Chapter 31 shows you many different ways to program the Ribbon—the primary user interface in Office 2013 applications. + +# Is This Book Suitable for Me? + +Yes. + +This book is for anyone who wants to learn to use VBA to automate their work in a VBA-enabled application. Automating your work could involve anything from creating a few simple procedures that would enable you to perform some complex and tedious operations via a single keystroke to building a custom application with a complete interface that looks nothing like the host application's regular interface. + +This book attempts to present theoretical material in as practical a context as possible by including lots of examples of the theory in action. For example, when you learn about loops, you execute short procedures that illustrate the use of each kind of loop so that you can see how and why they work and when to use them. And you'll also find many step-throughs—numbered lists that take you through a task, one step at a time. Above all, I've tried to make this book clear and understandable, even to readers who've never written any programming in their life. + +# Conventions Used in This Book + +This book uses several conventions to convey information succinctly: + + * designates choosing a command from a menu. For example, "choose File ⇒ Open" means that you should pull down the File menu and choose the Open command from it. + * \+ signs indicate key combinations. For example, "press Ctrl+Shift+F9" means that you should simultaneously hold down the Ctrl, Shift, and F9 keys. Also, you'll sometimes see this: Press Ctrl+F, I. That means simultaneously press Ctrl and F, then release them and press I. + * Some of these key combinations can be confusing at first (for example, "Ctrl++" means that you hold down Ctrl and press the + key—in other words, hold down Ctrl and Shift together and press the = key, because the + key is the shifted =.). + * Likewise, "Shift+click" means that you should hold down the Shift key as you click with the mouse, and "Ctrl+click" means that you should hold down the Ctrl key as you click. + * ↑→↓← represent the arrow keys on your keyboard. These arrows are also represented in the text as "up-arrow," "down-arrow," etc. The important thing to note is that ← does not mean the Backspace key (which on many keyboards bears a similar arrow). The Backspace key is indicated simply by the words "Backspace" or "the Backspace key." + * **Boldface** indicates that you are to type something. + * Program font indicates program items, or text derived from program lines. Complete program lines appear offset in separate paragraphs like the example below, while shorter expressions appear as part of the main text. + + Sub Sample_Listing() + 'lines of program code look like this. + End Sub + + * _Italics_ usually indicate either new terms being introduced or variable information (such as a drive letter that will vary from computer to computer and that you'll need to substitute for your own). + * _ (a continuation underline character) indicates that a single line of code has been broken onto a second or subsequent line in the book (because of the limitations of page size). In the VBA Editor, you should enter these "broken" lines of code as a single line. For example, in this code sample, a single line of VBA Editor code has been broken into three lines when printed in this book: + + MsgBox System.PrivateProfileString("", _ + "HKEY_CURRENT_USER\Software\Microsoft\ _ + Office\11.0\Common\AutoCorrect", "Path") + + * You'll also see sidebars throughout the book. These include asides, notes, tips, and warnings. They're a bit like footnotes, though less tedious. Each sidebar, no matter how small, has a headline—so you can quickly see if you want to read it. + * Finally, each chapter includes one, longer, _Real World Scenario_ sidebar: a case study, an important practical technique, or some other useful advice. + +# The Mastering Series + +The Mastering series from Sybex provides outstanding instruction for readers with intermediate and advanced skills in the form of top-notch training and development for those already working in their field and clear, serious education for those aspiring to become pros. Every Mastering book includes the following: + + * Real World Scenarios, ranging from case studies to interviews, that show how the tool, technique, or knowledge presented is applied in actual practice + * Skill-based instruction with chapters organized around real tasks rather than abstract concepts or subjects + * Self-review test questions so you can be certain you're equipped to do the job right + +# For More Information + +Sybex strives to keep you supplied with the latest tools and information you need for your work. Please check the website at www.sybex.com/go/masteringvbaoffice2013, where we'll post additional content and updates that supplement this book if the need arises. +Part 1 + +Recording Macros and Getting Started with VBA + + * **Chapter 1: Recording and Running Macros in the Office Applications** + * **Chapter 2: Getting Started with the Visual Basic Editor** + * **Chapter 3: Editing Recorded Macros** + * **Chapter 4: Creating Code from Scratch in the Visual Basic Editor** + +Chapter 1 + +Recording and Running Macros in the Office Applications + +In this chapter, you'll learn the easiest way to get started with Visual Basic for Applications (VBA): recording simple _macros_ using a Macro Recorder that is built into the Office applications. Then you'll see how to run your macros to perform useful tasks. + +I'll define the term _macro_ in a moment. For now, just note that by recording macros, you can automate straightforward but tediously repetitive tasks and speed up your regular work. You can also use the Macro Recorder to create VBA code that performs the actions you need and then edit the code to customize it—adding flexibility and power. In fact, VBA is a real powerhouse if you know how to use it. This book shows you how to tap into that power. + +In this chapter you will learn to do the following: + + * Record a macro + * Assign a macro to a button or keyboard shortcut + * Run a macro + * Delete a macro + +# What Is VBA and What Can You Do with It? + +Visual Basic for Applications is a programming language created by Microsoft that can be built into applications. You use VBA to automate operations in applications that support it. All the main Office applications—Word, Excel, Outlook, Access, and PowerPoint—include VBA, so you can automate operations through most Office applications. + +And please don't be put off by the notion that you'll be _programming_ : As you'll see shortly, working with VBA is nearly always quite easy. In fact, quite often you need not actually write any VBA yourself; you can merely _record_ it—letting the Office application write all the VBA "code." The phrase _automate operations in applications_ is perhaps a bit abstract. VBA allows you to streamline many tasks, avoid burdensome repetition, and improve your efficiency. Here are some examples: + + * You can record a macro that automatically carries out a series of actions that you frequently perform. Let's say that you often edit Word documents written by a co-worker, but she sets the zoom level to 100. You prefer a zoom level of 150. All you need to automatically fix this is this VBA code: + + ActiveWindow.ActivePane.View.Zoom.Percentage = 150 + +And don't worry, you need not even know these programming terms like ActiveWindow or View.Zoom. When you turn on the Macro Recorder, then perform these actions (clicking View, then clicking Zoom, then setting the percentage), all your actions are translated into the necessary VBA code. You write no code at all. + + * You can write code that performs actions a certain number of times and that makes decisions depending on the situation in which it is running. For example, you could write code that takes a series of actions on every presentation that's open in PowerPoint. + * You can have your macros interact with the user by displaying _forms_ , or custom dialog boxes, that enable the user to make choices and specify settings while the macro is running. For example, you might display a set of formatting options—showing captioned controls such as check boxes and option buttons—that the user can select. Then when the user closes the dialog box, your macro takes appropriate actions based on the user's input. + * You can take actions via VBA that you can't take (or take easily) by directly manipulating the user interface. For example, when you're working interactively in most applications, you're limited to working with the active file—the active document in Word, the active workbook in Excel, and so on. By using VBA, you can manipulate files that aren't active. + * You can make one application manipulate another application. For example, you can make Word place a table from a Word document into an Excel worksheet. + +## The Difference between Visual Basic and Visual Basic for Applications + +VBA is based on Visual Basic, a programming language derived from BASIC. _BASIC_ stands for Beginner's All-Purpose Symbolic Instruction Code. BASIC is designed to be user-friendly because it employs recognizable English words (or variations on them) rather than the abstruse and incomprehensible programming terms found in languages like COBOL. In addition to its English-like diction, BASIC's designers endeavored to keep its punctuation and syntax as simple and familiar as possible. + +Visual Basic is _visual_ in that it offers efficient shortcuts such as drag-and-drop programming techniques and many graphical elements. + +Visual Basic for Applications is a version of Visual Basic tailored to Microsoft Office applications. The set of _objects_ (features and behaviors) available in each application differs because no two applications share the same features and commands. + +For example, some VBA objects available in Word are not available in Excel (and vice versa) because some of Word's features, like the Table of Contents generator, are not appropriate in Excel. + +However, the large set of primary commands, fundamental structure, and core programming techniques of VBA in Word and VBA in Excel are the same. So you'll find that it's often quite easy to translate your knowledge of VBA in Word to VBA in Excel (or indeed in any VBA-enabled application). + +For example, you'd use the Save method (a _method_ is essentially an action that can be carried out) to save a file in Excel VBA, Word VBA, or PowerPoint VBA. What differs is the _object_ involved. In Excel VBA, the command would be ActiveWorkbook.Save, whereas in Word VBA it would be ActiveDocument.Save and in PowerPoint it would be ActivePresentation.Save. + +VBA always works with a host application (such as Access or Word). With the exception of some stand-alone programs that are usually best created with Visual Studio Tools for Office, a host application always needs to be open for VBA to run. This means that you can't build stand-alone applications with VBA the way you can with Visual Basic .NET or Visual Studio Tools for Office (VSTO). If you wish, you can _hide_ the host application from the user so that all they see is the interface (typically user forms) that you give to your VBA procedures. By doing this, you can create the illusion of a stand-alone application. Whether you need to employ this technique will depend on the type of programming you do. + +* * * + +What Are Visual Basic .NET and Visual Basic Express? + +Visual Basic .NET (VB .NET) is just one version of Microsoft's long history of BASIC language implementations. BASIC contains a vast set of libraries of prewritten code that allow you to do pretty much anything that Windows is capable of. Although VB .NET is generally employed to write stand-alone applications, you can tap into its libraries from within a VBA macro. Just remember, each Office application has its own object library, but the .NET libraries themselves contain many additional capabilities (often to manipulate the Windows operating system). So, if you need a capability that you can't find within VBA or an Office application's object library, the resources of the entire .NET library are also available to you. Visual Basic Express is a free version of VB .NET. After you've worked with VBA in this book, you might want to explore VB .NET at + +www.microsoft.com/visualstudio/eng/products/visual-studio-express-products + +You'll find versions for both traditional desktop Windows as well as Windows 8. + +* * * + +# Understanding Macro Basics + +A _macro_ is a sequence of commands you or a user can repeat at will. That's exactly the definition of a _computer program_. Macros, however, are generally short programs—dedicated to a single task. Think of it like this: A normal computer program, such as Photoshop or Internet Explorer (IE), has many capabilities. IE can prevent pop-up ads, block websites, display full-screen when you press F11, and so on. A macro is smaller, dedicated to accomplishing just one of these tasks, such as displaying full-screen. + +In some applications, you can set a macro to run itself automatically. For instance, you might create a macro in Word to automate basic formatting tasks on a type of document you regularly receive incorrectly formatted. As you'll see in Chapter 6, "Working with Variables, Constants, and Enumerations," in a discussion of the AutoExec feature, you can specify that a macro run automatically upon opening a document of that type. + +A macro is a type of _subroutine_ (sometimes also called a _subprocedure_ ). Generally, people tend to use the shorter, more informal terms _sub, procedure_ , and _routine._ In the Visual Basic Editor, each of your macros starts with the word _Sub_. Note that a macro is a single procedure, whereas a computer program like IE is a collection of many procedures. + +A macro used to be defined as recorded code rather than written code, but most people today use the word in its wider sense, so it can include written code as well. For example, if you record a macro and then edit it to make it more efficient, or to add commands to make it take further actions, most people still consider it a macro. + +In an Office application that supports the VBA Macro Recorder (such as Word or Excel), you can create macros in two ways: + + * Turn on the Macro Recorder and just perform the sequence of actions you want the macro to perform. Clicks, typing, dragging, dropping—whatever you do is recorded. + * Open the Visual Basic Editor and type the VBA commands into it. + +There's also a useful hybrid approach that combines recording with editing. First record the sequence of actions, and then later, in the Visual Basic Editor, you can view and edit your macro. You could delete any unneeded commands. Or type in new commands. Or use the editor's Toolbox feature to drag and drop user-interface elements (such as message boxes and dialog boxes) into your macro so users can make decisions and choose options for how to run it. Macros are marvelously flexible, and the VBA Editor is famously powerful yet easy to use. + +Once you've created a macro, you specify how you want the user to trigger it. In most applications, you can assign a macro to the Ribbon, to the Quick Access Toolbar, or to a shortcut key combination. This makes it very easy to run the macro by merely clicking an icon or pressing a shortcut key (such as Alt+R). You can also optionally assign your macro to a Quick Access Toolbar button or keyboard shortcut when you first record the macro, via a dialog box that automatically appears when you begin a recording. You'll see how all this works shortly. It's simple. (To assign a macro to the Ribbon, first record it, then right-click the Ribbon and choose Customize The Ribbon. Locate and click the Choose Commands From drop-down box, then click the Macros entry to display all your macros.) + +# Recording a Macro + +The easiest way to create VBA code is to record a macro using the Macro Recorder. Only Word and Excel include a Macro Recorder. + +You switch on the Macro Recorder, optionally assign a trigger that will later run the macro (a toolbar button or a shortcut key combination), perform the actions you want in the macro, and then switch off the Macro Recorder. As you perform the actions, the Macro Recorder translates them into commands— _code_ —in the VBA programming language. + +Once you finish recording the macro, you can view the code in the Visual Basic Editor and change it if you wish. If the code works perfectly as you recorded it, you never have to look at it—you can just run the macro at any time by clicking the toolbar button or key combination you assigned to the macro. + +## Displaying the Developer Tab on the Ribbon + +Before going any further, ensure that the Developer (programmer) tab is visible in your Ribbon. This tab is your gateway to macros, VBA, and the VBA Editor. By default, the Office applications do not display this tab. (Access doesn't even _have_ this tab. Word, Excel, PowerPoint, and Outlook do.) To add this tab to your Ribbon, click the File tab, and then click Options. Click Customize Ribbon. In the list box on the right, click Developer to select it. Click the OK button to close the Options dialog box. + +In the following sections, you'll look at the stages involved in recording a macro. The process is easy, but you need to be familiar with some background if you haven't recorded macros before. After the general explanations, you'll record example macros in Word and Excel. (Later in the book you'll examine and modify those macros, after you learn how to use the Visual Basic Editor. So don't delete them.) + +## Planning the Macro + +Before you even start the Macro Recorder, it's sometimes a good idea to do a little planning. Think about what you will do in the macro. In most cases, you can just record a macro and not worry about the context. You can just record it with a document open and some text visible. But in some situations you need to ensure that a special context is set up before you start the recording. For example, you might want to create a macro in Word that does some kind of editing, such as italicizing and underlining a word. To do this, you'll want to first have the blinking "insertion" cursor on a word _that's not italicized or underlined_. You don't want to record the actions of moving the insertion cursor to a particular word. That would make your macro specific to this document and this word in that document. You usually want a macro to work well with more than just one particular document. + +Your macro is intended to just italicize and underline whatever word is currently under the blinking cursor in any document. Nevertheless, most simple macros can be recorded without any special planning. Just record whatever you want the macro to do. + +* * * + +Pausing a Macro + +Word (but not Excel) lets you pause the Macro Recorder if you need to stop while recording to do something that you do not want to record. This capability allows you to deal with problems you hadn't anticipated when planning the macro—for example, having to open a document that should have been open before you started recording the macro. + +* * * + +Some macros should perform any necessary setup themselves. The setup will be part of the macro. In these cases, you should make sure the application is in the state that the macro expects before you start recording the macro. For example, if, to do its job, a macro needs a blank active workbook in Excel, the macro itself should create that blank workbook rather than using whichever workbook happens to be active at the time. This saves a step when the macro runs. So to do this, start recording before launching a blank active workbook. + +* * * + +A Warning about Security + +Macros are computer programs, albeit usually small. You can even tap into all the features in the Windows operating system itself from within a macro. The result is that viruses and other harmful code can be contained within macros (and such code can execute automatically merely by the user opening an infected document via the AutoExec feature discussed in Chapter 6 and via other techniques, such as employing the application's Startup folder). For example, a virus embedded in a macro could delete files on the hard drive if the user opened an infected Word document. This is obviously dangerous. + +Office 2013 applications, not to mention the Windows operating systems, contain multiple layers of security to protect against such viruses and harmful code. Specific to macros is a macro "trust" technology that's built into Office applications. To see or modify these trust settings, open the Trust Center dialog box by clicking the Developer tab on the Ribbon, and then click the Macro Security icon (in the Code section of the Ribbon) in Word, Excel, Outlook, or PowerPoint. (Access, as is often the case, does things a bit differently than the other Office applications. Access has no Developer tab. To manage macro security in Access you click the File tab, click the Options link on the left side, click Trust Center, click the Trust Center Settings button, then click Macro Settings.) + +The main point here is that you might have to make some adjustments if you can't run macros or if you get mysterious error messages such as "The Macro Could Not Be Created" or "Access is denied." If this happens, your first step should be to look at the Trust Center and choose Disable All Macros With Notification. This setting asks the user for permission to run macros. Or, while you're working with macros in this book, you might want to just select Enable All Macros in the Trust Center. Then deselect this option before closing a document that you worked on in this book. The idea is that you can trust your own macros but you don't want to trust _all_ macros from _all_ documents you might get from outside sources. + +If you are working on a document that you created and it contains macros that you wrote, you can trust that document and agree to activate the macros. However, if you open a document from someone else, you have to be careful. + +Additional security issues can be solved by managing the various strata of security that now, out of necessity, are embedded within operating systems and applications. One way to deal with security issues is to explore security topics in Windows 7 or 8 applications' Help features. You can also sometimes get good answers by posting questions in online user groups or searching expert websites such as _Wikipedia_. Also, you can find a good overview of Office 2013 security here: + + + +Chapter 19, "Securing Your Code with VBA's Security Features," covers Office 2013 security issues in depth. + +* * * + +## Starting the Macro Recorder + +Start the Macro Recorder by clicking the Developer tab on the Ribbon and then clicking the Record Macro button. You can also click the Macro Record button on the status bar at the bottom of the application. (With this approach, you don't have to open the Developer tab. Just click the button on the status bar.) + +As soon as you start the Macro Recorder, the Record Macro dialog box opens. You see that this new macro has been given a default macro name (Macro1, Macro2, and so on). You can accept that default name or change it. There's also an optional description to fill in if you wish. + +To stop the Macro Recorder, you can click the Stop Recording button in the Developer tab. You can alternatively stop the recording by clicking the square button that appears during recording on the status bar, down on the bottom left of the application's window. Once the Recorder is stopped, the square button is replaced with an icon that you can click to start recording a new macro. In Word for the Mac, click the REC indicator rather than double-clicking it. + +The appearance of the Record Macro dialog box varies somewhat from one application to another because the dialog box must offer suitable options to accommodate the varying capabilities particular to each application. In each case, you get to name the macro and add a description of it. In most cases, you can also specify where to save the macro—for example, Word offers two options. For global use (making the macro available to all Word documents), store it in the file named normal.dotm. Or, if it is merely to be used in the currently active document, choose to store it in a file with the document's name and the .dotm filename extension. An ordinary Word template has a .dotx filename extension, but macros are stored in a file with the filename extension .dotm. + +Other applications differ somewhat in how the dialog works when you begin recording a macro. For example, Excel allows you three options: to store macros in the current workbook, or in a new workbook, or for use with _all_ Excel workbooks, in the Personal Macro Workbook. That's the equivalent of Word's Normal.dotm file, and Excel's Personal Macro workbook is saved in a file named Personal.xlsb. + +* * * + +Where to Store Macros in PowerPoint + +You can't record macros in the 2013 version of PowerPoint, but you can create them by writing programming code using the Visual Basic Editor. Then you can store macros in the currently active presentation or in any other open presentation or template. PowerPoint also provides a global macro storage container (similar to Word's Normal.dotm file). In PowerPoint, choose the All Open Presentations option in the Macro list box, which is found by clicking the Macros icon in the Code section of the Ribbon's Developer tab. + +* * * + +The Record Macro dialog box also lets you specify how you want the macro triggered. Word displays buttons you can click to either open a dialog for entering a shortcut key combination or open the Word Options dialog where you can create a button for this macro that will appear on the Quick Access Toolbar. Excel limits you to Ctrl+ shortcut key combinations as a way of launching macros, so there is no button to display a full keyboard shortcut dialog like the one in Word. Excel has only a small text box where you can enter the key that will be paired with Ctrl as the shortcut. + +Most of the Microsoft applications that host VBA have the Developer tab from which you control macro recording, launch the Visual Basic Editor, and otherwise manage macros. Access, however, groups several of its macro-related tools in a Database Tools tab (which is visible by default) and also has a Macro option on its Create tab. + +Figure 1.1 shows the Record Macro dialog box for Word with a custom name and description entered. Figure 1.2 shows Word's version of the Developer tab on the Ribbon. + +Figure 1.1 In the Record Macro dialog box, enter a name for the macro you're about to record. Type a concise but helpful description in the Description box. This is the Record Macro dialog box for Word. + +Figure 1.2 You can use the Developer tab on the Ribbon to work with macros. + +Here's what the primary Visual Basic features on the Ribbon's Developer tab (or Access's Database Tools tab) do: + +**Run Macro button** + +Only Access has this Ribbon button. It displays a Run Macro dialog box, in which you can choose the macro to run. Many aspects of VBA in Access are unique only to Access, and Chapter 28, "Understanding the Access Object Model and Key Objects," covers this topic in depth. + +**Record Macro button** + +Displays the Record Macro dialog box in Word or Excel. + +**Macro Security button** + +Displays the Trust Center macro settings dialog. You'll examine this in detail in Chapter 19. This button allows you to specify whether and how you want macros enabled. + +**Visual Basic button** + +Starts or switches to the Visual Basic Editor. You'll begin working in the Visual Basic Editor in Chapter 2, "Getting Started with the Visual Basic Editor" (and you'll spend most of the rest of the book employing it). + +**Macros button** + +Opens the classic Macros dialog from which you can run, step into (start the Visual Basic Editor in _Break mode,_ more about this in Chapter 3, "Editing Recorded Macros"), edit, create, delete, or open the macro project organizer dialog. (Not all of these options are available in all applications. For example, PowerPoint has no organizer.) Word and Excel have a similar Macros button in the Ribbon's View tab. This button has the ability to open the Macros dialog but can also start recording a macro. Note that Break mode is also referred to as Step mode. + +**Add-Ins** + +This is where you can access templates, styles, and specialized code libraries. + +**Controls** + +A set of control buttons that, when clicked, insert user-interface components—such as a drop-down list box—into an open document. Similar components can also be added to macros that you create in the VBA Editor. Chapters 14 and 15 explore this user-interface topic. + +**Design Mode button** + +Toggles between _Design mode_ and _Regular mode_. When Design mode you can add or edit embedded controls in documents. In Regular mode you can interact normally with controls (controls can accept information from the user via typing or mouse clicks). + +**Properties button** + +This button is enabled only if you're in Design mode. It allows you to edit the properties of the document (such as removing personal information). + +* * * + +The Emergence of XML + +XML has become an industry standard for storing and transmitting data. With Office 2007, the Office applications' documents began to employ XML extensively. This switch to XML is the primary reason that documents created in versions of Office 2007, 2010, and 2013 are not compatible with earlier versions of Office, such as Office 2003 documents. Thus, you must _convert_ old Office documents to the newer Office formats. And people still using older versions of Office must install the Microsoft Office Compatibility Pack for Word, Excel, and PowerPoint File Formats. Note that Word 2010 and 2013 document files are saved with a .docx filename extension, the x reflecting the underlying XML format on which Office 2007, 2010, and 2013 rest. + +* * * + +## Naming the Macro + +Next, enter a name for the new macro in the Macro Name text box in the Record Macro dialog box. The name must comply with the following conventions: + + * It must start with a letter; after that, it can contain both letters and numbers. + * It can be up to 80 characters long. + * It can contain underscores, which are useful for separating words, such as File_Save. + * It cannot contain spaces, punctuation, or special characters, such as ! or *. + +* * * + +Name and Describe Your Macros + +Some people insist that to properly manage your set of macros, you must follow some clerical procedures that involve giving your macros descriptive names and also typing in a narrative description of each macro's purpose. They claim that if you create many macros, you should organize them carefully. Recording macros is so easy; you can create code so quickly that you can end up with a _pile_ of macros—as Southerners say—making it easy to get confused about which macro does what. + +You may be tempted not to assign a macro description when you're in a hurry or when you're playing with different ways to approach a problem and you're not sure which (if any) of your test macros you'll keep. And for simple, obvious code, perhaps using the Macro12, Macro13 default names and not typing in a description isn't a problem. If you find it easy to read VBA code, you can usually just look at a macro and read what it does. + +Even so, for more complex macros, and for people who find code hard to read—go ahead and enter a few notes for each macro that you record. Otherwise, you can end up with that pile of recorded macros that have the cryptic default names and no descriptions. To figure out what each macro does and which ones you can safely delete, you'll have to plow through the code—and a recorded macro's code can be surprisingly long, even if the macro does nothing more than adjust a few options in a couple of dialog boxes. + +You might also want to employ a macro-naming convention to indicate which are test macros that you can delete without remorse. Start the name with a word like _Temp_ , then add numeric values sequentially to keep track of the versions—for example, Scratch (Scratch01, Scratch02, and so on) and Temp (Temp01, Temp02, and so on). + +Each new macro you record is by default placed at the bottom of the set of macros in the VBA Editor. You can, however, always open the Visual Basic Editor and rename or add a description anytime you want because macros are fully editable. + +Personally, I like to put a little descriptive note inside more complicated macros' code, right at the top, under the Sub line. It looks like this: + + Sub AltH() + ' Applies Heading 1 style + Selection.Style = ActiveDocument.Styles("Title1") + End Sub + +Any text following a single-quote symbol (') on a line of code is ignored by VBA. The single quote indicates that what follows is a _comment_ to assist the programmer in understanding the code rather than actual code that should be executed. (VBA would not know what to make of the words Applies Heading 1 style. They are not part of VBA's dictionary.) + +Note that if you type a description in the Description field of the Record Macro dialog when you first start recording, that comment is automatically inserted into your code—complete with the single-quote symbol. + +Also, my preferred way to name any macros that are triggered by keyboard shortcuts is to use the name of the keyboard shortcut itself. Thus, Sub AltH tells me that this macro is triggered by the Alt+H keyboard shortcut. + +But whatever system you adopt, it's generally better to err on the side of greater description or commenting within the code rather than too little. It only takes a moment to provide an expressive, meaningful name and a clear description of the purpose of the macro. + +* * * + +### Invalid Macro Names + +Word and Excel, the two Office applications that permit macro recording, raise objections to invalid macro names when you click the OK button to start recording the macro. If you enter an invalid macro name in the Record Macro dialog box, these applications let you know—in their own way. Word displays a brief, rather cursory message, while Excel gives more helpful info. Figure 1.3 shows how these applications respond to an invalid macro name once it's entered. + +Figure 1.3 The dialog boxes supplied by Word and Excel showing invalid macro names. + +### Describing Your Macros + +Type a description for the macro in the Description text box. Recall that this description is to help you (and anyone you share the macro with) identify the macro and understand when to use it. If the macro runs successfully only under particular conditions, you can note them briefly in the Description text box. For example, if the user must make a selection in the document before running the macro in Word, mention that. + +You now need to choose where to store the macro. Your choices with Word and Excel are as follows: + +**Word** + +Recall that in Word, if you want to restrict availability of the macro to just the current template (.dotm file) or document (.docm file), choose that template or document from the Store Macro In drop-down list in the Record Macro dialog box shown in Figure 1.1. If you want the macro to be available no matter which template you're working in, make sure the default setting—All Documents (Normal.dotm)—appears in the Store Macro In combo box. (If you're not clear on what Word's templates are and what they do, see the sidebar "Understanding Word's Normal.dotm, Templates, and Documents" later in this chapter). + +**Excel** + +In Excel, you can choose to store the macro in This Workbook (the active workbook), a new workbook, or Personal Macro Workbook. The Personal Macro Workbook is a special workbook named Personal.xlsb. Excel creates this Personal Macro Workbook the first time you choose to store a macro in the Personal Macro Workbook. By keeping your macros and other customizations in the Personal Macro Workbook, you can make them available to any of your procedures. Recall that the Personal Macro Workbook is similar to Word's global macros storage file named Normal.dotm. If you choose New Workbook, Excel creates a new workbook for you and creates the macro in it. + +### Storing Your Macros + +Word and Excel automatically store recorded macros in a default location in the specified document, template, workbook, or presentation: + +**Word** + +Word stores each recorded macro in a _module_ named NewMacros in the selected template or document, so you'll always know where to find a macro after you've recorded it. This can be a bit confusing because there can be multiple NewMacros folders visible in the Project Explorer pane in the Visual Basic Editor. (This happens because there can be more than one project open—such as several documents open simultaneously, each with its own NewMacros folder holding the macros embedded within each document.) Think of NewMacros as merely a holding area for macros—until you move them to another module with a more descriptive name. (Of course, if you create only a handful of macros, you don't need to go to the trouble of creating various special modules to subdivide them into categories. You can just leave everything in a NewMacros module. As always, how clerical you need to be depends on how organized your mind and memory are.) + +If a NewMacros module doesn't yet exist, the Macro Recorder creates it. Because it receives each macro recorded into its document or template, a NewMacros module can soon grow large if you record many macros. The NewMacros module in the default global template, Normal.dotm, is especially likely to grow bloated, because it receives each macro you record unless you specify another document or template prior to recording. Some people like to clear out the NewMacros module from time to time, putting recorded macros you want to keep into other modules and disposing of any useless or temp recorded macros. I don't have _that_ many macros, so I find no problem simply leaving them within the NewMacros module. + +**Excel** + +Excel stores each recorded macro for any given session in a new module named Module _n_ , where _n_ is the lowest unused number in ascending sequence (Module1, Module2, and so on). Any macros you create in the next session go into a new module with the next available number. If you record macros frequently with Excel, you'll most likely need to consolidate the macros you want to keep so that they're not scattered across many modules like this. + +* * * + +Understanding Word's Normal.dotm, Templates, and Documents + +Word 2007, 2010, and 2013 store data differently than previous versions of Word. For one thing, in Word 2003 you could create custom menus and toolbars that you stored in templates. Later versions of Word do not permit menus, nor do they permit any toolbars other than the Quick Access Toolbar. What's more, customizing that toolbar has a _global_ impact. Custom toolbar buttons are not stored in templates. In other words, any modifications you make to the Quick Access Toolbar will be visible in any Word document, no matter which template(s) is currently active. + +Word 2007, 2010, and 2013 feature three kinds of templates: + + * Legacy templates from Word 2003 and earlier versions. These have a .dot filename extension. If you are working with one of these templates, [Compatibility Mode] appears on the Word title bar. + * Word 2010 templates that contain no macros (.dotx filename extension). You can save macros in a document that employs a .dotx template, but the macro will not be saved within the template. + * Templates with a .dotm filename extension contain macros. Recall that because macros written by malicious people can do damage just like a virus, recent versions of Word segregate macros into this special kind of template with a .dotm filename extension. A .dotm template can do anything that a .dotx template can do, but the .dotm template features the additional capability of hosting macros. + +Word has a four-layer architecture. Starting from the bottom, these layers are the application itself, the global template (Normal.dotm), the active document's template, and finally, the active document itself (the text and formatting). Each of the four layers can affect how Word appears and how it behaves, but all four layers are not necessarily active at any given time. + +The bottom layer, which is always active, is the Word application itself. This layer contains all the Word objects and built-in commands, such as _Open_. Also always active are objects such as Word's Quick Access Toolbar, the Ribbon, and so on. This layer is the most difficult to picture because usually you don't see it directly. Normal.dotm, the global template, forms the second layer and is also always active. + +When you start Word, it loads Normal.dotm automatically, and Normal.dotm stays loaded until you exit Word. (There's a special switch you can use—winword /n—to prevent the macros in Normal.dotm from being active if you need to troubleshoot it. Press the Start key [the Windows key] in Windows 8, and type **Run** to launch Word in this special way.) + +Normal.dotm contains styles (such as the default paragraph style), AutoText entries, formatted AutoCorrect entries, and customizations. These customizations show up in the other layers unless specifically excluded. + +Default blank documents (such as the document that Word normally creates when you start it and any document you create by clicking Ctrl+N or by clicking the Ribbon's File tab and then choosing New and Blank Document) are based on Normal.dotm. So when you're working in a default blank document, you see the Word interface as it is specified in Normal.dotm. + +The currently active template sits on top of the Word application and Normal.dotm. This template can contain styles, macro modules (if it is a macro-enabled .dotm file type), and settings for the template, along with any boilerplate text needed for this particular type of document. This is the third layer, but it is used only if the current document (or _active document_ ) is attached to a template other than Normal.dotm. + +On top of the current template sits the current document, which contains the text and graphics in the document, its formatting, and its layout. Documents can also contain macro modules and custom keyboard shortcuts, so the document itself can act as a fourth layer. This layer is always present when a document is open, but it has no effect on Word's interface or behavior unless the document contains customizations. + +Because these layers might contain conflicting information (such as two different font styles with the same name), there has to be an order of precedence that defines which layer "wins" in any such conflict. Customized settings work from the top layer downward. So customized settings in the active document take precedence over those in the active template. Likewise, any settings in the current template take precedence over any global templates (templates that automatically apply to all Word documents) or add-ins other than Normal.dotm. Customized settings in those global templates or add-ins take precedence over those in Normal.dotm. + +As another example, say you have the key combination Ctrl+Shift+K assigned to different actions in Normal.dotm, in a loaded global template, in a document's template, and in the document itself. When you press that key combination, only the procedure assigned in the document runs because that is the topmost layer. If you remove the key-combination assignment from the document, the template then becomes the topmost layer containing a definition of this key combination, so the procedure assigned in the template runs. If you remove the key combination from the template as well, the procedure in the loaded global template runs. Finally, if you remove that template's key combination too, the procedure in Normal.dotm runs. + +* * * + +## Choosing How to Run a New Macro + +Continuing our exploration of the Record Macro dialog box shown in Figure 1.1, at this point, after you've named the macro, typed a description, and chosen where to store it, it's time to choose how to trigger the macro. In other words, which way do you want to _run_ the macro: via a shortcut key or the Quick Access Toolbar button? Good typists generally prefer shortcut keys, but buttons provide at least a visual hint of the macro's purpose, and hovering your mouse on the button also displays the name of the macro. + +Shortcut keys and buttons are handy for people who record a moderate number of macros and don't organize them in complex ways—moving them from one module to another. If you create a great number of macros and feel the need to move them into other modules, assigning a shortcut key or button prior to recording becomes less useful. This is because moving a macro from one module to another disconnects any way you've assigned for running the macro. + +This limitation means that it makes sense to assign a way of running a macro—prior to recording—only if you're planning to use the macro in its recorded form (as opposed to, say, using part of it to create another macro) _and_ from its default location. If you plan to move the macro or rename it, don't assign a way of running it now. Instead, wait until the macro is in its final form and location, and then assign the means of running it. See "Specifying How to Trigger an Existing Macro," later in this chapter, for details. + +Personally, I don't have more than a couple dozen macros that I use all the time, so I avoid the complications described in the previous paragraph and the sidebar on managing your macros. Instead, I just add shortcut keys when I first create the macro, and leave them all in a single version of Normal.dotm. However, if you face more complicated situations—such as managing a big set of macros for a company—you might want to manage your macros with modules. + +* * * + +Manage Your Macros with Modules + +By moving your recorded macros into different modules, you can group related macros so you can compare the code, adjust them, or distribute them easily. + +* * * + +To assign a way to run the macro, follow the instructions in the next sections. + +You don't have to assign a button or keyboard shortcut prior to recording a macro. You can do it later, or at any time. In Word, Access, Excel, and other Office 2013 applications, you use the Options dialog box to assign a button on the Quick Access Toolbar to a macro. PowerPoint and Access do not permit you to assign keyboard shortcuts to macros, but for applications that do permit this—such as Word and Excel—you use the Customize Keyboard dialog box to assign a shortcut key to a macro. Excel limits you to Ctrl+ or Ctrl+Shift key combinations. + +### Running a Macro from the Ribbon + +Although it's not available in the Record Macro dialog box, you can add a macro to the Ribbon, like this: + +1. Right-click anywhere on the Ribbon. + +2. Click Customize The Ribbon on the menu. The Word Options dialog box appears. + +3. In the Choose Commands From drop-down list, select Macros. + +4. Click a macro's name to select it in the list. + +5. Click an existing tab in the list of tabs in the right dialog box where you want to locate your macro. + +6. Then click the New Group button and specify the name of your custom group. + +7. Click the rename button to give your new group a name. + +8. Click OK to close the Rename dialog box. + +9. Click the Add button to add your macro. + +10. Click the rename button to give your macro an easily understood name, and optionally an icon. + +11. Click OK to close the Rename dialog box. + +12. Click OK to close the Word Options dialog box. + +### Running a Macro from the Quick Access Toolbar + +Here's how to use the Word Options dialog box to assign a macro to a button on the Quick Access Toolbar: + +1. Right-click anywhere on the Quick Access Toolbar (it's the set of icons in the upper-left corner, above the Ribbon), and a menu will appear. (This toolbar will be just below the Ribbon if you've previously selected the Show Quick Access Toolbar Below The Ribbon option from this menu.) + +2. Click Customize Quick Access Toolbar on the menu. The Word Options dialog box appears. + +3. In the Choose Commands From drop-down list, select Macros. + +4. Click a macro's name to select it in the list, as shown in Figure 1.4. + +5. Click the Add button to insert this macro's name in the Customize Quick Access Toolbar list, as shown in Figure 1.4. + +Figure 1.4 Choose a way to run the macro in Word's Options dialog box. + +6. Word adds a button to the toolbar for the macro, giving it the macro's fully qualified name (its location plus its name), such as Normal.NewMacros.CreateDailyReport. This name consists of the name of the template or document in which the macro is stored, the name of the module that contains the macro, and the macro's name, respectively. You don't need all this information displayed when you hover your mouse pointer over the button. + +7. So rename the button or menu item: Click the Modify button at the bottom of the Customize Quick Access Toolbar list (see Figure 1.5). Whatever macro is highlighted (currently selected) in the list of toolbar items will be the one you're modifying. + +Figure 1.5 Word gives the menu item or toolbar button the full name of the macro. Use this Modify Button dialog to change the name to something shorter and better. + +* * * + +Macro Button Labels Need Not Match Their Official Names + +Notice that a macro's button name (displayed as its tooltip caption when you hover your mouse over it) doesn't have to bear any relation to the macro's actual name as it appears in the Visual Basic Editor or the Macro dialog. + +* * * + +8. While you're modifying the macro's name, you might also want to choose a different button icon that visually cues you about the macro's purpose. To do that, just double-click whatever icon you want to use, then click OK. + +### Running a Macro via a Shortcut Key Combination + +To assign the macro to a key combination, follow these steps: + +1. Right-click the Ribbon and choose Customize The Ribbon from the menu that appears. This opens the Word Options dialog. + +2. Click the Customize button next to Keyboard Shortcuts in the bottom left of the Word Options dialog box. + +3. Scroll down the Categories list box until you see Macros, then click Macros to select it. + +4. Click to select the name of the macro you want to assign a shortcut key combination to. + +5. Check the Current Keys list box to see if a key combination is already assigned. If it is, you can press the Backspace key to clear the key combination if you wish, or you can employ multiple key combinations to launch the macro. + +6. In the Press New Shortcut Key field, type the key combination you want to use to trigger the macro (see Figure 1.6). + +7. Check to see if this key combination is already used for another purpose. If so, you can reassign it, or you can choose a different combination by pressing the Backspace key in the Press New Shortcut Key field. + +Figure 1.6 Set a shortcut key combination for the macro in the Customize Keyboard dialog box. + +8. Be sure to click the Assign button when you're finished. Just closing this dialog does _not_ assign the key combination. + +* * * + +You Can Postpone Assigning a Shortcut Key Combination + +Remember that, as with the other ways of running a macro, you can assign a key combination to run a macro either at the time you record the macro or at any point after you finish recording it. If you intend to move the macro from the NewMacros module to another module, remember that you need not assign the key combination until the macro has reached its ultimate destination. + +* * * + +A key combination in Word can be any of the following: + + * Alt plus either a function key or a regular key not used as a menu-access key. + * Ctrl plus a function key or a regular key. + * Shift plus a function key. + * Ctrl+Alt, Ctrl+Shift, Alt+Shift, or even Ctrl+Alt+Shift plus a regular key or function key. Pressing Ctrl+Alt+Shift and another key tends to be too awkward for practical use. + +* * * + +Specify Two-Step Key Combinations + +You can set up shortcut keys that have two steps—for example, Ctrl+Alt+F, 1 and Ctrl+Alt+F, 2—by pressing the second key (in this case, the 1 or the 2) after pressing the key combination. However, these shortcuts tend to be more trouble than they're worth, unless you're assigning literally hundreds of extra shortcut keys. + +* * * + +### Running a Macro the Old-Fashion Way + +A clumsy, rarely used way to run a macro is to click the Developer tab in the Ribbon. To see how this works, follow these steps: + +1. Click the Macros icon. + +2. Click the name of the macro in a displayed list. + +3. Finally, click the Run button. + +By the way, you can also run a macro from within the Visual Basic Editor by pressing F5. This is how you test macros while you're editing them. + +### Assigning a Way to Run a Macro in Excel + +When you're recording a macro, Excel allows you to assign only a Ctrl shortcut key, not a button, to run it. If you want to assign a Quick Access Toolbar button to the macro, you need to do so _after_ recording the macro (using the Customize feature as described shortly). + +To assign a Ctrl shortcut key to run the macro you're recording, follow these steps: + +1. Start recording the macro, then click the Shortcut Key text box to display the blinking insertion cursor. Press the shortcut key you want to use. (Press the Shift key at the same time if you want to include Shift in the shortcut.) + +2. In the Store Macro In drop-down list, specify where you want the Macro Recorder to store the macro. Your choices are as follows: + + * _This Workbook_ stores the macro in the active workbook. This option is useful for macros that belong to a particular workbook and do not need to be used elsewhere. + * _New Workbook_ causes Excel to create a new workbook for you and store the macro in it. This option is useful for experimental macros that you'll need to edit before unleashing them on actual work. + * _Personal Macro Workbook_ stores the macro in the Personal Macro Workbook, a special workbook named PERSONAL.XLSB. By keeping your macros and other customizations in the Personal Macro Workbook, you can make them available to any of your procedures—in that way, the Personal Macro Workbook is similar to Word's Normal.dotm. If the Personal Macro Workbook does not exist yet, the Macro Recorder creates it automatically. + +3. Click the OK button to start recording the macro. + +### Assigning a Way to Run a Macro in PowerPoint + +PowerPoint does not let you record macros, but you can assign a way to run macros written in the Visual Basic Editor, as discussed in the section "Specifying How to Trigger an Existing Macro" later in this chapter. + +### Assigning a Way to Run a Macro in Outlook + +Outlook doesn't let you record macros, and by default macros are disabled. To enable macros in Outlook, click the Developer tab on the Ribbon, then click the Macro Security icon (it's on the left in the Code section of the Ribbon). The Trust Center dialog box opens. Click the Notification For All Macros option or the Enable All Macros option. To see how to assign a way to run macros, see the section "Specifying How to Trigger an Existing Macro" later in this chapter. + +### Recording the Actions in a Macro + +When you close the Record Macro dialog box, the Macro Recorder begins recording the macro. The Macro Recorder displays the Stop Recording icon (a white square) in the status bar at the bottom left of the screen (and a Stop Recording button in the Developer tab on the Ribbon). In addition, a small symbol of a cassette tape appears in the mouse pointer (these tapes were used in the old days, prior to the invention of the CD). + +Now you should perform the sequence of actions you want to record. What exactly you can do varies from application to application, but in general, you can use the mouse to select items, make choices in dialog boxes, and select defined items in documents (such as cells in spreadsheets). You'll find a number of things that you can't do with the mouse, such as select items within a document window in Word. To select items in a Word document window, you have to use the keyboard (Shift+arrow keys, for example). You can select cells with the mouse in Excel during recording. + +* * * + +The Macro Recorder Records Everything—The Complete Current Status + +When you make choices in a dialog box and click the OK button, the Macro Recorder records the current settings for all the options on that page of the dialog box. So, for example, when you change the left indentation of a paragraph in the Paragraph dialog box in Word, the Macro Recorder records _all the other settings_ on the Indents And Spacing page as well (Alignment, Before and After spacing, and so forth). + +* * * + +In Word, if you need to perform any actions that you don't want recorded, pause the Macro Recorder by clicking the Pause Recording button on the Ribbon. The button changes to Resume Recording. Click the button again to start recording again. + +To stop recording, click either the Stop Recording button on the Ribbon, or the other one on the status bar. + +The Macro Recorder has now recorded your macro and assigned it to a key combination or button, if you made that choice. + +# Running a Macro + +To run a macro you've recorded, you can use four methods to run it within the application: + + * If you assigned a Quick Access Toolbar button, use that. + * If you added your macro to the Ribbon, you can use that. + * If you specified a shortcut-key-combination macro, use it. + * A less convenient approach is to press Alt+F8 to display the Macros dialog box, select the macro, and then click the Run button. (Alternatively, you could double-click the macro name in the list box.) + +* * * + +Running in the Editor + +You can also run a macro from the Visual Basic Editor, which is useful when you're working in the Editor. Just press F5. + +* * * + +The macro runs, performing the actions in the sequence in which you recorded them. For example, suppose you create a macro in Excel that selects cell A2 in the current worksheet, boldfaces that cell, enters the text **Yearly Sales** , selects cell B2, and enters the number **100000** in it. The Macro Recorder recognizes and saves those five actions. VBA then performs all five actions, step-by-step, each time you run the macro—albeit quite rapidly. + +* * * + +How to Stop an Executing Macro + +To stop a running macro, press Ctrl+Break (Break is usually the unshifted Pause key on the keyboard). VBA stops running the code and displays a dialog box telling you that code execution has been interrupted. Click the End button to dismiss this dialog box. + +* * * + +Some applications (such as Word) let you undo most actions executed via VBA after the macro stops running (by pressing Ctrl+Z or clicking the Undo button on the Quick Access Toolbar, undoing one command at a time); other applications do not. + +* * * + +Macro Errors Are Often Caused by Incorrect Contexts + +If running the macro results in an error, often this means that the macro is trying to do something to a file or an object that isn't available. For example, if you record a macro in Excel that works on the active workbook, the macro causes an error if you run it when no workbook is open (thus there is no such thing as an active workbook). Likewise, if you write a macro in PowerPoint that works with the third shape on the active slide, that macro fails if you run it on a slide that has no third shape. To get the macro to run properly, re-create the conditions it needs, and then try it again. + +* * * + +# Recording a Sample Word Macro + +In this section, you'll record a sample macro in Word that you can work with later in the book. This macro selects the current word, cuts it, moves the insertion point one word to the right, and pastes the word back in. This is a straightforward sequence of actions that you'll later view and edit in the Visual Basic Editor. + +Follow these steps to record the macro: + +1. Create a new document by pressing Ctrl+N. + +2. Start the Macro Recorder by clicking the Developer tab on the Ribbon, then clicking the Record Macro button. Or click the Macro Record button on the status bar at the bottom of the application. (With this approach, you don't have to open the Developer tab. Just click the button on the status bar.) + +3. In the Macro Name text box, enter **Transpose_Word_Right**. + +4. In the Store Macro In drop-down list, make sure All Documents (Normal.dotm) is selected, unless you want to assign the macro to a different template. (This and future examples in this book assume this macro is located in Normal.dotm, so do store it there.) + +5. In the Description box, enter a description for the macro (see Figure 1.7). Be fairly explicit and enter a description such as **Transposes the current word with the word to its right. Created 5/5/13 by Nanci Selest-Gomes**. + +Figure 1.7 Creating the sample macro in Word + +6. Assign a method of running the macro, as described in the previous section, if you want to. Create a toolbar button or assign a keyboard shortcut. (The method or methods you choose is strictly a matter of personal preference.) If you'll need to move the macro to a different module (or a different template or document) later, don't assign a method of running the macro at this point. + +7. Click the OK button to dismiss the Word Options dialog box or the Customize Keyboard dialog box (or just click the OK button to dismiss the Record Macro dialog box if you chose not to assign a way of running the macro). Now you're ready to record the macro. The Stop Recording option appears on the Ribbon and on the status bar, and the mouse pointer has a cassette-tape icon attached to it. + +8. As a quick demonstration of how you can pause recording, click the Pause Recording button on the Ribbon. The cassette-tape icon disappears from the mouse pointer, and the Pause Recording button changes into a Resume Recording button. Enter a line of text in the document: **The quick brown fox jumped over the lazy dog.** Position the insertion point anywhere in the word _quick_ , and then click the Resume Recording button on the Ribbon to reactivate the macro recorder. + +9. Record the actions for the macro as follows: + +a. Use Word's extend selection feature to select the word _quick_ by pressing the F8 key twice. + +b. Press the Esc key to cancel Extend mode. + +c. Press Shift+Delete to cut the selected word to the Clipboard. + +d. The insertion point is now at the beginning of the word _brown_. Press Ctrl+right arrow to move the insertion point right by one word so that it's at the beginning of the word _dog_. + +e. Press Shift+Insert to paste in the cut word from the Clipboard. + +f. Press Ctrl+left arrow to move the insertion point one word to the left. This restores the cursor to its original position. + +10. Click the Stop Recording button on the Ribbon or status bar. Your sentence now reads, "The brown quick fox jumped over the lazy dog." + +* * * + +Finding Built-In Keyboard Shortcuts + +You can find a complete list of the built-in keyboard shortcuts (such as Ctrl+left arrow) by searching an application's Help system for "Keyboard Shortcuts." If available, click the Show All option to expand the complete list, then use Ctrl+F to search for whatever you're interested in. + +* * * + +You can now run this macro by using the toolbar button or keyboard shortcut that you assigned (if you chose to assign one). Alternatively, click the Macros button in the Developer tab and run the macro from the Macros dialog box. Try positioning the insertion point in the word _brown_ and running the macro to restore the words in the sentence to their original order. + +At this point, Word has stored the macro in Normal.dot. If you don't save macros until you exit Word (or until an automated backup takes place), Word doesn't, by default, prompt you to save them then. It just does so automatically. But it's best to click the Save button in the File tab to store Normal now. That way, if Word or Windows crashes, you will avoid losing the macro. + +* * * + +You Can Force Word to Prompt You to Save the Normal Template + +Word, by default, automatically saves new macros added to the Normal template. But if you prefer to have Word prompt you to save any changes to the Normal template, choose Options on the File tab, then click the Advanced button and scroll down until you see the section of Save options. Select the Prompt Before Saving Normal Template check box, and then click the OK button. This option was selected by default in early versions of Office, but ever since Office 2007 it is turned off by default. + +* * * + +# Recording a Sample Excel Macro + +In the following sections, you'll record a sample Excel macro. This macro creates a new workbook, enters a sequence of months into it, and then saves it. You'll work with this macro again in Chapter 3, so don't delete it. + +## Create a Personal Macro Workbook If You Don't Have One Yet + +If you don't already have a Personal Macro Workbook in Excel, you'll need to create one before you can create this procedure. (If you do have a Personal Macro Workbook, skip to the next section.) Follow these steps: + +1. Click the Developer tab in the Ribbon, then click the Record Macro button on the Ribbon (or just click the Record Macro button on the status bar) to display the Record Macro dialog box. + +2. Accept the default name for the macro because you'll be deleting it momentarily. + +3. In the Store Macro In drop-down list, choose Personal Macro Workbook. + +4. Click the OK button to close the Record Macro dialog box and start recording the macro. + +5. Type a single character in whichever cell is active, and press the Enter key. + +6. Click the Stop Recording button on the Ribbon or status bar to stop recording the macro. + +7. Click the Unhide button on the View tab to display the Unhide dialog box. Select PERSONAL.XLSB and click the OK button. + +8. Click the Developer tab in the Ribbon, then click the Macros button on the Ribbon to display the Macros dialog box. + +9. Select the macro you recorded and click the Delete button to delete it. Click the Yes button in the confirmation message box. + +You now have caused Excel to generate a Personal Macro Workbook that you can use from now on to hold your global macros. + +## Record the Macro + +To create this macro, start Excel and follow these steps: + +1. Click the Developer tab in the Ribbon, then click the Record Macro button on the Ribbon (or just click the Record Macro button on the status bar). This displays the Record Macro dialog box, shown in Figure 1.8, with information entered. + +Figure 1.8 Display the Record Macro dialog box for Excel and make your choices in it. + +2. Enter the name for the macro in the Macro Name text box: **New_Workbook_with_Months**. + +3. In the Shortcut Key text box, enter a shortcut key if you want to. (Remember that you can always change the shortcut key later, so you're not forced to enter one at this point.) + +4. In the Store Macro In drop-down list, choose whether to store the macro in your Personal Macro Workbook, in a new workbook, or in this active workbook. As discussed a little earlier in this chapter, storing the macro in the Personal Macro Workbook gives you the most flexibility because it is Excel's global macro container. For this example, don't store the macro in the active workbook, because you're going to delete the active workbook almost immediately. Instead, store it in your Personal Macro Workbook. Remember, we'll use this macro in future examples. + +5. Type a description for the macro in the Description text box. + +6. Click the OK button to dismiss the Record Macro dialog box and start recording the macro. + +7. Click the File tab on the Ribbon and click New to display the available templates for a new workbook. + +8. Double-click the Blank workbook icon. Excel creates a new workbook and selects the first sheet on it. + +9. Click cell A1 to select it. (It may already be selected; click it anyway because you need to record this click instruction.) + +10. Enter **January 2014** and press the right arrow key to select cell B1. Excel automatically changes the date to your default date format. That's fine. + +11. Enter **February 2014** and press the left arrow key to select cell A1 again. + +12. Drag from cell A1 to cell B1 so that the two cells are selected. + +13. Drag the fill handle from cell B1 to cell L1 so that Excel's AutoFill feature enters the months March 2014 through December 2014 in the cells. (The fill handle is the small black dot in the lower-right corner of the selection frame. You'll know you're on it when the cursor changes from a white to a black cross.) + +14. Click the File tab on the Ribbon, then click the Save As option to display the Save As dialog box. Save the workbook in a convenient folder (for example, the My Documents folder) under a name such as **Sample Workbook.xlsx**. + +15. Click the Stop Recording button on the Ribbon or status bar to stop recording the macro. + +Close the sample workbook, and use Windows Explorer to navigate to the new .xlsx file you just saved, and delete the file. Then run the macro and watch what happens. (If you don't delete the existing workbook, Excel prompts you to decide whether to overwrite it when in step 14 it tries to save the new workbook using the same name as the existing workbook.) + +# Specifying How to Trigger an Existing Macro + +If you didn't assign a way of running the macro when you recorded it, you can assign a way of running it as described here. + +## Assigning a Macro to a Quick Access Toolbar Button in Word + +To assign a macro to the Quick Access Toolbar, follow these steps: + +1. Right-click anywhere on the Quick Access Toolbar (it's the set of icons in the upper-left corner, above the Ribbon). A menu appears. + +2. Click Customize Quick Access Toolbar on the menu. The Word Options dialog box appears. + +3. In the Choose Commands From drop-down list, select Macros. + +4. Click the name of the macro you want to assign a button to. + +5. Click the Add button to copy the macro name into the list of buttons on the right. + +6. Click the Modify button if you want to assign a different icon or modify the button's name. + +7. Click OK to close the dialog. + +## Assigning a Macro to a Shortcut Key Combination + +The section "Running a Macro via a Shortcut Key Combination," earlier in this chapter, explained how to do this in Word. PowerPoint and Access do not let you assign a macro to a key combination. Excel uses a slightly different approach than Word, limiting you to Ctrl and Shift combinations, as described earlier in this chapter in the section "Assigning a Way to Run a Macro in Excel." + +# Deleting a Macro + +To delete a macro you no longer need, follow these steps: + +1. Press Alt+F8 to display the Macros dialog box. + +2. Choose the macro in the Macro Name list box. + +3. Click the Delete button. + +4. In the warning message box that appears, click the Yes button. Figure 1.9 shows Excel's variation of this warning message box. + +Figure 1.9 When you delete a macro, the application checks to make sure you mean to do so. + +5. Click the Close button or the Cancel button to close the Macros dialog box. + +* * * + +Organizing Macros in Word with the Organizer Dialog Box + +Most VBA-enabled applications require you to use the Visual Basic Editor (which is discussed in the next chapter) to move code modules, user forms, and other code items from one file to another file. (A _code module_ is a virtual container used for storing macros. A _user form_ is a custom dialog box displayed to the user for input.) But Word provides a useful tool called the Organizer dialog box that you can use to copy, move, rename, and delete code modules, user forms, and other code items directly in the Word interface without opening the Visual Basic Editor. + +To use the Organizer dialog box, follow these steps: + +1. In Word, press Alt+F8. + +2. Click the Organizer button to display the Organizer dialog box, and click the Macro Project Items tab if the Macro Project Items page (shown here) isn't automatically displayed. + +3. Look at the two documents or templates listed in the readouts above the two list boxes. Usually, the left list box shows the active document, and the right one shows Normal.dotm. Change these so that one list box shows the document or template that contains the code you want to copy or move and the other list box shows the destination document or template. (If you want only to delete or rename code items, you need only make the Organizer dialog box list the document or template that contains the items.) To change the document or template listed, click the Close File button underneath the list box on the corresponding side. The Close File button changes to an Open File button. Click this button to display the Open dialog box, navigate to and select the document or template you want, and then click the Open button. The Open dialog will automatically default to displaying the Templates folder. + +4. You can then delete, rename, copy, and move macro project items. The following list details how to do this: + + * To delete one or more macro project items from a template, choose the item or items from either panel of the Organizer dialog box and click the Delete button. Click the Yes button in the confirmation message box. Any copies of the items in other templates are unaffected. + * To rename a macro project item, select it from either panel and click the Rename button to open the Rename dialog box. Enter the new name and click the OK button. Any copies of the same item in other templates are unaffected. + * To copy one or more macro project items from one template to another, open the templates in the Organizer dialog box. Select the item or items to copy in either panel of the dialog box (the arrows on the Copy button change direction to point to the other panel). Then click the Copy button. If the recipient template contains a macro project item of the same name as one you're copying, Word displays a warning message box telling you that it can't copy the item. If you still want to copy the item, rename either the item you're copying or the item with the same name in the destination template, and then perform the copy operation. + * To move a macro project item from one template to another, copy it as described in the previous paragraph, and then delete the macro project item from the source template. + +5. Once you've deleted, renamed, copied, or moved macro project items, click the Close button to close the Organizer dialog box. If Word prompts you to save any changes to affected documents or templates that aren't open in your Word session, click the Yes button. + +* * * + +# The Bottom Line + +**Record a macro.** + +The easiest way to create a macro is to simply record it. Whatever you type or click—all your behaviors—are translated into VBA automatically and saved as a macro. + +Master It + +Turn on the macro recorder in Word and create a macro that moves the insertion cursor up three lines. Then turn off the macro recorder and view the code in the Visual Basic Editor. + +**Assign a macro to a button or keyboard shortcut.** + +You can trigger a macro using three convenient methods: clicking an entry on the Ribbon, clicking a button in the Quick Access Toolbar, or using a keyboard shortcut. You are responsible for assigning a macro to any or all of these methods. + +Master It + +Assign an existing macro to a new Quick Access Toolbar button. + +**Run a macro.** + +Macros are most efficiently triggered via a Ribbon entry, by clicking a button on the Quick Access Toolbar, or by pressing a shortcut key combination such as Alt+N or Ctrl+Alt+F. When you begin recording a macro, the Record Macro dialog has buttons that allow you to assign the new macro to a shortcut key or toolbar button. However, if you are using the Visual Basic Editor, you can run a macro by simply pressing F5. + +Master It + +Execute a macro from within the Visual Basic Editor. + +**Delete a macro.** + +It's useful to keep your collection of macros current and manageable. If you no longer need a macro, remove it. Macros can be directly deleted from the Visual Basic Editor or by clicking the Delete button in the Macros dialog (opened by pressing Alt+F8). + +Master It + +Temporarily remove a macro, then restore it, using the Visual Basic Editor. +Chapter 2 + +Getting Started with the Visual Basic Editor + +In this chapter, you'll start learning how to use the Visual Basic Editor, a powerful tool bundled with Office 2013 for working with VBA. This programming editor is the culmination of more than 18 years of modifications and improvements. It is highly effective. + +All applications that host VBA use the Visual Basic Editor, so the environment looks much the same no matter which application you're using. + +This chapter covers the fundamentals of the Visual Basic Editor: its components, what they do, and how you use them. You'll learn more advanced maneuvers as you work with VBA later in this book. + +This chapter also shows you how to customize the Visual Basic Editor to make it more comfortable, more in tune with your preferences. This customization doesn't take long, and you'll find the resulting ease of use more than worth the amount of time you invest. + +In this chapter you will learn to do the following: + + * Open the Visual Basic Editor + * Open a macro in the Visual Basic Editor + * Understand the Visual Basic Editor's main windows + * Set properties for a project + * Customize the Visual Basic Editor + +# Opening the Visual Basic Editor + +You open the Visual Basic Editor from the host application you're using. For example, if you're working in Word, you open the Visual Basic Editor from Word. The instance of the Visual Basic Editor that you open is then associated with Word. + +However, you can open two or more instances of the Visual Basic Editor. For example, if you've already opened an instance of the Visual Basic Editor in Word, you could open another instance in Excel, and then another in Access. + +You can open the Visual Basic Editor in two ways: + + * Select a macro that you want to edit. The host application then opens the Visual Basic Editor and displays that macro so that you're ready to work with it. + * Open the editor directly, and then locate the macro code you want to work with. + +The next two sections demonstrate the two ways of opening the Visual Basic Editor, and the third section shows you how to navigate to a macro. + +## Opening the Visual Basic Editor with a Macro Selected + +If you know the name of the macro you want to work with, use this method to open the Visual Basic Editor and the macro at the same time. This example uses Word to open the Transpose_Word_Right macro that you recorded in Chapter 1, "Recording and Running Macros in the Office Applications": + +1. Open Word if it's not already running. + +2. Press Alt+F8 to display the Macros dialog box. + +3. Select the Transpose_Word_Right macro and click the Edit button. Word opens the Visual Basic Editor with the macro displayed and ready for editing, as shown in Figure 2.1. + +Figure 2.1 The Visual Basic Editor with the Transpose_Word_Right macro open in the Code window + +4. Choose File ⇒ Close and return to Microsoft Word to close the Visual Basic Editor for the moment so that you can open it using the method described in the next section. + +## Opening the Visual Basic Editor Directly + +To open the Visual Basic Editor directly, follow these steps: + +1. Open or activate the host application. In this case, open or switch to Word. + +2. Press Alt+F11. The Visual Basic Editor opens. + +* * * + +The Visual Basic Editor Remembers Its Code Window + +Depending on the state of the Visual Basic Editor the last time it was closed, you may see one or more Code windows open. For example, if you left the Code window for the NewMacros module open in the previous section, the Visual Basic Editor will display this Code window again. + +* * * + +If you don't see the Properties window (see Figure 2.1), press F4. More on this important window shortly. + +## Navigating to a Macro + +After opening the Visual Basic Editor directly, use the Project Explorer pane (shown on the left side in Figure 2.1) to navigate to your macro. You also use the Project Explorer to navigate among open projects and modules when you're working in the Visual Basic Editor. + +* * * + +The Project Explorer Resembles Windows Explorer Folder View + +The Project Explorer pane works like a standard Windows Explorer tree when you're viewing folders and subfolders. Depending on the application you're using, you'll see different projects displayed in the tree (more on this later in the chapter). + +* * * + +To navigate to the Transpose_Word_Right macro, follow these steps: + +1. In the Project Explorer pane in the upper-left corner of the Visual Basic Editor, expand the entry for Normal (which represents Normal.dotm, the Normal template) by clicking the + sign to the left of its name. (If the Normal entry is already expanded, skip this step.) + +2. Double-click the Modules entry to expand it. + +3. Double-click the NewMacros module. (This is the global module in which Word automatically stores the macros you record unless you specify a different location in the Record Macro dialog box.) The Visual Basic Editor displays the contents of the module in the Code window on the right side, as you can see in Figure 2.1. + +If the module contains more than one macro, you'll also need to select the macro you want to work with—in this case, the Transpose_Word_Right macro. (If you've recorded only the Transpose_Word_Right macro, only this macro appears in the Code window.) To select a macro, use one of these methods: + + * In the Code window, select the macro from the Procedure drop-down list, as shown in Figure 2.2. (If you hover the mouse pointer over the list before dropping it down, you'll see a tooltip that gives its name: Procedure.) + * Use the scroll bar to scroll to the macro you want to edit, which is identified by the word _Sub_ , the name you gave it, and a pair of parentheses—in this case, Sub Transpose_Word_Right(). + +Figure 2.2 If the module contains two or more macros, scroll to the macro you want to edit, or select it from this Procedure drop-down list. + +* * * + +Maximize Your Code Window + +Eagle-eyed readers will notice a difference between Figures 2.1 and 2.2. By default, the Code window is displayed in "normal" window size. In other words, there is a gray background around it, as you can see in Figure 2.1. This allows you to open other code windows in the same area. However, that's a bit too much micro-multitasking for me, so from now on, I'll display the Code window maximized, as shown in Figure 2.2. This makes it easier to see your code. To do this, click the Code window's Maximize button, just to the left of the red X button that closes the window. + +* * * + +# Using the Visual Basic Editor's Main Windows + +In the following sections, you'll learn how to use the main windows of the Visual Basic Editor to get your work done. + +## The Project Explorer + +The Project Explorer is the tool for navigating among the various objects in the Visual Basic Editor. Figure 2.3 shows the Project Explorer for a Visual Basic Editor session with Word as the host application. + +Depending on the host application and its capabilities, each project can contain some or all of the following elements. (But don't worry about such items as class modules, link libraries, and so on—we'll explore them in later chapters.) + +Figure 2.3 Use the Project Explorer to navigate to the module you want to work with. + + * User forms (windows that make up part of the macro's user interface, such as a custom dialog box that accepts user input). + * Modules containing macros, procedures, and functions. + * Class modules (modules that define objects, their properties, and their values). + * References to other projects or to library files (such as DLLs—Dynamic Link Libraries). + * Objects related to the application. For example, each Word document and template contains a Microsoft Word Objects folder that holds a class object named ThisDocument. ThisDocument gives you access to the properties and _events_ (actions the object can react to, such as a click event) for the document or template. Each Excel workbook contains a class object named ThisWorkbook that gives you access to the properties and events for the workbook and a Sheet object (named Sheet1, Sheet2, and so on) for each worksheet. + +For most host applications, each open document and template is considered a separate project and is displayed as a root in the project tree. The project tree also contains any global macro storage container—such as the Normal.dotm template in Word or the Personal Macro Workbook in Excel—and any add-ins that are loaded. + +As an example, in Figure 2.3, Normal.dotm is identified as Normal, and the active document is identified as Project (C02): a document named C02. + +* * * + +Change a Project's Name at Any Time + +You can change the name of a project by using the Project Properties dialog box (discussed later in this chapter) or by selecting the project and entering a new name in the Properties pane, shown directly below the Project Explorer pane (as seen earlier in Figure 2.1). + +Once you change the name, the project is identified by that name in the Project Explorer, followed by the name of the document or template. For example, if you change the project name of document 2 to Testing, the document project is identified as Testing(2) in the Project Explorer rather than Project(2). + +* * * + +You navigate the Project Explorer in the same way that you navigate the Windows Explorer folder tree: Click the boxed plus sign to the left of a project item to expand the view and display the items contained in the project, and click the resulting boxed minus sign to collapse the view and hide the items again. Double-click a module to display its code in the Code window. Double-click a user form to display it in the Code window. + +The Visual Basic Editor displays the Project Explorer by default, and because the Project Explorer provides fast and efficient navigation among the various elements of your VBA projects, it's usually easiest to keep it displayed unless you're short on screen space or you're working for long periods in the Code window and don't need to switch to other elements. However, most people don't create document-specific macros or large, complicated programs spanning multiple projects. As a result, they just leave all their macros in the NewMacros module. + +To close the Project Explorer, click its close button (the x button in its title bar). To display the Project Explorer again, press Ctrl+R or choose View ⇒ Project Explorer. As you'll see later in this chapter, you can also undock the Project Explorer. This lets you push it aside when you need more room. But it doesn't take up much room, so, again, many people just leave it tucked up there in the upper left. + +In Figure 2.3, three buttons appear on a toolbar at the top of the Project Explorer: + +**View Code** + +Displays the Code window for the selected object. For example, if you select a user form in the Project Explorer and click the View Code button, the Visual Basic Editor displays a Code window containing any code attached to the user form. If you select a module or a class module in the Project Explorer and click the View Code button, the Visual Basic Editor displays a Code window containing the code in the module. You can also right-click an item in the Project Explorer and choose View Code from the context menu. + +_Code_ is merely a synonym for programming — the series of commands you type in (or record) to make the computer behave a certain way. Code is sometimes called _programming code_ or _source code._ + +Note that the words used in programming—the terms such as Selection or End Sub employed by a computer-programming language such as VBA—are referred to by a variety of synonyms: statements, keywords, commands, and so on. In this book, I'll frequently simply use the generic term _commands_. + +* * * + +Double-Click Modules to View Their Code + +For a module or a class module, you can also double-click the object to view its code. This is usually faster than selecting it and then clicking the View Code button. For a user form or a file, however, double-clicking displays the View Object option (discussed next) rather than the View Code option. + +* * * + +**View Object** + +Displays a window containing the selected object. The View Object button remains dimmed and unavailable until you select an object (such as a user form or a file or object within a file) that can be displayed. If the selected object is a user form, clicking the View Object button displays the user form; if the selected object is a file or an object within a file, clicking the View Object button displays that object in the host application's window. + +For example, selecting the ThisDocument object for a Word document and clicking the View Object button displays the actual Word document in the Word window. Selecting the Sheet1 object in an Excel workbook and clicking the View Object button displays that worksheet in the Excel workbook in the Excel window. + +* * * + +Viewing an Object + +You can also trigger the View Object mode by right-clicking an object and choosing View Object from the shortcut menu or by double-clicking an object that supports the View Object feature. (If the object doesn't support the View Object feature, double-clicking it triggers the View Code mode instead.) + +* * * + +**Toggle Folders** + +Toggles the view of the objects in the Project Explorer between _folder view_ (a view that shows the objects grouped within their projects and folders) and _contents view_ (which displays only the objects within their projects—no folders are shown). + +The left part of Figure 2.4 shows the Project Explorer for an application session sorted by folder view, and the right part shows the Project Explorer for the same situation in contents view. Whether you spend more time in folder view or contents view will depend on the size of your screen, the number of objects you put in any given project, and the way your mind works, not necessarily in that order. For many purposes, you'll want to toggle between folder view and contents view to locate objects most easily. + +Figure 2.4 Folder view (left) displays the objects separated into folders beneath the projects that contain them. Contents view (right) displays only the objects and the projects that contain them. + +The Project Explorer has several uses, which is another reason to keep it open all the time. Apart from navigating to the items you need to work with, you can perform the following additional tasks with the Project Explorer: + + * Add components to or remove them from a project. For example, you can use the Project Explorer to add a module or a user form to a project. + * Compare the components of one project to the components of another project. Such a comparison can be useful when you need to establish the differences between two or more projects quickly (for example, your reference copy of a company template and the copies users have been adding to). + * Move or copy items from one project to another. You can drag a code module, class module, or user form from one project to another in the Project Explorer to copy it or from the Project Explorer in one instance of the Visual Basic Editor to a project in the Project Explorer in another instance. For example, you could drag a user form from a Visual Basic Editor instance hosted by Excel to a Visual Basic Editor session hosted by PowerPoint to copy the user form. You can't, however, copy or move objects that are specific to a particular application's object model; for example, you can't drop an Excel sheet into Word's Project Explorer because Word doesn't support that type of object. + * Import or export a code module or a user form to or from a project. + +* * * + +The Project Explorer Is Your Best View + +Many actions that you can perform through the Project Explorer you can also perform through the Visual Basic Editor's menu items. In general, though, the Project Explorer provides the easiest way to navigate from module to module in the Visual Basic Editor, especially if you ever have several complex projects open at the same time. You can access the most commonly used features for an object by right-clicking it in the Project Explorer to display the shortcut menu. + +* * * + +## The Object Browser + +The Visual Basic Editor provides a full Object Browser for working with objects in VBA. You'll look at the Object Browser in detail in Chapter 8, "Finding the Objects, Methods, and Properties You Need," and when you examine the object models for the various Office applications in the final part of the book. But in the meantime take a quick look at Figure 2.5, which shows the Object Browser for a Word VBA session. The Document object is selected in the left-hand panel, and a list of its properties appears in the right-hand panel. (To see this in your VBA Editor, press F2.) + +Figure 2.5 The Object Browser provides a quick way to look up objects and their properties. Here, you can see the properties contained in the Document object. + +You'll find that a number of these properties immediately make sense from your general knowledge of Word documents. For example, as you would expect, the AttachedTemplate property tells you which template the document is currently attached to. Likewise, the Bookmarks property contains information on any bookmarks in the document. The property information is displayed at the bottom of the Object Browser. One of the great things about the BASIC language, of which VBA is a variant, and the libraries of objects underlying the Office applications is that they generally use ordinary English terminology. + +## The Code Window + +You'll do most of the actual work of testing and editing your macros in the Visual Basic Editor's Code window. (Since code is written in plain text, you could simply write it in Notepad, then paste it into the code editor for testing and debugging. But the Visual Basic Editor offers so many useful programming tools that only the brilliant few can easily get good results by trying to wing it without any assistance from the editor.) + +The Visual Basic Editor provides an individual Code window for each open project, for each document section within the project that can contains code, and for each code module and user form in the project. Each Code window is identified by the project name, the name of the module within the project, and the word _Code_ in parentheses. Figure 2.6 shows the Visual Basic Editor Code window with the Transpose_Word_Right macro open in it. + +Figure 2.6 You edit macros in the Code window. + +As you can see from the figure, two drop-down list boxes appear just below the title bar of the Code window: + + * The Object drop-down list box at the upper-left corner of the Code window provides a quick way of navigating between different objects. + * The Procedure drop-down list box at the upper-right corner of the Code window lets you move quickly from procedure to procedure within the current module. Click the down arrow button to display the drop-down list of procedures. You'll see that the first procedure is (Declarations). Clicking this item in the list takes you to the Declarations area at the top of the current code sheet, which is where you declare public variables and other VBA information that multiple procedures need to know. + +The Visual Basic Editor Code window provides a half dozen features that help you edit code efficiently and accurately, as discussed in the following sections. + +### Complete Word + +The Complete Word feature can complete the word you're typing into the Code window, once you've typed enough letters to distinguish that word from any other. If you haven't typed enough letters to distinguish the word, the Visual Basic Editor gives you the closest possibilities (see Figure 2.7). You can either "type down" (continue typing to narrow the selection) or scroll through the displayed list to find the one you want. + +Figure 2.7 The Complete Word feature automatically completes a term when you've typed enough to identify it. If you haven't typed enough, you can choose from a short list. + +The easiest way to activate Complete Word when you're typing code is to press Ctrl+spacebar. You can also choose Edit ⇒ Complete Word or click the Complete Word button on the Edit toolbar (see Figure 2.8). Note that the Edit toolbar isn't visible by default. Open it by choosing View ⇒ Toolbars ⇒ Edit or by right-clicking the toolbar area in the editor, then choosing Edit from the shortcut menu that appears. + +Figure 2.8 The Edit toolbar contains features used when working in the Code window. + +### Quick Info + +The Quick Info feature displays a ScreenTip showing syntax information about the currently selected variable, function, method, statement, or procedure. ( _Selected_ here just means the word in the code that's under or adjacent to the blinking cursor insertion point.) If you type in a command like MsgBox and then press the spacebar, the ScreenTip pops up to help you complete typing in the command. The tip shows both the required and optional elements of that command. Optional elements are enclosed in square brackets. + +Figure 2.9 shows an example of a Quick Info ScreenTip. + +Figure 2.9 Use the Editor's Quick Info feature to see a VB language command's syntax or a quick readout of status. + +To display Quick Info, use one of these methods: + + * Just type a space following a VB command. For example, type **msgbox (space)**. + * Click the Quick Info icon on the Edit toolbar. + * Right-click a VB command and choose Quick Info from the shortcut menu. + * Position the insertion point in the command and press Ctrl+I. + * Position the insertion point in the term and choose Edit ⇒ Quick Info. + * If you're typing in actual commands from the VBA language (as opposed to, say, variables or objects), the easiest way to see Quick Info is just to type the command's name (such as **MsgBox** ), then press the spacebar key. Note that VB doesn't pay any attention to capitalization, so you can type in **msgbox** or **MsgBox** or whatever variation you wish. Once you finish typing in the line of code (by pressing Enter), the editor will automatically capitalize the command the standard way: MsgBox. + +### Auto List Members + +Many VB commands have properties (qualities) and methods (behaviors). Taken together, the properties and methods of an object are called its _members_. + +For example, a message box can display various icons (such as question mark, exclamation point, and so on) to cue the user about the purpose of the message (question, warning, etc.). This icon is called the _Buttons_ property of the message-box object. And this property is specified right after the text message in the line of code. Therefore, when I type a comma to indicate that I'm now going to specify the icon for my message box, the Auto List Members feature opens a drop-down list of the choices available. As you can see in Figure 2.10, I'm choosing vbOKOnly, but there are a number of other possible choices, such as vbOKCancel, vbQuestion, and so on. + +Figure 2.10 Use the Auto List Members command to enter code items quickly and accurately. + +The Auto List Members list allows you to quickly complete the line of code. Auto List Members is switched on by default and is automatically displayed when you type a period in an object description or a comma, parentheses, or other punctuation in a line of code. Notice in Figure 2.10 that I've typed in a message-box command followed by the text Hello, Marvin! and then a comma. As soon as I typed the comma, the list of settings for the Buttons appeared. (These settings are called _constants_.) + +Alternatively, you can display the list box by clicking the List Properties/Methods button on the Edit toolbar. + +To use Auto List Members to insert your choice into your code, follow these steps: + +1. Press the down arrow key to scroll down to the property or method, or scroll down with the mouse (see Figure 2.10). You can also type the first few letters of the property or method's name to jump to it. + +2. Enter the property or method into the code by doing one of the following: + +a. Press Tab, or double-click the property or method, if you want to continue adding to this line of code after entering the property or method. (There might be additional optional properties you want to specify on this line.) + +b. Press Enter if you want to start a new line after entering the property or method. + +### List Constants + +The List Constants feature displays a pop-up list box containing constants for a property you've typed so that you can quickly complete the expression. List Constants is switched on by default. Alternatively, you can display the list box by clicking the List Constants button on the Edit toolbar. + +To use List Constants (see Figure 2.11), follow these steps: + +1. Press ⇒ to scroll down to the constant, type its first letter (or first few letters), or scroll down with the mouse. + +2. Enter the constant in the code by doing the following: + +a. Press Tab, or double-click the constant, if you want to continue working on the same line after entering the constant. + +b. Press Enter if you want to start a new line after entering the constant. + +Figure 2.11 The List Constants feature saves you time and effort, especially when typing complex constant names. + +### Data Tips + +The Data Tips feature displays a ScreenTip containing the value of a variable the mouse pointer moves over when the Visual Basic Editor is in Break mode (a mode you use for testing and debugging macros, described later in this book). Figure 2.12 shows an example. The Data Tips feature is switched on by default, but you can switch it, and other features, off by choosing Tools ⇒ Options. + +Figure 2.12 Use the Data Tips feature to check the value of a variable when you're running or stepping through code. + +### Margin Indicators + +The Margin Indicators feature lets you quickly set a breakpoint, the next statement, or a bookmark by clicking in the margin of the Code window. You'll look at setting breakpoints, setting the next statement, and setting bookmarks later. (You can just right-click the gray margin on the left side of the Code window, then choose Toggle from the shortcut menu to manipulate breakpoints or bookmarks. You can also just left-click to toggle breakpoints.) + +### Other Editing Features + +Apart from these features, the Code window includes standard Office editing features such as copy and move, cut and paste, and drag and drop. You can drag code from one procedure or module to another. + +## The Properties Window + +The Visual Basic Editor provides a Properties window you can use to view and modify the properties of an object in VBA, such as a project, a module or class module, a user form, or a _control_ (a button or check box in a dialog box, for example). If the Properties window isn't visible in the Editor, press F4. + +In the drop-down list at the top of the Properties window you can select the object whose properties you want to view or modify. The Alphabetic option displays an alphabetical list of the properties in the item, and the Categorized option presents a list of the properties broken down into categories. Generally, I find the categorization less than useful because many properties don't really fit neatly into any particular category. + +Figure 2.13 shows the Alphabetic option with the properties for an Excel workbook on the left and the Categorized page on the right. (Showing the Categorized page for the Excel workbook or worksheet isn't very helpful because all of the properties belong to a Misc category—miscellaneous. There's no categorization here at all.) + +Figure 2.13 Use the Properties window to view the properties of a project, user form, module, class module, or control. + +The purpose of most of the workbook properties is easy to grasp. For example, if the HasRoutingSlip property is set to False, it means the workbook does not have an email routing slip attached to it, and if the Saved property is set to True, that indicates that the workbook does not contain any unsaved changes. You'll learn about the properties for user forms in Chapter 14, "Creating Simple Custom Dialog Boxes," and Chapter 15, "Creating Complex Forms." + +* * * + +Understanding Design Mode, Run Mode, and Break Mode + +The Visual Basic Editor can be in one of three modes, reflecting three fundamental phases of programming—writing, locating a bug, and fixing a bug: + +**Design mode** + +Also known as _design time_. Anytime you're working in the Visual Basic Editor on your code, you're in Design mode. You don't have to be actively designing anything visually—such as a user control or form—although you often will be. You will also often just be typing in _source code_ —the commands that Visual Basic will execute when you switch to Run mode. Or you might be editing code you've recorded. + +**Run mode** + +Also known as _runtime_. When code is running, you're in Run mode. The macro will be executed just as if it had been launched from within an application like Word (using a shortcut key combination or via clicking a Quick Access Toolbar button). The purpose of Run mode in the Visual Basic Editor is to allow you to test and observe the code's behavior and interact with it if necessary, to see that it works as it's supposed to. This is known as _debugging_. If you do find any problem during runtime testing, you can stop the execution by pressing Ctrl+Break and then check the values in variables or otherwise attempt to track down _where_ in your code the error is located. VBA itself can also throw you into Break mode if it detects an error condition. + +**Break mode** + +When code is running but execution is temporarily suspended, you're in Break mode. Among other things, Break mode lets you step through your code one command or one procedure at a time (rather than running all the commands at once at full speed). Stepping is a very handy tool when you're debugging or otherwise critiquing your code. You'll explore debugging techniques in detail in Chapter 17, "Debugging Your Code and Handling Errors." + +* * * + +The Visual Basic Editor displays the Properties window by default, but you can close it by clicking its close button (the x button). To display the Properties window again, press F4 or choose View ⇒ Properties Window. + +To change a property, click the cell containing the property's name. If a down arrow button appears in the value cell, click it to choose a new value from a drop-down list. If no button appears, click the value cell to display the blinking insertion cursor and type in a new value. + +You'll be able to choose different values from drop-down lists, depending on the type of property. For a True/False property, you'll be limited to those two choices in the drop-down list. For a text property such as Name, you can enter any valid VBA name. + +By default, the Properties window is docked below the Project Explorer. You can adjust the relative heights of the Properties window or the Project Explorer window by dragging the border between them. Or you can widen both at once by dragging the border to their right. If you undock the Properties window (drag it), you can resize it by dragging its borders or corners to display more properties or to shrink the window so it takes up less space in the Visual Basic Editor. Undock interior windows (also called _panes_ , such as the Properties pane) by dragging them by their title bar or by double-clicking their title bar. Redock by double-clicking their title bar or dragging them back into position. + +## The Immediate Window + +Beyond the Project Explorer, the Code window, and the Properties window, the Visual Basic Editor includes a number of other windows that it doesn't display by default. Two of the key windows are the Object Browser (described earlier in this chapter) and the Immediate window, which you'll use during the discussion of the VBA language in Chapter 5, "Understanding the Essentials of VBA Syntax." + +The Immediate window, shown in Figure 2.14, is a small, unadorned window you can use as a virtual scratch pad to enter lines of code you want to test without entering them in an actual macro. When you type a line of code into the Immediate window and press the Enter key, the Visual Basic Editor executes that code. + +Figure 2.14 Use the Immediate window for on-the-fly work and information. + +To display the Immediate window, press Ctrl+G or choose View ⇒ Immediate Window. + +* * * + +Display Variables' Status during Debugging + +You can also use the Immediate window to display information to help you check the values of variables and expressions while code is executing. That is done by using the Debug.Print command, as in this example, which displays the value of the variable _x_ in the Immediate window: + + Sub ShowDebug() + Dim x As Integer + x = 12 + **Debug.Print x** + End Sub + +* * * + +# Setting Properties for a Project + +Each VBA project has several properties of its own that you can set, including its project name, its description, and whether it is locked against viewing. To examine or set the properties for a project, right-click the project or one of its components in the Project Explorer and choose the Properties item in the context menu to display the Project Properties dialog box. + +Both the menu item and the resulting dialog box are identified by the description of the project—for example, the properties dialog box for a template in Word is identified as TemplateProject – Project Properties, and the properties dialog box for an Excel workbook is identified as VBAProject – Project Properties. Figure 2.15 shows the Project Properties dialog box for an Excel workbook project. + +Figure 2.15 Use the Project Properties dialog box to view and set the properties for a project and to lock a project against change. + +Here's what you can do on the General tab of the Project Properties dialog box: + + * Set the project name in the Project Name text box. This name identifies the project in the Object Browser and, when necessary, in the Windows Registry. Make sure the name is unique to avoid confusion with any other project. Technically, the project name is the name of the type library for the project (a _type library_ describes the objects—such as modules and user forms—that the project contains); it is used to build the fully qualified class name of classes in the project (more on this later in the book). The project name can contain underscores but cannot contain spaces. + * Enter a description of the project in the Project Description text box. This description appears in the Description pane in the Object Browser to help the user understand what the project is. So be as concise, yet descriptive, as possible. + * Designate the Help file for the project by entering the name and path of the Help file in the Help File Name text box. Click the button marked with the ellipsis (...) to the right of the Help File Name text box to display the Help File dialog box. Then select the file and click the Open button to enter the name of the Help file in the text box. (Alternatively, you can type or paste in the name and path.) + * Specify the Help context for the project in the Project Help Context ID text box. The _Help context_ refers to a location in the Help file. The default Help context is 0, which causes the Help file to display its opening screen (the same screen you'll see if you run the Help file from the Run dialog box or by double-clicking the file in Explorer). You can specify a different help context to take the user to a particular topic—for example, one more relevant to the project on which they're seeking help. + * Specify any conditional compilation arguments needed for the project. + +Here's what you can do on the Protection tab of the Project Properties dialog box, shown in Figure 2.16: + + * Select the Lock Project For Viewing check box to prevent other people from opening the project, viewing it, and changing it without knowing the password. + * In the Password To View Project Properties group box, enter a password for the project in the Password text box, and then enter the same password in the Confirm Password text box. Click the OK button and then close the project. Now nobody can open and view (let alone change) the project if they don't know the password. That said, Office's password security has been weak and was easily cracked prior to Office 2007. Now superior encryption techniques are used, but the password is still crackable, albeit with far greater difficulty. More on this in Chapter 19, "Securing Your Code with VBA's Security Features." + +Figure 2.16 The Protection page of the Project Properties dialog box lets you lock your project with a password so that nobody can view or edit it + +* * * + +Select Lock Project For Viewing If You Want to Prevent Others from Opening It + +If you enter a password in the Password text box and the Confirm Password text box but you don't select the Lock Project For Viewing check box, the Visual Basic Editor will prompt you for the password the next time you try to display the Project Properties dialog box. However, you'll be able to open and view the project and its contents without supplying the password. + +* * * + +# Customizing the Visual Basic Editor + +Given how much time you're likely to spend in the Visual Basic Editor, you ought to customize it so you can work as efficiently and comfortably as possible. You can customize it as follows: + + * Choose editor and view preference settings in the Visual Basic Editor to control how it interacts with you + * Choose which windows to display in the Visual Basic Editor, and organize their layout so you can use your workspace as effectively as possible + * Customize the toolbar and menus in the Visual Basic Editor so the commands you need are at hand (without cluttering up your workspace) + * Customize the Toolbox so it contains the tools you need to build your user forms + +The following sections explain your options. + +* * * + +Customization Is Global across Applications + +Any customizing you do to the VBA Editor applies across all Office applications using the version of VBA you are customizing. For example, if you change the font in an instance of the Visual Basic Editor hosted by Excel, the font also changes for Editor instances hosted by Word, PowerPoint, Outlook, and so on. + +* * * + +## Choosing Editor and View Preferences + +To begin choosing editor and view preferences, choose Tools ⇒ Options to open the Options dialog box (see Figure 2.17). + +Figure 2.17 The Editor page of the Options dialog box + +### Editor Page Options + +The Editor page of the Options dialog box includes the following settings: + +**Auto Syntax Check** + +Controls whether VBA displays warning message boxes when it discovers errors while automatically checking your syntax as you type lines of code. Some people find this feature helpful because VBA instantly points out errors that could otherwise remain unnoticed until you tried to run or debug your code. But if your style is to move from one unfinished line of code to another (and ultimately finish all the lines at your convenience), you may want to turn off this feature to prevent the Visual Basic Editor from bombarding you with message boxes for errors you're aware of but prefer to fix later. This choice is similar to the difference between writers who like to fix spelling errors while they're typing (and thus leave Word's Check Spelling As You Type option active) and those who prefer to keep their eye on the ball and deal with minutia such as spelling after finishing their thoughts. + +* * * + +You'll Always Get a Code Red on Lines with Errors + +Even if you turn off Auto Syntax Check, the Visual Basic Editor still turns any offending lines of code red to draw your attention to them. It simply stops interrupting you with message boxes displaying error warnings each time you mistype something. + +* * * + +**Require Variable Declaration** + +Governs whether you must declare variables explicitly. Declaring variables explicitly is a little more work than declaring them implicitly, but many people believe that it's a good practice and will save you time down the road—so make sure that this check box is selected unless you have a strong preference otherwise. (Chapter 6, "Working with Variables, Constants, and Enumerations," discusses how to work with variables.) + +**Auto List Members** + +Described earlier in this chapter, this option controls whether the Auto List Members and List Constants features automatically suggest properties, methods, and constants as you work in the Code window. Most people find these features helpful, but some experienced programmers turn these features off because they know pretty much all the properties, methods, and constants they need and prefer not to be distracted by a busy interface. + +**Auto Quick Info** + +This option controls whether the Quick Info feature automatically displays information about functions and their parameters as you work with functions in the Code window. + +**Auto Data Tips** + +This option controls whether the Visual Basic Editor displays ScreenTips when you hover the mouse pointer over a variable or expression in Break mode, enabling you to check the value of a variable or expression quickly. (Alternatively, you can use the Locals, Immediate, or Watch window, but these take up more screen space.) + +**Auto Indent** + +Determines whether the Visual Basic Editor automatically indents subsequent lines of code after you've indented a line. When Auto Indent is switched on, the Visual Basic Editor starts each new line of code indented to the same level (the same number of tabs or spaces or the same combination of the two) as the previous line. When Auto Indent is switched off, the Visual Basic Editor starts each new line of code at the left margin of the Code window. Usually, automatic indentation is a time-saver, although it means that each time you need to decrease a new line's level of indentation, you must press Shift+Tab, click the Outdent button on the Edit toolbar, or delete the tabs or spaces. + +**Tab Width** + +Sets the number of spaces in a tab. You can adjust this setting from 1 to 32 spaces. The default setting is 4 spaces, which works well for the default font. If you choose to use a proportional font (such as Times or Arial) rather than a monospaced font (such as the default New Courier) for your code, you may want to increase the number of spaces a tab represents in order to clarify the levels of indentation in your code. + +**Drag-And-Drop Text Editing** + +Controls whether the Visual Basic Editor supports drag-and-drop. Most people find this feature helpful. You can drag portions of your code around the Code window or from one Code window to another. You can also drag code into the Immediate window or drag an expression into the Watch window. + +**Default To Full Module View** + +Controls whether the Visual Basic Editor displays all the procedures in a module in one list (Full Module view) or displays them one at a time (Procedure view). If you're working with short procedures, you may find Full Module view useful. However, the individual view can provide a less cluttered and more workable context for lengthy procedures. When working in Procedure view, you open the procedure you want to work with by choosing it from the Procedure drop-down list at the top of the Code window. To toggle between Full Module view and Procedure view, click the Full Module View button or the Procedure View button in the lower-left corner of any Code window. + +* * * + +Use a Drop-Down List to Quickly Move Procedures + +You can also use the Procedures drop-down list when working in Full Module view to quickly move to a procedure by name. + +* * * + +**Procedure Separator** + +Controls whether the Visual Basic Editor displays horizontal lines to separate the procedures within a module shown in Full Module view in the Code window. Usually these lines are helpful, providing a quick visual cue showing where one procedure ends and the next begins. (If you're using Procedure view, this check box has no effect.) + +### Editor Format Page Options + +The Editor Format page of the Options dialog box, shown in Figure 2.18, controls how code appears in the Visual Basic Editor. + +Figure 2.18 The Editor Format page of the Options dialog box + +By default, comments in your code are rendered in green. This helps you easily recognize that type of text in the code window. You can change the default colors for various types of text by choosing a type of text in the Code Colors list box and then specifying its colors and typeface (font). You have control over Foreground, Background, and Indicator options via drop-down lists. However, I find the default choices sensible, so I don't change them. + +Here's what the Code Colors choices mean: + +**Normal Text** + +Takes care of much of the text in a typical procedure. You'll probably want to make this a conventional color (such as black, the default). + +**Selection Text** + +Affects the color of selected (highlighted) text. + +**Syntax Error Text** + +Affects the color VBA uses for offending lines. The default color is red. + +**Execution Point Text** + +Affects the color VBA uses for the line currently being executed in Break mode. You'll usually want to make this a highlighter color (like the fluorescent yellow the Visual Basic Editor uses as the default) so you can immediately see the current line. + +**Breakpoint Text** + +Affects the color in which VBA displays breakpoints (points where code execution is forced to stop). + +**Comment Text** + +Affects the color of comment lines. The default color is dark green. + +**Keyword Text** + +Affects the color of keywords (words recognized as part of the VBA language). Recall that in this book I'm using the term _command_ for the words in the VBA language. + +Such text accounts for a sizable portion of each procedure. You may want to display keywords in a different color than normal text because some people find it helpful to be able to distinguish keywords without needing to read the entire code. The default color is dark blue, which is a good choice—not so intrusive that the characters look like confetti, yet not so hard to see that you can't quickly visualize the underlying syntax of a line of code. + +**Identifier Text** + +Affects the color VBA uses for identifiers. Identifiers include the names of variables, constants, and procedures you define. + +**Bookmark Text** + +Affects the color VBA uses for the bookmarks in your code. + +**Call Return Text** + +Affects the color VBA uses for calls to other procedures. By default, the Visual Basic Editor uses lime green for call return text. + +You can change the font and size of all the types of text in the Code window by using the Font and Size drop-down lists on the Editor Format page. You can also prevent the display of the margin indicator bar (the zone in which items such as the Next Statement and Breakpoint icons appear) by clearing the Margin Indicator Bar check box. (Usually, these icons are helpful, but removing this bar slightly increases the code area onscreen.) + +### General Page Options + +The General page of the Options dialog box contains several categories of settings. The following sections discuss them in groups. I always leave these options set to the default settings, which are shown in Figure 2.19. + +Figure 2.19 The General page of the Options dialog box + +#### _Form Grid Settings Group Box_ + +The Form Grid Settings options control how the Visual Basic Editor handles user forms: + + * The Show Grid check box controls whether the Visual Basic Editor displays a grid pattern of dots on the user form in Design mode to help you place and align controls. This check box is selected by default. + * The Width and Height text boxes set the spacing of the dots that make up the grid. You can set any value from 2 points to 60 points (the default setting is 6 points). If you display the grid onscreen, you'll see the dots; if you don't display the grid, it still affects the Align Controls To Grid feature, discussed next. Experiment and find the coarseness of grid that you find easiest to work with. + * The Align Controls To Grid check box governs whether the Visual Basic Editor automatically snaps the edges of controls you place or move to the nearest grid line. This option lets you place controls in approximately the right positions rapidly and easily, but it prevents you from making extremely fine positional adjustments. The grid enforces certain positions, and you might find it frustrating when trying to improve the layout of controls you've already placed on a user form. (If so, one option is to clear the Align Controls To Grid check box; another is to leave it selected but to decrease the size of the grid—to allow finer adjustments.) + +#### _The Edit and Continue Group Box_ + +The Edit And Continue group box contains only one control—the Notify Before State Loss check box. This option controls whether the Visual Basic Editor warns you, when you're running code, if you try to take an action that requires VBA to reset the values of all variables in the module. + +#### _Error Trapping Group Box_ + +The Error Trapping group box contains three option buttons you use to specify how VBA handles errors that occur when you're running code: + +**Break On All Errors** + +Tells VBA to enter Break mode when it encounters any error, no matter whether an error handler (a section of code designed to handle errors) is active or whether the code is in a class module. Break On All Errors is useful for pinpointing where errors occur, which helps you track them down and remove them. But if you've included an error handler in your code, you probably won't need this option. + +**Break In Class Module** + +This is arguably the most useful option for general use. When VBA encounters an unhandled error in a class module (a module that defines a type of object), VBA enters Break mode at the offending line of code. + +**Break On Unhandled Errors** + +The default setting, this is useful when you've constructed an error handler to deal with predictable errors in the current module. If there is an error handler, VBA allows the handler to trap the error and doesn't enter Break mode, but if there is no handler for the error generated, VBA enters Break mode on the offending line of code. An unhandled error in a class module, however, causes the project to enter Break mode on the line of code that invoked the offending procedure of the class, thus enabling you to identify (and alter) the line that caused the problem. + +#### _Compile Group Box_ + +The Compile group box controls when VBA compiles the code for a project into executable code. Before any code can be executed, it needs to be compiled, but not all the code in a project must necessarily be compiled before the Visual Basic Editor can start executing the first parts of the code. + +You can select the Compile On Demand check box if you want VBA to compile the code only as needed. VBA compiles the code in the procedure you're running before starting to execute that procedure, but it doesn't compile code in other procedures in the same module unless the procedure you're running calls them (transfers execution to them, a technique you'll learn later in this book). + +As a result, execution of the procedure you run first in a module can begin as soon as VBA finishes compiling the code for that procedure. If the procedure then calls another procedure in the module, VBA compiles the code for the second procedure when the first procedure calls it, not when you begin running the first procedure. + +Compile On Demand is usually a good choice. It's especially useful when you're building a number of procedures in a module and have unfinished code lying around in some of them. In contrast, if you clear the Compile On Demand check box, VBA compiles _all_ the code in _all_ the procedures in the module before starting to execute the procedure you want to run. This means that not only does the procedure start a little later (more code takes more time to compile, though most computers today are so fast you won't notice), but any language error or compile error in _any_ procedure in the entire module prevents you from running and testing the current procedure, even if the code in that procedure contains no errors. This is a problem when you've only sketched in some of the procedures, so they remain unfinished. + +Suppose you have a module named Compilation that contains two procedures, GoodCode and BadCode, which look like this: + + Sub GoodCode() + MsgBox "This code is working." + End Sub + + Sub BadCode() + Application.Delete + End Sub + +GoodCode simply displays a message box to indicate that it's working, whereas BadCode contains an invalid statement (Application objects don't have a Delete method). GoodCode runs without causing a problem, but BadCode causes an error every time. + +If you try to run GoodCode with Compile On Demand switched on, the procedure runs fine: VBA compiles only the programming in the GoodCode procedure, finds no errors, and runs it. But if you try to run GoodCode with Compile On Demand switched off, VBA also compiles the code in BadCode before starting to run GoodCode—and VBA stops with a compile error at the bogus Application.Delete statement. This thorough checking before running any code is good for finished modules that work together, but it can slow you down and be annoying when you're just "sketching" code—experimenting with code in a module. + +On the other hand, you can see the advantage of compiling all the code in the module when GoodCode calls BadCode, as in the third line of this version of the procedure: + + Sub GoodCode() + MsgBox "This code is working." + BadCode + End Sub + +Here, compiling the code in BadCode before starting to run GoodCode is a good idea because doing so prevents GoodCode from running if BadCode contains an error. If you run this version of GoodCode with Compile On Demand switched on, VBA compiles GoodCode and starts to run it, displaying the message box in the second line. The BadCode call in the third line then causes VBA to compile BadCode, at which point VBA stops with the compile error. You don't want this to happen in the middle of a complex procedure; in such a case, you'd want Compile On Demand switched off. + +The Background Compile check box, which is enabled only when the Compile On Demand check box is selected, controls whether the Visual Basic Editor uses idle CPU time to compile further code while it's running the code that it has already compiled. Keep Background Compile switched on unless you notice and are bothered by any slowing of the execution of your code. With current computer speeds, and if your projects aren't huge, you'll likely be unaware of any bothersome difference in execution rate. + +#### _Show ToolTips and Collapse Proj. Hides Windows_ + +The final two options on the General page of the Options dialog box are Show ToolTips and Collapse Proj. Hides Windows. Also known as ScreenTips, ToolTips are text descriptions that appear when you hover the mouse pointer over a button or icon. The Show ToolTips check box controls whether the Visual Basic Editor displays ToolTips for its toolbar buttons. ToolTips tend to be useful unless you're desperate to save the memory and processor cycles they consume—which is very unlikely. + +The Collapse Proj. Hides Windows check box controls whether the Visual Basic Editor hides the Code window and other project windows that you collapse in the Project Explorer's tree. This check box is selected by default, and in general it's a useful choice. When you collapse a project in the Project Explorer, the Visual Basic Editor hides any Code windows or user form windows belonging to that project and removes them from the list that appears on the Window menu. When you expand the project again, the Visual Basic Editor displays the windows in their previous positions and restores them to the Window menu's list. + +### Docking Page Options + +The Docking page of the Options dialog box, shown in Figure 2.20, controls whether the various windows in the Visual Basic Editor are dockable—that is, whether they snap automatically and magnetically to a side of the window when you move them there. Keeping windows dockable usually makes for a more organized interface. However, you may want to make the windows undockable so you can drag them off the edge of the Visual Basic Editor if necessary and arrange them as you like on the screen. Contemporary monitors are becoming quite large, so you might have plenty of room to display various windows outside the primary editor window. + +Figure 2.20 The Docking page of the Options dialog box + +## Choosing and Laying Out the Editor Windows + +You can reposition the various windows (or _panes_ ) within the Visual Basic Editor. Your choice of layout depends largely on the size and resolution of your screen and your personal preferences, but here are a couple of suggestions: + + * Always make the Code window large—maximize it. If you write long lines of code, you'll want to have as much space in the Visual Basic Editor window as possible. That way your lines won't wrap and the code will be easier to read. + * Some people find that much of the time they're actively writing code, they can dispense with the Project Explorer, displaying it only when needed. As a handy way of restoring it, you can put the Project Explorer display command on the Code window, Code window break, Watch window, Immediate window, and Locals window context menus. (You'll learn how to customize the editor's menus in the next section.) You can also quickly display the Project Explorer by pressing its shortcut key, Ctrl+R. + * If you're using a multimonitor arrangement, you'll wish you could drag the child windows outside the Visual Basic Editor parent window and onto the second monitor. Unfortunately, they won't go far beyond the boundaries of the parent window. But you can achieve a similar effect by expanding the Visual Basic Editor window from your right-hand monitor onto the left-hand monitor and then docking the Properties window and the Project Explorer on the left-hand monitor. The appearance of the menu bar and toolbar will suffer, but you'll have more space for the Code window, and all three windows will be available. + +## Customizing the Toolbar and Menu Bar + +The Visual Basic Editor supports the same toolbar and menu bar customizations as the classic, pre-Ribbon Microsoft applications used to offer, such as those found in Office 2003. + +However, since the Ribbon was introduced in Office 2007, the lone toolbar is the Quick Access Toolbar, and there are no menus at all in the main application. But the Visual Basic Editor retains the older interface style—enabling you to customize its menus and toolbars in the classic fashion. + +To customize the Visual Basic Editor, choose View ⇒ Toolbars ⇒ Customize (or right-click a displayed toolbar or the menu bar and choose Customize from the context menu) to display the Customize dialog box, shown in Figure 2.21. + +Figure 2.21 Use the Customize dialog box to customize the Visual Basic Editor's menus, toolbars, and context menus. + +* * * + +Limitations of Menu and Keyboard Shortcuts + +The Visual Basic Editor doesn't let you create new menus of your own or customize its keyboard shortcuts. + +* * * + +You can customize the Visual Basic Editor's toolbars, menus, and context menus to suit the way you work. Above all, if you use the context menus, be sure to customize them so they provide the commands you need. + +In particular, you may want to add two key commands to the context menus: Comment Block and Uncomment Block. The Comment Block command adds a comment apostrophe (') to the beginning of each line of code in a multiline block of text you select. This transforms these lines into a multiline comment that VBA won't execute. + +The Uncomment Block command reverses the process. It removes the first comment apostrophe from each command in the selected block. This makes the lines executable. (Any line that was commented before you employed the Comment Block command helpfully remains commented after you run the Uncomment Block command. Run the Uncomment Block command again, and you remove further commenting.) + +These commands are available from the Edit toolbar in the normal configuration of the Visual Basic Editor, but you'll probably find it more convenient to make them available at all times from the Code window's context menu. + +The Visual Basic Editor offers the context menus listed in Table 2.1. To customize a context menu, right-click anywhere within the toolbars and menus area. Then choose Customize from the shortcut menu. Now click the Toolbars tab in the Customize dialog box. + +Table 2.1 Context menus in the Visual Basic Editor + +**Context Menu** | **Appears When You Right-Click In or On** +---|--- +MSForms | A user form +MSForms Control | A control on a user form +MSForms Control Group | A group of controls on a user form +MSForms MPC | A multipage control on a user form +Code Window | The Code window in Design mode +Code Window (Break) | The Code window in Break mode +Watch Window | The Watch window +Immediate Window | The Immediate window +Locals Window | The Locals window +Project Window | The Project window in Design mode +Project Window (Break) | The Project window in Break mode +Object Browser | The Object Browser +MSForms Palette | The clear space on a page in the Toolbox +MSForms Toolbox | The tab on a page in the Toolbox +MSForms DragDrop | An item on a user form that can be dragged and dropped elsewhere on the user form +Property Browser | A property in the Properties window +Docked Window | A docked window (for example, the Project Explorer) + +Select the Shortcut Menus check box in the Toolbars list on the Toolbars page of the Customize dialog box. Then click the Commands tab in the Customize dialog box and drag the command you want from the Commands page to the context menu (see Figure 2.22). + +Figure 2.22 Use the Shortcut Menus toolbar to put key commands on the context menus in the Visual Basic Editor. + +Here are some suggestions for customizing the Visual Basic Editor: + + * If you use the Locals window often to track the value of variables when stepping through your code to debug it, place a button for that window on a toolbar that you always keep displayed (the default button for Locals is located by default only on the Debug toolbar), or place an item for it on the context menus for the Code window (both in Design mode and in Break mode), Watch window, and Immediate window. + * Put the Watch window and the Immediate window options on the context menus for the windows from which you'll invoke them. + * If you have a medium-sized monitor, consider grouping all the toolbar buttons you commonly use on one toolbar so that you don't waste space by displaying multiple toolbars horizontally. + +## Customizing the Toolbox + +You can also customize the Toolbox, a special pane that contains controls for building user forms. It can be made visible only when a user form is visible in the Code window. (Chapters 14 and 15 show you how to build user forms.) + +You can customize this Toolbox by adding and removing controls and adding new Toolbox pages of your own. Some programmers put their most-used controls on the Toolbox, all on one page, to save themselves time. These controls can include customized variations on the regular Toolbox controls, and by putting them on the Toolbox, you avoid having to customize them again. + +For example, many dialog boxes you create need an OK button that dismisses the dialog box, implements some code, and then continues execution of the procedure. Each OK button needs its Name property set to cmdOK, its Caption property set to OK, its Default property set to True, and its Height and Width properties set to a size smaller than the clunky dimensions the Visual Basic Editor assigns by default. Once you've thus customized a command button by modifying all these properties, you can place a copy of the special button on the Toolbox and easily just reuse it for subsequent forms. This saves time. Another candidate for this kind of customization is the TextBox. The default TextBox displays only a single line and uses a nearly unreadable font size of 8. To avoid having to modify these default properties each time you use a TextBox, create a custom TextBox that has multiple lines and is set to a font size of 11. + +Another reason to customize the Toolbox is to add advanced controls that extend the things you can do with dialog boxes and user forms. + +### Adding Controls to the Toolbox + +The first way you'll probably want to add controls to the Toolbox is directly from a user form. For example, once you've created your custom OK and Cancel buttons, or a TextBox, you can copy them from the user form to the Toolbox so you can reuse them in any user forms you subsequently create. + +To copy one of your custom controls from a displayed user form to the Toolbox, just drag it and drop it, as shown in Figure 2.23. (Chapter 14 shows you how to put controls onto user forms you create yourself.) + +Figure 2.23 The quickest way to add a control to the Toolbox is to drag it there from a user form. + +Microsoft and other vendors also provide a variety of prewritten controls you can add to your Toolbox. To add these controls, follow these steps: + +1. Right-click in the Toolbox page where you want to add controls. (You'll learn how to add new pages to the Toolbox in the section "Adding Pages to the Toolbox" a little later in this chapter.) + +2. Choose Additional Controls from the context menu to display the Additional Controls dialog box shown in Figure 2.24. + +3. In the Available Controls list box, click the check boxes for the controls you want to add to the Toolbox, and then click the OK button. + +Figure 2.24 In the Additional Controls dialog box, select the check boxes for the controls you want to add, and then click the OK button. + +Once you are finished, if you would like to collapse the list to only the currently selected items, click the Selected Items Only check box in the Show group box. + +Depending on your computer and what software is installed on it, you may find a variety of interesting and useful controls. There are numerous controls, but these are among the most noteworthy: + + * A set of Microsoft Outlook controls + * A control for Apple's QuickTime + * A status-bar control + +Some of these controls can add important functionality to your macros. You can also search the Internet for additional specialized controls like calendars, security locks, and so on. Adding prebuilt controls can save you time because you simply drag and drop functionality onto your user forms—functionality that doesn't require you to spend days writing code. + +You can move a control from one page of the Toolbox to another by dragging it from the page it's on and moving the mouse pointer (still dragging) over the tab of the destination page to display that page. Then, move the mouse pointer down (again, still dragging) into the body of that page and drop the control. + +### Renaming a Toolbox Control + +When you move the mouse pointer over a control in the Toolbox, a ScreenTip appears, showing the name of that control. To rename a control, right-click it in the Toolbox and choose the Customize option from the context menu to display the Customize Control dialog box. + +Type the name for the control in the Tool Tip Text box in the Customize Control dialog box (delete or change the existing name as necessary). This name appears as a ScreenTip when the user moves the mouse pointer over the control in the Toolbox. Then, if you wish, assign a different picture to the control's Toolbox icon, as described in the next section. Otherwise, click the OK button to close the Customize Control dialog box. + +### Assigning a Picture to a Control's Toolbox Icon + +Each control in the Toolbox is identified by a picture. You can assign a new picture to the control by displaying the Customize Control dialog box, clicking the Load Picture button, and selecting the picture or icon in the resulting dialog box. + +You can edit the picture assigned to some controls by displaying the Customize Control dialog box, clicking the Edit Picture button, and using the Edit Image dialog box to color the pixels that make up the picture. + +### Removing Controls from the Toolbox + +To remove a control from the Toolbox, right-click it and choose Delete from the context menu. The item is identified by the name of the control—for example, if you right-click a control named Company Name Combo Box, the menu item is named Delete Company Name Combo Box. + +If the item is a custom control you created, this action gets rid of the control and you can't restore it (unless you have a copy elsewhere). If the item is one of the Microsoft-supplied controls that come with the Microsoft Forms 2.0 package (which is part of VBA), you can restore it to the Toolbox using the Additional Controls dialog box. Just select the check box for the appropriate object (for example, Microsoft Forms 2.0 CommandButton). + +You can also remove controls from the Toolbox by deleting the entire page they're on. See "Removing Pages from the Toolbox," later in this chapter. + +### Adding Pages to the Toolbox + +To add a page to the Toolbox, right-click the tab at the top of a page (or the label on the tab) and choose New Page from the context menu. The Visual Basic Editor adds a new page named New Page, to which it adds the Select Objects control. You'll probably want to rename the new page immediately. + +By the way, the Select Objects control (its icon is a black arrow) appears on _every_ page in the Toolbox, and you can't remove it. This is strange since you can go years without ever clicking it. This "control" is unlike others. It isn't added to a form. Instead, it must be selected in the Toolbox when you're resizing or repositioning, or when you otherwise need to select a true control on the form. However, when you merely click a control (and following many other actions), VBA automatically activates this "select object" feature—so you'll find that you never actually click it. + +### Renaming Pages in the Toolbox + +To change the name of a Toolbox page, right-click its tab or label and choose Rename from the context menu to display the Rename dialog box. Type the name in the Caption text box, type any control tip text in the Control Tip Text box, and click the OK button to close the dialog box. + +### Removing Pages from the Toolbox + +To remove a page from the Toolbox, right-click its tab or label and choose Delete Page from the context menu. The Visual Basic Editor removes the page from the Toolbox without any confirmation, regardless of whether the page contains controls. + +### Importing and Exporting Toolbox Pages + +If you want to share Toolbox pages, you can save them as separate files and distribute them to your colleagues. Toolbox pages have a .pag filename extension. + +To import a Toolbox page, right-click the tab or label on an existing page in the Toolbox and choose Import Page from the context menu to display the Import Page dialog box. Select the page you want to import and click the Open button in the dialog box. The Visual Basic Editor adds the new page after the last page currently in the Toolbox and names it New Page. + +Right-click the page's tab or label, choose Rename, type a new name and description, and then click the OK button. + +Likewise, you can export a Toolbox page by right-clicking its tab or label and choosing Export Page from the context menu to display the Export Page dialog box. Type a name for the page, choose the folder in which to save it, and then click the Save button to save it. Now anyone can import your page into their editor as described previously. + +### Moving Pages in the Toolbox + +To move a page in the Toolbox, right-click its tab or label and choose Move from the context menu to display the Page Order dialog box. In the Page Order list box, select the page or pages you want to move (Shift+click to select multiple contiguous pages, Ctrl+click to select multiple pages individually) and use the Move Up and Move Down buttons to rearrange the pages as desired. Click the OK button to close the Page Order dialog box when you've finished. + +# The Bottom Line + +**Open the Visual Basic Editor.** + +When you want to create a new macro by hand-programming (as opposed to recording) or need to modify or test a macro, the Visual Basic Editor is a powerful tool. + +**Master It** + +Open the Visual Basic Editor in Word and create a simple macro. + +**Open a Macro in the Visual Basic Editor.** + +You edit and test macro code in the Code window of the Visual Basic Editor. + +Master It + +Open the Visual Basic Editor and display a particular macro in the Code window. + +**Understand the Project Explorer's two views.** + +The Project Explorer window displays a tree of current projects. You can choose between viewing only the files or the folders and files. + +Master It + +Switch between folder and contents view in the Project Explorer. + +**Set properties for a project.** + +You can specify a project's name, an associated Help file, and other qualities of a project. + +Master It + +Lock a project so others can't modify or even read its contents. + +**Customize the Visual Basic Editor.** + +The Visual Basic Editor can be customized in many ways, including personalizing classic menus and toolbars. + +Master It + +Undock the Properties window and change its size. Then redock it. +Chapter 3 + +Editing Recorded Macros + +In this chapter, you'll use the Visual Basic Editor to edit the Word and Excel macros you recorded with the Macro Recorder in Chapter 1, "Recording and Running Macros in the Office Applications." In addition, you'll create a new macro in PowerPoint and see how to edit it. Even if you're working with an application that doesn't include the Macro Recorder (such as PowerPoint), you may still want to read through this chapter because it shows you how to use some of the key editing features of the Visual Basic Editor. + +There are three reasons for working with macros in the Visual Basic Editor: + + * First, to fix any problems in the behavior of a macro you recorded. For example, if you accidentally hit the Enter key while recording the macro, the macro will keep performing that wrong instruction every time you run it unless you remove or change the instruction. You would want to delete this line of code in your macro: + + Selection.TypeParagraph + +(Alternatively, it's sometimes easier to just rerecord the macro.) + + * Second, to add further instructions to the macro to make it behave differently. This is a great way to get started learning VBA because sometimes by just making relatively small or simple changes to a recorded macro, you can greatly increase its power and flexibility. In the process, you become familiar with the language. + * Third, to create new macros by writing them in the Visual Basic Editor instead of recording them. You can write a new macro from scratch or paste in parts of an existing macro, as appropriate. + +In this chapter you will learn to do the following: + + * Test a macro in the Visual Basic Editor + * Set breakpoints and use comments + * Edit the recorded Word macro + * Edit the recorded Excel macro + * Edit a new PowerPoint macro + +# Testing a Macro in the Visual Basic Editor + +If a macro fails when you try to run it from the host application, the quickest way to find out what's going wrong is to open the macro in the Visual Basic Editor, run it, and see where in the code it fails: + +1. In the host application, press Alt+F8 or choose Tools ⇒ Macro ⇒ Macros to display the Macros dialog box. + +2. Select the macro, and then click the Edit button. The host application opens an instance of the Visual Basic Editor and displays the macro for editing. + +3. Start the macro running by pressing F5. Alternatively, you could choose Run ⇒ Run Sub/UserForm or click the Run Sub/UserForm button (a green arrow) on the Standard toolbar in the Visual Basic Editor (see Figure 3.1). + +Figure 3.1 Click the Run Sub/UserForm button on the Standard toolbar to start running the code. + +4. If the macro encounters an error and halts execution (goes into _Break mode_ ), VBA displays an error-message box onscreen and selects the offending statement in the Code window (displays white letters on a blue background). You can then edit the statement to fix the problem. Once you've done so, step through the macro as described in the next section. + +* * * + +Understanding the VBA Editor Modes + +The VBA Editor is always in one of three modes: + + * _Design mode_ when you're designing a user form or writing code + * _Execution mode_ when you've pressed F5 and are running your code, usually to see how it behaves to test it + * _Break mode_ when execution has been halted (so you can examine variables or otherwise take a look at what's going on in the code) + +The Editor halts execution and enters Break mode in several ways: when you press Ctrl+Break, each time you press F8 to single-step through the code, when it encounters a breakpoint that you've set within the code (discussed shortly), or when certain types of errors occur. + +You can tell if you're in Break mode by looking at the Editor's title bar. If you see the word [ _break_ ], you're in Break mode. If it just says _Normal_ , you're in Design mode. When you're in Break mode, you can return to normal Design (editing) mode (so you can type in the Code window to revise and retest the macro) by clicking the Reset button on the Standard toolbar in the Visual Basic Editor (it's the blue square next to the equals sign (Break button)—see Figure 3.1). If you ever find yourself unable to type in the Editor, or the Editor is otherwise behaving strangely, remember to click this Reset button to get out of Break mode and restore normalcy. + +* * * + +* * * + +Test Macros Only on Files You Don't Care About + +Always test your macros on files (or copies of files) that you don't care about. There are few better ways to lose valuable work than to unleash untested macros on a document and watch it get mangled or worse. Store your code in a central location (such as Normal.dotm in Word or the Personal Macro Workbook in Excel) so that it's accessible to all your files rather than only the file that contains it. If you create a macro in the wrong file, export it from that file and import it into your centralized storage. To export the macro, right-click its module in the Project Explorer, choose Export File from the context menu, use the Export File dialog box to specify the folder and filename, and then click the Save button. To import a module, right-click the destination project in the Project Explorer, choose Import File, select the file in the Import File dialog box, and then click the Open button. + +* * * + +## Stepping through a Macro + +To see exactly what a macro does (and what it does wrong), you can _step through_ the macro—go through the macro, executing one command at a time—so that you can see the effect of each command. Stepping through a macro can be time-consuming—you're seeing the macro run in slow motion—but it's one of the best ways to identify problems and fix them. + +Usually debugging is a matter of finding out _where_ in the code something goes wrong. And although you generally already know _what_ goes wrong, you still need to figure out the location of the problem in your code; then you can figure out how the error happens. + +To step through a macro, follow these steps: + +1. Open the host application, and then open the macro for editing: press Alt+F8, select the macro, and then click the Edit button. + +2. Sometimes it's helpful to arrange the Visual Basic Editor window and the host application's window so that you can see them both simultaneously. Either arrange the windows manually or use a Windows command to do so. For example, stack the windows by right-clicking in open space on the Windows Taskbar and choosing Show Windows Stacked from the context menu. Alternatively, you can select Show Windows Side By Side. If you have any other applications currently running, minimize them so they won't be included in your stack. (If you have two monitors, you can dedicate one to the Editor and one to the application.) In Windows 7 or 8, the quickest way to display two windows is to drag one of them to the far left (drop it, and it will snap to that location and resize so it takes up 50 percent of the screen). Drag the other window to the right. + +3. Set up conditions the macro expects. Perhaps you need to have a document open. For example, to run properly, a macro that applies a style to a paragraph requires that a paragraph is actually available. + +4. Click somewhere in the macro code. The location of the insertion cursor is how the Editor decides which macro you want to work with. + +5. Press F8 to step through the macro command by command. Each time you press F8, one line of your VBA code will be executed. The Visual Basic Editor highlights each command as it's executed, and you can watch the effect in the application window to catch errors. + +* * * + +Pressing F8 Is the Easiest Way to Step Through Macros + +You can also step through a macro by choosing Debug ⇒ Step Into or clicking the Step Into button on the Debug toolbar, but the F8 key is easiest to use. After all, you'll often need to step repeatedly until you locate the problem. Pressing a single key is quite a bit more efficient than repetitively opening a menu. + +* * * + +Figure 3.2 provides an example of stepping through a macro recorded in Word. As you'll see, to catch what a macro is doing wrong, arrange the application window and the Visual Basic Editor window so that you can see them both. Then step through the macro by pressing the F8 key or using the Step Into command. + +Figure 3.2 Stepping through a macro recorded in Word + +You'll learn about debugging macros in detail in Chapter 17, "Debugging Your Code and Handling Errors." However, let me briefly introduce two additional important techniques that can help you locate bugs in your macros: setting breakpoints and commenting out lines. + +## Setting Breakpoints + +A _breakpoint_ can be set on a line of code to tell VBA to stop executing the macro there. By using a breakpoint, you can run quickly through known functional parts of a macro at full speed (press F5 to run), and then the Editor automatically stops at the breakpoint. You put a breakpoint just before where you suspect a bug is located in the code. That way, you don't have to step through _all_ your code. You can execute the macro at normal, rapid speed—but then halt near the suspicious location and begin pressing F8 to step through the code, executing it slowly, statement by statement, to closely observe the behaviors. You can set as many breakpoints as you wish. + +To toggle a breakpoint on or off, right-click in a line of executable code (not a comment line, described in the following section) and choose Toggle ⇒ Breakpoint from the context menu or click the Toggle Breakpoint button on the Edit toolbar. Even easier, just click in the gray margin indicator bar to the left of the line of code. + +A line of code on which you set a breakpoint is shaded red by default. The breakpoint itself is designated by a red circle in the margin indicator bar (see Figure 3.3). + +Figure 3.3 Use a breakpoint (the red circle that appears in the margin indicator bar) to stop code execution at a line of your choice. + +* * * + +Breakpoints Are Not Persistent + +Breakpoints are temporary—the Visual Basic Editor doesn't save them with your code. You must specify them for each editing session. + +* * * + +## Commenting Out Lines + +Like most programming languages, VBA lets you add comments to your code so that it's easier to understand. Comments can be invaluable both when you're creating code and when you're revisiting your own code long enough after you've written it to forget what it does—or, worse, trying to figure out what someone else's code does. + +But there's another use for commenting. You can also _comment out_ lines of code to prevent the Visual Basic Editor from executing them. In other words, comments are normally just notes to self that are not part of the macro proper—they are not written in VBA. However, sometimes while debugging you'll want to comment out an actual line of executable code in your macro. That way during execution, this line is simply not executed. It's ignored. + +This can be a useful technique for temporarily skipping over suspect lines of code without actually removing them from the macro. Then you run the code and see what the difference is with the commented lines ignored. If the bug goes away, it's probably located within the lines that are commented out. + +To comment out a line manually, type an apostrophe (') at the very beginning of the line. Alternatively, you can use the Rem command instead of the apostrophe. ( _Rem_ is short for _remark_ , and comment lines are sometimes called remark lines.) To uncomment the line manually, just delete the apostrophe or Rem. + +The Visual Basic Editor provides the Comment Block and Uncomment Block commands for commenting out multiple lines automatically. Select the lines of code (or click in the single line you want to affect), and then click the Comment Block button on the Edit toolbar to place an apostrophe at the beginning of each line; to uncomment a line or a group of selected lines, click the Uncomment Block button, and the Visual Basic Editor removes an apostrophe from each line. + +The Comment Block and Uncomment Block commands work only with apostrophes, not with Rem lines. If you prefer to use Rem, you must comment and uncomment lines manually. Few people, though, use Rem these days. + +* * * + +Comment Block Commands Can Be Efficient + +The Comment Block command adds an apostrophe to the beginning of each line in the selected block, even for lines that are already commented off (this does no harm). Likewise, the Uncomment Block command removes apostrophes one at a time from each line in the selected block rather than removing all apostrophes at once. This behavior helps preserve comment lines and enables you to use different levels of commenting. + +* * * + +## Stepping Out of a Macro + +Once you've identified and fixed the problem with a macro, you probably won't want to step through the rest of the macro command by command. To run the rest of the macro and the rest of any macro that called it (triggered it), you can press the F5 key. Alternatively, you can click the Run Sub/UserForm button on the Standard toolbar or the Debug toolbar (see Figure 3.4), or you can choose Run ⇒ Continue. If you want to run only the rest of _this_ macro, and then return to stepping through the macro that called this one, use the Step Out command. The Step Out command finishes executing the current macro or procedure at full speed, but if the code then continues with another procedure, the Visual Basic Editor reverts to Break mode so you can examine that procedure's code. We'll explore what it means to _call_ procedures later in this book. + +Figure 3.4 The Debug toolbar contains commands for running code, stepping into it and out of it, and displaying key windows for debugging. + +To issue the Step Out command, press Ctrl+Shift+F8, click the Step Out button on the Debug toolbar, or choose Debug ⇒ Step Out. + +# Editing the Word Macro + +Now, edit the Transpose_Word_Right macro that you recorded in Word in Chapter 1, and use it to build another macro. To begin, open the macro in the Visual Basic Editor: + +1. Start Word if it's not already running, or activate it. + +2. Press Alt+F8 or choose Tools ⇒ Macro ⇒ Macros to display the Macros dialog box. + +3. Select the Transpose_Word_Right macro, and then click the Edit button. + +In the Code window, you should see code similar to Listing 3.1, except for the line numbers, which I'm using here to identify the lines of code. + +**Listing 3.1**: The recorded transpose-words macro + + 1. Sub Transpose_Word_Right() + 2. ' + 3. ' Transpose_Word_Right Macro + 4. ' Transposes the current word with the word to its right. _ + 5. 'Created 5/5/13 by Nanci Selest-Gomes. + 6. ' + 7. Selection.Extend + 8. Selection.Extend + 9. Selection.EscapeKey + 10. Selection.Cut + 11. Selection.MoveRight Unit:=wdWord, Count:=1 + 12. Selection.PasteAndFormat (wdFormatOriginalFormatting) + 13. Selection.MoveLeft Unit:=wdWord, Count:=1 + 14. End Sub + +Here's what the macro does: + + * Line 1 starts the macro with the Sub Transpose_Word_Right() statement, and line 14 ends the macro with the End Sub statement. The Sub and End Sub lines mark the beginning and end of the macro (as they do any macro). + * Lines 2 and 6 are blank comment lines the Macro Recorder inserts to make your macro easier to read. You can use any number of blank lines or blank comment lines in a macro to help separate statements into groups. (A blank line doesn't have to be commented out—it can just be blank—but the Macro Recorder has added commenting to these blank lines to make it clear what they are.) + * Lines 3 through 5 are comment lines that contain the name of the macro and its description. The Macro Recorder entered these lines from the information you typed into the Record Macro dialog box. + * Line 7 records the first keystroke of the F8 key, which starts Extend mode—a way of selecting text in a Word document. + * Line 8 records the second keystroke of the F8 key, which continues Extend mode and thereby selects the current word. + * Line 9 records the keystroke of the Esc key, which cancels Extend mode. + * Line 10 records the Cut command, which cuts the selection (in this case, the selected word) to the Clipboard. + * Line 11 records the Ctrl+→ ⇒ command, which moves the insertion point one word to the right. + * Line 12 records the Paste command, which pastes the selection into the document at the current position of the insertion point. Whatever formatting was originally applied to the selection is retained (rather than applying the formatting in effect at the new location). + * Line 13 records the Ctrl+← command, which moves the insertion point one word to the left. + +## Stepping Through the Transpose_Word_Right Macro + +Try stepping through this macro in Break mode using the Step Into command: + +1. Arrange your screen so you can see both the active Word window and the Visual Basic Editor window (for example, by right-clicking the Taskbar and choosing Show Windows Stacked from the context menu or by snapping each window to a side of the screen). + +2. Click in the Visual Basic Editor, and then click to place the blinking insertion point at the start (on the Sub) of the Transpose_Word_Right macro in the Code window. + +3. Press F8 to step through the code one active line at a time. You'll notice that VBA skips the blank lines and the comment lines because they're supposed to be ignored. VBA highlights the current statement each time you press F8, and you see the actions taking place in the Word window. + +The Visual Basic Editor leaves Break mode when it reaches the end of the macro (in this case, when you press F8 to execute the End Sub statement in line 14). The Editor returns to Design mode. You can also exit Break mode at any time by clicking the Reset button (blue square) on the Standard or the Debug toolbar or by choosing Run ⇒ Reset. + +## Running the Transpose_Word_Right Macro + +If the macro works fine when you step through it, you may also want to run it from the Visual Basic Editor. Just press F5. In Break mode, F5 executes the macro from the current instruction (where the insertion cursor is located). + +## Creating a Transpose_Word_Left Macro + +At this point we'll modify the macro. We'll create a Transpose_Word_Left macro by making minor adjustments to the Transpose_Word_Right macro. Follow these steps. + +1. In the Code window, select all the code for the Transpose_Word_Right macro, from the Sub Transpose_Word_Right() line to the End Sub line. You can select in three ways: by dragging with the mouse, by holding down Shift and using the arrow keys to extend the selection, or by positioning the insertion point at one end of the macro and then Shift+clicking the other end. + +2. Copy the code by issuing a Copy command (for example, by right-clicking and choosing Copy from the context menu or by pressing Ctrl+C or Ctrl+Insert). + +3. Click to move the insertion point to the line below the End Sub statement for the Transpose_Word_Right macro in the Code window. + +4. Paste the code by issuing a Paste command (by right-clicking and choosing Paste from the context menu or by pressing Ctrl+V or Shift+Insert). The Visual Basic Editor automatically enters a horizontal line between the End Sub statement for the Transpose_Word_Right macro and the new macro you've pasted. + +5. Change the name of the second Transpose_Word_Right macro to Transpose_Word_ **Left** by editing the Sub line: + + Sub Transpose_Word_Left() + +6. Edit the comment lines at the beginning of the macro accordingly—for example, + + 'Transpose_Word_Left Macro + 'Transposes the current word with the word to its left. _ + 'Created 5/5/13 by Nanci Selest-Gomes. + +7. Now all you need to do is replace the MoveRight method with the MoveLeft method. This will move the insertion point one word to the left instead of one word to the right. While you could do that by typing the correction or by using Cut and Paste to replace the Selection.MoveRight line with the commented-out Selection.MoveLeft line, try using the List Properties/Methods feature instead. Just for practice, follow these steps: + +a. Click to place the insertion point in the word MoveRight. + +b. Click the List Properties/Methods button on the Edit toolbar to display the list of properties and methods. It's the first button on the far left. Or just press Ctrl+J. (If the Edit toolbar isn't visible, right-click one of the existing toolbars and choose Edit from the context menu.) + +c. Double-click the MoveLeft method in the list to make it replace the MoveRight method in the code line. + +8. Now that you no longer need it, delete the line Selection.MoveLeft Unit:=wdWord, Count:=1 from the end of the macro. + +You should end up with a macro that looks like Listing 3.2. + +**Listing 3.2**: The edited transpose-words macro + + Sub Transpose_Word_Left() + ' + ' Transpose_Word_Left Macro + ' Transposes the current word with the word to its left. _ + ' 'Created 5/5/13 by Nanci Selest-Gomes. + ' + Selection.Extend + Selection.Extend + Selection.EscapeKey + Selection.Cut + Selection.MoveLeft Unit:=wdWord, Count:=1 + Selection.PasteAndFormat (wdFormatOriginalFormatting) + End Sub + +Try stepping through this macro to make sure it works. If it does, you're ready to save it—and perhaps to create a Quick Access Toolbar button, or keyboard shortcut, for it in Word if you plan to use it in your writing. + +## Save Your Work + +When you finish working with this or any other macro, choose File ⇒ Save (Ctrl+S) from the Visual Basic Editor to save the document or template that contains the macro and the changes you've made to it. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft Word to close the Visual Basic Editor and return to Word. + +# Editing the Excel Macro + +In the following sections, you'll edit the Excel macro that you recorded in Chapter 1. This time, you won't create a new macro—instead, you'll add to the existing one. + +## Unhiding the Personal Macro Workbook + +Before you can edit the Excel macro, you'll need to unhide the Personal Macro Workbook if it's currently hidden: + +1. Open the View tab on the Ribbon. + +2. If the Unhide button is gray (disabled) in the Window group, then no workbooks are hidden, including Personal. You can skip the following steps. However, if the Unhide button is black (enabled), click it to display the Unhide dialog box. + +3. Select PERSONAL.XLSM or PERSONAL.XLSB and click the OK button. If you stored the macro from Chapter 1 in another workbook, open that workbook before trying to proceed. To hide the Personal Macro Workbook again after editing the macro, click the Hide button on the Ribbon while the Personal Macro Workbook is active. + +* * * + +Creating a Backup Copy of your Files + +Eventually you'll have a collection of macros in the Personal workbook. It's a good idea to keep a backup copy of these files in case something happens—such as reinstalling your Office applications when you buy a new computer. You don't want to lose your macro collection. To create a backup file, just locate PERSONAL.XLSB in Windows 8 by pressing the Windows key+F (or in Windows 7, just by pressing the Windows key) to open the Windows Search field and typing in its name. Then right-click PERSONAL.XLSB in the search-results list and choose Open File Location. + +Now you can copy the file, save it to another location, and rename it something like PERSONAL.BAK. You can also find PERSONAL.XLSB by using Windows explorer to locate it in this folder: Users\ _YourNameHere_ \AppData\Roaming\Microsoft\Excel\XLStart. + +Also make a backup copy of any other important macro collections, such as Word's Normal.Dotm file. + +* * * + +## Opening the Macro for Editing + +Now take the following steps to open the macro you recorded in Chapter 1 for viewing and editing: + +1. Press Alt+F8 to display the Macros dialog box. + +2. Select the macro named New_Workbook_with_Months. + +3. Click the Edit button to display the macro for editing in the Visual Basic Editor. Listing 3.3 shows code similar to what you should be seeing. + +**Listing 3.3**: New "workbook with months added" macro + + 1. Sub New_Workbook_with_Months() + 2. ' + 3. ' New_Workbook_with_Months Macro + 4. ' Creates a new workbook with the months filled in for a year. + 5. ' + 6. ' + 7. Workbooks.Add + 8. Range("A1").Select + 9. ActiveCell.FormulaR1C1 = "Jan-2011" + 10. Range("B1").Select + 11. ActiveCell.FormulaR1C1 = "Feb-2011" + 12. Range("A1:B1").Select + 13. Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault + 14. Range("A1:L1").Select + 15. ActiveWorkbook.SaveAs Filename:= _ + "C:\Users\ _Richard_ \Documents\Sample Workbook.xlsx", FileFormat:= _ + xlOpenXMLWorkbook, CreateBackup:=False + + 16. End Sub + +(If you are using a version of Office prior to Office 2013, the file location specified in line 15 is likely C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\Sample Workbook.xlsx. Replace _Richard_ with your name.) + +Here's what happens in the macro in Listing 3.3: + + * Line 1 starts the macro with the Sub New_Workbook_with_Months() statement, and line 16 ends the macro with the End Sub statement. + * Lines 2, 5, and 6 are comment lines that the Macro Recorder automatically adds. (The comment line in line 6 seems superfluous. It's there because Excel allows you to enter two lines in the Description text box in the Record Macro dialog box, but this macro uses only one line. Delete any blank or comment lines you wish. They'll have no effect on the behavior of the macro, though removing them could make it less readable in the Editor. It's your call.) + * Line 3 is a comment line that gives the macro's name and describes it as a macro, and line 4 contains the description from the Record Macro dialog box. + * Line 7 creates a new blank workbook by using the Add method on the Workbooks collection object. (A _collection_ object, or more concisely a _collection_ , is an object that contains objects of a given type. For example, a worksheet will contain a PivotTables collection of all the PivotTables on that worksheet.) + * Line 8 selects the Range object A1, making cell A1 active. + * Line 9 enters Jan-2011 in the active cell. Notice that the Macro Recorder has stored the parsed date value rather than the text that you typed in (January 2011). Also, keep in mind that the date displayed in the cell may be in a different format than MMM. + * Line 10 selects the Range object B1, making cell B1 active, and line 11 enters Feb-2011 in that cell. + * Line 12 selects the range A1:B1. + * Line 13 performs a default AutoFill operation on the range A1:L1, and line 14 selects that range. Note how the Macro Recorder has recorded two separate actions, although in the Excel interface you performed only one action. + * Line 15 saves the workbook under the name and folder given. Note that the Macro Recorder has automatically broken this long statement onto three lines by using the continuation character, an underscore preceded by a space. You can break lines of code anywhere between keywords to make the lines of code a comfortable length for working within the Editor. Again, lines broken with an underscore at the end have no effect on macro execution. They're merely formatting issues, so it's your call. + +## Editing the Macro + +Now modify the macro by following these steps: + +1. Select lines 8 through 13. + +2. Copy these lines by pressing Ctrl+C or right-clicking in the selection and choosing Copy from the context menu. + +3. Click at the start of line 14 to move the insertion point there. + +4. Paste the copied lines by pressing Ctrl+V and choosing Edit ⇒ Paste or right-clicking at the insertion point and choosing Paste from the context menu. + +5. If necessary, press the Enter key to move the line Range("A1:L1").Select down one line. (Press Enter if this code is red, indicating that it should be moved down one line rather than appended to line 13's code.) + +Your new macro should look like Listing 3.4. + +**Listing 3.4**: New extended version + + 1. Sub New_Workbook_with_Months() + 2. ' + 3. ' New_Workbook_with_Months Macro + 4. ' Creates a new workbook with the months filled in for a year. + Recorded 5/5/13 by Abe Normal.' + 5. ' + 6. ' + 7. Workbooks.Add + 8. Range("A1").Select + 9. ActiveCell.FormulaR1C1 = "Jan-2011" + 10. Range("B1").Select + 11. ActiveCell.FormulaR1C1 = "Feb-2011" + 12. Range("A1:B1").Select + 13. Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault + 14. Range("A1").Select + 15. ActiveCell.FormulaR1C1 = "Jan-2011" + 16. Range("B1").Select + 17. ActiveCell.FormulaR1C1 = "Feb-2011" + 18. Range("A1:B1").Select + 19. Selection.AutoFill Destination:=Range("A1:L1"), Type:=xlFillDefault + 20. Range("A1:L1").Select + 21. ActiveWorkbook.SaveAs Filename:= _ + "C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\Sample Workbook.xlsx", _ + FileFormat:=xlOpenXMLWorkbook, CreateBackup:=False + 22. End Sub + +Now, change the macro by taking the following steps: + +1. Delete line 6. It's not doing any good, just taking up space in the Code window. + +2. Delete line 20. It's not necessary for what the macro does—you don't need the macro to select the range because the AutoFill instruction in line 13 is enough to perform the AutoFill operation without selecting the range. + +3. Change line 14 to select cell A2 instead of cell A1: + + Range("A2").Select + +4. Change line 15 so that it enters the value 100 instead of Jan-2008: + + ActiveCell.FormulaR1C1 = 100 + +5. Change line 16 to select cell B2 instead of cell B1: + + Range("B2").Select + +6. Change line 17 so that it enters the value 200 instead of Feb-2008: + + ActiveCell.FormulaR1C1 = 200 + +7. Change line 18 so that it selects the range A2:B2: + + Range("A2:B2").Select + +8. Change line 19 so that it performs the AutoFill operation on the range A2:L2: + + Selection.AutoFill Destination:=Range("A2:L2"), Type:=xlFillDefault + +9. Break line 13 with a space, underscore, and carriage return before the Type argument, as shown here. Indent the second line by one tab. + + Selection.AutoFill Destination:=Range("A1:L1"), _ + Type:=xlFillDefault + +10. Similarly, break line 19 with a space, underscore, carriage return, and tab before the Type argument. + +11. Click the Save button or choose File ⇒ Save to save the changes you made. + +The macro should now read like Listing 3.5. + +**Listing 3.5**: Streamlined macro + + 1. Sub New_Workbook_with_Months() + 2. ' + 3. ' New_Workbook_with_Months Macro + 4. ' Creates a new workbook with the months filled in for a year. + Recorded 5/5/13 by Abe Normal. + ' + 5. ' + 6. Workbooks.Add + 7. Range("A1").Select + 8. ActiveCell.FormulaR1C1 = "Jan-2011" + 9. Range("B1").Select + 10. ActiveCell.FormulaR1C1 = "Feb-2011" + 11. Range("A1:B1").Select + 12. Selection.AutoFill Destination:=Range("A1:L1"), _ + Type:=xlFillDefault + 13. Range("A2").Select + 14. ActiveCell.FormulaR1C1 = 100 + 15. Range("B2").Select + 16. ActiveCell.FormulaR1C1 = 200 + 17. Range("A2:B2").Select + 18. Selection.AutoFill Destination:=Range("A2:L2"), _ + Type:=xlFillDefault + 19. ActiveWorkbook.SaveAs Filename:= _ + "C:\Users\Richard\AppData\Roaming\Microsoft\Excel\XLSTART\temp.xlsx", _ + FileFormat:=xlOpenXMLWorkbook, CreateBackup:=False + 20. End Sub + +Now step through the macro and watch what happens: it creates the new workbook as before and enters the months, but then it enters the values 100 through 1200 in the second row of cells. This one is fun to watch on a split screen because you watch the cells fill with data as you step through it. + +At the end, the macro attempts to save the workbook as before. However, an error message or dialog box warns that a previous workbook exists by this name (unless you've already deleted it). Later you'll see how to handle this type of error so the macro doesn't halt or confuse the user with these kinds of odd error messages or dialog boxes. + +## Save Your Work + +When you finish working with this macro, choose File ⇒ Save from the Visual Basic Editor to save the workbook that contains the macro and the changes you've made to it. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft Excel to close the Visual Basic Editor and return to Excel. + +# Editing a PowerPoint Macro + +In this section, you'll edit a PowerPoint macro. PowerPoint no longer includes a macro recorder, so you'll either have to type in the code for the following example or, better, just copy and paste it from this book's Web page at www.sybex.com/go/masteringvba2013. + +Start by opening the PowerPoint Visual Basic Editor: + +1. Open PowerPoint, and choose the blank presentation template (in PowerPoint 2010 and earlier versions, the blank presentation is opened by default). Now add a shape by clicking the Insert tab on the Ribbon, then clicking the Shapes icon in the Illustrations section. + +2. Click a rectangle shape of your choice, and drag on the slide to create it. This will be object 1 in the Shapes collection, so we can refer to it in the code like this: + + ActiveWindow.Selection.SlideRange.Shapes(1).Select + +3. Open the PowerPoint Visual Basic Editor by pressing Alt+F11. + +4. Create a new, empty module by choosing Insert ⇒ Module in the Editor. Now you're ready to add some code. + +5. Type in (or paste from this book's web page) the code shown in Listing 3.6. + +**Listing 3.6**: Add a slide in PowerPoint + + 1. Sub Add_Slide_and_Format_Placeholder() + 2. ' + 3. ' Sample macro that adds a slide, formats its placeholder, + ' and adds text to it. Recorded 6/16/13 by Batfield Dial. + 4. ' + 5. ActiveWindow.View.GotoSlide Index:= _ + ActivePresentation.Slides.Add(Index:=2, _ + Layout:=ppLayoutText).SlideIndex + 6. ActiveWindow.Selection.SlideRange.Layout = ppLayoutTitle + 7. ActiveWindow.Selection.SlideRange.Shapes(1).Select + 8. With ActiveWindow.Selection.ShapeRange + 9. .IncrementLeft -6# + 10. .IncrementTop -125.75 + 11. End With + 12. ActiveWindow.Selection.ShapeRange.ScaleHeight 1.56, msoFalse, _ + msoScaleFromTopLeft + 13. ActiveWindow.Selection.SlideRange.Shapes(1).Select + 14. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select + 15. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _ + (Start:=1, Length:=0).Select + 16. With ActiveWindow.Selection.TextRange _ + 17. .Text = "The quick brown dog jumped over the lazy fox" + 18. With .Font + 19. .Name = "Arial" + 20. .Size = 44 + 21. .Bold = msoFalse + 22. .Italic = msoFalse + 23. .Underline = msoFalse + 24. .Shadow = msoFalse + 25. .Emboss = msoFalse + 26. .BaselineOffset = 0 + 27. .AutoRotateNumbers = msoFalse + 28. .Color.SchemeColor = ppTitle + 29. End With + 30. End With + 31. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _ + (Start:=1, Length:=42).Select + 32. With ActiveWindow.Selection.TextRange.Font + 33. .Name = "Impact" + 34. .Size = 54 + 35. .Bold = msoFalse + 36. .Italic = msoFalse + 37. .Underline = msoFalse + 38. .Shadow = msoFalse + 39. .Emboss = msoFalse + 40. .BaselineOffset = 0 + 41. .AutoRotateNumbers = msoFalse + 42. .Color.SchemeColor = ppTitle + 43. End With + 44. End Sub + +Here's what happens in the macro: + + * Line 1 starts the macro, and line 44 ends it. + * Lines 2 and 4 are blank comment lines used to set off the description of the macro, which appears in line 3. + * Line 5 adds the slide to the presentation. This statement is a little complicated, but don't worry about it too much just yet. For now, note two things: First, the statement uses the Add method with the Slides collection object to add a slide to the collection (in other words, to create a new slide in this case). This is similar to the way the Excel macro explored earlier in this chapter used the Add method to add a workbook to its Workbooks collection. Second, the layout of the slide is ppLayoutText, the VBA constant for the Text slide layout that PowerPoint uses for a default new slide. + * Line 6 applies the Title layout (ppLayoutTitle) that you chose when recording the macro. (If you chose a different slide layout, you'll see a different constant than ppLayoutTitle.) + * Line 7 selects the first shape in the Shapes collection on the active slide. (For the moment, don't worry about how you get to the active slide.) + * Lines 8 to 11 are a With block. This block begins with a With statement that specifies properties or behaviors ( _methods_ ) for the shape that has been selected (ActiveWindow.Selection.ShapeRange). A With statement is a way of simplifying object references, and everything between the With statement and the End With statement refers to the objects that the With statement first mentions. In this case, line 9 uses the IncrementLeft method with a negative value to move the shape to the left, and line 10 uses the IncrementTop method with a negative value to move the shape up the slide. + +* * * + +The With Command Has Two Uses + +With statements have two benefits: They simplify code (because you don't need to specify the object in each of the lines between the With and End With lines), and they make code run faster. + +* * * + + * Line 13 selects the first shape in the Shapes collection, and line 14 selects the TextRange object in the TextFrame object in the shape. When you're working interactively, PowerPoint makes this selection process seamless: You click in a shape displaying the legend "Click to add title" (or whatever), and PowerPoint selects the text range in the shape's text frame—but all you see is that the text in the shape becomes selected. In VBA, you have to go through a couple of unseen layers in the object model before getting to the text. + * When you select the placeholder text, PowerPoint gets rid of it. The same thing happens when you select the placeholder text via VBA. So line 15 makes a new selection at the beginning of the first character in the text range. The Length of the selection is 0, meaning that the selection is collapsed to an insertion point rather than containing any characters. Line 16 starts a With statement that continues until line 30. The With ActiveWindow.Selection.TextRange statement in line 16 lets line 17 reference the Text property of the TextRange object in the ActiveWindow object's Selection object much more simply (instead of ActiveWindow.Selection.TextRange.Text), and it lets line 18 reference the Font property of the TextRange object in the Selection object in the ActiveWindow object easily (instead of ActiveWindow.Selection.TextRange.Font). + * Line 17 sets the Text property of the ActiveWindow.Selection.TextRange object to the text typed. + * Line 18 then begins a nested With statement that sets the properties of the Font object for the TextRange object. Line 19 sets the Name property of the Font object to Arial; line 20 sets the Size property of the Font object to 44; line 21 sets the Bold property of the Font object to msoFalse, the Microsoft Office (mso) constant for False; and so on. These statements are not necessary for our purposes in this macro. But they're harmless, so you can leave them in your code or, if you wish, delete this entire With block (as we'll do shortly). Line 29 ends the nested With statement. + +* * * + +With Blocks Can Be Nested + +A nested With statement is one that is placed within another With statement and specifies an object within the object specified in the outer With statement. You can nest multiple-level With statements when necessary. You can see that the With block that begins on line 18 is nested within the outer With block that begins on line 16. + +* * * + + * Line 31 uses the Select method to select characters 1 through 42 in the text range. This is the same as pressing the Ctrl+Shift+Home key combination. Because this statement specifies the characters to select, you'll need to change it if you change the text that this macro inserts. (If you run the statement on a text range that has fewer than 42 characters, it will return an error. If you run it on a text range that has more than 42 characters, it will select only the first 42 characters in the text range—not what you want.) + * Line 32 begins another With statement that works with the Font object of the TextRange object. This With statement imitates what happens if the user opens and modifies the Font dialog box. + * Line 43 ends the With statement, and line 44 ends the macro. + +You can edit this macro by slimming it down a little and changing the text it inserts: + +1. Delete the unnecessary With statement in lines 18 through 29. + +2. Delete line 30. + +3. Change lines 16 and 17 into a single statement without With: + + ActiveWindow.Selection.TextRange.Text = _ + "The quick brown dog jumped over the lazy fox" + +4. Now change the text that the new line 16 inserts. Type text of your choice between the double quotation marks. + +5. Change line 31 to use the Select method on the text _range_ rather than specifying which characters to select. Delete Characters(Start:=1, Length:=42) to leave this statement: + + ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select + +6. By specifying a range rather than a particular character count, you avoid the problem discussed earlier of having to count characters anytime you change the message. Specifying a character count is called _hard-coding_ and it's to be avoided whenever possible. If there's a way—as there is here with the TextRange property—let the computer figure out the count rather than specifying it in your code. + +7. Click the Save button on the Standard toolbar or choose File ⇒ Save to save the changes you've made to the presentation. In the Save As dialog box, locate the Save As Type drop-down list and change it from the default .pptx type (which cannot contain macros) to the .pptm type (which can). + +You should now have code that reads like Listing 3.7. + +**Listing 3.7**: The macro slimmed down and modified + + 1. Sub Add_Slide_and_Format_Placeholder() + 2. ' + 3. ' Sample macro that adds a slide, formats its placeholder, and adds text + ' to it. Recorded 12/4/08 by Rodney Converse. + Recorded 12/4/08 by Rodney Converse. + 4. ' + 5. ActiveWindow.View.GotoSlide Index:= _ + ActivePresentation.Slides.Add(Index:=2, _ + Layout:=ppLayoutText).SlideIndex + 6. ActiveWindow.Selection.SlideRange.Layout = ppLayoutTitle + 7. ActiveWindow.Selection.SlideRange.Shapes("Rectangle 4").Select + 8. With ActiveWindow.Selection.ShapeRange + 9. .IncrementLeft -6# + 10. .IncrementTop -125.75 + 11. End With + 12. ActiveWindow.Selection.ShapeRange.ScaleHeight 1.56, msoFalse, _ + msoScaleFromTopLeft + 13. ActiveWindow.Selection.SlideRange.Shapes("Rectangle 4").Select + 14. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select + 15. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Characters _ + (Start:=1, Length:=0).Select + 16. ActiveWindow.Selection.TextRange.Text = "Welcome to Acme Industries" + 17. ActiveWindow.Selection.ShapeRange.TextFrame.TextRange.Select + 18. With ActiveWindow.Selection.TextRange.Font + 19. .Name = "Impact" + 20. .Size = 54 + 21. .Bold = msoFalse + 22. .Italic = msoFalse + 23. .Underline = msoFalse + 24. .Shadow = msoFalse + 25. .Emboss = msoFalse + 26. .BaselineOffset = 0 + 27. .AutoRotateNumbers = msoFalse + 28. .Color.SchemeColor = ppTitle + 29. End With + 30. End Sub + +Now step through the changed macro and make sure it works as you expect it to. You may need to modify Rectangle 4 in the code to a different number. + +## Save Your Work + +When you finish working with this macro, choose File ⇒ Save from the Visual Basic Editor to save the presentation that contains the macro and the changes you've made to it. Be sure to change the file type from the default .pptx to the macro-enabled .pptm file type. Then press Alt+Q or choose File ⇒ Close And Return To Microsoft PowerPoint to close the Visual Basic Editor and return to PowerPoint. + +* * * + +When Should You Use the Macro Recorder? + +As you've seen so far in this book, you can create VBA code two ways. First, you can use the Macro Recorder (in the two applications—Word and Excel—that provide one) to record a series of actions when working interactively in the application. Or, second, you can type VBA statements into the Code window in the Visual Basic Editor. You're probably wondering when you should record a macro and when you should create code from scratch. Writing a procedure from scratch is clearly more difficult and more advanced than recording a procedure—so should you always record if a Recorder is available? + +Using the Macro Recorder has advantages and disadvantages. The advantages are as follows: + + * The Macro Recorder creates usable code every time (provided you run the macro under suitable conditions). + * It is quick and easy to use. + * It can help you discover which VBA objects, methods, and properties correspond to which part of an application's interface. + +And here are the disadvantages: + + * Code created in the Macro Recorder may contain unnecessary statements because the Macro Recorder records _everything_ you do in the application—including all the options in every built-in dialog box you use when recording the macro. For example, if you start the Macro Recorder from Word, choose Tools ⇒ Options to display the View page of the Options dialog box, click the Edit tab to display the Edit page, and change the Auto-Keyboard Switching setting, the Macro Recorder will record all the settings on the Edit page as well as all those on the View page. The result is about 40 lines of unnecessary code. (If you visit any other pages in the Options dialog box on the way to the Edit page, the Macro Recorder will record all the settings in those pages as well.) If you create the code manually in the Visual Basic Editor, you can achieve the same effect by using one statement rather than dozens. + * Code created by the Macro Recorder can work only in the active document because whichever document you're working with interactively automatically becomes the active document. Later in this book, you'll learn how to use objects in the applications' object models to work with documents other than the active document. Working with other documents can have advantages; for example, you can make your code run faster or hide from the user the manipulations you're performing. + * The Macro Recorder can create VBA code for only _some_ of the actions you perform in the host application. For example, if you want to display a dialog box or a user form in the course of a procedure, you need to write the appropriate statement manually—you can't record it. The subset of VBA actions available through the Macro Recorder is similar to the set of actions you can take in the host application when working interactively, so you can get a lot done with it. Still, you'll find it's limited compared to the full range of actions you can perform through VBA. + +However expert you become with VBA, consider the Macro Recorder a useful tool for creating either rough-and-ready macros or the basis of more complex procedures. You'll often find it makes sense to have the Macro Recorder handle as much of the strain of creating a procedure as possible. If you can save time by using the Macro Recorder to quickly identify the VBA object or property that you need, then do so. + +In addition, the Macro Recorder can show you how to write some code that you can't figure out how to write on your own. The Recorder always gets the syntax right. + +* * * + +# The Bottom Line + +**Test a macro in the Visual Basic Editor.** + +When you need to modify or debug a macro, the Visual Basic Editor is your best friend. It's filled with tools to make your job easier. + +Master It + +Open a macro; then step through it to see if anything goes wrong. + +**Set breakpoints and use comments.** + +Setting breakpoints allows you to press F5 to execute a macro, but forces the Editor to enter Break mode when execution reaches the line where the breakpoint resides. Comments help you understand the purpose of code—they describe it but are ignored during execution of the macro's code. "Commenting out" a line of code allows you to temporarily render it inactive to see what effect this has during execution. This is sometimes a good way to see if that line is causing the bug you're tracking down. + +Master It + +Set a breakpoint in, and add a comment to, a macro. + +**Edit a recorded macro.** + +Make some changes to a Word macro. + +Master It + +With the Visual Basic Editor open, choose a macro and modify it. +Chapter 4 + +Creating Code from Scratch in the Visual Basic Editor + +In this chapter, you'll practice creating procedures from scratch in the Visual Basic Editor. The examples walk you through creating a procedure in Word, Excel, and PowerPoint. + +For the examples in this book, the Visual Basic Editor should be set up a certain way and (for good practice) set to require explicit declarations of variables. So we'll start off this chapter by ensuring that these conditions are met. + +The purpose of this chapter is to give you a feel for creating code in the Visual Basic Editor before you study the details of the language. You'll work briefly with VBA elements (such as objects, properties, methods, variables, and constants) that you'll learn about more fully later in this book. Along the way, you'll meet several of the many helpful tools that the Visual Basic Editor provides, including the Macro Recorder, the Object Browser, and the Help system. You'll explore these tools more thoroughly later in this book, as well. + +In this chapter you will learn to do the following: + + * Set up the Visual Basic Editor for creating procedures + * Create a procedure for Word + * Create a procedure for Excel + * Create a procedure for PowerPoint + * Create a procedure for Access + +# Setting Up the Visual Basic Editor for Creating the Procedures + +You'll find it easiest to follow the instructions in the following procedures—and in the rest of the book—if you have the Visual Basic Editor set up in a default configuration (like the layout you see the first time you display the Visual Basic Editor from a VBA host). Any changes you make to the VBA Editor will be in effect across all VBA-enabled Office applications. So, if you set up the Editor as described next, it will look like this whether you open it in Excel, Word, Access, Outlook, or PowerPoint. + +The following steps describe how to set up the Visual Basic Editor so it looks like Figure 4.1: + +1. If the Project Explorer isn't displayed, choose View ⇒ Project Explorer or press Ctrl+R to display it. + +2. If the Properties window isn't displayed, choose View ⇒ Properties Window or press the F4 key to display it. + +3. Unless you really prefer things otherwise, dock the Project Explorer in its conventional position at the upper-left corner of the main Visual Basic Editor area. Dock the Properties window below the Project Explorer, again in its default position. (To change docking, choose Tools ⇒ Options, click the Docking tab, and select the Docking options.) To dock an undocked (floating) window, double-click its title bar. + +4. Set up the Visual Basic Editor to require variables to be declared explicitly. The Editor will then enforce a rule that you must declare each variable formally before you can use it in the code. Choose Tools ⇒ Options to display the Options dialog box, select the Require Variable Declaration check box on the Editor page, and then click the OK button. More on variable declaration later in the book, but here's a brief summary. This setting makes the Visual Basic Editor automatically enter an Option Explicit statement for all modules and user forms you create from now on. And _that_ statement causes the Editor to check during runtime for any implicitly declared variables and remind you that you must declare them _explicitly_ , like this: + + Dim txtName As String + +Figure 4.1 The default configuration for the VBA Editor + +# Creating a Procedure for Word + +The procedure you'll create for Word causes the Track Changes feature to toggle (between Strikethrough and Hidden) how deleted text will be displayed. With this macro, you'll be able to switch instantly between having deleted text remain onscreen with a line through it or having it simply disappear. + +Start by using the Macro Recorder to provide the necessary object qualifications. Then you can modify the code by hand in the Editor to create the toggle behavior. + +Follow these steps to record the macro: + +1. Start Word. If Word is already running, exit it and restart it. + +2. Record a macro to get to the object qualifications (properties and settings) you need. (Remember that to some, recording may feel like cheating, but the Macro Recorder is truly a gift when it comes to finding objects and getting complicated syntax correctly coded.) Follow these substeps: + +a. Click the Developer tab on the Ribbon; then click the Record Macro button in the Code section to display the Record Macro dialog box. + +b. Either accept the macro name that the Macro Recorder automatically assigns (Macro1, Macro2, and so on) or create a scratch name of your own, such as Temp, that will remind you to delete the macro if you forget to do so. + +c. Leave the Store Macro In drop-down list set to All Documents (Normal.dotm). Leave the description blank. This is a temporary macro just for practice, so we won't add it to our permanent collection. + +d. Click the OK button to start recording the macro. + +e. Click the Review tab on the Ribbon, and then click the small arrow in the lower-right corner of the Tracking section. The Track Changes Options dialog box opens. In that box click the Advanced Options button. (Note that the Advanced Track Changes Options dialog box looks somewhat different in Office 2010 and earlier versions. And you open the first dialog box by clicking the bottom half of the Track Changes icon.) Now ensure that Strikethrough is selected in the Deletions drop-down list (see Figure 4.2), and then click OK twice to close the two Track Changes Options dialog boxes. (Strikethrough is the default, so it's probably already selected—but we want the Recorder to show us how this option is coded in VBA. Clicking OK to close a dialog box records all the current settings in that box.) + +Figure 4.2 The Advanced Track Changes Options dialog box in Word + +f. Repeat the preceding step (e.) to reopen the Track Changes Options dialog box. Now, select Hidden in the Deletions drop-down list, and again click OK to close the dialog box. + +g. Stop recording the macro by clicking the white recording button in the status bar or by clicking the Stop Recording button on the Developer tab on the Ribbon. + +3. Press Alt+F8 to display the Macros dialog box. Select the macro you just recorded and click the Edit button to open it for editing in the Visual Basic Editor. Your code should look like this: + + 1. Sub temp() + 2. ' + 3. ' temp Macro + 4. ' + 5. ' + 6. With Options + 7. .InsertedTextMark = wdInsertedTextMarkUnderline + 8. .InsertedTextColor = wdRed + 9. .DeletedTextMark = wdDeletedTextMarkStrikeThrough + 10. .DeletedTextColor = wdRed + 11. .RevisedPropertiesMark = wdRevisedPropertiesMarkNone + 12. .RevisedPropertiesColor = wdByAuthor + 13. .RevisedLinesMark = wdRevisedLinesMarkOutsideBorder + 14. .CommentsColor = wdRed + 15. .RevisionsBalloonPrintOrientation = _ + wdBalloonPrintOrientationPreserve + 16. End With + 17. ActiveWindow.View.RevisionsMode = wdMixedRevisions + 18. With Options + 19. .MoveFromTextMark = wdMoveFromTextMarkDoubleStrikeThrough + 20. .MoveFromTextColor = wdGreen + 21. .MoveToTextMark = wdMoveToTextMarkDoubleUnderline + 22. .MoveToTextColor = wdGreen + 23. .InsertedCellColor = wdCellColorLightBlue + 24. .MergedCellColor = wdCellColorLightYellow + 25. .DeletedCellColor = wdCellColorPink + 26. .SplitCellColor = wdCellColorLightOrange + 27. End With + 28. With ActiveDocument + 29. .TrackMoves = False + 30. .TrackFormatting = True + 31. End With + 32. With Options + 33. .InsertedTextMark = wdInsertedTextMarkUnderline + 34. .InsertedTextColor = wdRed + 35. .DeletedTextMark = wdDeletedTextMarkHidden + 36. .DeletedTextColor = wdRed + 37. .RevisedPropertiesMark = wdRevisedPropertiesMarkNone + 38. .RevisedPropertiesColor = wdByAuthor + 39. .RevisedLinesMark = wdRevisedLinesMarkOutsideBorder + 40. .CommentsColor = wdRed + 41. .RevisionsBalloonPrintOrientation = _ + wdBalloonPrintOrientationPreserve + 42. End With + 43. ActiveWindow.View.RevisionsMode = wdMixedRevisions + 44. With Options + 45. .MoveFromTextMark = wdMoveFromTextMarkDoubleStrikeThrough + 46. .MoveFromTextColor = wdGreen + 47. .MoveToTextMark = wdMoveToTextMarkDoubleUnderline + 48. .MoveToTextColor = wdGreen + 49. .InsertedCellColor = wdCellColorLightBlue + 50. .MergedCellColor = wdCellColorLightYellow + 51. .DeletedCellColor = wdCellColorPink + 52. .SplitCellColor = wdCellColorLightOrange + 53. End With + 54. With ActiveDocument + 55. .TrackMoves = False + 56. .TrackFormatting = True + 57. End With + 58. End Sub + +4. That's a daunting amount of code for the few rather simple actions you took. Remember that this is because the Macro Recorder records the settings for _all_ of the possible options in the Track Changes Options dialog box that you visited, not just the option you selected. Look over the code briefly to see the many settings that were recorded from the options inside the dialog box displayed in Figure 4.2. + +If you look at the figure, you can see how the code reflects the settings. For example, see the .SplitCellColor = wdCellColorLightOrange line of code and locate the setting it refers to in the dialog box. + +5. A second set of nearly identical settings in the code represents your second visit to the dialog box. Notice lines 9 and 36 in particular; these are key. Line 35 reflects the change made on your second visit—specifying a hidden rather than strikethrough property for the DeletedTextMark property of the Options object. Notice, too, the two values for this property: wdDeletedTextMarkStrikeThrough (when you recorded the Deletions drop-down specifying Strikethrough) and wdDeletedTextMarkHidden (when you set it to Hidden). + +6. Now in the Editor, select the entire recorded macro, from the Sub temp statement down to the End Sub statement, and press the Delete key to get rid of it. + +7. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures," earlier in this chapter. + +8. In the Project Explorer window, right-click anywhere in the Normal item and choose Insert ⇒ Module from the context menu. The Visual Basic Editor inserts a new module in the Normal.dotm global template and displays a Code window for it. + +9. Press the F4 key to activate the Properties window for the new module. (By _activate_ I mean _give_ _the_ _focus to_ —whatever window has the focus is the one where typing will be displayed or mouse clicks will have an effect.) The Visual Basic Editor selects the (Name) property, the only property available for this new module. (Confusingly, the property's name is enclosed in parentheses.) + +10. Type a name for the new module in the Properties window. For this example, delete the default name (Module 1 or Module 2 or whatever it is) and type the name Procedures_to_Keep_1. + +11. Press the F7 key or click in the Code window to activate it. + +12. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet (the code area) in the Code window. If not, go back and complete step 4 in the list at the start of this chapter. + +13. Below the Option Explicit statement, type the Sub statement for the procedure and press the Enter key. Name the procedure Toggle_Track_Changes_between_Hidden_and_Strikethrough: + + Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough + +14. When you press the Enter key, the Visual Basic Editor inserts for you the required parentheses at the end of the Sub statement, a blank line, and the End Sub statement and places the insertion point on the blank line, ready for you to start typing in some programming: + + Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough() + + End Sub + +15. Press the Tab key to indent the first line below the Sub statement. + +16. Type **if options.** (in lowercase, and be sure to end with the period). Now the Editor displays the List Properties/Methods drop-down list. + +17. Type down through the list (type **d** , **e** , and then **l** ) and use the ⇒ key, or simply scroll with the mouse, to select the DeletedTextMark entry. + +18. Now just type **=** (the equal sign). The Visual Basic Editor enters the DeletedTextMark command for you, followed by the equal sign, and then displays the List Properties/Methods list of constants that can be used with the DeletedTextMark property (see Figure 4.3). + +Figure 4.3 The Visual Basic Editor's List Properties/Methods list displays the constants available for the DeletedTextMark property. + +19. Select the wdDeletedTextMarkHidden item and enter it into your code by pressing the Tab key or by double-clicking it. + +20. Type **Then** and press the Enter key. Note that when you start the next line of code (by pressing Enter), the Visual Basic Editor checks the line of code for errors. If you used lowercase for the If Options part of the statement, the Visual Basic Editor applies capitalization (this is just for show—VBA pays no attention to capitalization when executing code). If there are no space characters on either side of the equal sign, the Visual Basic Editor adds them too. + +21. Enter **Options.DeletedTextMark=wdDeletedTextMarkStrikethrough** , using the assistance offered by the Visual Basic Editor's _Auto List Members_ features (described earlier, in steps 16 through 18), and then press Enter. + +22. Press the Backspace key or Shift+Tab to unindent the new line of code by one tab stop. + +23. Type the **ElseIf** keyword, and then enter the rest of the procedure as follows: + + ElseIf Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough Then + Options.DeletedTextMark = wdDeletedTextMarkHidden + End If + +24. Make sure your completed procedure looks like this: + + Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough() + If Options.DeletedTextMark = wdDeletedTextMarkHidden Then + Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough + ElseIf Options.DeletedTextMark = wdDeletedTextMarkStrikeThrough Then + Options.DeletedTextMark = wdDeletedTextMarkHidden + End If + End Sub + +25. Press Alt+F11 to switch to Word, and then type in a line or two of text. + +26. Arrange the Word window and the Visual Basic Editor window side by side. In Word, click the Review tab on the Ribbon, and click the upper half of the Track Changes button (the graphic icon) to activate the feature that marks up (or otherwise handles) revisions. Delete a word in your text. Notice whether it is struck through or is simply hidden. You have a macro that toggles between these two behaviors, so in the Visual Basic Editor, press the F5 key or click the Run Sub/UserForm button (on the Standard and Debug toolbars) to run the macro. Back in Word, see what effect the deletion has now. You can also take a look at the Track Changes Options dialog box to see that the Deletions setting has changed. + +27. Click the Save button on the Standard toolbar in the Visual Basic Editor. + +Note that you could alternatively write this macro using a With statement for the Options object so that it looks like this: + + Sub Toggle_Track_Changes_between_Hidden_and_Strikethrough_2() + With Options + If .DeletedTextMark = wdDeletedTextMarkHidden Then + .DeletedTextMark = wdDeletedTextMarkStrikeThrough + ElseIf .DeletedTextMark = wdDeletedTextMarkStrikeThrough Then + .DeletedTextMark = wdDeletedTextMarkHidden + End If + End With + End Sub + +There are usually several ways to code a given behavior in VBA. Although formal (professional) programmers learn a set of "best practices," if you're just a hobbyist writing VBA for your own personal use, go ahead and code however you wish. Whatever works. + +# Creating a Procedure for Excel + +The procedure you'll create for Excel is short but helpful: When the user runs Excel, the procedure maximizes the Excel window and opens the last file used. The procedure also illustrates some useful techniques, including these: + + * Writing a macro that executes when an application first starts up + * Working with events + * Using the Object Browser to find the objects, methods, and properties you need + +Follow these steps to create the procedure: + +1. Start Excel if it's not already running. + +2. Press Alt+Tab to cycle through your workbooks to locate Personal.xlsb. If your Personal Macro Workbook is currently hidden, click the Unhide button in the Window section of the View tab on the Ribbon. Select PERSONAL.XLSB in the Unhide Workbook list box, and then click the OK button. + +3. Press Alt+F11 to open the Visual Basic Editor. + +4. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures" earlier in this chapter. + +5. In the Project Explorer window, expand VBAProject (PERSONAL.XLSB) if it's collapsed. To expand it, either double-click its name or click the + sign to its left. + +6. Expand the Microsoft Excel Objects folder. + +7. Double-click the ThisWorkbook item to open its code sheet in a Code window. The ThisWorkbook object represents the current workbook. + +8. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet. If not, go back and complete step 4 in the list at the start of this chapter. However, note that at the time of this writing, even if you select the Require Variable Declaration option (via the Tools ⇒ Options menu in the Excel version of the VBA Editor), Option Explicit is not automatically inserted into your Code window. + +9. In the Code window, type + + Private Sub Auto_Open + +10. and then press the Enter key. The Editor will add the required parentheses and the End Sub line. + +* * * + +Macros Have Scope + +The Private keyword limits the scope of a macro—the area in which it can operate. Private scope makes the macro available to all procedures in the module that contains it, but not to procedures in other modules. Chapter 6, "Working with Variables, Constants, and Enumerations," explains scope in more detail. + +* * * + +11. Open the Object Browser. Press the F2 key, choose View ⇒ Object Browser, or click the Object Browser button on the Standard toolbar to display the Object Browser window (see Figure 4.4). + +Figure 4.4 Use the Object Browser to find the objects, methods, and properties you need for a procedure. + +12. The first action we want to take in this macro is to maximize the Excel's application window. As in any application, VBA uses the Application object to represent the Excel application, but you need to find the correct property of this object to work with. Select Excel in the Project/Library drop-down list (see the label in Figure 4.4), type **maximize** in the Search Text box, and either click the Search button or press the Enter key. The Object Browser displays the result of the search (see Figure 4.5) in its Search Results pane (which was collapsed and not visible in Figure 4.4). The constant xlMaximized is a member of the class XlWindowState. + +Figure 4.5 The result of the search for "maximize" in the Object Browser + +13. Press the F7 key to activate the Code window. (Alternatively, click the Code window, choose View ⇒ Code, or choose the Code window from the Window menu.) + +14. Type **application.** (in lowercase and including the period) so that the Visual Basic Editor displays the drop-down list, type **w** to jump to the items beginning with _W_ , and select the WindowState item. + +15. Type **=** to enter the WindowState item in your code and to display the list of constants available for WindowState (see Figure 4.6). + +Figure 4.6 Use the list of constants to enter the constant quickly and easily. + +16. Select the xlMaximized item and press Enter to insert that property in the code, and move down a line to start writing a new statement. + +17. The second action for the macro is to open the last file used—file 1 on the recently used files list (this is the list that appears in the Recent Documents list when you click the Recent item in the File tab on the Ribbon). Press the F2 key to activate the Object Browser again. + +18. Leave Excel selected in the Project/Library drop-down list, type **recent** , and either press the Enter key or click the Search button. The Object Browser displays the results of the search (see Figure 4.7). The item you need is the RecentFiles property of the Application object. The RecentFiles property returns the RecentFiles _collection_ , an object that knows the information about the files in the recently used files list. + +Figure 4.7 The result of the search for "recent" in the Object Browser + +19. Press the F7 key to return to the Code window. Type **application.** and select RecentFiles from the List Properties/Methods drop-down list. Then type **(1).** to indicate the first item in the RecentFiles collection, and select the Open method from the List Properties/Methods list: + + Application.RecentFiles(1).Open + +20. That's it. Your procedure should look like this: + + Private Sub Auto_Open() + + Application.WindowState = xlMaximized + Application.RecentFiles(1).Open + + End Sub + +21. Press Alt+Q or choose File ⇒ Close And Return To Microsoft Excel to return to Excel. + +22. Click the File tab on the Ribbon and choose Save. + +23. Click the Hide button in the Window section of the View tab on the Ribbon. This hides PERSONAL.XLSB from view. + +24. Open a sample document, type something into one of the cells, save it, and close it. + +25. Press Alt+F4 to exit Excel. If you are asked if you want to save the changes you made to the current workbook and your Personal Macro Workbook, choose Yes. + +26. Restart Excel. Notice how Excel automatically maximizes the application window and opens the most recently used file. + +27. If you see an error message, it most likely means that you've renamed or moved the most recently used file. To prevent this problem, you can add some error-trapping code. We'll explore the On Error command thoroughly in Chapter 17, "Debugging Your Code and Handling Errors," but if you wish, you can make the following changes to your Auto_Open macro: + + Private Sub Auto_Open() + + **On Error GoTo Problem** + + Application.WindowState = xlMaximized + Application.RecentFiles(1).Open + + Exit Sub + + Problem **:** + + MsgBox "Error: " & Application.RecentFiles(1).Path & " can't be opened." + + End Sub + +The Auto_Open name is special. When you name a macro Auto_Open, VBA knows that whatever actions are in the macro code should be executed when Excel starts running. This is one of a handful of special names called Excel's _events_ —things that happen to an object, in this case the Open event of the Excel application. (Notice that an object's _methods_ are actions it can take, such as a print method sending a document to the printer. Conversely, an object's _events_ are things that can happen to it, such as a user clicking a button or opening an application.) + +* * * + +How to Turn off Default Templates + +The following section describes how to use a template that comes with PowerPoint. You've likely noticed that when you start Office 2013 applications, they display a set of templates. Some users are likely to never use these templates and would prefer the traditional Office applications' behavior: starting with a blank document and bypassing this display of templates. To turn this off, choose File ⇒ Options, then uncheck Show The Start Screen When This Application Starts. + +* * * + +# Creating a Procedure for PowerPoint + +The procedure you'll create for PowerPoint is short and straightforward, but it can save the user enough effort over the long run to make it worthwhile. It adds a title slide to the active presentation, inserting a canned title that includes the current date and the company's name as the presenter. + +Follow these steps to create the procedure: + +1. Start PowerPoint. If PowerPoint is already running, close it and restart it. If PowerPoint creates a default presentation on startup, close the presentation (click the File tab and choose Close). + +2. Create a new presentation based on the Contemporary Photo Album template. In Office 2013, locate the list of Suggested Searches at the top of the default templates window displayed when you first run PowerPoint. Then click Photo Albums. (In previous versions of Office, click the File tab and choose New And Sample Templates). Make sure the default slide on the presentation has the Title Slide layout by right-clicking a blank area in the slide, then choosing Layout ⇒ Title And Content (it will be called _Title Slide_ in earlier versions of Office) to apply it to the default slide. + +3. Press Alt+F11 to open the Visual Basic Editor. + +4. Make sure the Visual Basic Editor is set up as described in the section "Setting Up the Visual Basic Editor for Creating the Procedures" earlier in this chapter. + +5. In the Project Explorer window, right-click anywhere in the VBAProject(Presentation1) item and choose Insert ⇒ Module from the context menu. The Visual Basic Editor inserts a new module in the project, displays a Code window containing the code sheet for the module, and expands the project tree in the Project Explorer. + +6. Verify that the Visual Basic Editor has entered the Option Explicit statement in the declarations area at the top of the code sheet. If not, go back and complete step 4 in the list at the start of this chapter. + +7. Press the F4 key to activate the Properties window. + +8. Replace the default name _Module_ _1_ by typing (in the Properties window) **General_Procedures**. + +9. Press the F7 key or click in the Code window to activate it. + +10. Below the Option Explicit statement, type the Sub statement for the procedure and press the Enter key: + + Sub Add_Title_Slide + +11. When you press Enter, the Visual Basic Editor enters the parentheses at the end of the Sub statement, a blank line, and the End Sub statement for you, and places the insertion point on the blank line: + + Sub Add_Title_Slide() + + End Sub + +12. Press the Tab key to indent the first line below the Sub statement. + +13. Now identify the objects you need by using the Help system. You'll be working with the active presentation, which is represented by the ActivePresentation object. As you'll see in Part 6 of this book, "Programming the Office Applications"—which is all about objects—there are several ways to get information when programming with objects. For now, let's try searching online help rather than using the Editor's built-in Object Browser. Using Google or Bing, search for **object model reference powerpoint 2013**. You should then be able to locate the details about the Application object's ActivePresentation property object, as shown in Figure 4.8. + +Figure 4.8 The ActivePresentation property ⇒ screen + +14. Click the _Presentation_ link in "Returns a Presentation object..." near the top, as shown in Figure 4.8. This link will take you to the Presentation object's Help screen. We're drilling down in this Help system to find example code and other assistance that will show us how to work with slides and related objects. All this will become much clearer to you in Part 6. For now, just follow along to get the general idea. + +15. Now on the Presentation object's Help page, click the Presentation Object Members link (scroll to find it near the bottom of this web page), and then scroll way down to locate the Slides object in the properties list. Click the Slides link (see Figure 4.9), then in the new web page that appears, click a Slides link again (it's near the top where it says "Returns a Slides collection..."). Now you see the information about the Slides Collection object, as shown in Figure 4.10. + +Figure 4.9 Select the Slides object from the list. + +Figure 4.10 The Slides Collection Object Help screen + +16. From this screen, you learn two pieces of information: first, that a slide is represented by a Slide object (stored in a Slides collection), and second, that you use the Add method to create a new slide. + +17. Type a declaration for an object variable of the Slide object type to represent the slide the procedure creates. Notice that after you type **as** and a space, the Visual Basic Editor displays the list of available objects. Type down through the list (type **s** and **l** ) until you have selected Slide, and then press the Enter key to complete the term and start a new line of code: + + Dim sldTitleSlide As Slide + +18. Use a Set statement to assign to the sldTitleSlide object a new slide you create by using the Add method. Type **set sld** and then press Ctrl+spacebar to make the Editor's Complete Word feature enter sldTitleSlide for you. Then type **= activepresentation.slides.add(** , using the Visual Basic Editor's assistance, so that the line reads as shown here: + + Set sldTitleSlide = ActivePresentation.Slides.Add( + +19. When you type the parenthesis, the Auto Quick Info feature displays the syntax for the Add method, as shown in Figure 4.11. + +Figure 4.11 The Auto Quick Info feature displays the syntax for the Add method when you type the parenthesis after the Add method + +20. Type the **Index** argument, a colon, an equal sign, the value **1** (because the title slide is to be the first slide in the presentation), and a comma: + + Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1, + +* * * + +Choosing between Labeled and Implied Argument Lists + +When a method uses arguments, as the Add method does here, you can choose between specifying the argument names or omitting them and letting VBA infer the arguments from the order of the values or constants. For example, in this case you can specify either Add(Index:=1, Layout:=ppLayoutTitle) or Add(1, ppLayoutTitle). The latter is more concise and easier to type in, but the former is much clearer to read. + +* * * + +21. Break the statement to the next line with a line-continuation character (an underscore preceded by a space). Then type a tab to indent the new line, type the **Layout** argument, a colon, and an equal sign, and pick the ppLayoutTitle constant from the List Properties/Methods drop-down list, as shown in Figure 4.12. + +Figure 4.12 Choose the ppLayoutTitle constant for the Layout argument. + +22. Type the parenthesis to end the statement: + + Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1, _ + Layout:=ppLayoutTitle) + +23. Press the Enter key to start a new line, and then press either the Backspace key or Shift+Tab to unindent the new line by one tab stop. + +24. You'll be working with the sldTitleSlide from here on, so create a With statement using it, and place the insertion point on the line between the With statement and the End With statement: + + With sldTitleSlide + + End With + +25. Next, the macro will manipulate the two items on the slide. To make it do so, you need to know the objects that represent them. You could use the Macro Recorder to find the objects, but this time try a more direct method: Place the insertion point on the line within the With statement and type . (a period) to display the List Properties/Methods drop-down list of available properties and methods for the Slide object. + +26. Sometimes the List Properties/Methods drop-down list is of little help because it displays so many possibly relevant properties and methods that you can't identify the property you need. But if you scan the list in this case, you'll see that the Shapes property (which returns the Shapes collection) is the only promising item. + +27. Press Ctrl+G, choose View ⇒ Immediate, or click the Immediate Window button on the Debug toolbar to display the Immediate window for a bit of testing. + +28. Type the following exploratory statement into the Immediate window and press the Enter key to execute this statement: + + ActivePresentation.Slides(1).Shapes(1).Select + +(The Immediate window is a quick way to test individual lines of code without having to run the entire macro.) Now switch to PowerPoint's window to see if the item was, in fact, selected (whether it has a frame drawn around it). Press Alt+F11 or click the View Microsoft PowerPoint button on the Standard toolbar to display the PowerPoint window to verify that VBA has selected the first Shape object on the slide. + +29. Okay, this is the right object to start with, but now you need to find out how to add text to the shape. Go back to the Code window (click in the Code window or press the F7 key). Press the Backspace key to delete the period, and then type it again to redisplay the list. Type **te** to jump down to the items in the list whose names start with _text_. Select the TextFrame item in the list, and then type a period to enter the term and display the next list. Scroll down the list, select the TextRange object, and type a period to enter the term and display the next list. In the next list, select the Text property. Type an equal sign to enter the term. Then type double quotation marks followed by the text to assign to the text property **Pollution Update:** (with a space after it), double quotation marks, an ampersand, and the date (supplied by the Date function): + + Shapes(1).TextFrame.TextRange.Text = "Pollution Update: " & Date + +30. Assign information to the second Shape in the same way: + + .Shapes(2).TextFrame.TextRange.Text = "JMP Industrials." + +31. The finished procedure should look like this: + + Sub Add_Title_Slide() + Dim sldTitleSlide As Slide + Set sldTitleSlide = ActivePresentation.Slides.Add(Index:=1, _ + Layout:=ppLayoutTitle) + With sldTitleSlide + .Shapes(1).TextFrame.TextRange.Text = _ + "Pollution Update: " & Date + .Shapes(2).TextFrame.TextRange.Text = _ + "JMP Industrials" + End With + End Sub + +32. Press F5 to test the procedure. Look at the slides in PowerPoint. There should be a new first slide in the collection of slides on the left. Then delete all slides from the presentation (select slides by pressing Shift while clicking a range of slides in the left pane, then press Delete). + +33. If you wish, right-click on the Quick Access Toolbar in the upper-left corner of PowerPoint's screen, then choose Customize Quick Access Toolbar. Then add a Quick Access Toolbar button for the Add_Title_Slide macro. + +34. Save the presentation under a name such as Procedures.pptm. You might see a warning about personal-information risks. Click OK to close that Be Careful! message box. + +35. Create a new presentation; then test the toolbar button or menu item for the procedure. If you see a security warning, read the sidebar titled "A Warning about Security" in Chapter 1. Close the presentation without saving changes. + +# Creating a Procedure for Access + +Access has a long tradition of autonomy from the other Office applications, and this applies as well to its implementation of macros. It has no Recorder, for example, nor does it permit you to assign macros to shortcut key combinations. + +In addition, Access includes a legacy "Macro Builder," which you can take a look at by clicking the Macro button on the Create tab of the Ribbon. (Note that in Access there is no _Developer_ tab on the Ribbon. You can open the Visual Basic Editor from the Database Tools tab or press Alt+F11.) + +The Macro Builder utility has been generally unpopular over the years because the Visual Basic Editor offers far more options, objects, and features. The Builder is for nonprogrammers—a way to create simple macros via lists rather than actual programming. However, the Builder was somewhat improved in Access 2007, including provisions for error handling and the ability to embed macros within individual forms. And additional improvements were made for Access 2010, enough improvements that Microsoft renamed it the Macro Designer. But a rose by any other name is still a rose. If you're interested in details about the Macro Designer and its curious, some might say simplistic, reliance on repeated If...Then structures, see the sidebar titled "Using The Macro Builder" in Chapter 28, "Understanding the Access Object Model and Key Objects." + +For the reasons I mentioned, you will likely prefer to use the Visual Basic Editor rather than the Builder/Designer for any but the most elementary macros. After all, relying on a list of If queries is not only limiting, it's downright dated. + +Let's get a feel for writing real VBA macros in Access. In this example, you'll write a macro that displays today's date and time: + +1. Start Access. + +2. Double-click the Blank Desktop Database icon (in Access 2010 and earlier, double-click the Blank Database button). + +3. Press Alt+F11 to open the Visual Basic Editor. + +4. Right-click the database name in the Project Explorer, then choose Insert Module to open a new module in the Code window, where you can write macros. + +5. In the Code window, type the following macro: + + Sub ShowDate() + + MsgBox ("It is: " & Now) + + End Sub + +6. Click anywhere within this code, and then press F5 to execute the macro. You should see a message box that displays the current date and time. (Note that you don't type the End Sub; Access automatically inserts it for you.) + +We'll cover Access macro programming in depth in Chapter 28 and Chapter 29, "Manipulating the Data in an Access Database via VBA." Also, you might have noticed that the Editor automatically inserted a line of code at the top: Option Compare Database. This specifies a particular way to go about comparing text strings. + +# The Bottom Line + +**Set up the Visual Basic Editor for creating procedures.** + +How you arrange the various components of the Visual Basic Editor is your personal choice, but while using this book, it's easiest if you set up the Editor to resemble the way it appears in the book's figures. Besides, this arrangement is quite close to the default layout, which has proven to be the most effective one for the majority of programmers (according to various focus groups and polls) for the decades that Visual Basic has been used. + +Master It + +Press a single key to display, then hide, the Properties window. + +**Create a procedure for Word.** + +Using the Help feature in any VBA-enabled application allows you to find code examples that you can copy and paste into your own code. + +Master It + +Open the Code window and use Help to find a code example. + +**Create a procedure for Excel.** + +Certain procedure names are special. In a previous Excel exercise, you added line numbering and gave that procedure a name of your own choice. But some procedure names have a special meaning—they are triggered by an _event_ in Excel itself. They will execute _automatically_ when that event takes place (you don't have to run events by choosing Run from the Macro dialog box or by assigning the macro to a keyboard shortcut or Quick Access Toolbar button). One such event is Excel's Auto_Open procedure. + +Master It + +Display a message to the user when Excel first executes. + +**Create a procedure for PowerPoint.** + +As you type a procedure, the Visual Basic Editor provides you with lists of objects' members (the Auto List Members feature) and with syntax examples, including both required and optional arguments (the Auto Quick Info feature). These tools can be invaluable in guiding you quickly to the correct object and syntax for a given command. + +Master It + +Use the Auto List Members and Auto Quick Info features to write a macro that saves a backup copy of the currently active presentation. + +**Create a procedure for Access.** + +Although Access includes a variety of macro-related features that are unique (such as its Macro Builder/Designer), its Visual Basic Editor is quite similar to the Visual Basic Editors in the other Office applications. + +Master It + +Open the Visual Basic Editor in Access and write a macro that displays today's date using the Date function rather than the Now function. Use the Access Visual Basic Editor Help system to understand the difference between these two functions. +Part 2 + +Learning How to Work with VBA + + * **Chapter 5: Understanding the Essentials of VBA Syntax** + * **Chapter 6: Working with Variables, Constants, and Enumerations** + * **Chapter 7: Using Array Variables** + * **Chapter 8: Finding the Objects, Methods, and Properties You Need** + +Chapter 5 + +Understanding the Essentials of VBA Syntax + +In this chapter, you'll learn the essentials of VBA syntax, building on what you learned via practical examples in the previous chapters. This chapter defines the key terms that you need to know about VBA to get going with it, and you'll practice using some of the features in the Visual Basic Editor. + +* * * + +If You Don't Understand a Programming Term, Look Ahead + +You'll find lots of definitions of programming terms as you work your way through this chapter. If you come across something that doesn't yet make sense to you, just keep going; you'll most likely find an explanation in the next few pages. + +* * * + +In this chapter you will learn to do the following: + + * Understand the basics of VBA + * Work with procedures and functions + * Use the Immediate window to execute statements + * Understand objects, properties, methods, and events + +# Getting Ready + +To learn most efficiently in this next section, arrange the Visual Basic Editor in Word by performing the following steps. This chapter ⇒ focuses on Word because it's the most widely distributed of the VBA-enabled applications. If you don't have Word, read along anyway without performing the actions on the computer; the examples are easy to follow. (Much of this will work on any VBA host application, though many of the commands shown here are specific to Word.) Here are the steps: + +1. Start Word. + +2. Launch the Visual Basic Editor by pressing Alt+F11 or clicking the Developer tab on the Ribbon and then clicking the Visual Basic button. + +3. Arrange the Word window and the Visual Basic Editor window so that you can see both of them at once. For example, if these are the only two open windows that are not minimized, right-click the Taskbar and choose Show Windows Stacked or Show Windows Side By Side from the context menu to arrange the windows, or just drag them by their title bars to the right or left side. + +4. Display the Immediate window in the Visual Basic Editor by pressing Ctrl+G, choosing View ⇒ Immediate Window, or clicking the Immediate Window button on the Debug toolbar. Your setup should look like Figure 5.1. + +Figure 5.1 The Visual Basic Editor set up alongside a Word document. This is a good way to edit or debug macros. You can see where you are in the code and, often, the effect the macro is having. + +* * * + +Using Dual Monitors + +If you're using a multiple-monitor setup, you can dedicate one monitor to Word and another to the Visual Basic Editor. + +* * * + +# Procedures + +A _procedure_ in VBA is a named unit of code that contains a sequence of statements to be executed as a group. VBA itself has a library of procedures. + +For example, VBA contains a function (a type of procedure) named Left, which returns the left portion of a text string that you specify. For example, hello is a string of text five characters long. The statement Left("hello", 3) returns the leftmost three characters of the string: hel. (You could then display this three-character string in a message box or use it in code.) The name assigned to the procedure gives you a way to refer to the procedure. + +In addition, when you write a macro, you are writing a procedure of your own (as opposed to a procedure built into VBA already). + +Any executable code (your macros) in VBA must be contained in a procedure—if it isn't, VBA can't execute it and an error occurs. (The exception is statements you execute in the Immediate window, which take place outside a procedure. However, the contents of the Immediate window exist only during the current VBA session and are used for testing code. They cannot be executed from the host application via buttons, ribbons, or keyboard shortcuts.) + +A macro—in other words the code from Sub to End Sub—is a procedure. + +Procedures are contained within modules, which in turn are contained within project files, templates, or other VBA host objects, such as user forms. + +There are two types of procedures: functions and subprocedures (subs). + +## Functions + +A _function_ in VBA is one of two types of procedures. Like a sub, a function is a procedure designed to perform a specific task. For example, the built-in VBA Left function returns the left part of a text string, and the Right function, its counterpart, returns the right part of a text string. Each function has a clear task that you use it for, and it doesn't do anything else. To take a ridiculous example, you can't use the Left function to print a document in Word or make characters boldface—for those tasks, you need to use the appropriate functions, methods, and properties. Left just does its one, simple job. + +VBA comes with many built-in functions, but you can create your own as well. You'll create your own functions later in the book. They will begin with a Function statement and end with an End Function statement. + +_Each function returns a value_. For example, the Left function returns the left part of the string. Other functions return different kinds of results. Some, for example, just test a condition and return True if the condition is met and False if it is not met. But just remember that what distinguishes a function is that it returns some value. + +## Subprocedures + +A _subprocedure_ (also called a _sub_ or _subroutine_ ), like a function, is a complete procedure designed to perform a specific task, but unlike a function, a sub _does not return a value_. + +Note that many tasks need not return a result. For example, the Transpose_Word macros you created earlier in this book merely switch a pair of words in a document. There's no need for any value to be returned to VBA for further use. On the other hand, if your procedure calculates sales tax, there _is_ a result, the amount of tax, that must be returned by the procedure for display to the user or further manipulations by the VBA code. + +All the macros you record using the Macro Recorder are subprocedures, as are many of the procedures you'll look at in the rest of this book. + +Each subprocedure begins with a Sub statement and ends with an End Sub statement. + +* * * + +Functions Aren't Displayed in the Macros Dialog Box + +Only subprocedures appear in the Macros dialog box. Should you choose to write a function, it will not appear in that box. + +* * * + +# Statements + +When you create a macro in VBA, you're writing _statements_ , which are similar to sentences in ordinary speech. A _statement_ is a unit of code that describes an action, defines an item, or gives the value of a variable. VBA usually has one statement per line of code, although you can put more than one statement on a line by separating them with colons. (This isn't usually a good idea because it makes your code harder to read. Most programmers stick to one statement per line.) + +You can also break a lengthy line of code onto a second line or a subsequent line to make it easier to read (although this isn't usually necessary). You continue a statement onto the next line by using a line-continuation character: an underscore (_) preceded by a space (and followed by a carriage return; in other words, press the Enter key). You continue a line strictly for visual convenience; VBA still reads continued lines as a single "virtual" line of code. In other words, no matter how many line continuations you use for easy-to-read formatting, during execution it's still a single statement to VBA. + +So, think of VBA code as a series of sentences, each on its own line (or continued), that are usually executed one by one down from the top. + +* * * + +You Can't Break Strings with the Line-Continuation Character + +You can't break a string (text enclosed in quotation marks) with the line-continuation character. If you need to break a line that involves a long string in quotes, break the string into shorter strings and concatenate them using the & operator: "This" & "that". + +* * * + +VBA statements vary widely in length and complexity. A statement can range in length from a single word (such as Beep, which makes the computer beep, or Stop, which halts the execution of VBA code) to very long and complicated lines involving many components. But to make it easy to read your code, try to make your lines as brief as possible. + +That said, let's examine the makeup of several sample VBA statements in Word. Most of these will use the ActiveDocument object, which represents the active document in the current session of Word; a couple use the Documents collection, which represents all open documents (including the active document); and one uses the Selection object, which represents the current selection within a document (selected text or the location of the blinking insertion cursor). Don't worry if some of these statements aren't immediately comprehensible—you'll understand them soon enough. + +Here are some example statements for you to try: + + Documents.Open "c:\temp\Sample Document.docm" + MsgBox ActiveDocument.Name + ActiveDocument.Words(1).Text = "Industry" + ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges + Documents.Add + Selection.TypeText "The quick brown fox jumped over the lazy dog." + Documents.Close SaveChanges:=wdDoNotSaveChanges + Application.Quit + +Let's look at each of these statements in turn. The statement + + Documents.Open "c:\temp\Sample Document.docm" + +uses the Open method of the Documents collection to open the specified document—in this case, Sample Document.docm. Enter this statement in the Immediate window, substituting a path and filename of a document that exists on your computer for **_\temp\Sample Document.docm_**. + +Press the Enter key, and VBA opens the document in the Word window. Just as when you open a document by hand while working interactively in Word, this statement in the macro makes this document the active document (the document whose window has the _focus_ ; in other words, the window that is currently selected and will therefore take input from keystrokes or mouse activity). + +The statement + + MsgBox ActiveDocument.Name + +uses the MsgBox function (built into VBA) to display the Name property of the ActiveDocument object (in this example, Sample Document.docm). As an experiment, type this MsgBox statement into the Immediate window (type in lowercase, and use VBA's Help features as you choose) and press the Enter key. VBA displays a message box over the Word window. Click the OK button to dismiss the message box. + +Now you see how you can quickly test a statement using the Immediate window. You don't have to execute an entire macro; you can just try out a single statement (a single line of code) in the Immediate window if you want to see its effect. + +Next, the statement + + ActiveDocument.Words(1).Text = "Industry" + +uses the _assignment_ _operator_ (the equal [=] sign) to assign the value Industry to the Text property of the first item in the Words collection in the ActiveDocument object. Enter this statement in the Immediate window and press the Enter key. You'll see the word _Industry_ displayed in the current typeface at the beginning of the document you opened. + +Note that after this line executes, the blinking insertion point appears at the _beginning_ of this word rather than at the end of the word, where it would be if you'd typed the word. This happens because VBA manipulates the properties of the document (in this case the Words collection) directly rather than imitating "typing" into it. + +The statement + + ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges + +uses the Close method to close the ActiveDocument object. It uses one _argument_ , SaveChanges, which controls whether Word saves the document that's being closed (if the document contains unsaved changes). In this case, the statement uses the constant wdDoNotSaveChanges to specify that Word shouldn't save changes when closing this document. Enter this statement in the Immediate window and press the Enter key, and you'll see VBA make Word close the document. + +An _argument_ is information you send to a procedure. For example, in this next statement the argument is the text string show, which is sent to the built-in VBA MsgBox function: + + MsgBox ("show") + +A MsgBox function will display _any_ text. So you send it an argument: the particular text you want it to display. You'll learn more about arguments shortly. + +Now try entering this statement in the Immediate window: + + Documents.Add + +This statement uses the Add method of the Documents collection to add a new Document object to the Documents collection. In other words, it creates a new document. Because the statement doesn't specify which template to use, the new document is based on the default template (Normal.dotm). When you enter this statement in the Immediate window and press Enter, Word creates a new document. As usual, this new document becomes the active document. + +The statement + + Selection.TypeText "The quick brown fox jumped over the lazy dog." + +uses the TypeText method of the Selection object to type text into the active document at the position of the insertion point or current selection. (The Selection object represents the current selection, which can be either a "collapsed" selection—a mere insertion point with nothing actually selected, as in this example—or one or more selected objects, such as one or more words.) + +If text is selected in the active document, that selection is overwritten as usual—unless you've cleared the Typing Replaces Selected Text check box by pressing Alt+F then I, and then clicking the Advanced option in the left pane of the Word Options dialog box. In that case, the selection is collapsed to its beginning and the new text is inserted before the previously selected text. + +But in this example—because you just created a new document—nothing is selected. Enter the previous Selection.TypeText statement in the Immediate window and press the Enter key, and Word enters the text. Note that this time the insertion point ends up _after_ the inserted text; the TypeText method of the Selection object _is_ analogous to typing something into Word yourself. + +The statement + + Documents.Close SaveChanges:=wdDoNotSaveChanges + +is similar to an ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges statement except that it works on the Documents collection rather than the ActiveDocument object. The Documents collection represents _all_ open documents in the current Word session. So this statement closes all open documents and doesn't save any unsaved changes in them. Enter this statement in the Immediate window and press Enter, and you'll see that Word closes all the open documents. + +The statement + + Application.Quit + +uses the Quit method of the Application object to close the Word application. Enter the statement in the Immediate window and press the Enter key. Word closes itself, also closing the Visual Basic Editor in the process because Word is the host for the Visual Basic Editor. + +* * * + +Getting Help in Visual Basic for Applications + +The Visual Basic Editor offers comprehensive help for the Visual Basic for Applications programming language. To view it, choose Help ⇒ Microsoft Visual Basic For Applications Help from the Visual Basic Editor. You're taken to a website devoted to the current application (in this case, Word 2013). + +Pressing F1 works two ways. If your blinking cursor is on a blank space or an empty line in the Code window, F1 displays a generic Office Help page. This page contains the link "Welcome to the Visual Basic for Applications language reference for Office 2013." Click that link. + +Here's a second way to press F1 for help. Often the quickest way to get help is to click a keyword in your code, such as ActiveWindow or MsgBox. By clicking, you put the blinking insertion cursor in that command, "selecting" it. Now when you press F1, the Editor tries to locate online help for that particular command. + +Most of the built-in VBA statements and functions are illustrated with code examples, which can be particularly useful when you're creating and troubleshooting your own code. The samples show you how it's done. + +The Visual Basic Help files use a couple of conventions you should know about before you try to use them: + + * Italics denote variables or values you'll need to change yourself. + * Brackets—[and]—denote optional arguments. + +This book uses the same conventions, so you'll see them in use soon. + +If you don't find what you need by searching the Microsoft Visual Basic Help ⇒ web pages, choose Help ⇒ MSDN On The Web. That's a more generic Office 2013 help site, with links for all the various Office applications and their object library references. + +* * * + +# Keywords + +A _keyword_ is a word that is part of the built-in VBA language. Here are some examples: + + * The Sub keyword indicates the beginning of a subprocedure, and the End Sub keywords mark the end of a subprocedure. + * The Function keyword indicates the beginning of a function, and the End Function keywords mark the end of a function. + * The Dim keyword starts a declaration (for example, of a variable) and the As keyword links the item declared to its type, which is also a keyword. For example, in the statement Dim strExample As String, there are three keywords: Dim, As, and String. + +The names of functions and subprocedures are not keywords (neither the built-in procedures nor procedures you write). Note that in this book I sometimes use the term _command_ as a synonym for _keyword_. + +* * * + +Identifying Keywords by Color + +The Visual Basic Editor displays all keywords in blue. But if you wish, you can specify a different color for keyword text on the Editor Format tab of the Options dialog box (choose Tools ⇒ Options from the Visual Basic Editor). If you're not sure whether an item is a keyword, check if the color the Visual Basic Editor gives the item is the same color as keywords such as Sub. + +* * * + +# Expressions + +An _expression_ involves multiple words. It consists of a combination of keywords, operators, variables, and/or constants that results in (or _resolves to_ ) a string, number, or object. For example, you could use an expression to do a math calculation or to compare one variable against another. Here's an example of a numeric expression (it's shown in boldface) that compares the variable _N_ to the number 4 by using the > (greater than) operator: + + If N > 4 Then + +The result of this expression will depend on whatever value is currently held in the variable _N_. If it holds 12, then the expression will result in TRUE because 12 is greater than 4. More on expressions later. + +# Operators + +An _operator_ is a symbol you use to compare, combine, or otherwise work with values in an expression. VBA has four kinds of operators: + + * _Arithmetic operators_ (such as + and –) perform mathematical calculations. + * _Comparison operators_ (such as < and >, less than and greater than, respectively) compare values. + * _Logical operators_ (such as And, Not, and Or) build logical structures. + * The _concatenation operator_ (&) joins two strings together. + +You'll look at the different kinds of operators and how they work in Chapter 11, "Making Decisions in Your Code." + +# Variables + +A _variable_ is a location in memory set aside for storing a piece of information that can change while a procedure is running. (Think of it as a named, resizable compartment within the memory area.) + +For example, if you need the user to input their name via an input or a dialog box, you'll typically store the name in a variable so you can work with it further down in some later statement in the procedure. Or perhaps you're adding several numbers that the user types in. You would have a variable that holds the current sum total—which keeps changing (varying) as the user types in more numbers. + +VBA uses several types of variables, including these: + +_Strings_ store text characters or groups of characters. + +_Integers_ store whole numbers (numbers without fractions). + +_Objects_ store objects. + +_Variants_ can store any type of data. Variant is the default type of variable. + +Either you can let VBA create Variant variables as the default type, or you can specify another data type if you wish. Specifying the types of variables has certain advantages that you'll learn about in due course. + +For the moment, try creating a variable in the Immediate window. Type the following line and press Enter: + + myVariable = "Some sample text" + +Nothing visible happens, but VBA has created the myVariable variable. It has set aside some memory and labeled that area myVariable. It also stored the text string Some sample text in that variable. Now, type the following line and press Enter: + + MsgBox myVariable + +This time, you can see the result: VBA goes to the memory area you specified (with the variable name myVariable) and retrieves the value, the string. A message box appears containing the text you had stored in the variable. + +You can declare variables either explicitly or implicitly. An _explicit_ declaration is a line of code that specifies the name you want to give the variable, and usually its type, before you use the variable in your code. Here's an explicit variable declaration: + + Dim myVariable As String + +An _implicit_ declaration means that you don't bother with that explicit declaration statement. Instead, you just use the variable name in some other statement. VBA then stores the data in a Variant variable. (You have not specified the type.) + +In other words, if you _just use_ a variable in your code without declaring it, it's implicit. + +Here's an example of implicit declaration: + + myVariable = "Some sample text" + +You never _explicitly_ declared this variable. The first time it appeared in your code, you just assigned some data, the text, to it. So VBA assumes that you want to create the variable implicitly. + +In the next few chapters, you'll use a few implicit variable declarations to keep things simple. In other words, you won't have to type in lines of code to declare implicit variables. VBA will create them for you when you first use them in an assignment or other statement. + +However, many educators and professional programmers insist on explicit declaration, so we'll do that for the most part in the later sections of this book. Explicit variable declarations make your code run faster and make it easier to understand. What's more beneficial, some types of errors can be avoided if you explicitly declare all your variables. So declaring is a good habit to get into. + +# Constants + +A _constant_ is similar to a variable. It's a named item that keeps a constant value while a program is executing. The constant's meaning _doesn't change_ during the macro's execution. (So in this way, it's unlike a _variable_.) + +VBA uses two types of constants: _intrinsic constants_ , which are built into the VBA language itself (and individual Office applications' implementations of VBA), and _user-defined constants_ , which you can create. For example, the built-in constant vbOKCancel is always available in VBA to be used with the MsgBox function. This constant creates a message box that contains an OK and a Cancel button. There are sets of built-in constants for colors, printing (vbTab, for example), and other properties. + +Concerning constants that you define, you might want to create one to store a piece of information that doesn't change, such as the name of a procedure or the distance between Boston and New York. In practice, the built-in _intrinsic_ constants are used quite often in VBA programming; user-defined constants not so much. It's just as easy to put the distance between those cities in a variable, even though it won't vary. + +# Arguments + +An _argument_ is a piece of information—supplied by a constant, a variable, a literal, or an expression—that you pass to a procedure, a function, or a method. Some arguments are required; others are optional. The text hello there in this MsgBox function is an argument: + + MsgBox "hello there" + +Here's another example. As you saw earlier, the following statement uses the optional argument SaveChanges to specify whether Word should save any unsaved changes while closing the active document: + + ActiveDocument.Close SaveChanges:=wdDoNotSaveChanges + +This optional argument uses the built-in constant wdDoNotSaveChanges. + +* * * + +Understanding Literals + +A _literal_ can be used instead of a constant or variable, if you wish. With a literal, you just type the actual value into the argument. For example, you could display a message box that says "Hi there" by using a variable: + + txtMsg = "Hi there!" + MsgBox (txtMsg) + +Or you could simply avoid the variable and employ a literal (the actual text string) as the argument: + + MsgBox ("Hi there!") + +Both of these approaches have the same result. + +* * * + +The Visual Basic Editor's helpful prompts and the Visual Basic Help file show the list of arguments for a function, a procedure, or a method in parentheses, with any optional arguments enclosed in brackets. If you have its Auto Quick Info feature activated, the Editor displays the argument list for a function, procedure, or method after you type its name followed by a space. + +Figure 5.2 shows the argument list for the Document object's Open method. Type **Documents.Open** , then press the spacebar to see the argument list. + +Figure 5.2 Optional arguments are enclosed within brackets. + +The FileName argument is required, so it isn't surrounded by brackets. All the other arguments (ConfirmConversions, ReadOnly, AddToRecentFiles, and so on) are optional and therefore are surrounded by brackets. + +If you don't supply a value for an optional argument, VBA uses the default value for the argument. (To find out the default value for an argument, consult the VBA Help file. The default is usually the most commonly employed value.) The Visual Basic Editor uses boldface to indicate the current argument in the list; as you enter each argument, the next argument in the list becomes bold. + +## Specifying Argument Names vs. Omitting Argument Names + +You can add arguments in either of two ways: + + * Enter the name of the argument (for example, ConfirmConversions), followed by a colon, an equal sign (ConfirmConversions:=), and the constant or value you want to set for it (ConfirmConversions:=True). For example, the start of the statement might look like this: + + Documents.Open FileName:="c:\temp\Example.docm", _ + ConfirmConversions:=True, ReadOnly:=False + + * Or enter the constant or value in the appropriate position in the argument list for the method, without entering the name of the argument. The previous statement would look like this: + + Documents.Open "c:\Temp\Example.docm", True, False + +When you use the first approach—naming the arguments—you don't need to put them in order because VBA looks at their names to identify them. The following statements are functionally equivalent: + + Documents.Open ReadOnly:=False, FileName:= "c:\temp\Example.docm", _ + ReadOnly:=False, ConfirmConversions:=True + + Documents.Open FileName:="c:\temp\Example.docm", _ + ConfirmConversions:=True, ReadOnly:=False + +You also don't need to indicate to VBA which optional arguments you're omitting. + +By contrast, when you don't employ argument names, you're specifying which argument is which by its position in the list. Therefore, _the arguments must be in the correct order_ for VBA to recognize them correctly. If you choose not to use an optional argument but to use another optional argument that follows it, enter a comma (as a placeholder) to denote the omitted argument. For example, the following statement omits the ConfirmConversions argument and uses a comma to denote that the False value refers to the ReadOnly argument rather than the ConfirmConversions argument: + + Documents.Open "c:\temp\Example.docm",, False + +Remember that when you type the comma in the Code or the Immediate window, Auto Quick Info moves the boldface to the next argument in the argument list to indicate that it's next in line for your attention. + +* * * + +Required Arguments Precede Optional Arguments + +Typically, required arguments are listed first in the argument list—before optional arguments. That way, you don't have to use commas to indicate the omission of optional arguments if you only want to enter the required arguments. You can just leave out all the rest of the items in the argument list. + +* * * + +## When to Include the Parentheses around the Argument List + +Most programmers enclose argument lists within parentheses. It makes the code easier to read. However, parentheses can be omitted in some circumstances. When you're assigning the result of a function to a variable or other object, you _must_ enclose the whole argument list in parentheses. For example, to assign to the variable objMyDocument the result of opening the document c:\temp\Example.docm, use the following statement: + + objMyDocument = Documents.Open(FileName:="c:\temp\Example.docm", _ + ConfirmConversions:=True, ReadOnly:=False) + +However, when you aren't assigning the result of an operation to a variable or an object, you don't need to use the parentheses around the argument list, even though it's common practice to do so. The following examples illustrate how you can either use or leave out parentheses when not assigning a result to a variable or other object: + + MsgBox ("Hi there!") + MsgBox "Hi there!" + +# Objects + +To VBA, each application consists of a series of _objects_. Here are a few examples: + + * In Word, a document is an object (the Document object), as is a paragraph (the Paragraph object) and a table (the Table object). Even a single character is an object (the Character object). + * In Excel, a workbook is an object (the Workbook object), as are the worksheets (the Worksheet object) and charts (the Chart object). + * In PowerPoint, a presentation is an object (the Presentation object), as are its slides (the Slide object) and the shapes (the Shape object) they contain. + +Most of the actions you can take in VBA involve manipulating objects. For example, as you saw earlier, you can close the active document in Word by using the Close method on the ActiveDocument object: + + ActiveDocument.Close + +# Collections + +A _collection_ is an object that contains other objects, the way an umbrella-stand object contains umbrella objects. Collections provide a way to access all their members at the same time. For example, the Documents collection contains all the open documents, each of which is an object. Instead of closing Document objects one by one, you can close all open documents by using the Close method on the Documents collection: + + Documents.Close + +Likewise, you can use a collection to change the properties of all the members of a collection simultaneously. + +Here's an example of some code that displays, in the Immediate window of the Editor, all the names of the objects in Word's CommandBars collection: + + 'fetch the number of commandbars + n = CommandBars.Count + + 'display all their names + For i = 1 To n + Debug.Print CommandBars(i).Name + Next i + +# Properties + +Each object has a number of _properties_. Think of properties as the qualities of an object, such as its color, size, and so on. + +For example, the current document in Word has properties such as the number of sentences in the document. Type this into the Immediate window, then press Enter: + + MsgBox (ActiveDocument.Sentences.Count) + +Here you're using the Count property of the Sentences collection to find out how many sentences are in the document. + +Likewise, even a single character has various properties, such as its font, font size, and various types of emphasis (bold, italic, strikethrough, and so on). + +# Methods + +A _method_ is something an object can _do_. A capability. Different objects have different methods, just as different people have different talents. For example, here's a list of some of the methods of the Document object in Word (many of these methods are also available to objects such as the Workbook object in Excel and the Presentation object in PowerPoint): + +**Activate** + +Activates the document (the equivalent of selecting the document's window with the keyboard or mouse) + +**Close** + +Closes the document (the equivalent of pressing Alt+F then C, or clicking the Close button after clicking the File tab on the Ribbon) + +**Save** + +Saves the document (the equivalent of pressing Alt+F then S, or clicking the Save button after clicking the File tab on the Ribbon) + +**SaveAs** + +Saves the document under a specified name (the equivalent of pressing Alt+F then A, or clicking the Save As button after clicking the File tab on the Ribbon) + +# Events + +When an _event_ occurs, VBA is aware that something happened, usually something that happened _to_ an object. For example, the opening of a file (either by a user or by a macro procedure) typically generates an event. The user clicking a button in the toolbar generates a Click event. Another way to put it is that when you click a button, you trigger that button's Click event, and VBA becomes aware that this has happened. + +By writing code for an event, you can cause VBA to respond appropriately when that event occurs. For example, let's say you display a user form (a window). You might write some code in an OK button's Click event. This code might check that all necessary settings were specified by the user when the user clicked the OK button to close the user form and apply the settings. You might write more code within that button's Click event that responded (perhaps by displaying a message box) if the user had failed to type in some required information. In essence, you can write code in an event to tell VBA what to do if that event is triggered. You don't have to write code for all events; sometimes you'll write code in only one of them. But if you put a button captioned "Display Results" on a user form, you'd better at least write some code in that button's Click event to display some results. + +* * * + +Objects and Their Components + +I'll have much more to say about objects throughout the rest of this book. For now, see if you can identify the three primary parts of a typical object: properties (its qualities), methods (ways you can make the object behave), and events (something that happens to an object while a program or application is executing). Collectively, these three components of an object are called the object's _members_. + +Take a look at the following code window. See if you can spot the members of the Document object—its properties, its methods, and an event. + +Here, you can see that the ThisDocument object is selected in the Project Explorer on the left. This object has available to it the many properties in the long list displayed in the Properties window on the left side. You can either modify those properties directly in the Visual Basic Editor or write code that modifies them on the fly while the macro executes. + +On the right side is a drop-down list of events—actions that can happen to a Document object, or at least happen while the document is in existence within the computer. You can write code in any of these events (in the Code window, each event will be a separate subprocedure, enclosed within the Sub and End Sub statements). Here, you can see that we're writing code that will execute when the Document_Close event is triggered: + + Private Sub Document_Close() + +In this example, I'm writing code to query users if they attempt to close the document. This code will execute anytime this document's Close event is triggered (when the user clicks the x button in the upper-right corner of the window, for instance). + +Only one _method_ is shown in the Code-window illustration. Can you spot it? It's in boldface in the following code example: + + Private Sub Document_Close() + Dim intAnswer As Integer + intAnswer = MsgBox("Do you want to check the spelling?", _ + vbOKCancel, "Document Is Being Closed") + If intAnswer = 1 Then ' they clicked OK. 1 = OK 2 = Cancel + ThisDocument. **CheckSpelling** + End If + End Sub + +As you can see, CheckSpelling is a method (a task that an object is able to carry out). + +* * * + +# The Bottom Line + +**Understand the basics of VBA.** + +VBA includes two types of procedures, used for different purposes. + +Master It + +Name the two types of procedures used in VBA (and indeed in most computer languages), and describe the difference between them. + +**Work with procedures and functions.** + +A procedure is a container for a set of programming statements that accomplish a particular job. + +Master It + +Write a subprocedure in the Visual Basic Editor that displays a message to the user. Then execute that subprocedure to test it. + +**Use the Immediate window to execute individual statements.** + +When you're writing code, you often want to test a single line (a statement) to see if you have the syntax and punctuation right or if it produces the expected result. + +Master It + +Open the Immediate window, type in a line of code, and then execute that line. + +**Understand objects, properties, methods, and events.** + +Object-oriented programming (OOP) means creating objects to use in your programming. OOP has become the fundamental paradigm upon which large programming projects are built. Generally speaking, macros are not large and therefore don't profit from the clerical, security, and other benefits that OOP offers—particularly for people who write large applications as a team. + +However, code libraries, such as the vast VBA set of objects and their members (not to mention the even vaster .NET libraries that tap into the power of the operating system itself) _are_ written by large groups of people, and written at different times. These libraries themselves are huge. There must be a way to organize their objects and functions—to categorize them and allow you to execute the methods and manage their properties and arguments. As a result, another aspect of OOP—taxonomy—is quite valuable even when writing brief macros. It's a way to quickly locate the members you're interested in. + +Master It + +Look up the Document object in the Visual Basic Editor's Help system; then look at its methods. +Chapter 6 + +Working with Variables, Constants, and Enumerations + +This chapter covers the basics of working with variables, constants, and enumerations. _Variables_ are used very often; they provide a way of storing and manipulating information. Variables come in several types, such as String variables for storing text, various numeric data types for storing numbers (for example, Integer variables for storing integer values), Date variables for storing dates and time, Boolean variables for storing True/False values, and even Object variables for storing objects. + +A _constant_ is a named item that stores a value that doesn't change. Constants, like variables, exist only while a program is executing. Most programmers rarely create their own constants; they just use variables instead. However, there is another kind of constant that the programmer does not create. And this type of constant is used all the time. Many useful constants are built into VBA, to represent elements in Access, text color options in Excel, styles in Word, and so on. + +For our purposes, the term _enumeration_ means a numbered list—like a list of all the items you need to buy to paint a room. The list contains both the numbers and the names of the items. So you can refer to each item either by its number in the list or by its name. Essentially, an enumeration is a group of related, predefined constants. But constants are more commonly identified by their names rather than their numbers in the list. That's because the name AnimationFlyIntoFromLeft is easier to use in your programming than its number, 1312. + +The one type of variable that this chapter doesn't discuss is the Array variable, which is used to store a set of multiple pieces of related information at the same time. It's similar to an enumeration. Arrays are so important in computer programming that I'll devote an entire chapter to them: Chapter 7, "Using Array Variables." + +In this chapter you will learn to do the following: + + * Understand what variables are and what you use them for + * Create and use variables + * Specify the scope and lifetime of a variable + * Work with constants + * Work with enumerations + +# Working with Variables + +Variables are used in nearly all computer programs, even short programs like macros. Think of a variable as a named area in the computer's memory that you use for storing data while a procedure is running. For example, in Chapter 5, "Understanding the Essentials of VBA Syntax," you created a variable that stored a simple string of text that you then displayed in a message box: + + myVariable = "Sample variable text" + MsgBox myVariable + +The first statement sets aside an area in memory, names it myVariable, and assigns the string Sample variable text to it. The second statement retrieves the contents (called the _value_ ) of myVariable from memory and uses the MsgBox function to display it in a message box. The contents of myVariable remain in memory, so you can use the value again if necessary while the macro is running. Or you can even change the contents. In other words, the value in a variable can _vary_ while the program runs. A _constant_ , by contrast, doesn't vary during program execution. + +## Choosing Names for Variables + +VBA imposes several constraints on how you name your variables: + + * Variable names must start with a letter and can be up to 255 characters in length. Usually, you'll want to keep them much shorter than this so that you can easily type them into your code and so that your lines of code don't rapidly reach awkward lengths. + * The Visual Basic Editor's AutoComplete feature helps make long variable names a little more manageable: Type enough of the variable's name to distinguish it from any keywords and other variable names, and press Ctrl+spacebar. If you've typed enough letters to uniquely identify the variable, the Visual Basic Editor inserts its name; if not, the Visual Basic Editor displays the drop-down list of keywords and names starting with those letters. + * Variable names can't contain characters such as periods, exclamation points, mathematical operators (+, –, /, *), or comparison operators (=, <>, >, >=, <, <=), nor can they internally contain type-declaration characters (@, &, $, #). (You'll learn about the type-declaration characters later in this chapter.) + * Variable names can't contain spaces but can contain underscores, which you can use to make the variable names more descriptive by combining words. User_Response is one example. However, it's more common to just omit the underscore and let capitalization segregate the words, as in UserResponse. + +As a general rule, you're pretty safe if you stick with straightforward alphanumerics enlivened with the occasional underscore if you like underscores. + +For example, all of the following variable names are fine, although the last one is awkwardly long: + + * i + * John + * MyVariable + * MissionParameters + * The_String_the_User_Entered_in_the_Input_Box + +On the other hand, these variable names are not usable: + + * My Variable—Contains a space + * My!Variable—Contains an exclamation point + * Time@Tide—Contains a type-declaration character (@) + * 1_String—Does not start with a letter + +Each variable name must be unique within the scope in which it's operating (to prevent VBA from confusing it with any other variable). Typically, the scope within which a variable operates is a procedure, but if you declare the variable as public or private (discussed later in the chapter), its scope is wider. + +The other constraint on variable names is that you should avoid assigning to a variable a name that VBA already uses in its own language or the name of a built-in function, statement, or object member. Doing so is called _shadowing_ a VBA keyword. It doesn't necessarily cause problems, but it may prevent you from using that function, statement, or method without specifically identifying it to VBA by prefacing its name with VBA. For example, instead of Date, you'd have to use VBA.Date—no big deal, but worth avoiding in the first place. After all, why add this complexity when it's simpler to just make up your own, unique variable names? Why do things that provide you with no real benefit and have drawbacks like making your code harder to read? + +There's no reason to shadow a VBA keyword, but VBA has so many keywords that it's surprisingly easy to do so. + +Don't worry about accidentally creating a variable name that violates one of the rules listed in this section. VBA will throw you an error message if you use @ or start your variable name with 6 or try any other illegal moves. VBA will either report "Invalid Character" or separate your variable name into multiple words, such as changing 56nin into 56 nin, thinking you are trying to use line numbers in your code. (You can, if you wish, number your lines, and VBA will execute the code by just ignoring the line numbers. I number the lines in the code in this book so I can reference them in the text.) + +## Declaring a Variable + +Recall from Chapter 5 that VBA lets you declare variables either implicitly or explicitly. As you'll see shortly, each approach has its pros and cons. However, as you'll also see, explicit declarations are almost always a good idea, and when you've been working with VBA for even a little while, you'll probably use them all the time. For this reason, it's best to declare your variables explicitly right from the beginning. But this chapter also illustrates how to make implicit declarations so you know that technique if that's your preference. + +### Declaring a Variable Implicitly + +Declaring a variable implicitly means that you just use it in your code without first declaring it explicitly. When you declare a variable implicitly, VBA checks to make sure that there isn't already an existing variable with that name. It then automatically creates a variable with that name for you and assigns it the Variant data type, which can contain any type of data except a fixed-length string. + +For example, in the previous chapter, you declared the variable myVariable by using the following implicit declaration: + + myVariable = "Sample variable text" + +Here, myVariable is implicitly declared as a variable—because it is used in a statement rather than first being declared explicitly (usually with the Dim command). + +VBA assigns an implicitly declared variable to the Variant data type, which has a dozen or so subtypes. In this case, the variable's subtype is a string because it contains text. VBA usually assigns the variable the value Empty (a special value used to indicate Variant variables that have never been used) when it creates it, but in this case the variable receives a value immediately (because the string of text is assigned to it). VBA then assigns the string type because it can see you're storing a string in the variable. + +The advantage of declaring a variable implicitly is that you write less code. When you want a variable, you simply declare it on the spot by using it in a statement. But declaring a variable implicitly also has a couple of disadvantages: + + * It's easier to make a mistake typing the variable's name elsewhere in your code. For example, suppose you implicitly declare the variable FilesToCreate and then later type FllesToCreate instead. VBA doesn't query the latter spelling (with its double ll typo). No error messages are displayed. VBA merely creates another, new, different variable with the ll name. + +When you're working with a number of variables, it can be difficult and time-consuming to catch little typo mistakes like this. And a mistake like this (having two variables when you think you have only one) causes errors. The problem in this example is that you think you're referring to the FilesToCreate variable, but you're not. VBA can detect this kind of error, but only if _explicit declaration_ is enforced. ( _Enforced_ here means that if you try to get away with using an undeclared variable—the one with the typo was never formally declared—the Visual Basic Editor displays an error message and halts execution.) + + * The Variant variable type takes up more memory than other types of variables because it has to be able to store various types of data. This difference is negligible under most normal circumstances, particularly if you're using only a few variables or writing only short procedures. However, if you're using many variables in a huge program running on a computer with limited memory, the extra memory used by Variant variables might slow down a procedure or even run the computer out of memory. What's more important on an underpowered computer is that manipulating Variants takes longer than manipulating the other data types. This is because VBA has to keep checking to see what sort of data is in the variable. + +You can get around this second disadvantage in a couple of ways: first, by using a type-declaration character to specify the data type when you declare a variable implicitly or, second (as you will see in the next section), by simply telling VBA to force you to declare variables explicitly—and to display an error message if you don't. + +A _type-declaration character_ is a character that you add to the end of a variable's name in an implicit declaration to tell VBA which data type to use for the variable. Table 6.1 lists the type-declaration characters. + +Table 6.1 Type-declaration characters + +**Character** | **Data Type of Variable** | **Example** +---|---|--- +% | Integer | Quantity% +& | Long | China& +@ | Currency | Profits@ +! | Single | temperature! + +# | Double | Differential# +$ | String (variable length) | myMessage$ + +So you could implicitly declare the String variable UserName with the following statement, which assigns the value Jane Magnolia to the variable: + + UserName$ = "Jane Magnolia" + +And you could implicitly declare the currency variable Price by using this statement: + + Price@ = Cost * Margin + +You use the type-declaration character only when declaring the variable. Thereafter, you can refer to the variable by its name—UserName and Price in the previous examples. + +### Declaring a Variable Explicitly + +Declaring a variable explicitly means telling VBA that the variable exists before you use it. VBA allocates memory space to that variable and registers it as a known quantity. You can also declare the variable type at the same time—a good idea but not obligatory. + +You can declare a variable explicitly at any point in code before you use it, but custom and good sense recommend declaring all your variables at the beginning of the procedure that uses them. (Or, to give a variable greater scope, declare it in the General Declarations area up at the top of the Code window. More on scope later.) + +Locating all your declarations at the top of a procedure makes them easy to find, which helps anyone reading the code. + +Declaring variables explicitly offers the following advantages: + + * Your code is easier to read and to debug, both for you yourself and for other programmers. When you write complex code, this is an important consideration. + * Forcing explicit variable declarations is accomplished by adding an Option Explicit statement at the top of a module—in the General Declarations section of the Code window. This enforcement makes it more difficult for you to create new variables unintentionally by mistyping the names of existing variables. + * It is more difficult for you to wipe out the contents of an existing variable unintentionally when trying to create a new variable. + * VBA can catch some data-typing errors at design time or compile time that with implicit declarations wouldn't surface until runtime. + +* * * + +Store the Correct Type of Value in a Variable + +A data-typing error occurs when you assign the wrong type of information to a variable. For example, if you declare an Integer variable and then assign a string of text to it, VBA triggers an error because it can't store string information in an Integer variable. + +* * * + + * Your code runs a fraction faster because VBA won't need to determine each variable's type while the code is running. + +The disadvantage of declaring variables explicitly is that doing so takes a little more time, effort, and thought. For most code, however, this disadvantage is far outweighed by the advantages. + +To declare a variable explicitly, you use one of the following keywords: Dim, Private, Public, or Static. + +For example, the following statement declares the variable MyValue: + + Dim MyValue + +Dim is the most common keyword to use for declaring a variable, and you'll probably want to use it for most of your variable declarations. You use the other keywords to specify a different scope, lifetime, and data type for the variable in the declaration. In the previous example, the MyValue variable receives the default scope and lifetime and the Variant data type, which makes it suitable for general-purpose use. + +You can also declare multiple variables on the same line by separating the variable statements with commas: + + Dim Supervisor As String, ControllerCode As Long + +This can help you keep down the number of declaration lines in your code, but it makes the declarations harder to read, so it's not usually a good idea. + +Be warned that when you declare multiple variables on the same line, you must specify the data type for each, as in the previous example. You might be tempted to try a little abbreviation, like this, hoping for a couple of String variables: + + Dim strManager, strReportingEmployee As String + +This statement doesn't create two String variables: strReportingEmployee is a String variable, but strManager is a Variant because the As String part of the code applies only to strReportingEmployee. + +## Choosing the Scope and Lifetime of a Variable + +The _scope_ of a variable is the area in VBA where it can operate. Think of it as similar to your scope of activity at work: those areas in which you perform tasks and those areas in which you don't. Your scope might be the office-cubicles area of the building, but if you were found slinking around inside the walk-in safe, there would be trouble. Entering the safe is not part of your job description. + +The default scope of a variable is the procedure that declares the variable (either implicitly or explicitly). In other words, the scope is between the Sub and End Sub (or Function and End Function) that define the start and end of a procedure. Macros are most often fairly short, and thus their code is most often contained within a single procedure. For these typical macros, there's no reason for a variable to have a scope any larger than its own procedure. + +Here's an example of procedure-level scope. Suppose you have a module named Financial_Procedures that contains the procedures Breakeven_Table and Profit_Analysis_Table, each of which uses a variable named Gross_Revenue and another named Expenses. The variables in each procedure are distinct from the variables in the other procedure, so there is no danger of VBA confusing the two. (For the human reader, though, using the same variable names in different procedures rapidly becomes confusing when debugging. In general, it's a good idea to use unique variable names, even at the default procedure level.) + +The _lifetime_ of a variable is the period during which VBA remembers the value of the variable. You need different lifetimes for your variables for different purposes. A variable's lifetime is tied to its scope. Lifetime, here, refers to how long during program execution the variable is in existence. + +Sometimes you need to access a variable from outside the procedure in which it's declared. In these cases, you need to declare a different, wider scope for the variable. + +* * * + +Require Explicit Declarations for Variables + +Most experts urge you to explicitly declare variables. You can set VBA to require you to declare variables explicitly. Most programmers and developers find this feature useful because it prevents you from declaring any variables implicitly, whether intentionally or otherwise. + +To require variable declarations globally—so explicit declaration is automatically enforced in any new module you create—choose Tools ⇒ Options in the Visual Basic Editor to display the Options dialog box, select the Require Variable Declaration check box in the Code Settings area, and then click the OK button. (The Require Variable Declaration check box is cleared by default, enabling you to declare variables implicitly, which is usually the easiest way to learn how to work with variables.) The Visual Basic Editor then adds an Option Explicit statement to each new module that you create. This statement enforces explicit variable declarations for the module it's in. + +When you select the Require Variable Declaration check box, the Visual Basic Editor doesn't add the Option Explicit statement to your existing modules. You must type the Option Explicit statement into your existing modules manually if you want to force explicit declarations in them too. + +To require variable declarations only for specified modules, put an Option Explicit statement at the beginning of each module for which you want to require declarations. The Option Explicit statement must go before the Sub or Function statement for the first procedure in the module—if you put it inside a procedure, or between procedures, VBA gives an error when you try to run any of the code in the module. This zone—above the first procedure in a module—is called the _General Declarations_ area. + +If you've set Option Explicit either globally or for a module, VBA tests the procedure before running it. More precisely, VBA protests when it tries to compile the code and discovers that you haven't declared one or more of the variables, and it warns you if a variable isn't explicitly declared, as shown here in this screenshot. VBA also highlights the variable in your code. + +If you get this message box, you can solve the problem either by declaring the variable or by turning off the requirement of variable declarations for the module. To turn off the requirement, remove the Option Explicit statement from the module by selecting and deleting the line that contains it or by commenting out this line by putting a single-quote symbol (') at the start of the line. + +* * * + +A variable can have three types of scope: + + * Procedure + * Private + * Public + +### Procedure Scope + +A variable with _procedure scope_ (also known as _procedure-level scope_ or _local scope_ ) is available only to the procedure that contains it. As a result, the lifetime of a _local variable_ is limited to the duration of the procedure that declares it: As soon as that procedure stops running, VBA removes all local variables from memory and reclaims the memory that held them. This is true even if later on that same procedure is executed again. Local variables don't _persist_ if execution moves outside their procedure. + +Procedure scope is all you'll need for variables that operate only in the procedure in which they're declared. For example, say you implicitly declare a Variant variable named Supervisor, like this: + + Supervisor = "Paul Smith" + +You can then use the Supervisor variable in the rest of that procedure—for example, retrieving the text stored in it or changing that text. When the procedure stops running, VBA removes the variable and reclaims the memory it occupied. + +* * * + +Implicitly Declared Variables Are Always Local + +When you declare a variable implicitly, it's automatically assigned procedure scope. + +* * * + +To explicitly declare a local variable, use the Dim keyword and place the declaration inside the procedure, like this: + + Sub Create_Weekly_Report() + Dim strSupervisor As String + Dim lngController As Long + ... + End Sub + +Here, the second line declares the variable strSupervisor as the String data type, and the third line declares the variable lngController as the Long data type. (The section "Specifying the Data Type for a Variable," a bit later in this chapter, goes through the variable types.) + +On the other hand, if you need to pass any of these variables to another procedure that you call from the current procedure, procedure scope isn't sufficient—you need to use either private scope or public scope. + +### Private Scope + +A variable with private scope is available to all the other procedures in the module that contains it, but not to procedures in other modules. Using private variables enables you to pass the value of a variable from one procedure to another. Unlike local variables, which retain their value only as long as the procedure that contains them is running, private variables retain their value as long as _any procedure_ in the project that contains them is executing. + +To declare a variable with private scope, you can use either the Dim keyword or the Private keyword at the beginning of a module, placing it up top before the Sub statement for the first procedure in the module, like this: + + Dim strSupervisor As String + Private blnConsultantAssigned As Boolean + + Sub Assign_Personnel() + +The Visual Basic Editor displays the private declarations above the dividing line that appears between the General Declarations area and the code below it (see Figure 6.1). + +Figure 6.1 Private variable declarations appear in the declarations area. + +You'll notice that the Dim statement here uses exactly the same syntax as the earlier declaration for the local variable. The only difference is that to declare a private variable, you place the Dim statement in the declarations area rather than within a procedure. Because the Private statement has the same effect as the Dim statement for declaring private variables and can't be used within a procedure, it's clearer to use the Private statement rather than the Dim statement for declaring private variables. + +### Public Scope + +A variable declared with _public_ scope is available anywhere in a project. It's accessible by all procedures in all modules in the project that contains it. + +To declare a public variable, you use the Public keyword in the General Declarations area at the beginning of a module (up above the Sub statement for the first procedure in the module). Here's an example: + + Option Explicit + Public intMyVar As Integer + +The second statement declares the public variable intMyVar as the Integer type. + +Like private variables, public variables retain their value as long as the project that contains them is open (still running). For example, if you want to track the user's name through a series of operations in Word, you can create an AutoExec procedure that prompts users to enter their name when they start Word. (AutoExec is the built-in name for a procedure that runs automatically when Word starts. Word, when you start it, searches to see if there is a Sub named AutoExec and, if so, executes that procedure.) + +* * * + +The Declarations Area Appears at the Top of the Code Window as Necessary + +The General Declarations area appears at the beginning of each module that contains declarations. For example, if you choose to use explicit variable declarations (by selecting the Require Variable Declaration check box on the Editor page of the Tools ⇒ Options dialog box), the Visual Basic Editor automatically enters the Option Explicit declaration at the start of each new module you create. If not, the Visual Basic Editor creates the declarations area when you first enter a statement there manually. + +* * * + +By storing the result of the user's input in a public variable, you can then retrieve the value for use anytime later in the same Word session. You can see how this would be handy if several macros needed the information contained in a variable. Remember that local variables (those declared inside a procedure) are destroyed as soon as that procedure reaches its End Sub statement and shuts down. + +* * * + +Use Prefixes to Identify Variable Types + +You'll likely notice in the various examples in this chapter that it's common to employ prefixes to identify a variable's data type (more on this later in the chapter). For instance, instead of naming a variable CurrentUser in Listing 6.1, I named it strCurrentUser. This str prefix identifies CurrentUser as a variable that holds text strings. These prefixes make your code easier to read and modify because each variable, everywhere in the code, is identified as a particular type. Prefixes commonly used include str for String, int for Integer, var for Variant, lng for Long, obj for Object, and so on. As you'll see later in this book, a similar set of prefixes is used to identify controls you place on a user form: txt for Text, btn for Button, and so on. If you're interested in following this convention, you can find lists of prefixes at this location in _Wikipedia_ : + + + +Listing 6.1 shows an AutoExec procedure. + +**Listing 6.1**: An AutoExec procedure + + 1. Public strCurrentUser As String + 2. + 3. Sub AutoExec() + 4. strCurrentUser = InputBox("Please enter your name.", _ + "Current User Identity") + 5. End Sub + 6. + 7. Sub Identify_Current_User() + 8. MsgBox "The current user is " & strCurrentUser, _ + vbOKOnly + vbInformation, "Current User" + 9. End Sub + +This code consists of three different parts: + + * Line 1 declares the public String variable strCurrentUser. + * Lines 3 through 5 contain the AutoExec procedure. This procedure runs each time the user starts Word. Line 4 displays an input box that prompts the user to enter their name and stores their response in the public variable strCurrentUser. + * Lines 7 through 9 contain the Identify_Current_User procedure, which simply displays a message box that gives the name of the user, along with lead-in text and an information icon and title bar for completeness. + +You can test these procedures by stepping through (by pressing the F8 key) first the AutoExec procedure and then the Identify_Current_User procedure in the Visual Basic Editor. But to see their effect, you'll have to create the procedures and then exit Word. When you restart Word, the AutoExec procedure displays the input box for you to enter your name. At any point thereafter (until you exit Word), you can access the value in the strCurrentUser variable. For example, you could run the Identify_Current_User procedure at any time (until you close Word itself), and VBA displays a message box with the name you entered. A public variable is said to _persist_. + +* * * + +A Large Number of Public Variables Can Clog Memory + +Why not just make all variables public? When writing short programs like macros, this wouldn't cause as much difficulty as when writing large programs or programming professionally in a team. However, there is a variety of reasons to limit the scope of variables to as local as possible. For an interesting take on the advantages and disadvantages of public (global) variables, see this website: + + + +* * * + +### Using Static Variables + +Besides declaring variables with Dim, Private, and Public, you can also use the Static keyword, which is special. You can use it to cause even a _local_ variable to persist. Use Static instead of Dim when you want to declare a _static_ variable—a variable whose values you want to preserve between calls to the procedure in which they are declared. + +Static variables are similar to public variables in that their lifetime is not limited to the duration of the procedure that declares them. The difference is that static variables, once declared, are available only to the procedure that declared them, whereas public variables are available to all procedures once they've been declared. So, a static variable has the scope of a local variable but the _lifetime_ of a public or private variable. There is one particular situation where static variables come in handy: _toggling_. + +Static variables are useful for maintaining information on a process that you need to run a number of times during a session of the application, either to maintain a running total (for example, a count of the times you performed a procedure) or to keep at hand a piece of information that may prove useful when you run a procedure a second or subsequent time. Typically you employ a static variable in a procedure that _toggles_ something between two states. For example, you could create a procedure that when first executed turns on italics, then when next executed turns italics off, then back on, and so on. Such a toggle would look something like this: + + Sub ToggleItal() + + Static switch As Boolean + + switch = Not switch + + If switch Then + MsgBox "on" + Else + MsgBox "Off" + End If + + End Sub + +You can test this by stepping through it (pressing F8 after clicking the first line of the procedure). Each time you execute the procedure, you get a different message. The Not command switches a Boolean variable type back and forth between True and False. A Boolean variable has only those two possible values. + +The following statement declares the static String variable strSearchTerm1: + + Static strSearchTerm1 As String + +## Specifying the Data Type for a Variable + +Table 6.2 explains the data types that VBA supports and the amount of memory each variable type requires. + +Table 6.2 VBA variable data types + +**Variable** | **Short Description** | **Memory Required** +---|---|--- +Boolean | True or False | 2 bytes +Byte | An integer from 0 to 255 | 1 byte +Currency | A positive or negative number with up to 15 digits to the left of the decimal point and 4 digits to the right of it | 8 bytes +Date | A floating-point number with the date to the left of the decimal point and the time to the right of it | 8 bytes +Decimal | An unsigned integer scaled by a power of 10 | 12 bytes +Double | A floating-point number with a negative value between –1.79769313486231570E+308 and –4.94065645841246544E-324 or a positive value between 4.94065645841246544E-324 and 1.79769313486231570E+308 | 8 bytes +Integer | An integer from –32,768 to 32,767 | 2 bytes +Long | An integer from –2,147,483,648 to 2,147,483,647 | 4 bytes +Object | A reference to an object | 4 bytes +Single | A floating-point number with a negative value between –3.4028235E+38 and –1.401298E-45 and a positive value between 1.401298E-45 and 3.4028235E+38 | 4 bytes +Variable-length String | A string of text | 10 bytes plus the storage for the string +Fixed-length String | A string whose length doesn't change | Whatever size is specified for the length +Variant | Any type of data except a fixed-length string in a subtype of the Variant | Variants containing numbers: 16 bytes; Variants containing characters: 22 bytes plus the storage for the characters + +The next few pages discuss these data types in detail. + +### Do You Need to Specify the Data Type? + +Specifying the data type for each variable you create is a good idea, but it's not compulsory. You can almost always use the default Variant data type (as you've done a couple of times so far in this book's examples) and let VBA figure out which subtype to assign to the Variant. + +There are four disadvantages to using the Variant data type like this: + + * Sometimes VBA makes a mistake when trying to interpret which kind of subtype you intended. This can cause rather obscure bugs. + * Using the Variant data type causes your code to run more slowly. However, with short procedures (or long procedures involving relatively few variables), memory and speed are rarely an issue. + * Your code is harder for humans to read and to debug. This can be more of a concern than memory or speed issues. + * The Variant data type takes up more memory than any of the other data types except long strings. + +### Boolean + +A Boolean variable can be set only to True or False. You can use the keywords True and False to set the value of a Boolean variable, as in the second line in the following code (the first declares the Boolean variable blnProduct_Available): + + Dim blnProduct_Available As Boolean + blnProduct_Available = True + +You can then retrieve the result of the Boolean variable and take action accordingly: + + If blnProduct_Available = True Then + MsgBox "The product is available." + Else 'blnProduct_Available = False + MsgBox "The product is not available." + End If + +When you convert a Boolean variable to another data type (such as a numeric value), True returns –1 and False returns 0. When you convert a numeric value to a Boolean value, 0 returns False and all other numbers (whether positive or negative) return True. + +Boolean variables take up 2 bytes each in memory. + +### Byte + +A Byte variable takes up the least memory of any data type—just 1 byte—and can store a number from 0 to 255. + +### Currency + +The Currency data type is designed for use with money. It allows for positive and negative numbers with up to 15 digits to the left of the decimal point and 4 digits to the right of it. Unlike the Single and Double data types, the Currency data type is exact, not rounded. + +To implicitly declare a Currency variable, use the type-declaration character @. For example, you could work out your weekly salary with a little simple math: + + Sub Calculate_Weekly_Salary() + Salary@ = InputBox("Enter your salary.", _ + "Calculate Weekly Salary") + WeeklySalary@ = Salary / 52 + MsgBox WeeklySalary + End Sub + +Currency variables take up 8 bytes each. + +### Date + +The Date data type is relatively complex. VBA works with dates and times as floating-point numbers, with the date displayed to the left of the decimal point and the time to the right. VBA can handle dates from 1 January 100 to 31 December 9999 and times from 0:00:00 to 23:59:59. + +* * * + +Fixed-Point Numbers Are More Efficient + +Computer programming typically stores a number in either of two ways: as a floating-point number or as a fixed-point number. A floating-point number is a number in which the quantity is given by one number multiplied by a power of the number base (for example, 10): the decimal point "floats" to different locations. A fixed-point number is one in which the decimal place remains in the same location. Fixed-point numbers should be used whenever practical because the computer can calculate with them more quickly, for the same reason that addition, multiplication, and subtraction are easier to learn in school than long division and fractions. + +* * * + +You can enter date variables as literal date values—such as **6/30/36** or **June 30, 1936** —by placing a **#** sign before and after the literal date value: + + #June 30, 1936# + +When you move the insertion point from the line in the Code window in which you've entered a literal date value between # signs, VBA converts the data to a number and changes the display to the date format set in your computer. For example, if you enter **June 30, 1936** , VBA will probably display it as 6/30/36. Likewise, you can enter a literal time value (for example, **#10:15PM#** ), and VBA converts it to a number and displays it according to the current time format (for example, 10:15:00 PM). + +Date variables take up 8 bytes each. + +* * * + +Always Specify the Century When Managing Date Data + +Always specify the century of the dates you use (such as 1909 or 2009), because VBA may supply the wrong century if you don't. Earlier versions of VBA (for example, in Office 2000 and Office 97) used to assign any year from 1 through 29 to the twentieth century and any year from 30 through 00 to the twenty-first century. + +* * * + +### Decimal + +The Decimal data type stores unsigned integers, scaled by powers of 10. _Unsigned_ means that the integers carry no plus or minus designation. Note that you can't declare a Decimal variable directly: you can use the Decimal data type only within a Variant data type (discussed later in this section). + +Decimal variables take up 12 bytes each. + +### Double + +The Double data type is for floating-point numbers and can handle negative values from –1.79769313486231570E+308 to –4.94065645841246544E-324 and positive numbers from 4.94065645841246544E-324 to 1.79769313486231570E+308 + +Some numbers in this range cannot be represented exactly in binary, so VBA rounds them. + +_Double_ here stands for double-precision floating point—the way in which the number is handled by the computer. _Single_ (discussed later) stands for single-precision floating point. + +You can use the # type-declaration character to declare a Double variable implicitly. Double variables take up 8 bytes each. + +### Integer + +The Integer data type is the most efficient way of handling numbers within its range (from 32,768 to 32,767), a range that makes it useful for many procedures. For example, if you wanted to repeat an action 300 times, you could use an Integer variable for the counter, as in the following lines: + + Dim intMyVar As Integer + For intMyVar = 1 to 300 + 'repeat actions + Next intMyVar + +Integer variables take up 2 bytes each. The Integer is the most commonly used numeric data type for many programming tasks. This is because unless you're working with something like moon rockets or the national debt, most math will fall within the Integer type's range. + +### Long + +The Long data type is for the national debt. A Long can hold integer values larger or smaller than those the Integer data type can handle: long variables can handle numbers from –2,147,483,648 to 2,147,483,647. (For numbers even larger or smaller than these, use the Double data type, but beware of its rounding.) + +Long variables use the type-declaration character & for implicit declarations and take up 4 bytes each. + +### Object + +The Object data type is for storing addresses that reference objects (for example, objects in an application's object model), providing an easy way to refer to an object. + +Object variables take up 4 bytes each. + +### Single + +The Single data type, like the Double data type, is for working with floating-point numbers. Single can handle negative values from –3.4028235E+38 through –1.401298E-45 and positive values from 1.401298E-45 through 3.4028235E+38 + +Some numbers in this range cannot be represented exactly in binary, so VBA rounds them. + +Use the exclamation point type-declaration character to declare a Single variable implicitly (if you must use implicit declarations). Single variables take up 4 bytes each. + +### String + +The String data type is for handling text: + + * Variable-length String variables can contain up to about 2 billion characters. They take up 10 bytes plus the storage required for the string. + * Fixed-length String variables can contain from 1 to about 64,000 characters. They take up only the storage required for the string. If the data assigned to the String variable is shorter than the fixed length, VBA pads the data with trailing spaces to make up the full complement of characters. If the data assigned to the String variable is longer than the fixed length, VBA truncates the data after the relevant character. VBA counts the characters from the left end of the string—for example, if you assign the string Output to a fixed-length String variable that's four characters long, VBA stores Outp. Fixed-length String variables are rarely used in most programming, with the exception of managing certain databases where there's a rule that a string cannot be longer than a specified length. + * Strings can contain letters, numbers (digits), spaces, and punctuation, not to mention special characters like @ and *. + * You can use the $ type-declaration character to declare a String variable implicitly, but (as usual) you'll do best to declare your String variables explicitly, along with all your other variables. + +### Variant + +The Variant data type, as mentioned earlier in this chapter, is the default type. It's assigned by VBA to any variable whose data type isn't specified by you—so a declaration such as Dim myUntypedVariable creates a Variant. However, Dim intVariable As Integer creates a variable of the Integer data type. (You can also declare a Variant variable explicitly: Dim myVariant As Variant, for example.) + +Variants can handle most of the different types of data, but there are a couple of characteristics of Variants to keep in mind: + + * Variants can't contain fixed-length string data. If you need to use a fixed-length string, you must specify a fixed-length String data type. + * Variant variables can contain four special values: Empty (which means the variable hasn't yet been initialized), Error (a special value used for tracking errors in a procedure), Nothing (a special value used for disassociating a variable from the object it was associated with), and Null (which you use to indicate that the variable deliberately contains no data). + +Variant variables take up more memory than other types. Variant variables that contain numbers take up 16 bytes, and Variant variables that contain characters take up 22 bytes plus the storage required for the characters. + +### Deciding among Types for Variables + +If you found the details of the different types of variables confusing, relax. First, you can usually avoid the whole issue of choosing a variable type by declaring the variable either implicitly or explicitly and letting VBA assign the Variant data type with the appropriate subtype. Second, if you do choose to specify data types for some or all of your variables, you can apply a few straightforward rules to direct your choices: + + * If the variable will contain only the values True and False, declare it as the Boolean data type. + * If the variable will always contain an integer (if it will never contain a fraction), declare it as the Integer data type. (If the number may be too big for the Integer data type, declare it as the Long data type instead.) + * If the variable will be used for calculating money, or if you require no-rounding fractions, use the Currency data type. + * If the variable may sometimes contain a fraction, declare it as the Single or Double data type. + * If the variable will always contain a string, declare it as the String data type. + +* * * + +If You're Unsure, Test a Variable's Type Using a Variant + +If you aren't sure what type of variable will best contain the information you're planning to use, start by declaring the variable as a Variant. Then step through the procedure in Break mode with the Locals window displayed (View ⇒ Locals Window). The Locals window displays local variables, their value, and their type. As you press F8 to step through your procedure, see what Variant subtype VBA assigns to the variable. You'll see the type, such as Variant/Double or Variant/String, in the Type column. Test the procedure a couple more times to make sure this subtype is consistent, and then try declaring the variable as the data type indicated by the subtype. Run the code a few times to make sure the new data type works. + +* * * + +# Working with Constants + +A constant is a named item that keeps a constant value during execution of a program. VBA provides many built-in constants, but you can also declare your own constants to help you work with information that stays constant through a procedure. But recall that many programmers simply use variables rather than constants, even for values that won't change (such as the number of eggs in a dozen). However, constants are available if you or your superiors find them of value. + +## Declaring Your Own Constants + +To declare your own constants, use the Const statement. By declaring a constant, you can simplify your code when you need to reuse a set value a number of times in your procedures. + +### Syntax + +The syntax for the Const statement is as follows: + + [Public/Private] Const _constant_ [As _type_ ] = _expression_ + +Here, Public and Private are optional keywords used for declaring public or private scope for a constant. You'll learn how they work in a moment. _constant_ is the name of the constant, which follows the normal rules for naming variables. _type_ is an optional argument that specifies the data type of the constant. _expression_ is a literal (a value written into your code), another constant, or a combination of the two. + +As with variables, you can declare multiple constants in the same line by separating the statements with a comma: + + Const conPerformer As String = "Carmen Singer", _ + conTicketPrice As String = "$34.99" + +### Example + +Declaring a constant in VBA works in a similar way to declaring a variable explicitly, but you declare the value of the constant when you declare the constant (rather than at a later point of your choosing). You can't change its value afterward. + +As an example, take a look at the following statements: + + Const conVenue As String = "Davies Hall" + Const conDate As Date = #December 31, 2013# + MsgBox "The concert is at " & conVenue & " on " _ + & conDate & "." + +The first line declares the constant conVenue as a String data type and assigns it the data Davies Hall. The second line declares the constant conDate as a Date string type and assigns it the date December 31, 2013. (When you finish creating this line of code and move the insertion point to another line, VBA changes the date to the date format set in your computer's clock—#12/31/2013#, for example.) The third line displays a message box containing a string concatenated from the three text items in double quotation marks, the conVenue string constant, and the conDate date constant. + +## Choosing the Scope and Lifetime for Your Constants + +Scope works the same way for constants as it does for variables. The default scope for a constant declared in a procedure is local—that is, its scope is the procedure that declares it. Consequently, its lifetime is the time for which the procedure runs. But you can set a different scope and lifetime for your constants by using the Public or Private keyword. + + * To declare a private constant, place the declaration at the beginning of the module in which you want the constant to be available. A private constant's lifetime isn't limited, but it's available only to procedures in the module in which it's declared: + + Private Const conPerformer As String = "Carmen Singer" + + * To declare a public constant, place the declaration at the beginning of a module. A public constant's lifetime isn't limited, and it's available to all procedures in all modules in the project in which it's declared: + + Public Const conTicketPrice As String = "$34.99" + +# Working with Enumerations + +In addition to constants you can create in your code, VBA includes sets of predefined constants. An _enumeration_ is a predefined list of unique integers (numbers) that have individual names. It's a set of items, related in some way. + +Here's an enumeration, a set of items that you need to paint a room. Note that another way to describe this is that it's a numbered list: + +1. Brushes + +2. Paint + +3. Masking tape + +4. Drop cloth + +5. Sandpaper + +You could now refer to any of these items by either their number in the enumeration or by their name. + +An enumeration is typically used in your programming to specify a property of an object. Each integer in the enumeration has a meaning to VBA and a name that allows you to refer to it easily. The names that correspond to the integers in the enumeration are called _enumerated constants_. + +For example, when you use the MsgBox function to display a message box using VBA, you can pick one of the enumerated constants in the VbMsgBoxStyle enumeration to specify the type of message box you want to show. If you require an icon in the message box, you can specify which icon from the list of available built-in icons. For example, one of the icons—a stop sign—is the enumerated constant vbCritical (or the integer 16). The enumerated constant vbQuestion (integer 32) displays a question-mark icon, and the enumerated constant vbExclamation (48) displays an exclamation-point icon. The enumerated constant vbInformation (64) refers to an information icon. However, in practice, the integers are rarely used. The enumerated constants (names like vbQuestion) are far easier for humans to grasp, read, and remember than the values (the various integers like 16, 32, 64, and so on) to which they are mapped. So, although you _could_ use the integers in your code, it's better to stick with the enumerated constants like vbQuestion. + +VBA includes many built-in enumerations, and the Visual Basic Editor displays the list of available enumerated constants to help you select the appropriate integer value when you're creating code. To see such a list, type this into a procedure: + + msgbox("inga", + +As soon as you type the comma, up pops the list of enumerated constants, all the available button styles for a message box, including vbQuestion, vbYesNo, vbOKOnly, and so on. As you might guess, the vbOKOnly style displays only a single button, captioned OK. The vbYesNo style displays two buttons, one captioned Yes, the other No. + +You just click one of these button styles in the list of enumerated constants to enter it into your code. If you don't see the list, choose Tools ⇒ Options in the Visual Basic Editor, then select the Auto List Members check box. + +You can also define your own enumerations in custom objects that you create. + +# The Bottom Line + +**Understand what variables are and what you use them for.** + +Variables are a cornerstone of computer programming; they are extremely useful for the same reason that files are useful in the real world. You give a name to a variable for the same reason that you write a name to identify a file folder. And a file can, over time, contain various different papers, just as the value contained in a programming variable can vary. In both cases, the contents vary; the name remains the same. It's good practice to always specifically name a variable before using it in your code. This is called _explicit declaration_. + +Master It + +Explicitly declare a variable named CustomersAge. + +**Create and use variables.** + +When creating (declaring) a new variable, you should avoid using words or commands that are already in use by VBA, such as **Stop** or **End**. There are also restrictions such as not using special characters. + +Master It + +This variable name cannot be used, for two reasons. Fix it so it is a legitimate variable name: + + Dim 1Turn! as Integer + +**Specify the scope and lifetime of a variable.** + +Variables have a range of influence, depending on how you declare them. + +**Master It** + +Create a variable named AnnualSales that will be available to any procedure within its own module but not to other modules. + +**Work with constants.** + +Constants, like variables, are named locations in memory that contain a value. Unlike with variables, however, the value in a constant does not change during program execution. + +Master It + +Define a string constant using the Dim command. Name your constant FirstPrez, and assign it the value George Washington. + +**Work with enumerations.** + +Enumerations provide a handy name for each item in a list, often a list of properties. + +Master It + +In the Project Explorer, click the ThisDocument object to select it. Then locate the JustificationMode property in the Properties window, and choose one of that property's enumerated constants by clicking the small down arrow that appears, then clicking one of the constants in the drop-down list. +Chapter 7 + +Using Array Variables + +In this chapter, you'll learn how to use arrays—containers that can store multiple values at the same time. An array is a kind of super-variable. + +You'll start by examining what arrays are and what you use them for. You'll then examine how to create them, populate them, and erase them. Along the way, you'll look at how to resize an array to make it contain more (or fewer) values, how to specify the scope for an array, and how to find out while your macro executes whether a particular variable name represents an array or a just an ordinary, single-value variable. + +In this chapter you will learn to do the following: + + * Understand what arrays are and what you use them for + * Create and use arrays + * Redimension an array + * Erase an array + * Find out whether a variable is an array + * Sort an array + * Search an array + +# What Is an Array? + +An _array_ is a variable on steroids—a variable that can contain multiple values (but they must be of the same data type). + +You can access the array itself as a whole to work with all the values it contains at once. Or you can access any individual value stored within the array by specifying its index number, which indicates its position within the array. + +If you're having difficulty visualizing what this means, try picturing an array as a numbered list, similar to an enumeration (as described in Chapter 6, "Working with Variables, Constants, and Enumerations"). Each item in the list is located in its own row and is identified by an index number, so you can access the value of the item by just specifying its index number. It's like houses on a street: they all share the same name, such as Maple Drive, but each has a distinguishing number all its own. You'll see visual examples of arrays later in this chapter. + +The previous description is of a simple array—a numbered list like a row of houses on a street. Such an array is said to have only one _dimension_. However, later in this chapter you'll see that you can construct more complicated arrays, which are called _multidimensional_. They're more like a crossword puzzle with both rows _and columns_. + +* * * + +Variant Arrays Can Store Values of Differing Data Types + +An array with the Variant data type can store multiple subtypes of data. That's because a Variant permits any kind of data: strings, integers, and so on. It's a shape-shifter, unique among data types in that it can contain data of all types. + +* * * + +For now, though, let's look at the qualities of the most common, and most easily visualized, array structure, the _one-dimensional array_. + +* * * + +Use Option Base 1 to Simplify Indexes + +Although your code will be less portable—and other programmers who use other computer languages might object—if you're writing macros for your own private use you might want to employ the controversial Option Base 1 statement. + +An array is _delimited_ (or bounded) by a lower bound and an upper bound. In other words, the array's index numbers start with 1 (the lower bound) and end with whatever number of items are in the array (the upper bound). An array representing the eggs in an egg carton would have a lower bound of 1 and an upper bound of 12. That's the simple way to construct and visualize an array, but there's a catch: many computer languages, including VBA, employ a lower bound of _zero_ rather than one by default. + +This means that the first item in an array is indexed as zero—it's the zeroth item. This can be confusing, because it means that you're always working with an index number that's one lower than the item's position in the array. In such an array, January would be the zeroth month, February the first month, with array index number 1, March would be given index 2, and so on. It's as if your shopping list looked like this: + +0. Brushes + +1. Paint + +2. Masking tape + +3. Drop cloth + +4. Sandpaper + +Nobody writes lists with a zeroth item, but this is just one of the kinks in computer programming caused by carelessness when programming languages were first invented. + +However, _unlike_ most other computer languages, VBA allows you to normalize the way array indexes work: beginning them with index 1, the way humans count items in sets or lists. + +VBA lets you make 1 the default index number of the first item in an array by entering an Option Base 1 statement at the beginning of a module. Type this option up in the General Declarations section of your Code window, and the index number for each item in the array is then the same as the item's position in the array, so the array is easier to work with—easier to visualize. + +Why does the first item in an array default to zero anyway? Forty years ago, people who wrote programming languages decided to do this, and it has persisted. The major exception was the BASIC language, VBA's ancestor. It defaulted, sensibly, to 1 as the lower bound of any array. Eventually (with version 6 of Visual Basic), BASIC was modified to make it conform to the other languages and those in charge changed VBA's lower bound to zero. But BASIC did preserve the programmer's option to specify the lower bound as 1 with this Option Base statement. + +Arrays are lists, and we humans don't start lists with zero. We have a first birthday party, not a zeroth one. A winning team comes in first place, not zeroth place. Nonetheless, computer programmers have been wrestling with zero-based array indexing for three decades now—and introducing countless bugs into their code as a result. You're fortunate to be working with VBA, where you have an option to avoid this problem if it bothers you. But note that if you are studying programming or plan to use other languages or program professionally, you will have to accustom yourself to the types of error messages generated by this zero index hitch. Then you can say, "Oh, this is probably an indexing problem," and fiddle with an index number to fix it. Generally, you'll subtract 1 from the index number and that'll do the trick. + +* * * + +# Declaring an Array + +An array is a kind of variable, so you declare an array by using the familiar keywords: Dim, Private, Public, and Static. To indicate that it's an array, however, you add a pair of parentheses after the array's name. For example, the following statement declares an array named curMonthProfit: + + Dim varMonthProfit() + +If you had left off the parentheses, then you would have created an ordinary variable capable of holding only a single value: + + Dim varMonthProfit + +Because no data type was specified in the declaration (Dim) of the preceding array example, this example creates a Variant array. VBA then assigns the appropriate data types (String, Integer, and so on) when you store data in the array. + +But you can specify the data type of an array, just as you would for an ordinary variable. For example, the following statement declares the array named curMonthProfit and makes it the Currency data type: + + Dim curMonthProfit() As Currency + +You can also specify the number of items in the array by using an _array subscript_. For example, the following statement declares the array named curMonthProfit, assigns the Currency data type, and specifies that the array contains 12 items: + + Dim curMonthProfit(11) As Currency + +Now you can see one aspect of the zeroth problem. This array holds 12 items, but in its declaration we must specify 11! The array _subscript_ in the Dim curMonthProfit(11) As Currency statement is _11_ rather than 12 because by default an array's index starts at 0 rather than 1. That 0 index number gives this list _an extra element_. The 1st item is curMonthProfit(0), the 2nd is curMonthProfit(1), and the 12th is curMonthProfit(11). (You can avoid this counterintuitive approach by using the Option Base 1 statement.) + +Figure 7.1 shows a simple representation of the single-dimensional array created by the Dim curMonthProfit(11) As Currency statement. + +Figure 7.1 The single-dimensional array created by the statement Dim curMonthProfit(11) As Currency can be thought of as looking like this. + +To make numbering start at 1, add an Option Base statement to the declarations area at the beginning of the module in which you declare the array. Here is an example: + + Option Base 1 'at the beginning of the code sheet + + Dim curMonthProfit(12) As Currency + +Figure 7.2 shows a simple representation of how this array would look. + +Figure 7.2 The single-dimensional array created by the statement Dim curMonthProfit(12) As Currency with the Option Base 1 statement. Compare this to Figure 7.1. + +* * * + +Variants Can Be Inefficient under Extreme Circumstances + +Recall that omitting the data type when declaring an array (and thus making VBA automatically use the Variant data type) causes slightly increased memory usage, which could (under extreme circumstances) slow the performance of the computer. Because an array needs storage for each item it contains, a very large array can consume a significant amount of memory. This is particularly true with multidimensional arrays discussed later in this chapter. + +* * * + +You can also specify both the lower and upper bounds of an array explicitly. This example code states that the lower bound is to be 1 and the upper bound is 12: + + Option Base 1 'at the beginning of the code sheet + + Dim curMonthProfit(1 To 12) As Currency + +Because learning to use arrays is much easier for beginners if we start with an index of 1, the examples in the rest of this chapter use Option Base 1 statements. + +# Storing Values in an Array + +To assign a value to an item in an array, you use each item's index number to identify it. For example, the following statements assign the values London, Hong Kong, and Taipei to the first three items in an array named strLocations: + + Option Base 1 + + Dim strLocations(6) As String + + strLocations(1) = "London" + strLocations(2) = "Hong Kong" + strLocations(3) = "Taipei" + +Figure 7.3 shows how this array can be envisioned. + +Figure 7.3 A simple String array with three values assigned + +# Multidimensional Arrays + +The curMonthProfit example in the previous section is a one-dimensional array, which is the easiest kind of array to use. But VBA supports arrays with up to 60 dimensions—enough to tax the visualization skills of anyone without a PhD in multidimensional modeling. You probably won't want to get this complicated with arrays—two, three, or four dimensions are enough for most purposes. In fact, one dimension is enough for many purposes. + +To declare a multidimensional array, you separate the dimensions with commas. For example, the following statements declare a two-dimensional array named MyArray with three items in each dimension: + + Option Base 1 + Dim MyArray(3, 3) + +Figure 7.4 shows how you might represent the resulting array. Note that inside each item in this figure's table you can see the pair of index numbers you would use to access it, such as item 1,2 or item 3,2. + +Figure 7.4 You can think of a two-dimensional array as consisting of rows and columns. + +Multidimensional arrays sound forbidding, but a two-dimensional array is quite straightforward if you think of it basically as a _table_ that consists of rows and columns. + +In this example, the first series of three elements appears down the first column of the table, the second series of three elements appears down the second column, and so on. + +The information in any series doesn't need to be related to information in the other series, although it does need to be of the same data type. For example, you could assign three folder names to the first dimension of a String variable array (they would be in column 1), the names of your three cats to the second dimension (more strings), a list of the names of the Three Stooges to the third dimension (the third column in the table), and so on. You could then access the information in the array by specifying the position of the item you want to access—for instance, the second item in the first column of the table (item 1,2). You'll learn how to do this in just a minute. + +Similarly, you could picture a three-dimensional array as being something like a workbook of spreadsheets—rows and columns, with further rows and columns in the third dimension (down, or away from you). + +But that's about the limit of easily pictureable arrays—four-dimensional and larger arrays start to tax the imagination. A row of honeycombs, a set of apartment buildings? It gets difficult. + +# Declaring a Dynamic Array + +You can declare both _fixed_ - _size_ arrays and _dynamic_ arrays. The examples you've seen so far were fixed-size arrays. For instance, the curMonthProfit array was specified as having 12 items. + +Dynamic arrays are useful when the number of values you need to store will vary. For example, for a procedure that arranges windows side by side, you might create an array to contain the name of each open window. But while writing the code, you can't know how many windows might want to open while the macro runs. You'll probably want to use a dynamic array to contain the information. That way the array can be sized to fit the situation. + +To declare a dynamic array, you use a declaration statement _without_ specifying the number of items (you include the parentheses but leave them empty). For example, the following statement declares a dynamic array named arrTestArray and causes VBA to assign it the Variant data type (because no data type is specified): + + Dim arrTestArray() + +# Redimensioning an Array + +You can change the size of, or _redimension_ , a dynamic array by using the ReDim statement. For example, to redimension the dynamic array arrTestArray declared in the previous example and assign it a size of five items, you could use the following statement: + + **ReDim** arrTestArray(5) + +When you use ReDim to redimension an array like this, you lose the values currently in the array. If so far you've only declared the array as a dynamic array and it contains nothing, losing its contents won't bother you. There are no contents. + +But in other situations an array might be full of data, so you'll want to increase the size of an array while keeping its current contents. To preserve the existing values in an array when you raise its upper bound, use a ReDim Preserve statement instead of a straight ReDim statement: + + ReDim **Preserve** arrTestArray(5) + +If you use ReDim Preserve to reduce the size of the array (to lower its upper bound), you of course lose the information stored in any items not included in the redimensioned array. For example, if you have a five-subscript (five-item) array with information in each item and then you redimension it using ReDim Preserve so that it has only three subscripts, you lose the information in the fourth and fifth subscripts. + +Note that ReDim Preserve works only for the last dimension of a multidimensional array. You can't preserve the data in other dimensions in a multidimensional array. + +# Returning Information from an Array + +To get information from an array, you use an index number to specify the position of the information you want to return. For example, the following statement returns the fourth item in the array named arrMyArray and displays it in a message box: + + Option Base 1 + + MsgBox arrMyArray(4) + +The following statement returns the fifth item in the second dimension of a two-dimensional array named arrMy2DArray and displays it in a message box: + + Option Base 1 + + MsgBox arrMy2DArray(2,5) + +To return multiple items from an array, specify each item individually. + +# Erasing an Array + +To erase the contents of an array, use the Erase command with the name of the array. This command reinitializes the items in a fixed-size array and frees the memory taken by items in dynamic arrays (completely erasing the array). For example, the following statement erases the contents of the fixed-size array named arrMyArray: + + Erase arrMyArray + +# Finding Out Whether a Variable Is an Array + +Because an array is a type of variable, you may occasionally need to check whether a particular variable name denotes an array or an ordinary variable (sometimes called a _scalar variable_ ). To find out whether a variable is an array, use the IsArray function with the variable's name. For example, the following statements check the variable MyVariable and display the results in a message box: + + If IsArray(MyVariable) = True Then + + Msg = "MyVariable" & " is an array." + Else + Msg = "MyVariable" & " is not an array." + End If + MsgBox Msg, vbOKOnly + vbInformation, "Array Check" + +# Finding the Bounds of an Array + +To find the bounds of an array, you use the LBound function and the UBound function. LBound returns the _lower bound_ , the index number of the first item; UBound returns the upper bound, the index number of the last item. + +The LBound function and the UBound function have the following syntax: + + LBound( _array_ [, _dimension_ ]) + UBound( _array_ [, _dimension_ ]) + +Here, _array_ is a required argument specifying the name of the array, and _dimension_ is an optional variant specifying the dimension whose bound you want to return—1 for the first dimension, 2 for the second, and so on. (If you omit the _dimension_ argument, VBA assumes you mean the first dimension.) + +For example, the following statement returns the upper bound of the second dimension in the array named arrMyArray and displays it in a message box: + + MsgBox UBound(arrMyArray, 2) + +# Sorting an Array + +You'll sometimes need to sort an array, especially when you load information into the array from an external source rather than assigning values one by one in your code. + +Sorting is easy to understand conceptually: You simply rearrange things into the desired order. For example, you could sort the strings in one array into alphabetical order or reverse alphabetical order, or the numbers in another array into ascending order or descending order. But writing a program that sorts is much more difficult. So, don't write it. Just copy it from examples on the Internet, or from the following example. + +This section shows you a simple form of sorting—the bubble sort, so called because the items being sorted to the earlier positions in the array gradually bubble up to the top. The bubble sort consists of two _loops_ that compare two items in the array; if the second item belongs further up the list than the first item, the sort reverses their positions, and the comparisons continue until the whole list is sorted into order. The bubble sort is a relatively inefficient method of sorting items, but it's easy to grasp, and processor cycles are comparatively cheap these days. The bubble sort hasn't itself become any more efficient over the years, but processor speeds have sure ramped up. + +This example also introduces you to a major element of programming: the _loop_. Loops are an important tool found in many procedures and projects. In effect, a loop repeats some action until a condition is met. It's like saying, "Keep rearranging these attendance cards until the stack is alphabetized." Chapter 12, "Using Loops to Repeat Actions," shows you how to work with loops. + +Listing 7.1 contains the code for the bubble sort. + +**Listing 7.1**: A bubble sort + + 1. Option Explicit + 2. Option Base 1 + 3. + 4. Sub Sort_an_Array() + 5. + 6. 'declare the array and other variables + 7. Dim strArray(12) As String + 8. Dim strTemp As String + 9. Dim strMsg As String + 10. Dim X As Integer, Y As Integer, i As Integer + 11. + 12. 'assign strings to the array + 13. strArray(1) = "nihilism" + 14. strArray(2) = "defeatism" + 15. strArray(3) = "hope" + 16. strArray(4) = "gloom" + 17. strArray(5) = "euphoria" + 18. strArray(6) = "despondency" + 19. strArray(7) = "optimism" + 20. strArray(8) = "pessimism" + 21. strArray(9) = "misery" + 22. strArray(10) = "happiness" + 23. strArray(11) = "bliss" + 24. strArray(12) = "mania" + 25. + 26. strMsg = "Current items in array:" & vbCr & vbCr + 27. For i = 1 To UBound(strArray) + 28. strMsg = strMsg & i & ":" & vbTab & strArray(i) & vbCr + 29. Next i + 30. MsgBox strMsg, vbOKOnly + vbInformation, "Array Sorting: 1" + 31. + 32. For X = LBound(strArray) To (UBound(strArray) - 1) + 33. For Y = (X + 1) To UBound(strArray) + 34. If strArray(X) > strArray(Y) Then + 35. strTemp = strArray(X) + 36. strArray(X) = strArray(Y) + 37. strArray(Y) = strTemp + 38. strTemp = "" + 39. End If + 40. Next Y + 41. Next X + 42. + 43. strMsg = "Items in sorted array:" & vbCr & vbCr + 44. For i = 1 To UBound(strArray) + 45. strMsg = strMsg & i & ":" & vbTab & strArray(i) & vbCr + 46. Next i + 47. MsgBox strMsg, vbOKOnly + vbInformation, "Array Sorting: 2" + 48. + 49. End Sub + +Read through this code, and the explanation of it that follows, to see how much of it you can understand. At this point, you might not grasp much at all. But don't worry; things will become clearer as you progress through this book. What's more, you need never write a bubble sort from scratch anyway—just copy this one, modifying it a little to sort whatever array you're dealing with. And remember, you can copy all the code in this book from this book's website at www.sybex.com/go/masteringvba2013. + +* * * + +How to Locate Line Numbers in the Editor + +In this book code examples more than a few lines long are given line numbers so the lines can be referenced easily in the explanatory text. If you're following along with a code description in this book, you'll sometimes want to know what line the blinking cursor is on in the editor code. Just look at the field at the far right of the editor's Standard toolbar, right next to the blue Help question mark. This field always displays the current line number and character number, as you can see in this screenshot. + +* * * + +Here's what happens in Listing 7.1: + + * Line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make array index numbers start at 1 rather than 0. These two statements appear in the General Declarations zone of the code sheet, above any other procedure in the Code window. Line 3 is a spacer—a blank line inserted just to make the code easier to read. You can remove it if you wish, or add more spacers—it's your call. VBA ignores blank lines. + * Line 4 begins the Sort_an_Array procedure. Line 5 is a spacer. + * Line 6 is a comment line prefacing the declaration of the array and the variables. Line 7 declares the String array strArray with 12 subscripts (array items). Line 8 declares the String variable strTemp. Line 9 declares the String variable strMsg. Line 10 declares the Integer variables X, Y, and i. Line 11 is a spacer. + * Line 12 is a comment line explaining that the next 12 statements (lines 13 through 24) assign strings to the array. The strings used are words describing various moods. Line 25 is a spacer. + * Lines 26 through 30 build a string out of the strings assigned to the array and then display it in a message box. This section of code is included to help users easily see what's going on if they run the procedure rather than stepping through it. Line 26 assigns introductory text and two carriage returns (two vbCr characters) to the String variable strMsg. Line 27 starts a For... Next loop that runs from i = 1 to i = UBound(strArray)—in other words, once for each item in the array. (The loop could also have run to i = 12 because the upper bound of the array is set, but using the upper bound is more flexible than hard-coding values.) Line 28 adds to strMsg the value of the counter variable i, a colon, a tab (vbTab), the contents of the array item currently referenced (strArray(i)), and a carriage return (vbCr). Line 29 concludes the loop, and line 30 displays a message box containing strMsg, as shown in Figure 7.5. Line 31 is a spacer. + +Figure 7.5 The Sort_an_Array procedure displays a message box of the unsorted terms so that the user can see how things start out. + + * The sorting part of the procedure takes place in lines 32–41. Here are the details: + * Line 32 begins a set of nested loops: one inside another. There's an outer loop and an inner loop. The outer For... Next loop ends in line 41 with the Next X statement. This loop runs from X = LBound(strArray) (in other words, X = 1) to X = (UBound(strArray) - 1) (in other words, X = 11, the upper bound of the array, minus 1). + * Line 33 begins the inner (nested) For... Next loop, which runs from Y = (X + 1) to Y = UBound(strArray). Line 40 ends this loop. + * Line 34 compares strArray(X) to strArray(Y). If strArray(X) is greater than strArray(Y)—in other words, if strArray(X) should appear after strArray(Y) in the alphabetized array—line 35 assigns strArray(X) to strTemp, line 36 assigns strArray(Y) to strArray(X), and line 37 assigns strTemp to strArray(Y), thus switching the values. Line 38 restores strTemp to an empty string. Line 39 ends the If statement. Line 40 ends the inner loop, line 41 ends the outer loop, and line 42 is a spacer. + * Lines 43 through 47 essentially repeat lines 26 through 30, displaying a message box (shown in Figure 7.6) of the now-sorted array so that the user can see that the sort has worked. + +Figure 7.6 When the Sort_an_Array procedure has finished sorting, it displays the sorted list in a second message box. + + * Line 48 is a spacer, and line 49 ends the procedure. + +# Searching through an Array + +Another task you sometimes need to perform with an array is searching to find a particular value in it. This is similar to rifling through a box of recipe cards until you find _Ralph's Jailhouse Chili_. + +The following sections show you two methods of sorting—a linear search, which you can perform on either a sorted array or an unsorted array, and a binary search, which is faster but works only on a sorted array. + +## Performing a Linear Search through an Array + +A _linear_ search is a simple kind of search: You start at the beginning of the array and check each item until you find your target, or until you reach the end of the array and must report _not found_. + +Before executing this code, display the Immediate window in the editor by pressing Ctrl+G or choosing View ⇒ Immediate Window. This procedure prints information in the Immediate window so that you can see what's going on—and whether the code is running as intended. Using the Immediate Window like this to check output is often preferable to displaying message boxes as we did in the previous section. With the Immediate window, you don't have to click the message boxes closed, and the window can also be scrolled, displaying as much information as you wish. + +Listing 7.2 contains the code for a simple linear search through a one-dimensional array. + +**Listing 7.2**: A simple linear search + + 1. Option Explicit + 2. Option Base 1 + 3. + 4. Sub Linear_Search_of_Array() + 5. + 6. 'declare the array and the variables + 7. Dim intArray(10) As Integer + 8. Dim i As Integer + 9. Dim varUserNumber As Variant + 10. Dim strMsg As String + 11. + 12. 'add random numbers between 0 and 10 to the array + 13. 'and print them to the Immediate window for reference + 14. For i = 1 To 10 + 15. intArray(i) = Int(Rnd * 10) + 16. Debug.Print intArray(i) + 17. Next i + 18. + 19. Loopback: + 20. varUserNumber = InputBox _ + ("Enter a number between 1 and 10 to search for:", _ + "Linear Search Demonstrator") + 21. If varUserNumber = "" Then End + 22. If Not IsNumeric(varUserNumber) Then GoTo Loopback + 23. If varUserNumber < 1 Or varUserNumber > 10 Then GoTo Loopback + 24. + 25. strMsg = "Your value, " & varUserNumber & _ + ", was not found in the array." + 26. + 27. For i = 1 To UBound(intArray) + 28. If intArray(i) = varUserNumber Then + 29. strMsg = "Your value, " & varUserNumber & _ + ", was found at position " & i & " in the array." + 30. Exit For + 31. End If + 32. Next i + 33. + 34. MsgBox strMsg, vbOKOnly + vbInformation, "Linear Search Result" + 35. + 36. End Sub + +Here's what happens in Listing 7.2: + + * As in the previous listing, line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make the index numbers of arrays start at 1 rather than 0. These two statements appear in the declarations part of the code sheet, before any other procedure. Line 3 is a spacer. + * Line 4 begins the Linear_Search_of_Array procedure. Line 5 is a spacer. + * Line 6 is a comment line prefacing the declaration of the array and the other variables that the code uses. Line 7 declares the Integer array intArray with 10 subscripts. Line 8 declares the Integer variable I (traditionally programmers use the name I for a loop's counter variable— _I_ for _increment_ or _iteration_ ). + * Line 9 declares the Variant variable varUserNumber, which the code uses to store the user's input from an input box. (More on this control in a moment.) Line 10 declares the String variable strMsg. Line 11 is a spacer. + * The procedure declares the variable varUserNumber as a Variant rather than an Integer. This way, Visual Basic doesn't automatically halt execution and display an error message if the user enters something other than an integer (for example, text) in the input box. + * Lines 12 and 13 contain an extended comment line on the code in lines 14 through 17. (These two lines could be combined into one logical line by adding a continuation character at the end of the first line and omitting the apostrophe at the beginning of the second line, but the code is easier to read when the second line begins with the comment character as well.) + * Line 14 begins a For... Next loop that repeats 10 times: from i = 1 to 1 = 10. Line 15 assigns to the current item in the intArray array the integer result of a random number multiplied by 10: intArray(i) = Int(Rnd * 10). (The Rnd function generates a random number between 0 and 1 with a good number of decimal places. So the procedure multiplies that random number by 10 to get a number between 0 and 10 and then takes the integer portion of the number. In other words, the Int command strips off any fractional result, any values to the right of the decimal point.) Line 16 then uses the Print method of the Debug object to print the current item in intArray to the Immediate window. This is an easy way for you, the programmer, to examine the values generated randomly for the array. The user never sees the Immediate window. Line 17 ends the loop with the Next i statement. Line 18 is a spacer. + * Line 19 contains a _label_ , named Loopback, used to return execution to this point in the code if the user's input does not meet required conditions (If it's not between 1 and 10). + * Line 20 assigns to the Variant variable varUserNumber the result of the user's input. An input box (shown in Figure 7.7) prompts the user to enter a number between 1 and 10. + * Line 21 then compares the contents of varUserNumber to an empty string—the result you get if the user clicks the Cancel button in the input box or clicks the OK button without entering anything in the text box. If varUserNumber is an empty string, the End statement ends execution of the procedure. + * Line 22 uses the IsNumeric function to see whether the contents of varUserNumber are numeric. If they're not, the GoTo Loopback statement returns execution to the Loopback label, after which the input box is displayed again for the user to try their luck once more. Line 23 checks to see if varUserNumber is less than 1 or greater than 10. If either is the case, another GoTo Loopback statement returns execution to the Loopback label, and the input makes another appearance. Line 24 is a spacer. + +Figure 7.7 The Linear_Search_of_Array procedure displays an input box prompting the user to enter a number between 1 and 10. The array itself is printed in the Immediate window. + +* * * + +VBA Is Flexible + +Note the flexibility of VBA here: The code solicits user input and makes sure that it's a number between 1 and 10 (inclusive). Though that number is still stored in a Variant rather than explicitly converted to an Integer, VBA still performs the comparison needed. + +* * * + + * Line 25 assigns to the String variable strMsg a preliminary message stating that the value (which it specifies) was not found in the array. (If the code finds the value in the array, it changes the message before displaying it.) Line 26 is a spacer. + * Lines 27 through 32 contain the searching part of the procedure. Line 27 begins a For... Next loop that runs from i = 1 to i = UBound(intArray)—once for each subscript in the array. Line 28 compares intArray(i) to varUserNumber; if there's a match, line 28 assigns to strMsg a string telling the user at which position in the array the value was found, and line 29 uses an Exit For statement to exit the For... Next loop. (If line 28 does not match, the Next i statement in line 32 causes the code to loop.) + * Line 33 is a spacer. Line 34 displays a message box containing strMsg to convey to the user the result of the linear search operation. Figure 7.8 shows the result of a successful search. Line 35 is a spacer, and line 36 ends the procedure. + +Figure 7.8 Line 34 of Listing 7.2 displays a message box telling the user the result of the linear search operation. + +* * * + +How to Generate Random Numbers + +Sharp-eyed readers will notice that a 0 sometimes appears in the array in the previous example, and what's more, 10 never appears. In other words, the code Int(Rnd * 10) randomly produces numbers ranging from 0 to 9. This is a byproduct of the rounding performed by the Int command. Here's how to use the Rnd command to produce the exact range of numbers you want. + +When asking VBA for a random number, you specify the upper limit of the range of numbers you want and then multiply that number by Rnd. For example, if you want to simulate rolling dice, you need random numbers from 1 to 6, so 6 is the upper limit. You multiply the result that Rnd gives you by 6. _But then you must add 1_ to make the result range from 1 to this upper limit. (Otherwise, the result is a range between 0 and the upper limit, minus 1, as in the code in Listing 7.2, which provided numbers from 0 to 9 rather than 1 to 10.) + +The Int function must be used because Rnd provides only fractions. Here are some typical results when the Rnd function executes: + + * 0.4542078 + * 0.3570231 + * 0.1499811 + * 0.7043958 + * 0.928786 + +Because these are fractions, you need to multiply to get whole numbers. But the Int command rounds off any fractional part of the final result. So here is how you would get a random number from 1 to 50: + + X = Int(Rnd * 50 + 1) + +To get a range from 0 to an upper limit, specify as the upper limit a number 1 higher than you actually want. And don't add 1 inside the parentheses. This example provides a random number from 0 to 50: + + X = Int(Rnd * 51) + +* * * + +## Performing a Binary Search through an Array + +As you saw in the previous section, a linear search is easy to perform, but it's pretty simple and slow—it starts looking at the beginning of the array and then checks each element, each item, in turn. This approach works fine for small searches, such as the 10-subscript array you searched in the last example, but you wouldn't want to try it on anything the size of a phone book—even in a small town. For serious, heavy-duty searching, you need a smarter approach. + +For conventional purposes, a _binary search_ is a good way to approach searching a sorted array. A binary search formalizes the technique you probably use when searching for something like a lost TV remote control. You expect it to be in a given location—somewhere in the living room, probably near the couch. So you focus your attention on the relevant area and search it thoroughly. (With a _linear search_ , you search everywhere in the house, from start to finish, without any attempt to intelligently narrow the search area.) + +The binary search technique (technically called an _algorithm_ ) determines the most likely target area by dividing the sorted array in half, establishing which half will contain the search item, and then repeating the divide-and-interrogate procedure until it either finds the search item or reaches the last subdivisible unit of the array without finding it. Remember, this array is presorted, so if the algorithm is looking for the number 12 in a list from 1 to 20, it's likely that the target will be in the second half of the list. + +Here's another example. Say that a binary search is looking for the value 789,789 in a million-subscript array that contains the numbers 1 through 1,000,000 in ascending order. It divides the array into two halves, each of which contains a half million subscripts. It establishes whether the search item is in the first half or the second half and then narrows the search to the appropriate half and divides it into new halves. It establishes whether the search item is in the first of these halves or the second and then focuses on that half, dividing _it_ into halves—and so on until it finds the term or has gotten down to a single subscript. + +This is a simple example, but a million is still a hefty number. Listing 7.3 makes things even simpler by using an array of a thousand subscripts that contains the numbers 1 through 1000 in order: The first subscript contains the number 1, the second subscript contains the number 2, and so on up to 1000. The example is unrealistic, but it makes it easy to see what's happening in the code. + +**Listing 7.3**: Searching through a large array + + 1. Option Explicit + 2. Option Base 1 + 3. + 4. Sub Binary_Search_of_Array() + 5. + 6. 'declare the array and the variables + 7. Dim intThousand(1000) As Integer + 8. Dim i As Integer + 9. Dim intTop As Integer + 10. Dim intMiddle As Integer + 11. Dim intBottom As Integer + 12. Dim varUserNumber As Variant + 13. Dim strMsg As String + 14. + 15. 'populate the array with numbers 1 to 1000, in order + 16. For i = 1 To 1000 + 17. intThousand(i) = i + 18. Next i + 19. + 20. 'prompt the user for the search item + 21. Loopback: + 22. varUserNumber = InputBox _ + ("Enter a number between 1 and 1000 to search for:", _ + "Binary Search Demonstrator") + 23. If varUserNumber = "" Then End + 24. If Not IsNumeric(varUserNumber) Then GoTo Loopback + 25. + 26. 'search for the search item + 27. intTop = UBound(intThousand) + 28. intBottom = LBound(intThousand) + 29. + 30. Do + 31. intMiddle = (intTop + intBottom) / 2 + 32. If varUserNumber > intThousand(intMiddle) Then + 33. intBottom = intMiddle + 1 + 34. Else + 35. intTop = intMiddle - 1 + 36. End If + 37. Loop Until (varUserNumber = intThousand(intMiddle)) _ + Or (intBottom > intTop) + 38. + 39. 'establish whether the search discovered the search item _ + or not and add the appropriate information to strMsg + 40. If varUserNumber = intThousand(intMiddle) Then + 41. strMsg = "The search found the search item, " _ + & varUserNumber & ", at position " & intMiddle _ + & " in the array." + 42. Else + 43. strMsg = "The search did not find the search item, " _ + & varUserNumber & "." + 44. End If + 45. + 46. MsgBox strMsg, vbOKOnly & vbInformation, "Binary Search Result" + 47. + 48. End Sub + +Here's what happens in Listing 7.3: + + * Line 1 contains an Option Explicit statement to force explicit declarations of variables, and line 2 contains an Option Base 1 statement to make the numbering of arrays start at 1 rather than 0. These two statements appear in the declarations part of the code sheet, before any procedure. + * Line 3 is a spacer. Line 4 declares the Binary_Search_of_Array procedure, and line 5 is another spacer. + * Line 6 is a comment line prefacing the declaration of the array (the thousand-subscript Integer array intThousand, declared in line 7) and the other variables that the procedure uses: the Integer variables i (line 8), intTop (line 9), intMiddle (line 10), and intBottom (line 11); the Variant variable varUserNumber (line 12); and the String variable strMsg (line 13). Line 14 is yet another spacer. + * Line 15 is a comment line announcing that lines 16 through 18 populate the array with the numbers 1 to 1000 in order. To do so, these lines use a For... Next loop that runs from i = 1 to i = 1000, assigning the current value of i to the subscript in the array referenced by i—in other words, assigning to each subscript the number that corresponds to its position in the array. Line 19 is a spacer. + * Line 20 is a comment line introducing the section of code (lines 21 through 24) that uses an input box (shown in Figure 7.9) to prompt users to enter a number to search for, and checks that they do so. As in the previous listing, this section of code checks to make sure users don't enter an empty string in the input box (line 23) and terminates execution of the procedure if they did. It also uses a label named Loopback (in line 21), to which the code returns if what a user entered in the input box (in line 22) turns out not to be numeric when line 24 checks. Because this time you know which numbers the array will contain, you don't need to check to make sure that users enter a suitable value. If they want to enter a value that doesn't appear in the array, so be it. + +Figure 7.9 The Binary_Search_of_Array procedure prompts the user to enter a number between 1 and 1000. + + * Line 25 is a spacer, and line 26 is a comment that introduces the section of code that searches for the search item the user entered. Line 27 assigns to the intTop variable the upper bound of the array, and line 28 assigns to intBottom the lower bound. Line 29 is a spacer. + * Lines 30 through 37 contain a Do... Loop Until loop that performs the bulk of the binary searching. Here are the details: + * Line 30 starts the Do... Loop Until loop with the Do keyword, and line 37 ends it with the Loop Until keywords and the condition ((varUserNumber = intThousand(intMiddle)) Or (intBottom > intTop)). You'll look at loops in detail in Chapter 12; for now, all you need to know is that a Do... Loop Until runs once and then evaluates the condition in the Loop Until statement to determine whether it should end or run again. The condition here specifies that the loop continue until either the value of the subscript in the array identified by intMiddle –intThousand(intMiddle)– matches the value in varUserNumber or the value of intBottom is greater than the value of intTop (intBottom > intTop). + * Line 31 sets the value of the Integer variable intMiddle to the sum of intTop and intBottom divided by 2 : (intTop + IntBottom) / 2. Doing so gives the midpoint for dividing the array. For example, in the thousand-subscript array, intTop has a value of 1000 on the first iteration of the loop, and intBottom has a value of 0, so intMiddle receives the value 500 (1000 divided by 2). + * Line 32 tests whether varUserNumber is greater than the value stored in the subscript identified by intMiddle—intThousand(intMiddle), the midpoint of the current section of the array. If it is, the search needs to work on the top half of the array, so line 33 resets intBottom to intMiddle + 1. If it's not, the Else statement in line 34 kicks in, and line 35 resets intTop to intMiddle–1 so that the search works on the lower half of the array. + * Line 36 ends the If statement, and line 37 tests the condition and continues or terminates the loop, as appropriate. + * Line 38 is a spacer. Line 39 contains a two-line comment introducing the code in lines 40 through 44, which establish whether the search found the search item and assign suitable information to the strMsg String variable. Line 40 compares varUserNumber to intThousand(intMiddle); if it matches, line 41 assigns to strMsg a string telling the user where the search item was found in the array. If it doesn't match, line 43 assigns a string telling the user that the search did not find the search item. Line 45 is a spacer, and line 46 displays a message box telling the user the result of the search. Figure 7.10 shows examples—one successful, one otherwise—of the message box. + +Figure 7.10 The Binary_Search_of_Array procedure tells the user whether the search was successful (left) or not. + + * Line 47 is another spacer, and line 48 ends the procedure. + +The most complex part of the procedure is what happens in the loop. Download the code from the book's website at www.sybex.com/go/masteringvba2013. + +Copy the code, and paste it into the Visual Basic Editor (this code will work in any VBA-enabled application). Then open up the module and follow these steps: + +1. Display the Locals window (View ⇒ Locals Window) so that you can track the values of the variables intTop, intMiddle, and intBottom. Figure 7.11 shows the Locals window while the procedure is running. + +Figure 7.11 Use the Locals window to track the values of the intTop, intMiddle, and intBottom variables as the procedure runs. + +2. Set a breakpoint in the procedure on line 22 by clicking in the margin indicator bar next to the statement that begins varUserNumber = InputBox. (Because the statement is broken onto three lines, the Visual Basic Editor displays three red dots rather than one in the margin indicator bar, to indicate the breakpoint.) + +3. Press the F5 key (or choose Run ⇒ Run Sub/UserForm) to run the code up to the breakpoint. VBA creates and populates the array and then stops at line 22. + +4. Press the F8 key to step through the next statements. The first press displays the input box. Enter the value **67** for this example and click the OK button. + +5. As the code enters the Do loop and cycles through it, watch the values of the variables intTop, intMiddle, and intBottom in the Locals window. You'll see them change, as shown in the following list: + +At the end of the tenth iteration of the loop, intThousand(intMiddle) is equal to varUserNumber, so the loop ends. As you can see, breakpoints, single-stepping, and the Locals window are excellent debugging tools. Chapter 17, "Debugging Your Code and Handling Errors," further explores these and other debugging techniques. + +# The Bottom Line + +**Understand what arrays are and what you use them for.** + +Arrays play an important role in computer programming. In some ways they resemble a mini-database, and organized data is central to computing. Computers are sometimes called data processors for good reason, and arrays make it easier for you to manipulate variable data. + +**Master It** + +What is the difference between an array and an ordinary variable? + +**Create and use arrays.** + +When you create a new array, you _declare_ it and, optionally, specify the number of values it will contain. + +**Master It** + +There are four keywords that can be used to declare arrays. Name at least three of them. + +**Redimension an array.** + +If you want to resize an existing dynamic array, you can redimension it. + +**Master It** + +Redimensioning an array with the ReDim statement causes you to lose any values that are currently in that array. However, you can preserve these values using a special keyword. What is it? + +**Erase an array.** + +You can erase all the values in a fixed-size array or completely erase a dynamic array. + +**Master It** + +Write a line of code that erases an array named arrMyArray. + +**Find out whether a variable is an array.** + +An array is a type of variable, and you may occasionally need to check whether a particular variable name denotes an array or an ordinary _scalar variable_ (a variable that isn't an array). + +**Master It** + +Which built-in function can you use in VBA to find out whether a variable is an array or an ordinary, single-value variable? + +**Sort an array.** + +Visual Basic .NET includes array objects with built-in search and sort methods. In VBA, however, you must write a bit of code to search and sort the values in an array. + +**Master It** + +Name a popular, understandable, but relatively inefficient sorting technique. + +**Search an array.** + +Searching through an array can be accomplished in two primary ways. If you have a relatively small array, you can use the simpler, but less efficient technique. With large amounts of data, though, it's best to use the more robust approach. + +**Master It** + +Name two common ways to search an array. +Chapter 8 + +Finding the Objects, Methods, and Properties You Need + +In this chapter, you'll learn how to find the objects you need in the applications you're using. To learn the material in this chapter, you'll build on what you've learned in the earlier chapters. You'll start by examining the concepts involved: what objects and collections are, what properties are, and what methods are. You'll then learn how to find the objects, collections, properties, and methods you need to make your code work. To identify these items, you'll use a number of tools you've already read about, including the Object Browser (which you used briefly in Chapter 4, "Creating Code from Scratch in the Visual Basic Editor") and the VBA online Help resources. + +Along the way, this chapter explains how to use Object variables to represent objects in your code. + +In this chapter you will learn to do the following: + + * Understand and use objects, properties, and methods + * Use collections of objects + * Find objects, properties, and methods + * Use Object variables to represent objects + +# What Is an Object? + +VBA-enabled applications (and many other modern applications) consist of a number of discrete objects, each with its own characteristics and capabilities. + +## The Benefits of OOP + +Building an application out of objects is called _object-oriented programming_ ( _OOP_ ). In theory, object-oriented programming has a number of benefits—for example, the code is easier to build and maintain (update) because you break it down into objects of a manageable size. + +Object-oriented programs should also be easier to understand than monolithic programs because it's less difficult for most people to grasp the concept of individual objects with associated characteristics and actions than to remember a far longer list of capabilities for the application as a whole. + +Figuring out which commands to use to accomplish your programming goals can also be faster thanks to OOP taxonomy. For example, a table in Word is represented by a Table object, and a column is represented by a Column object. The Column object has a Width property that sets or returns its width. It's simpler to manage this information when it's broken down into small pieces than to deal with some complex command such as WordTableSetColumnWidth or WordTableGetColumnWidth. + +A third benefit of object-oriented programming is that the VBA language itself can be extended. The programmer can build custom objects to implement functionality that the language itself didn't originally contain. For example, you can use VBA to build your own objects that do things that the Office applications themselves can't do. + +Another, rather different, use for OOP is somewhat clerical: OOP can be of help when a group of programmers are working together on a single program. They can easily step on each other's toes in various ways—using the wrong version, changing each other's code, and so on. We'll look at the ways OOP is employed in team programming at the end of this chapter. + +Objects can—and frequently do—contain other objects. Typically, the objects in an object-oriented application are arranged into a hierarchy called the _object model_ of the application. This hierarchy is intended to make it easier to figure out where—within a large library of objects—you'll find a particular object that you want to use in your macros. It's similar to the way a biography is likely to be found in the library's nonfiction area. + +* * * + +Object Models Covered in Depth Later in the Book + +This chapter discusses object models only a little, at the conceptual level: You need to know what an object model is in order to make sense of what you'll be learning in the following chapters, but you don't need to know the specifics of each object model to manipulate the objects used in the examples. Part 5 of this book, "Creating Effective Code," examines the object models of each of the applications covered in this book in enough detail to get you started on exploring the depths of each object model on your own. + +* * * + +Most VBA host applications, including all the major Office applications, have an Application object that represents the application as a whole. The Application object has properties and methods for things that apply to the application as a whole. For example, many applications have a Quit method that exits the application and a Visible property that controls whether the application is visible or hidden. + +In a typical object model, the Application object essentially contains all the other objects (and collections—groups—of objects) that make up the application. For example, Excel has an Application object that represents the Excel application, a Workbook object (grouped into the Workbooks collection) that represents a workbook, and a Worksheet object (grouped into the Sheets collection) that represents a worksheet. The Workbook object is contained within the Application object because you normally need to have the Excel application open to work with an Excel workbook. + +In turn, the Worksheet object is contained within the Workbook object because you need to have an Excel workbook open to use a worksheet. Walking further down the object model, the Worksheet object contains assorted other objects, including Row objects that represent the individual rows in the worksheet, Column objects that represent columns in the worksheet, and Range objects (which represent ranges of cells). And these objects in turn contain further objects. + +To get to an object, you typically walk down through the hierarchy of the object model until you reach the object you're looking for. + +To get to a Range object in Excel, for example, you would go through the Application object to the Workbook object, through the Workbook object to the appropriate Sheet object, and then finally to the Range object. The following statement shows how to select the range A1 in the first worksheet in the first open workbook (more on this in a minute): + + Application.Workbooks(1).Sheets(1).Range("A1").Select + +## Understanding Creatable Objects + +The Application object, however, is optional and is usually left out of code lines. Why? Because you'd have to go through the Application object to get to pretty much _anything_ in the application, most applications _expose_ (make available to you) a number of _creatable_ objects. Creatable merely means that you can access something without having to type the word Application in your code. It's assumed. This is similar to the fact that you don't have to include the word _Earth_ when addressing an envelope. There's only that one possibility. + +These creatable objects are usually the most-used objects for the application, and by going through them, you can access most of the other objects without having to refer to the Application object. For example, Excel exposes the Workbooks collection as a creatable object, so you can use the following statement, which doesn't require that you type in Application. See the alternative example a couple of paragraphs earlier in this chapter. + + Workbooks(1).Sheets(1).Range("A1").Select + +Any object can have properties and methods. The next sections discuss these items in detail. + +## Properties + +In VBA, a _property_ is an attribute or characteristic of an object. Most objects have multiple properties that specify each aspect of it. + +Each property has a specific data type for the information it stores. For example, the objects that represent files (such as documents, workbooks, or presentations) typically have a Boolean property named Saved that stores a value denoting whether all changes in the object have been saved (a value of True) or not (a value of False). These two values encompass the entire range of possibilities for the object: it can either contain unsaved changes or not contain unsaved changes. There is no third state. And a Boolean data type is used because that type has only two possible values. + +Similarly, most objects that represent files have a Name property that contains the name of the file in question. The Name property contains a String data type because it needs to contain text. And that text can be just about anything, limited only by the 255-character path that Windows permits for files and by certain characters—such as colons and pipe (|) characters—that Windows forbids in filenames. + +To work with a property, you _get_ (fetch or return) it to find out its current value or _set_ (change) it to a value of your choosing. Many properties are _read/write_ , meaning that you can both _get_ and _set_ their values, but some properties are _read-only_ , meaning that you can view their values but not change them. + +The Saved property is read/write for most applications, so you can set it. This means that you can tell the application that a file contains unsaved changes when it really doesn't or that it contains no unsaved changes when it actually has some. (Changing the Saved property can be useful when you're manipulating a file without the user's knowledge.) But the Name property of a file object is read-only—you'll typically set the name by issuing a Save As command, after which you cannot change the name from within the application while the file is open. So you can get (read, return, or fetch) the Name property but not set it. You'll also encounter some write-only properties, properties that you can set but not get. + +When an object contains another object, or contains a collection, it typically has a property that you _call_ (invoke) to return the contained object or collection. For example, the Word Document object includes a PageSetup property that returns the PageSetup object for the document (the PageSetup object contains settings such as paper size, orientation, lines per page, and margins for the document) and a Tables property that you call to return the Tables collection. Here's how you can _call_ the PageSetup object (which is contained in the Document object): + + Sub GetLinesPage() + + Dim sngLinesPerPage As Single + + **sngLinesPerPage = ActiveDocument.PageSetup.LinesPage** + + MsgBox sngLinesPerPage + + End Sub + +Each object of the same type has the same set of properties but stores its own particular values for them. For example, if you're running PowerPoint and have three Presentation objects open, each has its own Name property. The value in each Name property is specific to each Presentation object. In other words, the value in a property in one object has nothing to do with the value in that property in another object: each object is independent of the other objects. + +## Methods + +A _method is_ an action that an object can perform, a capability an object has. For example, the Document object in various applications has a Save method that saves the document. You can use the Save method on different Document objects—Documents(1).Save saves the first Document object in the Documents collection, and Documents(2).Save saves the second Document object—but the Save method does the same thing in each case. An object can have one or more methods associated with it. Some objects have several dozen methods to implement all the functionality they need. + +The Save method is very common. It appears in many applications, as do other methods, such as SaveAs (which saves the file with a different name, location, or both) and Close (which closes the file). + +But other methods are unique to each application. For example, the Presentation object in PowerPoint has an AddBaseline method that applies a baseline (consisting either of the active presentation or of a specified presentation file) that enables you to track changes for a merge. The Document object in Word has no AddBaseline method, but it has an AcceptAllRevisions method that accepts all revisions in the document. PowerPoint doesn't have an AcceptAllRevisions method. + +Just as methods like Save are common to multiple applications, some methods are found in more than one object. For example, the Delete method is associated with many different objects. As its name suggests, the Delete method usually deletes the specified object. But other implementations of the Delete method behave somewhat differently, depending on the object they're working with. So even if you're familiar with a method from using it with one object, you need to make sure that it will have the effect you expect when you use it with another object. + +Some methods take no arguments. Other methods take one or more arguments (to supply necessary information). Just as with built-in VBA functions like MsgBox, some methods' arguments are required, while others are optional. + +When a method applies to multiple objects, it may have different syntax for different objects. Again, even if you're familiar with a method, you need to know exactly what it does with the object for which you're planning to use it. + +To use a method, you access it through the object involved. For example, to close the ActivePresentation object, which represents the active presentation in PowerPoint, you use the Close method (but you must specify the ActivePresentation object, like this): + + ActivePresentation.Close + +* * * + +**Max the Dog: Visualizing Objects, Methods, and Properties** + +If you have a hard time getting a grip on objects, their properties, and methods, here's a somewhat strained comparison between the virtual objects, properties, and methods in VBA and physical objects, properties, and actions in the real world. Consider this example. + +Let's say you have a massive dog named Max—a Pyrenean mountain dog, white, 200 pounds, four years old, male, and not _fixed_. + +Max performs all the usual dog actions—sleep, run, eat, bark, growl, chew things, various unmentionable actions that we'll skip over—but also has a couple of unusual (for dogs) actions built in, such as slobbering on command, knocking people down, and biting mail carriers. + +If Max were implemented in VBA, he'd be a Dog object in a Dogs collection. The Dog object for Max would have properties such as these: + + Name This is a read-only String with a value of Max + Sex This is a read-only String with a value of Male. + Fixed This is a read/write Boolean with a value of False. + Height This is a read/write Long with a value of 36. + Weight This is a read/write Long with a value of 200. + Age This is a read/write Integer with a value of 4. + Type This is a read/write String with a value of Pyrenean Mountain. + Color This is a read/write String with a value of White. + +Max would have methods such as Slobber, Bark, KnockDown, Intimidate, Chew, Run, and so on. Some of these methods would require arguments. The Slobber method would definitely need arguments like this, probably using Dog-specific constants that start with the dog designation: + + Dogs("Max").Slobber OnWhat:="MyKnee", How:=dogSlobberDisgustingly + +The Dog object would contain objects representing the many components of the dog—ears, eyes, tongue, brain, stomach, legs, tail, and so on. Each of these objects in turn would have its own properties and methods as appropriate. For example, the Tail object would need a Wag method, which you would probably invoke ( _call_ ) something like this: + + Dogs("Max").Tail.Wag Direction:=dogWagHorizontal, Frequency:=200 + +* * * + +# Working with Collections + +When an object contains more than one object of the same type, the contained set of objects is said to be grouped into a _collection_. For example, Word uses Document objects, which are grouped into the Documents collection; PowerPoint has a Presentations collection for Presentation objects, and Excel has the Workbooks collection. + +As in these examples, the names of most collections are simply the plural of the outer, container object. There _are_ some exceptions, such as the Sheets collection in Excel that contains the Worksheet objects. But by and large the names of most collections are easy to derive from the name of the objects they contain—and vice versa. + +A collection—taken as a whole—is an object too and can have its own properties and methods. For example, many collections have a Count property that tells you how many objects are in the collection. This next example tells you how many documents are in the Documents collection: + + Sub GetDocCount() + + Dim lngCount As Long + + lngCount = Documents.Count + + MsgBox lngCount + + End Sub + +Collections tend to have fewer properties and methods than individual objects. Most collections have an Add method for adding another object to the collection. Some collections, however, are read-only and do not have an Add method. Most collections have an Item property (the default property) for accessing an item within the collection. + +Most collections in VBA have the core group of properties listed in Table 8.1. + +Table 8.1 Core properties for collections in VBA + +**Property** | **Explanation** +---|--- +Application | A read-only property that returns the application associated with the object or collection—the root of the hierarchy for the document. For example, the Application property for objects in PowerPoint returns Microsoft PowerPoint. +Count | A read-only Long property that returns the number of items in the collection—for example, the number of Shape objects in the Shapes collection in a PowerPoint slide. +Creator | In Microsoft applications, a read-only Long property that returns a 32-bit integer indicating the application used to create the object or collection. +Item | A read-only property that returns a specified member of the collection. Item is the default property of every collection, which means that you seldom need to specify it. +Parent | In Microsoft applications, a read-only String property that returns the parent object for the object or collection. The _parent_ object is the object that contains the object in question; the contained object is the _child_ object. For example, a Document object is a child of the Documents collection. + +## Working with an Object in a Collection + +To work with an object in a collection, you identify the object within the collection either by its name or by its position in the collection. For example, the following statement returns the first Document object in the Documents collection and displays its Name property in a message box: + + MsgBox Documents(1).Name + +* * * + +Most Collections Are Zero-Based + +Recall that arrays are zero-based by default in VBA. They employ a 0 index number for the first item in the array (unless you use the Option Base 1 statement to force the first index number to 1 as we did in Chapter 7, "Using Array Variables"). + +Fortunately, most VBA collections default to the more sensible 1 for the first item in the collection. This makes it easy to identify the object you need. For example, Documents(1) gives you the first document, Workbooks(2) gives you the second workbook, and so on. + +But notice the word _most_. Sadly, there are exceptions to this rule. Be warned that _some_ collections in VBA implementations are zero-based—their numbering starts at 0 (zero) rather than 1. For example, Access—nearly always the special case in VBA—employs zero-based collections. If you're not sure whether a particular collection is one- or zero-based, consult the Help topic for that collection. + +* * * + +You can optionally use the Item property to return an object from the collection, but because Item is the default property of a collection, you don't need to use it. It's assumed. The following two statements have the same effect, so there's no advantage to using the Item method: + + strName = Documents(1).Name + strName = Documents.Item(1).Name + +## Adding an Object to a Collection + +To create a new object in a collection, you add an object to the collection. In many cases, you use the Add method to do so. For example, the following statement creates a new Document object in Word: + + Documents.Add + +# Finding the Objects You Need + +The Visual Basic Editor provides a number of tools for finding the objects you need: + + * The Macro Recorder, which you used to record macros in some Microsoft Office applications in Chapter 1, "Recording and Running Macros in the Office Applications" + * The Object Browser, which you used briefly in Chapter 4 + * The online Help system, which can provide detailed help on the objects in the application + * The Auto List Members feature in the Visual Basic Editor + +The following sections show you how to use these tools to find objects. + +## Using the Macro Recorder to Add Code for the Objects You Need + +If you're using a Microsoft application, chances are that the easiest way to find the objects you need is to run the Macro Recorder to record a quick macro using the objects you're interested in. While you perform various actions in the application, the Macro Recorder creates code that you can then open in the Visual Basic Editor, examine, and modify if necessary. + +In spite of its advantages, the Macro Recorder does have two drawbacks: + + * First, you can't record _every_ action that you might want. Let's say you're working in Excel and want to create a statement that performs an action on a specified workbook in the Workbooks collection rather than on the active workbook. With the Macro Recorder, you can record only actions performed on the active workbook. (This is the case because the Macro Recorder can record only those actions you can perform interactively in Excel, and you can't work interactively with any workbook other than the active one.) Here's another example: Some Ribbon actions are not recorded. In Word, clicking the Review ⇒ Show Markup Formatting feature to deselect it results in no recorded code. You would need to write the following code in the Editor yourself: + + ActiveWindow.View.ShowFormatChanges = False + + * Second, the Macro Recorder is apt to record more statements than you need, particularly when you're trying to record a setting in a dialog box. + +You saw an example of the second problem in Chapter 4. Here's another example. This time we'll record a macro to create an AutoCorrect entry. Let's say that you often have to type the word _references_ in your job. Dozens of times every day. You can speed up your work by merely typing **reffs** (or some other abbreviation of your choice). Then Word will automatically replace _reffs_ with _references_ as you type. Here's how to create this macro: + +1. Start Word. + +2. Click the Record Macro button on the status bar, or click the Developer tab on the Ribbon and then click the Record Macro button in the Code section. This displays the Record Macro dialog box. Type **Add_Item_to_AutoCorrect** in the Macro Name text box, and type a description in the Description text box. Make sure All Documents (Normal.dotm) is selected in the Store Macro In drop-down list, and then click the OK button to start recording. + +3. Press Alt+F then I. Then click the Proofing button and the AutoCorrect Options button to display the AutoCorrect dialog box. Type **reffs** in the Replace box and **references** in the With box, and click the Add button. Then click OK twice to close both open dialog boxes. + +4. Click the Stop Recording button on the Ribbon or the status bar to stop the Macro Recorder. + +Now press Alt+F8 to display the Macros dialog box, select the Add_Item_to_AutoCorrect entry, and click the Edit button to open the macro in the Visual Basic Editor. The code should look like this: + + Sub Add_Item_to_AutoCorrect() + ' + ' Add_Item_to_AutoCorrect Macro + ' Change reffs to references + ' + **AutoCorrect.Entries.Add Name:="reffs", Value:="references"** + With Options + .AutoFormatAsYouTypeApplyHeadings = False + .AutoFormatAsYouTypeApplyBorders = True + .AutoFormatAsYouTypeApplyBulletedLists = True + .AutoFormatAsYouTypeApplyNumberedLists = True + .AutoFormatAsYouTypeApplyTables = True + .AutoFormatAsYouTypeReplaceQuotes = True + .AutoFormatAsYouTypeReplaceSymbols = True + .AutoFormatAsYouTypeReplaceOrdinals = True + .AutoFormatAsYouTypeReplaceFractions = True + .AutoFormatAsYouTypeReplacePlainTextEmphasis = False + .AutoFormatAsYouTypeReplaceHyperlinks = True + .AutoFormatAsYouTypeFormatListItemBeginning = True + .AutoFormatAsYouTypeDefineStyles = False + .TabIndentKey = True + End With + With AutoCorrect + .CorrectInitialCaps = True + .CorrectSentenceCaps = True + .CorrectDays = True + .CorrectCapsLock = True + .ReplaceText = True + .ReplaceTextFromSpellingChecker = True + .CorrectKeyboardSetting = False + .DisplayAutoCorrectOptions = True + .CorrectTableCells = True + End With + With OMathAutoCorrect + .UseOutsideOMath = False + .ReplaceText = True + End With + With Options + .AutoFormatApplyHeadings = True + .AutoFormatApplyLists = True + .AutoFormatApplyBulletedLists = True + .AutoFormatApplyOtherParas = True + .AutoFormatReplaceQuotes = True + .AutoFormatReplaceSymbols = True + .AutoFormatReplaceOrdinals = True + .AutoFormatReplaceFractions = True + .AutoFormatReplacePlainTextEmphasis = True + .AutoFormatReplaceHyperlinks = True + .AutoFormatPreserveStyles = True + .AutoFormatPlainTextWordMail = True + End With + Options.LabelSmartTags = False + End Sub + +Here, the Recorder has created dozens of lines of unnecessary code. The only statement you actually need to accomplish your task is this: + + AutoCorrect.Entries.Add Name:="reffs", Value:="references" + +This line shows you that to add an AutoCorrect entry, you need to work with the Entries collection object in the AutoCorrect object. You use the Add method on the Entries collection to add an AutoCorrect entry to the list. + +All the other lines of code specifying the status of various options are unnecessary because you are not interested in changing any of them in this macro. + +By removing these extraneous lines from this recorded macro, you can reduce it to just the single line it needs to contain (together with the comment lines, which you can also remove if you want): + + Sub Add_Item_to_AutoCorrect() + ' + ' Add_Item_to_AutoCorrect Macro + ' Change reffs to references + ' + AutoCorrect.Entries.Add Name:="reffs",Value:="references" + End Sub + +You used the Recorder to see the correct syntax for adding an entry to the AutoCorrect feature. There's no point to leaving in lines of code unrelated to your purposes. What's more, such extraneous code would make it harder at some future date to read and understand the macro's purpose. Even worse, these extra lines can set properties to conditions that you, or someone else using this macro, might not want. Let's say you run this macro in the future and you are working in a document that must not have any bullet symbols in it. So you've clicked the File tab on the Ribbon, then chosen File ⇒ Options ⇒ Proofing ⇒ AutoCorrect Options ⇒ AutoFormat As You Type and turned off bullets. However, when you run this macro, bullets are turned back on by this unneeded line in the code: + + .AutoFormatAsYouTypeApplyBulletedLists = True + +In spite of its limitations, the Macro Recorder does provide quick access to the objects you need to work with, and you can always modify the resulting code in the Visual Basic Editor. What's more, the code that the Recorder generates is, if nothing else, guaranteed to execute without bugs. + +## Using the Object Browser + +For many programmers, the primary tool for writing code for objects is the Object Browser, which you used briefly in Chapter 4. In the following sections, you'll get to know the Object Browser better and learn to use it to find the information you need about objects. To see the Object Browser, press F2 in the Editor. + +### Components of the Object Browser + +The Object Browser provides the following information about both built-in objects and custom objects you create: + + * Classes (formal definitions of objects) + * Properties (the attributes of objects or aspects of their behavior) + * Methods (actions you can perform on objects) + * Events (for example, the opening or closing of a document) + * Constants (named items that keep a constant value while a program is executing) + +Figure 8.1 shows the components of the Object Browser. + +Figure 8.1 The Object Browser provides information on built-in objects and custom objects. Here, the application is Excel. + +Here's what the different elements of the Object Browser do: + + * The Project/Library drop-down list provides a list of object libraries available to the current project. (An _object library_ is collection of objects made available to programs. There can be several libraries in use at a given time. For example, one library might contain objects that specialize in rendering graphics, a second library might contain objects that assist with security features, and so on.) Use the drop-down list to choose the object libraries you want to view. For example, you might choose to view only objects in Outlook by choosing Outlook in the Project/Library drop-down list. Alternatively, you could stay with the default choice of . + * In the Search Text box, enter the string you want to search for: Either type it in or choose a previous string in the current project session from the drop-down list. Then either press Enter or click the Search button to find members containing the search string. + +* * * + +Improve Your Searches with These Techniques + +To make your searches less specific, you can use wildcards such as ? (to represent any single character) and * (to represent any group of characters). You can also choose to search for a whole word only (rather than matching your search string with part of another word) by right-clicking anywhere in the Object Browser (except in the Project/Library drop-down list or in the Search Text box) and choosing Find Whole Word Only from the context menu. The Find Whole Word Only choice has a check mark next to it in the context menu when it's active; to deactivate it, choose Find Whole Word Only again on the context menu. + +* * * + + * Click the Go Back button to retrace one by one your previous selections in the Classes list and the Members Of list. Click the Go Forward button to move forward through your previous selections one by one. The Go Back button becomes available when you go to a class or member in the Object Browser; the Go Forward button becomes available only when you've used the Go Back button to go back to a previous selection. + * Click the Copy To Clipboard button to copy the selected item from the Search Results list, the Classes list, the Members Of list, or the Details pane to the Clipboard so that you can paste it into your code. + * Click the View Definition button to display a Code window containing the code for the object selected in the Classes list or the Members Of list. The View Definition button is available (undimmed) only for objects that contain code, such as procedures and user forms that you've created. + * Click the Help button to display any available help for the currently selected item. Alternatively, press the F1 key. + * Click the Search button to search for the term entered in the Search Text box. If the Search Results pane isn't open, VBA opens it at this point. + * Click the Show/Hide Search Results button to toggle the display of the Search Results pane on and off. + * The Search Results list in the Search Results pane contains the results of the latest search you've conducted for a term entered in the Search Text box. If you've performed a search, the Object Browser updates the Search Results list when you use the Project/Library drop-down list to switch to a different library. Choosing a different library in the Project/Library drop-down list is a handy way of narrowing, expanding, or changing the focus of your search. + * The Classes list shows the available classes in the library or project specified in the Project/Library drop-down list. + * The Members Of list displays the available elements of the class selected in the Classes list. A method, constant, event, property, or procedure that has code written for it appears in boldface. The Members Of list can display the members either grouped into their different categories (methods, properties, events, and so on) or ungrouped as an alphabetical list of all the members available. To toggle between grouped and ungrouped, right-click in the Members Of list and choose Group Members from the context menu; click either to place a check mark (to group the members) or to remove the check mark (to ungroup the members). + * The Details pane displays the definition of the member selected in the Classes list or in the Members Of list. For example, if you select a procedure in the Members Of list, the Details pane displays its name, the name of the module and template or document in which it's stored, and any comment lines you inserted at the beginning of the procedure. The module name and project name contain hyperlinks (jumps) so that you can quickly move to them. You can copy information from the Details pane to the Code window by using either copy and paste or drag and drop. + * Drag the three split bars to resize the panes of the Object Browser to suit yourself. (You can also resize the Object Browser window as needed or maximize it so that it docks itself in the Code window.) + +The Object Browser uses different icons to indicate the various types of object that it lists. Figure 8.1 shows several icons; Table 8.2 shows the full range of icons and what they represent. + +A blue dot in the upper-left corner of a Property icon or a Method icon indicates that that property or method is the default. + +Table 8.2 Object Browser icons + +**Icon** | **Meaning** +---|--- + | Property + | User-defined type + | Method + | Global + | Constant + | Library + | Module + | Project + | Event + | Built-in keyword or type + | Class + | Enum (enumeration) + +### Adding and Removing Object Libraries + +The default object libraries are sufficient for most typical macros, so you generally need not worry about adding any specialized libraries. If you get into some kinds of advanced macro programming, however, you will need to add other libraries (you'll modify the Ribbon in Chapter 31, "Programming the Office 2010 Ribbon," and to do that you have to add a special library). You can add and remove object libraries by choosing Tools ⇒ References in the editor and using the References dialog box to make your selections: + + * By adding object libraries, you can make available additional objects to work with. + * By removing object libraries that you don't need to view or use, you can reduce the number of object references that VBA needs to resolve when it is compiling the code in a project. This allows the code to run faster, though as I've mentioned before, today's computers are so fast that finding ways to increase speed of macro execution is never an issue for most people. + +When you start the Visual Basic Editor, it automatically loads the object libraries required for using VBA and user forms with the host application. You don't have to change this set of object libraries until you need to access objects contained in other libraries. For example, if you create a procedure in Word that needs to employ a feature found in Excel, you'll have to add to Word's VBA Editor a reference to an Excel object library to make Excel's objects available. + +You can adjust the priority (or _order of precedence_ ) of different references by adjusting the order in which the references appear in the References dialog box. The priority of references matters when you use in your code an object whose name appears in more than one reference: VBA checks the References list to determine the order of the references that contain that object name and uses the first one unless specifically told to do otherwise by use of an unambiguous name. + +To add or remove object libraries, follow these steps: + +1. In the Visual Basic Editor, choose Tools ⇒ References to display the References dialog box (see Figure 8.2). You can also display the References dialog box by right-clicking in the Object Browser and choosing References from the context menu. + +Figure 8.2 You add and remove object libraries by using the References dialog box. + +2. In the Available References list box, select the check boxes for the object libraries you want to have access to, and clear the check boxes for the references you want to remove because you don't need them. You should find a reference for an object library for each application that supports automation and is installed on your computer. _Automation_ , in this context, means that an application permits the automation of tasks (in other words, macros). Another way to put this is an application that supports automation _exposes its objects_ , meaning that the application makes its objects available to programmers. + +3. The references that are in use appear together at the top of the Available References list box, not in alphabetical order (in order of precedence, as described earlier in this chapter). + +4. Adjust the order of precedence of the references if necessary by selecting a reference and using the up- and down-arrow Priority buttons to move it up or down the list. Usually, you'll want to keep Visual Basic for Applications and the object library of the application you're working with at the top of your list. + +* * * + +Adding a Reference Library + +You can even add new reference libraries to the list of available references in the References dialog box by clicking the Browse button to display the Add Reference dialog box, selecting the library file, and then clicking the Open button. + +* * * + +5. Click OK to close the References dialog box and return to the Object Browser. + +### Navigating with the Object Browser + +To browse the objects available to a project, follow these steps: + +1. First, activate a code module by double-clicking it in the editor's Project Explorer. + +2. Display the Object Browser by choosing View ⇒ Object Browser, by pressing the F2 button, or by clicking the Object Browser button on the Standard toolbar. (If the Object Browser is already displayed, make it active by clicking it or by selecting it from the list at the bottom of the Window menu.) + +3. In the Project/Library drop-down list, select the name of the project or the library that you want to view. The Object Browser displays the available classes in the Classes list. + +4. In the Classes list, select the class you want to work with. For example, if you chose a project in step 3, select the module you want to work with in the Classes list. + +5. If you want to work with a particular member of the class or project, select it in the Members Of list. For example, if you're working with a template project, you might want to choose a specific procedure or user form to work with. + +Once you've selected the class, member, or project, you can perform the following actions on it: + + * View information about it in the Details pane at the bottom of the Object Browser window. + * View the definition of an object by clicking the View Definition button. Alternatively, right-click the object in the Members Of list and choose View Definition from the context menu. The View Definition button and the View Definition command are enabled (available, undimmed) only for objects that contain code, such as procedures and user forms that you've created. + +* * * + +A "Definition" Is Contained Code + +The definition of a procedure is the code that it contains. The definition of a module is all the code in all the procedures that it contains. The definition of a user form is the code in all the procedures attached to it. To see how the View Definition button works, type the name of one of your macros in the Object Browser's Search field (to the left of the icon). Then click the icon to locate this macro. Then click the View Definition button, and the Code window will open, displaying this macro's code. + +* * * + + * Copy the text for the selected class, project, or member to the Clipboard by clicking the Copy button or by issuing a standard Copy command (pressing Ctrl+C or Ctrl+Insert). + +## Using Help to Find the Object You Need + +VBA's online Help system provides another easy way to access the details of the objects you want to work with. The Help files provide a hyperlinked reference to all the objects, methods, and properties in VBA, including graphics that show how the objects are related to each other, and plenty of code samples to show you the correct syntax. + +The quickest way to access VBA Help is to press the F1 key while working in the Visual Basic Editor. + +### Pressing F1 to Go to a General VBA Help Page + +F1 works two different ways. Press F1 with the cursor on a blank line, and you're taken to the VBA portal shown in Figure 8.3. However, press F1 with the cursor on a language keyword such as Variant or InputBox, and you're taken to a Help page with specific information about that particular keyword. + +Figure 8.3 The generic VBA portal + +First, try clicking a blank line in the Code window, then press F1. Your browser opens a generic Office website shown in Figure 8.3. + +For us Office programmers in the web page shown in Figure 8.3 are two links: the _Office_ link under Platforms shown down at the bottom and the _Welcome to the Visual Basic for Applications language reference for Office 2013_ link shown in the middle of the page. + +Click the Office link and you're taken to the Office for Developers help page, shown in Figure 8.4. There you'll find many useful links to code samples, Office application–specific pages, video lessons, and whatnot. + +Figure 8.4 This Help page contains many valuable links. + +### Pressing F1 to Go Directly to a Command's Help Page + +The second way to use F1 takes you directly to the Help page for the keyword you're interested in. If you want to see how to manipulate the active window, for example, just type **activewindow** into the Editor's Code window, and then, with the blinking insertion cursor somewhere in that word, press F1. See Figure 8.5. + +Figure 8.5 Put your insertion cursor on a command, then Press F1 to get context-sensitive help. + +After you press F1 on the activewindow command, as shown in Figure 8.5, the Help page for this command opens, as you can see in Figure 8.6. + +Figure 8.6 Here's the main Help page for the ActiveWindow property. + +Apart from the regular Help information you'll find in the Help pages online, here are a few additional ways to find help: + + * At the top of most Microsoft help windows, you'll see a field titled _Search MSDN With Bing_. Try this: Type **Word 2013 selection object** into the Bing search field. A page is displayed with plenty of links. Click the top link, and you'll see several helpful code examples. + * When looking for help, you can also try clicking the Help menu in the Editor, then choosing one of the two help options listed: Microsoft Visual Basic Applications Help or MSDN On The Web. These two options open different entrees into the Help system, from which you can drill down until you locate the explanations or code samples you're after. + * Finally, when looking for help with objects, don't forget you can press F2 to display the built-in Object Browser. + +## Using the Auto List Members Feature + +You've already used the Auto List Members feature a couple of times in the previous chapters. To recap, in VBA code—as with most other programming languages—objects and their _members_ (properties and methods) are separated by periods. This punctuation helps you see the relationships between parent objects, child objects, and members. Notice the two periods in this code: + + sngLinesPerPage = ActiveDocument.PageSetup.LinesPage + +When you're entering a statement in the Visual Basic Editor and you type the period at the end of the current object, the Auto List Members feature displays a list of properties and methods appropriate to the statement you've entered so far. (Turn this feature on in the Visual Basic Editor by choosing Tools ⇒ Options, then selecting the Auto List Members check box.) + +Technically, there's a distinction between Auto List Members and a somewhat similar List Properties/Methods feature. The former feature is triggered by typing a period (.) following the name of an object in a line of code. The latter is triggered by pressing Ctl+J or by right-clicking the name of an object in a line of code and choosing List Properties/Methods from the menu that appears. Of the two, I find Auto List Members more useful. + +The Auto List Members feature provides a quick way of completing statements, but you need to know which object you should work with before you can work with its members. Sometimes using this feature is a bit like finding your way through a maze and being given detailed directions that end with the phrase, "But you can't get there from here." + +Once you know the object from which to start, though, you can easily find the property or method you need. For example, to put together the statement Application.Documents(1).Close to close the first document in the Documents collection in Word, you could work as follows: + +1. Place the insertion point on a fresh line in an empty procedure (between the Sub and End Sub statements). Create a new procedure if necessary. + +2. Type the word **application** , or type **appl** and press Ctrl+spacebar to have the Complete Word feature complete the word for you. + +3. Type the period (.) after **Application**. The Auto List Members feature displays the list of properties and methods available to the Application object. + +4. Choose the Documents item in the Auto List Members list. You can scroll to it using the mouse and then double-click it to enter it in the Code window, scroll to it by using the arrow keys and enter it by pressing Tab, or type the first few letters of its name (to automatically locate it) and then enter it by pressing Tab. The latter method is shown in Figure 8.7, which uses Word. + +Figure 8.7 Using the Auto List Members feature to enter code + +5. Type **(1).** after Documents. When you type this period, the Auto List Members feature displays the list of properties and methods available to a Document object. Note that without the (1), you're working with the documents collection, but as soon as you add the (1), you're then working with a specific document, namely the first one in the collection. + +6. Choose the Close method in the Auto List Members list by scrolling to it with the mouse or with the down arrow key. Because this is the end of the statement, press the Enter key to enter the method and start a new line (rather than pressing the Tab key, which enters the method but continues the same line of code). + +* * * + +Automatic Selection Helps You Keep Your Hands on the Keyboard + +For most people, the quickest way to enter statements in the Code window is to keep their hands on the keyboard. After all, you're _typing_ your programming. To help you do this, the Visual Basic Editor automatically selects the current item in the Auto List Members list when you type a period or an opening parenthesis. In the previous example, you can type **Application.** to display the list, **Do** to select the Documents item, and **(** to enter the Documents item. + +* * * + +# Using Object Variables to Represent Objects + +As you learned in Chapter 6, "Working with Variables, Constants, and Enumerations," one of the data types available for variables in VBA is the _Object_ type. You use an Object variable to represent an object in your code: instead of referring to the object directly, you can employ the Object variable to access or manipulate the object it represents. + +Here's one major benefit of this approach: Using Object variables makes your code easier to read. It's simpler to see which object a section of code is working with, especially when you're working with multiple objects in the same section of code. Plus, you can give names to these variables that are descriptive and easily understood. What's more, object variables are often a necessity when you need to manipulate collections of objects. + +For example, say you create a procedure that manipulates the three open workbooks in Excel, copying a range of cells from one to the other two. If you have only those three workbooks open, you'll be able to refer to them directly as Workbooks(1), Workbooks(2), and Workbooks(3), respectively, because they'll occupy the first (and only) three slots in the Workbooks collection. + +But if your procedure changes the order of the workbooks, closes one or more workbooks, or creates one or more new workbooks, things rapidly get confusing. If, however, you've created Object variables (named, say, xlWorkbook1, xlWorkbook2, and xlWorkbook3) to refer to those specific workbooks, it will be much easier to keep them straight. This is because no matter which workbook moves to first position in the Workbooks collection, you'll be able to refer to the object represented by the Object variable xlWorkbook1 and know that you'll be accessing the workbook you're after. In other words, when you create Object variables, you get to _name them_ , using words that are more easily understood than index numbers. More important, once it's named, an Object variable's name does not change. Index numbers can change. + +To create an Object variable, you declare it in almost exactly the same way as you declare any other variable, using a Dim, Private, or Public statement. For example, the following statement declares the Object variable objMyObject: + + Dim objMyObject As Object + +As usual for the Dim statement, if you use this declaration within a procedure, it creates a variable with local scope. If you use it in the declarations section at the top of a code sheet, it creates a variable with module-level private scope. Similarly, the Private and Public keywords create module-level private and public Object variables, respectively. + +Once you've declared the Object variable, you can assign an object to it. (Assigning objects works a bit differently from the way you use just an equal sign to assign a value to an ordinary variable.) To assign an object to an Object variable, you use a Set statement. The syntax for a Set statement is as follows: + + Set _objectvariable_ = {[New] _expression_ |Nothing} + +Here's how that syntax breaks down: + + * _objectvariable_ is the name of the Object variable to which you're assigning the object. + * New is an optional keyword that you can use to implicitly create a new object of the specified class. However, usually it's better to create objects _explicitly_ and then assign them to Object variables rather than use New to create them implicitly. + * _expression_ is a required expression that specifies or returns the object you want to assign to the Object variable. + * Nothing is an optional keyword that you assign to an existing Object variable to obliterate its contents and release the memory they occupied. + +For example, the following statements declare the Object variable objMyObject and assign to it the active workbook in Excel: + + Dim objMyObject As Object + Set objMyObject = ActiveWorkbook + +The following statement uses the Nothing keyword to release the memory occupied by the objMyObject Object variable: + + Set objMyObject = Nothing + +What's different about declaring an Object variable versus declaring other types of variables is that not only can you declare the Object variable as being of the type Object and then use the Set command, but you can also specify which type of object it is. For example, if an Object variable will always represent a Workbook object, you can declare it as being of the Workbook ⇒ data type. The following statement declares the Object variable xlWorkbook1 as being of the Workbook data type: + + Dim xlWorkbook1 As Workbook + +Strongly associating a type with an Object variable like this has a couple of advantages. First, once you've _strongly typed_ (as it's called) the Object variable, the Visual Basic Editor can provide you with full assistance for the Object variable, just as if you were dealing with the object directly. For example, once you've created that Object variable xlWorkbook1 of the Workbook object type, the Visual Basic Editor displays the Auto List Members drop-down list when you type that Object variable's name followed by a period, as shown in Figure 8.8. + +Figure 8.8 When you strongly type your Object variables, you get the full benefit of the Visual Basic Editor's code-completion features for those Object variables. + +Second, when you strongly type an Object variable, you make it a bit harder to get things wrong in your code. If you try to assign the wrong type of object to a strongly typed Object variable, VBA gives an error. For example, if you create a Worksheet Object variable in Excel, as in the first of the following statements, but assign to it a Workbook object, as in the second statement, VBA displays a "Type Mismatch" error message when you execute this code—as well it should: + + Dim wksSheet1 As Worksheet + Set wksSheet1 = ActiveWorkbook + +Finding out at this testing stage that you've created a problem is usually preferable to finding out later (for example, when you go to manipulate the wksSheet1 object and discover it doesn't behave as you expect it to). + +The main argument for _not_ strongly typing an Object variable is that you might not be sure ahead of time (while writing the code) what kind of object that variable will eventually reference during execution or if the kind of object it will store may vary from one execution of the code to another. (If either is the case, your code will need to be flexible enough to accommodate objects of different types for the same Object variable.) Usually, though, you'll want to strongly type all your Object variables. + +If you're not sure which object type to use for an Object variable, start by declaring the Object variable as being of the Object data type. Then run through the code a couple of times with the Locals window (View ⇒ Locals) displayed, and note the data type that VBA assigns to the Object variable. For example, if you press F8 repeatedly to step through the following statements in a Visual Basic Editor session hosted by Excel, the readout in the Locals window at first identifies the Object variable wks only as Object (as shown on the left in Figure 8.9). That's not too useful. However, press F8 again to execute the Set command, and you see loads of information (press the + icon next to wks). You now see Object/Sheet1 (as shown on the right in Figure 8.9) when executing the second statement assigns the first sheet in the active workbook to it. You also can see all the members, their current values, and their type. + + Dim wks As Object + Set wks = ActiveWorkbook.Sheets(1) + +Figure 8.9 You can use the Locals window to help identify the object type that an Object variable will contain. + +* * * + +There Are Drawbacks to Weakly Typed Variables + +As you learned earlier in the book, you can avoid specifying data types altogether. For example, the statement Dim varMyVariant creates a Variant variable because the statement does not specify a data type. Variant variables can contain objects as well as other data types—but as before, using Variants requires VBA to do a little more work each time it encounters the variable (because VBA has to determine what data type the variable currently is) and denies you the benefits of strongly typing your variables. Weak typing also makes your code harder to read. + +* * * + +# Team Programming and OOP + +VBA is used by individual programmers as well as teams. OOP can offer some advantages when you are trying to manage a group of programmers working together on a large, complex VBA solution. OOP can help people avoid stepping on each other's toes—duplicating global variable names, creating version problems—because everyone's individual copy of the code is _their_ latest version but not the latest official version of the group, and other kinds of interference. + +Group programming needs management, and OOP, among its other benefits, assists in avoiding chaos when a team needs to work together on a common goal. + +One feature of OOP is _encapsulation_. This means that an object is self-contained and sealed off. It's like a black box that is plugged into your video system to improve the picture. You don't _open_ the box. Nobody is supposed to _modify_ the innards. You just use it. + +As an example, say that the boss wants all documents from now on to emphasize the company's name. You give Sandra the task of creating an object that is supposed to italicize and capitalize all references to _ACME WINDOWORKS_ in all company documents. And you ask Joe to create an object that ensures that any use of the company name is displayed in green rather than the normal black letters. (In reality, you would likely want to code these simple manipulations into _functions_ —see Chapter 10, "Creating Your Own Functions,"—rather than _objects_. Objects tend to perform multiple related jobs rather than a single, simple job like turning something green. But this is just an example, so we'll keep it simple here.) + +When this code is encapsulated into sealed-off objects, nobody has to worry that Sandra and Joe might use the same variable names or otherwise interfere with each other's code. Instead, within their totally separate, sealed-off objects, they can go ahead and write code as they please. This is because the scope of the code is local to the object, and also, neither Joe nor Sandra can view, much less modify, each other's code. + +A document is passed to Sandra's ItalAndCap object, and the document comes out the other end (returns) with all instances of _ACME WINDOWORKS_ italicized and capitalized. Then the document is passed to Joe's object and in turn spit out with _ACME WINDOWORKS_ in green. Thus, each component of the overall solution, the larger program, does its own job without interference from any other component (object). You thus avoid a lot of problems if people are working on individual tasks with the assurance that nobody else will be able to mess with their code or accidentally interact with it in some unpredictable way. Also, it's easier to track down bugs because each job is isolated from other jobs—and if the company name is only turning green half the time, you can tell Joe to take another look at his object. + +It's true that over the years OOP theory has grown quite arcane, abstract, and academic. OOP can be, in the upper reaches of universities, a terribly complex subject of study. In fact, they say that, like quantum mechanics, advanced OOP theory is understood by only 12 people in the world—and _they're_ fooling themselves. Nonetheless, if you are in charge of a team that's responsible for building a large application for Office, take some time to employ OOP features. Each individual programmer will be responsible for how their object works. The other programmers can merely _use_ that object without worrying about debugging it. They are not even allowed to see its internal code. Consider the objects that are built into VBA itself, such as Word's Selection object. It was written by somebody at Microsoft. You can put this object in your code and ask it to do things for you, such as move the cursor one word to the left: + + Selection.MoveLeft Unit:=wdWord, Count:=1 + +But you never see the actual code within the Selection object. You aren't allowed to modify it. And its code does not interact with your code's variables or cause other unwanted side effects. In other words, the built-in VBA objects are encapsulated—usable as black boxes, but sealed off. + +To create your own encapsulated objects in VBA, you add _class modules_ to a project, which are distinct from regular code modules. You'll see how to do this in Chapter 16, "Building Modular Code and Using Classes." + +# The Bottom Line + +**Understand and use objects, properties, and methods.** + +Contemporary programming employs a hierarchical method of organization known as object-oriented programming (OOP). At the very top of the hierarchy for any given application is the Application object. You go through this object to get to other objects that are lower in the hierarchy. + +Master It + +By using _creatable_ objects, you can often omit the Application object when referencing it in code. What are creatable objects? + +**Use collections of objects.** + +Collections are containers for a group of related objects, such as the Documents collection of Document objects. + +Master It + +Are collections objects? Do they have their own methods and properties? + +**Find objects, properties, and methods.** + +The Visual Basic Editor offers several ways to locate objects' members and add them to your programming code. There's an extensive Help system, the Object Browser, a List Properties/Methods feature, and the Auto List Members tool. + +Master It + +How do you employ Auto List Members to find out which properties and methods are available for Word's Document object? + +**Use Object variables to represent objects.** + +You can create variables that contain objects rather than typical values like strings or numbers. + +Master It + +What keywords do you use to declare an Object variable? +Part 3 + +Making Decisions and Using Loops and Functions + + * **Chapter 9: Using Built-in Functions** + * **Chapter 10: Creating Your Own Functions** + * **Chapter 11: Making Decisions in Your Code** + * **Chapter 12: Using Loops to Repeat Actions** + +Chapter 9 + +Using Built-in Functions + +VBA comes with a large number of built-in functions that perform commonly needed operations—everything from determining whether a file exists to returning the current date and converting data from one format to another. (For example, you can use a function to convert numeric data into a text string.) + +This chapter demonstrates what functions are, what they do, and how to use them. Along the way, you'll get to know some of the key functions built into VBA—including functions that convert data from one data type to another, functions that manage file operations, functions that do math, and many others. + +You can also create custom functions of your own to supplement VBA's built-in functions. The next chapter tells you how to build your own when VBA's functions don't meet your needs. + +In this chapter you will learn to do the following: + + * Understand what functions are and what they do + * Use functions + * Use key VBA functions + * Convert data from one type to another + * Manipulate strings and dates + +# What Is a Function? + +A _function_ is a type of procedure. A function differs from a subroutine (subprocedure) in that a function always returns a value and a subroutine doesn't. And in common practice, a function almost always takes one or more arguments. Although subroutines _can_ be written to take arguments, most programmers don't write their code this way. + +So, to sum up, here are the key difference between functions and subroutines: + +**Subroutines** + +These never return values and are rarely sent arguments. Subs are also generally self-contained. + +**Functions** + +These communicate more with code outside their own, accepting incoming data from arguments, processing that data in some way, and sending back a result. + +You'll often use functions that are built into VBA. Typically, you feed information into a built-in function by sending it arguments. The built-in function then processes that info and returns a value for you to use. But you can also create your own functions in the Code window if you wish. + +Built-in functions are so essential to VBA that you've already used several in examples in this book. However, we'll now explore them more fully. For example, in Chapter 7, "Using Array Variables," you used the Rnd function to generate random numbers to fill an array named intArray, and the Int function to turn the random numbers into integers: + + intArray(i) = Int(Rnd * 10) + +Rnd is one of the rare functions that does not have to take one or more arguments. (Rnd _can_ take one optional argument, but the previous example doesn't use it.) + +The Int function, on the other hand, requires an argument—the number or expression that it's turning into an integer. The argument in this example is supplied by the expression Rnd * 10. Here the Rnd function returns a value that the Int function uses; the Int function then returns a value to the procedure, which uses it to populate a subscript in the array. + +An _argument_ is a piece of information that gets passed to a function. (Arguments are also passed to methods and other commands.) You can tell when arguments are optional in Help descriptions because they're shown enclosed within brackets. When they are optional, you can either provide or omit the arguments displayed in the brackets. For example, the full Help syntax for the Rnd function looks like this: + + Rnd([ _number_ ]) As Single + +The brackets indicate that the _number_ argument is optional, and the As Single part of the syntax denotes that the value _returned_ by the function will be of the Single data type. + +Different functions return different data types suited to their job: Many functions return a Variant, but yes/no functions, such as the IsNumeric function used in Chapter 7, return a Boolean value, either True or False. When necessary, VBA may even sometimes convert the result of a function to a different data type needed by another function in the expression. + +If any pair of brackets contains two arguments, you have to use both of them at once (blessedly, this is quite rare). For example, the MsgBox function displays a message box. The syntax for the MsgBox function is as follows: + + MsgBox( _prompt_ [, _buttons_ ] [, _title_ ][, _helpfile, context_ ]) + +Here, _prompt_ is the only required argument: _buttons, title, helpfile_ , and _context_ are all optional. But notice that _helpfile_ and _context_ are enclosed within a single set of brackets instead of each having its own pair, meaning that you need to use either both of these arguments or neither of them; you cannot use one without the other. Chapter 13, "Getting User Input with Message Boxes and Input Boxes," shows you how to use the MsgBox function in your code. + +# Using Functions + +To use a function, you _call_ it (or _invoke_ it) from a procedure—either a subprocedure (Sub) or from another function (Function). + +To call a function, you can use a _call_ statement, either with the optional Call keyword or by just using the name of the function. Using the Call keyword allows you to search through all calls in your project by searching for "call " ( _call_ followed by a space). However, using the Call keyword is overkill for everyday functions; programmers rarely use it. + +The syntax for the Call statement is as follows: + + [Call] _name_ [ _argumentlist_ ] + +Here, _name_ is a required String argument giving the name of the function or procedure to call, and _argumentlist_ is an optional argument providing a comma-delimited list of the variables, arrays, or expressions to pass to the function or procedure. When calling a function, you'll almost always need to pass arguments (except for those few functions that take no arguments). + +The brackets around the Call keyword indicate that it is optional. If you do use this keyword, you need to enclose the _argumentlist_ argument in parentheses. In most cases, it's easier to read the code if you don't use the Call keyword when calling a function. + +For example, the following statement calls the MsgBox function, supplying the required argument _prompt_ (in this example, it's the string Hello, World!): + + MsgBox "Hello, World!" + +You could use the Call keyword instead, as shown in the following statement, but there's little advantage in doing so: + + Call MsgBox "Hello, World!" + +Note that the MsgBox function is one of the few with which you can omit the parentheses around the argument list. + +You can assign to a variable the result returned by a function. For example, consider the following code fragment. The first two of the following statements declare the String variables strExample and strLeft10. The third statement assigns a string of text to strExample. The fourth statement uses the Left function to return the leftmost 10 characters from strExample and assign them to strLeft10, which the fifth statement then displays in a message box (see Figure 9.1): + + Dim strExample As String + Dim strLeft10 As String + strExample = "Technology is interesting." + strLeft10 = Left(strExample, 10) + MsgBox strLeft10 + +Figure 9.1 Using the Left function to take the left part of a string—in this case, the first 10 characters of the string + +If you prefer, you can assign the result of each function to a variable, as in this next example. Here the first string variable, str1, is assigned the leftmost 13 characters from the string This is Pride and Patriotism. So after its code line executes, str1 holds the value This is Pride. Then str2 is assigned the rightmost 5 characters from str1, resulting in Pride. + + Dim str1 As String + Dim str2 As String + + str1 = Left("This is Pride and Patriotism", 13) + str2 = Right(str1, 5) + + MsgBox str2 + +However, after you become accustomed to working with functions, you can collapse them in various ways in your code. Instead of assigning the result of a function to a variable, you can insert it directly in your code or pass it (as an argument) to another function. This is a common shortcut. Take a look at the following statement. It does the same thing as the previous example but collapses the code into one line, avoiding the use of variables altogether: + + MsgBox Right(Left("This is Pride and Patriotism", 13), 5) + +This statement uses three functions: the MsgBox function, the Left function, and the Right function. (The Right function is the counterpart of the Left function and returns the specified number of characters from the right side of the specified string.) + +When you have multiple sets of parentheses in a VBA statement, the code is executed starting from the innermost pair of parentheses and working outward. This is the same way that nested parentheses are handled in math. + +So, in the previous example the Left function is evaluated first, returning the leftmost 13 characters from the string: This is Pride (the spaces are characters too). VBA passes this new string to the Right function, which in this case returns the rightmost five characters from it: Pride. VBA then passes this second new string to the MsgBox function, which displays it in a message box. + +* * * + +Limit Your Nesting + +You can nest functions to many levels without giving VBA any trouble, but multilevel nesting can become hard for us humans to read and troubleshoot. For most practical purposes, it's a good idea to limit nesting to only a few levels, if that. + +* * * + +## Passing Arguments to a Function + +When a function takes more than one argument, you can pass the arguments to it in any of three ways: + + * By supplying the argument values, without their names, _positionally_ (in the order in which the function expects them) + * By supplying the arguments, with their names, in the order in which the function expects them + * By supplying the arguments, with their names, in any order you choose + +The first method, supplying the arguments positionally without using their names, is usually the quickest way to proceed. The only disadvantage to doing so is that anyone reading your code may not know immediately which value corresponds to which argument—though they can look this up without trouble. To omit an optional argument, you place a comma where it would appear in the sequence of arguments. + +It does take extra time to type in argument names, but it makes your code easier to read. And when you omit an argument from a named argument list, you don't need to use the comma to indicate that you're skipping it. + +There's no advantage to using named arguments out of order over using them in order unless you happen to find doing so easier. + +For example, the DateSerial function returns a Variant/Date containing the date for the given year, month, and day. The syntax for DateSerial is as follows: + + DateSerial( _year, month, day_ ) + +Here, _year_ is a required Integer argument supplying the year, _month_ is a required Integer argument supplying the month, and _day_ is a required Integer argument supplying the day. + +The following statement supplies the arguments positionally without their names: + + MsgBox DateSerial(2010, 12, 31) + +This statement is equivalent but supplies the arguments positionally with their names: + + MsgBox DateSerial(Year:=2010, Month:=12, Day:=31) + +The following statement supplies the arguments, with their names, out of order: + + MsgBox DateSerial(Day:=31, Year:=2010, Month:=12) + +All three of these statements work fine and achieve the same result. You'll cause a problem only if you list out-of-order arguments that you're supplying without names (positionally), if you name some arguments and don't name others, or if you omit required arguments. Figure 9.2 shows one of the errors you may encounter. In this case, I left out the required _month_ argument. + +Figure 9.2 An "Argument not optional" error occurs when you omit a required argument. + +# Using Functions to Convert Data + +Most data-type conversion isn't frequently needed in VBA, but you might as well at least understand what it does. Some computer languages are pretty strict about requiring explicit data typing (sometimes called _strong data typing_ ). And there _are_ a few specialized situations where you will need to convert one variable type into another. For example, you might be using the InputBox command to get some information from the user. The user is typing on a keyboard, so all the data they input will be characters (text string) data. But if your macro needs to do any math with this input, such as using the + command to add numbers, you must first convert the string data into numeric variables (or use the default Variant type). To convert a string to an integer number, you could use the Cint command. This same problem arises if you are importing data from another source, such as a database that stores everything as a string variable. + +VBA provides a full set of simple functions for converting data from one data type to another. Table 9.1 lists VBA's functions for simple data conversion. + +Table 9.1 VBA's functions for simple data conversion + +**Function (Arguments)** | **Data Type Returned** +---|--- +CBool( _number_ ) | Boolean +CByte( _expression_ ) | Byte +CCur( _expression_ ) | Currency +CDate( _expression_ ) | Date +CDec( _expression_ ) | Decimal +CDbl( _expression_ ) | Double +CInt( _expression_ ) | Integer +CLng( _expression_ ) | Long +CSng( _expression_ ) | Single +CStr( _expression_ ) | String +CVar( _expression_ ) | Variant + +For example, the following statements declare the untyped variable varMyInput and the Integer variable intMyVar and then display an input box prompting the user to enter an integer. In the third statement, the user's input is assigned to varMyInput, which automatically becomes a Variant/String. The fourth statement uses the CInt function to convert varMyInput to an integer, assigning the result to intMyVar. The fifth statement compares intMyVar to 10, converts the result to Boolean by using the CBool function, and displays the result (True or False) in a message box. + + Dim varMyInput + Dim intMyVar As Integer + varMyInput = InputBox("Enter an integer:", "10 Is True, Other Numbers Are False") + intMyVar = CInt(varMyInput) + MsgBox CBool(intMyVar = 10) + +Recall that a Boolean variable is only either True or False. So in the final line of this example, you're saying in effect, "If the value in the variable intMyVar is 10, the Boolean result will be True. If the value is anything other than 10, the result will be False." + +VBA also has a set of functions that manipulate data in more complicated ways. Only two of these more complex manipulation functions—Format and Chr—are used much in VBA programming, so we'll explore them in depth in this chapter. + +Table 9.2 lists VBA's functions for more complex data manipulation. + +Table 9.2 VBA's functions for complex data conversion + +**Function (Arguments)** | **Returns** +---|--- +Asc( _string_ ) | The ANSI character code for the first character in the string. +Chr( _number_ ) | The string for the specified character code (a number between 0 and 255). +Format( _expression_ _,_ _format_ ) | A variant containing _expression_ formatted as specified by _format_. (You'll see how Format works in "Using the Format Function to Format an Expression" later in the chapter.) +Hex( _number_ ) | A string containing the hexadecimal value of _number_. +Oct( _number_ ) | A string containing the octal value of _number_. +RGB( _number1_ _,_ _number2_ _,_ _number3_ ) | A Long integer representing the color value specified by _number1, number2_ , and _number3_. +QBColor( _number_ ) | A Long containing the RGB value for the specified color. +Str( _number_ ) | A Variant/String containing a string representation of _number_. Use the superior CStr function instead. +Val( _string_ ) | The numeric portion of _string_ ; if _string_ does not have a numeric portion, Val returns 0. Use the superior CInt function instead. + +## Using the Asc Function to Return a Character Code + +This function isn't used much. Asc tells you which numeric value has been assigned to a particular letter according to the ANSI character code that's used in Windows. A _character code_ is a list of numbers by which computers refer to letters of the alphabet. For example, the character code used in Windows for a capital _A_ is 65 and for a capital _B_ is 66; a lowercase _a_ is 97, and a lowercase _b_ is 98. + +The syntax for the Asc function is straightforward: + + Asc( _string_ ) + +Here, string is any string expression. For example, Asc("A") returns 65. + +The following statements use the Asc function to return the character code for the first character of the current selection in the active document and display that code in a message box: + + strThisCharacter = **Asc** (Selection.Text) + MsgBox strThisCharacter, vbOKOnly, "Character Code" + +## Using the Val Function to Extract a Number from the Start of a String + +The Val function, like Asc, is not much used. But for completeness, I've included it. The Val function converts the numbers contained in a text string into a numeric value. Val follows these rules: + + * It reads only numbers in a string. + * It starts at the beginning of the string and reads only as far as the string contains characters that it recognizes as numbers (digits). + * It ignores tabs, line feeds, and blank spaces. + * It recognizes the period as a decimal separator, but not the comma. + +This means that if you feed Val a string consisting of tabbed columns of numbers, such as the second line here, it will read them as a single number (in this case, 445634.994711): + + Item# Price Available On Order Ordered + 4456 34.99 4 7 11 + +If, however, you feed it something containing a mix of numbers and letters, Val will read only the numbers and strings recognized as numeric expressions (for example, Val("4E5") returns 400000 because it reads the expression as exponentiation). For example, if fed the address shown in the next example, Val returns 8661, ignoring the other numbers in the string (because it stops at the _L_ of _Laurel_ , the first character that isn't a number, a tab, a line feed, or a space): + + 8661 Laurel Avenue Suite 3806, Oakland, CA 94610 + +The syntax for Val is straightforward: + + Val( _string_ ) + +Here, _string_ is a required argument consisting of any string expression. + +The following statement uses Val to return the numeric variable StreetNumber from the string Address1: + + StreetNumber = Val(Address1) + +* * * + +Using CInt Instead of Val + +You should generally use the CInt function rather than the Val function when converting text to numbers. The reason is that CInt takes into account where you are located (the _regional settings_ in Windows). In America, for example, we use a comma to indicate thousands: 12,000. The CInt function can handle this; Val cannot (and converts "12,000" into 12): + + Dim StrVar As String + StrVar = "12,000" + MsgBox "Val = " & Val(StrVar) & " CInt = " & CInt(StrVar) + +When you execute this code, you'll see the result shown in the following message box. This illustrates why you should use CInt rather than Val. + +Remember that Val stops when it reaches the first non-digit character. So that comma trips it up when trying to convert 12,000. + +* * * + +## Using the Str Function to Convert a Number into a String + +Just as you can use CInt to convert a text string into a numeric value as described in the previous section, you can also convert a numeric value to a string with the Str function. But you should use the newer CStr function rather than the Str function, for the same reasons that CInt is superior to the older Val command. + +You'll need to convert a number to a string when you want to _concatenate_ the information contained in a value with a string. Concatenation means appending one string to another, as in "123" & "654", which results in the text "123654". + +Concatenation cannot be accomplished by simply using the + operator because VBA would attempt to perform the mathematical operation addition rather than the string operation you want: concatenation. + +A text string is just that: text. It's one or more alphanumeric characters, such as "55"—and that's quite different from the number 55. You can't concatenate "55" and 55. They're not the same kind of data at all. + +Here's an example. Suppose you've declared a String variable named strYourAge and a numeric variable named intAge. You can't use a strYourAge + intAge statement to concatenate them because they're different data types. You first need to create a string from the intAge variable and then concatenate that string with the strYourAge string. (Alternatively, you can use the & operator to concatenate the two variables.) + +To convert a value to a string, use the CInt function. The syntax for the CInt function is this: + + CInt( _number_ ) + +Here, number is a variable containing a numeric expression (such as an Integer data type, a Long data type, or a Double data type). + +The following short procedure provides an example of converting a value to a string: + + Sub Age() + Dim intAge As Integer, strYourAge As String + intAge = InputBox("Enter your age:", "Age") + strYourAge = "Your age is " & **CInt** (intAge) & "." + MsgBox strYourAge, vbOKOnly + vbInformation, "Age" + End Sub + +* * * + +Using a Declaration Shortcut + +Notice in the example Sub Age how the Dim statement uses a kind of shorthand. Two different variables, separated by a comma, are declared on the same line using the same Dim command. This is equivalent to + +* * * + + Dim intAge As Integer + Dim strYourAge As String + +## Using the Format Function to Format an Expression + +The Format function is a powerful tool for changing numbers, dates and times, and strings into a format that you prefer. + +The syntax for the Format function is as follows: + + Format( _expression_ [, _format_ [, _firstdayofweek_ [, _firstweekofyear_ ]]]) + +These are the components of the syntax: + + * _expression_ is any valid expression. + * _format_ is an optional argument specifying a named format expression or a user-defined format expression. More on this in a moment. + * _firstdayofweek_ is an optional constant specifying the day that starts the week (for date information): The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; uses the system setting). + * _firstweekofyear_ is an optional constant specifying the week considered first in the year (again, for date information), as shown in Table 9.3. + +Table 9.3 Constants that specify how a year starts + +**Constant** | **Value** | **Year Starts with Week** +---|---|--- +vbUseSystem | 0 | Use the system setting. +vbFirstJan1 | 1 | The week in which January 1 falls (the default setting). +vbFirstFourDays | 2 | The first week with a minimum of four days in the year. +vbFirstFullWeek | 3 | The first full week (seven days) of the year. + +You can define your own formats for the Format function as described in the following sections if none of the predefined numeric formats (described next) suit your needs. + +### Using Predefined Numeric Formats + +Table 9.4 lists the predefined numeric formats that you can use with the Format function. + +Table 9.4 Predefined numeric formats + +**Format Name** | **Explanation** | **Example** +---|---|--- +General Number | The number is displayed with no thousand separator. | 124589 +Currency | The number is displayed with two decimal places, a thousand separator, and the currency symbol appropriate to the system locale. | $1,234.56 +Fixed | The number is displayed with two decimal places and at least one integer place. | 5.00 +Standard | The number is displayed with two decimal places, at least one integer place, and a thousand separator (when needed). | 1,225.00 +Percent | The number is displayed multiplied by 100, with two decimal places and with a percent sign. | 78.00% +Scientific | The number is displayed in scientific notation. | 5.00E+00 +Yes/No | A nonzero number is displayed as Yes; a zero number is displayed as No. | Yes +True/False | A nonzero number is displayed as True; a zero number is displayed as False. | False +On/Off | A nonzero number is displayed as On; a zero number is displayed as Off. | Off + +For example, the following statement returns $123.45: + + Format("12345", "Currency") + +### Creating a Numeric Format + +If none of the predefined numeric formats suit your needs, you can create your own numeric formats by using your choice of a combination of the characters listed in Table 9.5. + +Table 9.5 Characters for creating your own number formats + +**Character** | **Explanation** +---|--- +[None] | Displays the number without any formatting. (You won't usually want to use this option.) +0 | Placeholder for a digit. If there's no digit, VBA displays a zero. If the number has fewer digits than you use zeroes, VBA displays leading or trailing zeroes as appropriate. + +# | Placeholder for a digit. If there's no digit, VBA displays nothing. +. | Placeholder for a decimal. Indicates where the decimal separator should fall. The decimal separator varies by locale (for example, a decimal point in the United States, a comma in Germany). +% | Placeholder for a percent character. VBA inserts the percent character and multiplies the expression by 100. +, | Thousand separator (depending on locale, a comma or a period). +: | Time separator (typically a colon, but again this depends on the locale). +/ | Date separator. (Again, what you'll see depends on the locale.) +E– E+ e– e+ | Scientific format: E– or e– places a minus sign next to negative exponents. E+ or e+ places a minus sign next to negative exponents and places a plus sign next to positive exponents. +– + $ ( ) | Displays a literal character. +\\[character] | Displays the literal character. +"[string]" | Displays the literal character. Use Chr(34) (the character code for double quotation marks) to provide the double quotation marks. + +For example, the following statement returns a currency formatted with four decimal places: + + Format("123456", "$00.0000") + +### Creating a Date or Time Format + +Similarly, you can create your own date and time formats by mixing and matching the characters listed in Table 9.6. + +Table 9.6 Characters for creating your own date and time formats + +**Character** | **Explanation** +---|--- +: | Time separator (typically a colon, but this depends on the locale). +/ | Date separator (also locale-dependent). +C | Displays the date (if there is a date or an integer value) in the system's short date format and the time (if there is a date or a fractional value) in the system's default time format. +D | Displays the date (1 to 31) without a leading zero for single-digit numbers. +Dd | Displays the date with a leading zero for single-digit numbers (01 to 31). +Ddd | Displays the day as a three-letter abbreviation (Sun, Mon, Tue, Wed, Thu, Fri, Sat) with no period. +Dddd | Displays the full name of the day. +Ddddd | Displays the complete date (day, month, and year) in the system's short date format. +Dddddd | Displays the complete date (day, month, and year) in the system's long date format. +aaaa | Displays the full, localized name of the day. +w | Displays an integer from 1 (Sunday) to 7 (Monday) containing the day of the week. +ww | Displays an integer from 1 to 54 giving the number of the week in the year. The number of weeks is 54 rather than 52 because most years start and end with partial weeks rather than having 52 start-to-finish weeks. +m | Displays an integer from 1 to 12 giving the number of the month without a leading zero on single-digit months. When used after h, returns minutes instead of months. +mm | Displays a number from 01 to 12 giving the two-digit number of the month. When used after h, returns minutes instead of months. +mmm | Displays the month as a three-letter abbreviation (except for May) without a period. +mmmm | Displays the full name of the month. +oooo | Displays the full localized name of the month. +q | Displays a number from 1 to 4 giving the quarter of the year. +y | Displays an integer from 1 to 366 giving the day of the year. +yy | Displays a number from 00 to 99 giving the two-digit year. +yyyy | Displays a number from 0100 to 9999 giving the four-digit year. +h | Displays a number from 0 to 23 giving the hour. +Hh | Displays a number from 00 to 23 giving the two-digit hour. +N | Displays a number from 0 to 60 giving the minute. +Nn | Displays a number from 00 to 60 giving the two-digit minute. +S | Displays a number from 0 to 60 giving the second. +Ss | Displays a number from 00 to 60 giving the two-digit second. +ttttt | Displays the full time (hour, minute, and second) in the system's default time format. +AM/PM | Uses the 12-hour clock and displays AM or PM as appropriate. +am/pm | Uses the 12-hour clock and displays am or pm as appropriate. +A/P | Uses the 12-hour clock and displays A or P as appropriate. +a/p | Uses the 12-hour clock and displays a or p as appropriate. +AMPM | Uses the 12-hour clock and displays the AM or PM string literal defined for the system. + +For example, the following statement returns Saturday, April 01, 2010: + + Format(#4/1/2010#, "dddddd") + +### Creating a String Format + +The Format function also lets you create custom string formats using the options shown in Table 9.7. + +Table 9.7 Characters for creating your own string formats + +**Character** | **Explanation** +---|--- +@ | Placeholder for a character. Displays a character if there is one, and a space if there is none. +& | Placeholder for a character. Displays a character if there is one, and nothing if there is none. +< | Displays the string in lowercase. +> | Displays the string in uppercase. +! | Causes VBA to fill placeholders from left to right instead of from right to left (the default direction). + +For example, the following statement assigns to strUser a string consisting of four spaces if there is no input in the input box: + + strUser = Format(InputBox("Enter your name:"), "@@@@") + +## Using the Chr Function and Constants to Enter Special Characters in a String + +To insert special characters (such as a carriage return or a tab) into a string, specify the built-in constant (for those special characters that have built-in constants defined) or enter the appropriate character code using the Chr function. The syntax for the Chr function is straightforward: + + Chr( _charactercode_ ) + +Here, _charactercode_ is a number that identifies the special character you want to add. + +Table 9.8 lists the most useful character codes and character constants. + +Table 9.8 VBA character codes and character constants + +**Code** | **Built-in Character Constant** | **Character** +---|---|--- +Chr(9) | vbTab | Tab +Chr(10) | vbLf | Line feed +Chr(11) | vbVerticalTab | Soft return (Shift+Enter) +Chr(12) | vbFormFeed | Page break +Chr(13) | vbCr | Carriage return +Chr(13) + Chr(10) | vbCrLf | Carriage return/line feed combination +Chr(14) | — | Column break +Chr(34) | — | Double straight quotation marks (") +Chr(39) | — | Single straight quotation mark/apostrophe (') +Chr(145) | — | Opening single smart quotation mark (') +Chr(146) | — | Closing single smart quotation mark/ apostrophe (') +Chr(147) | — | Opening double smart quotation mark (") +Chr(148) | — | Closing double smart quotation mark (") +Chr(149) | — | Bullet +Chr(150) | — | En dash +Chr(151) | — | Em dash + +Here's a practical example exploiting the Chr function. Say you wanted to build a string containing a person's name and address from individual strings containing items of that information. You also wanted the individual items separated by tabs in the resulting string so that you could insert the string into a document and then easily convert it into a table. + +To do this, you could use the following code: + + Sub FormatTabular() + + Dim i As Integer + Dim strFirstName As String + Dim strLastName As String + Dim strAddress As String + Dim strCity As String + Dim strState As String + Dim strAllInfo As String + + strFirstName = "Phil" + strLastName = "Mortuqye" + strAddress = "12 Batwing Dr." + strCity = "Tulsa" + strState = "OK" + + strAllInfo = strFirstName & vbTab & strLastName _ + & vbTab & strAddress & vbTab & strCity _ + & vbTab & strState & **vbCr** + + Selection.TypeText strAllInfo + End Sub + +String variables are assigned to the string strAllInfo by concatenating the strings strFirstName, strLastName, and so on with tabs—vbTab characters—between them. The final character added to the built string is vbCr (a carriage-return character), which creates a new paragraph. + +The final line enters the strAllInfo string into the current document, thus building a tab-delimited list containing the names and addresses. This list can then be easily converted into a table whose columns each contain one item of information: The first column contains the strFirstName string, the second column the strLastName string, and so on. + +# Using Functions to Manipulate Strings + +String variables are often useful for holding text. You can use them to store any quantity of text, from a character or two up to a large number of pages from a Word document or other text document. You can also use strings to store specialized information, such as filenames and folder names. Once you've stored text in a string, you can manipulate it according to your needs. + +Table 9.9 lists VBA's built-in functions for manipulating strings. Because many of these functions are useful, and some are complex, you'll find detailed examples after the table. + +Table 9.9 VBA's string-manipulation functions + +**Function (Arguments)** | **Returns** +---|--- +InStr( _start_ , _string1_ , _string2_ , _compare_ ) | A Variant/Long giving the position of the first instance of the search string ( _string2_ ) inside the target string ( _string1_ ), starting from the beginning of the target string +InStrRev( _stringcheck_ , _stringmatch_ , _start_ , _compare_ ) | A Variant/Long giving the position of the first instance of the search string ( _stringmatch_ ) inside the target string ( _stringcheck_ ), starting from the end of the target string +LCase( _string_ ) | A String containing the lowercased _string_ +Left( _string_ , _number_ ) | A Variant/String containing the specified number of characters from the left end of _string_ +Len( _string_ ) | A Long containing the number of characters in _string_ +LTrim( _string_ ) | A Variant/String containing _string_ with any leading spaces trimmed off it +Mid( _string_ , _start_ , _length_ ) | A Variant/String containing the specified number of characters from the specified starting point within _string_ +Right( _string_ , _number_ ) | A Variant/String containing the specified number of characters from the right end of _string_ +RTrim( _string_ ) | A Variant/String containing _string_ with any trailing spaces trimmed off it +Space( _number_ ) | A Variant/String containing _number_ of spaces +StrComp( _string1_ , _string2_ , _compare_ ) | A Variant/Integer containing the result of comparing _string1_ and _string2_ +StrConv( _string_ , _conversion_ , _LCID)_ | A Variant/String containing _string_ converted as specified by _conversion_ for the (optional) specified Locale ID _(LCID)_ +String( _number_ , _character_ ) | A Variant/String containing _number_ of instances of _character_ +StrReverse( _expression_ ) | A String containing the characters of _expression_ in reverse order +Trim( _string_ ) | A Variant/String containing _string_ with any leading spaces or trailing spaces trimmed off it +UCase( _string_ ) | A String containing the uppercased _string_ + +## Using the Left, Right, and Mid Functions to Return Part of a String + +Frequently, you'll need to use only part of a string in your macros. For example, you might want to take only the first three characters of the name of a city to create a location code. + +VBA provides several functions for returning from strings the characters you need: + + * The Left function returns a specified number of characters from the left end of the string. + * The Right function returns a specified number of characters from the right end of the string. + * The Mid function returns a specified number of characters starting from a specified location inside a string. + +* * * + +Some String Functions Come in Two Flavors + +VBA provides two versions of a number of string functions, including the Left, Right, and Mid functions: the versions shown here, which return String-type Variant values, and versions whose names end with $ (Left$, Right$, Mid$, and so on), which return pure String values. + +The functions that return the pure Strings run faster (though you're not likely to notice any difference with normal use) but return an error if you use them on a Null value. The functions that return the String-type Variants can deal with Null values with no problem. Which approach you employ can depend on, for example, the type of data you're manipulating. Some databases employ Null, some do not. + +* * * + +### Using the Left Function + +The Left function returns the specified number of characters from the left end of a string. The syntax for the Left function is as follows: + + Left( _string, length_ ) + +Here, the _string_ argument is any string expression—that is, any expression that returns a sequence of contiguous characters. Left returns Null if _string_ contains no data. The _length_ argument is a numeric expression specifying the number of characters to return. _length_ can be a straightforward number (such as 4, or 7, or 11) or it can be an expression that results in a number. For example, if the length of a word were stored in the variable named LenWord and you wanted to return two characters fewer than LenWord, you could specify the expression LenWord - 2 as the _length_ argument; to return three characters more than LenWord, you could specify LenWord + 3 as the _length_ argument. + +One way to use the Left function would be to separate the area code from a telephone number that was provided as an unseparated 10-digit number from a database. In the following statements, the telephone number is stored in the String variable strPhone, which the code assumes was created earlier: + + Dim strArea As String + strArea = Left(strPhone, 3) + +These statements create the variable Area and fill it with the leftmost three characters of the variable strPhone. + +### Using the Right Function + +The Right function is the mirror image of the Left function. Right returns a specified number of characters from the right end of a string. The syntax for the Right function is as follows: + + Right( _string, length_ ) + +Again, the _string_ argument is any string expression, and _length_ is a numeric expression specifying the number of characters to return. And, again, Right returns Null if _string_ contains no data, and _length_ can be a number or an expression that results in a number. + +To continue the previous example, you could use the Right function to separate the last seven digits of the phone number stored in the string strPhone from the area code: + + Dim strLocalNumber As String + strLocalNumber = Right(strPhone, 7) + +These statements create the variable strLocalNumber and fill it with the rightmost seven characters from the variable strPhone. + +### Using the Mid Function + +The Left and Right functions extract a substring from the left or right side of a string. The Mid function fetches a substring out of the middle of a string. + +The Mid function returns the specified number of characters from inside the given string. You specify a starting position in the string and the number of characters (to the right of the starting position) that you want extracted. + +The syntax for the Mid function is as follows: + + Mid( _string, start_ [, _length_ ]) + +Here are the elements of the syntax: + + * As in Left and Right, the _string_ argument is any string expression. Mid returns Null if _string_ contains no data. + * _start_ is a numeric value specifying the character position in _string_ at which to start the _length_ selection. If _start_ is larger than the number of characters in _string_ , VBA returns a zero-length string. In code, an empty string is typed as two quotation marks with nothing inside: strState = "". + * _length_ is an _optional_ numeric expression specifying the number of characters to return. If you omit _length_ or use a _length_ argument greater than the number of characters in _string_ , VBA returns all the characters from the _start_ position to the end of _string_. _length_ can be an ordinary literal number or an expression that results in a number. + +Using the phone-number example, you could employ Mid to pluck the local exchange code out from within a 10-digit phone number (for instance, extract the 555 from 510 **555** 1212), like this: + + Dim strPhone As String + + strPhone = "5105551212" + MsgBox Mid(strPhone, 4, 3) + +This statement displays three characters in the variable strPhone, starting at the fourth character. + +* * * + +**Don't Torture Your Users—Accept a Variety of Formats** + +All too often programmers, for no good reason, make it hard for users to succeed. How many times have you tried to type your phone number in a website and been told that the only acceptable format is xxx-xxx-xxxx? Or (xxx) xxx-xxxx? Or numbers only! You _will_ do things our way. Well...why? + +Why? Because the programmer was lazy and refused to permit a variety of input. + +People write down their phone number various ways. Some type it in like this: (xxx) xxx-xxxx; others favor variations like xxx xxx-xxxx. Have you seen those instructions that say "use no hyphens" or "you must use hyphens"? + +This is simply slothful programming. The programmer doesn't want to take a little extra time to deal with varying input, so they transfer the work to the user. Make life easier for your users by writing a little extra code to translate various typical formats into whatever your program expects. Don't force the users to provide data "just so." + +Avoid such user frustration by simply writing some code that tests the user's input. Here are a few easy solutions: + +Use the InStr function (described later in this chapter) to check for parentheses or hyphens. Or use Mid to extract only the numeric values in the user's string entry—ignoring whatever blank spaces or non-numeric characters the user might have typed in. Your program's goal is to end up with 5105551212. After extracting the non-digits, you can then show the user a _useful_ error message if they have not entered the necessary 10 digits. + +Test the number with the Len function to see if there are 10 digits. If not, tell the user they made a mistake and to please reenter the phone number because there are not enough (or too many) digits. + +Your error message should also display the user's entry so they can see the problem. But you're just being lazy and annoying if you tell them they can't use parentheses or hyphens or _must_ use those punctuation marks—to satisfy _you_. Who are you? + +Your code should accept several predictable variations of user input. There's no need to reject legitimate user input simply because that input is punctuated in a different way than your data store or your code prefers. After all, why waste the time of perhaps thousands of users when it only takes a little extra coding to accommodate them? + +* * * + +We've seen how to extract a substring using Mid. But this function has another use as well. You can also use Mid to _find_ the location of a character within a string. In the following snippet, the Do Until... Loop walks backward through the string strFilename (which contains the FullName property of the template attached to the active document in Word) until it reaches the first backslash (\), storing the resulting character position in the Integer variable intLen. The message box then displays that part of strFilename to the right of the backslash (determined by subtracting intLen from the length of strFilename)—the name of the attached template without its path: + + Dim strFilename As String, intLen As Integer + strFilename = ActiveDocument.AttachedTemplate.FullName + MsgBox strFilename + + intLen = Len(strFilename) + Do Until **Mid** (strFilename, intLen, 1) = "\" + intLen = intLen - 1 + Loop + MsgBox Right(strFilename, Len(strFilename) - intLen) + +This example is more illustrative than realistic for two reasons: First, you can get the name of the template more easily by just using the Name property rather than the FullName property. Second, there's a function called InStrRev (discussed next) that returns the position of one string within another by walking backward through it. + +## Using InStr and InStrRev to Find a String within Another String + +You can use the Mid function to find an individual character within a string, but what if you need to find a set of characters within a string? The InStr function is designed to find one string within another string. For example, you could check, say, the current paragraph to see if it contained a particular word. If it did, you could take action accordingly—for instance, replacing that word with another word or selecting the paragraph for inclusion in another document. Maybe your company has changed its name and you need to do a search and replace in a large number of document templates. Or something. + +The InStrRev function is the counterpart of the InStr function, working in a similar way but in the reverse direction. + +The syntax for InStr is as follows: + + InStr([ _start_ , ] _string1, string2_ [, _compare_ ]) + +Here are the arguments: + + * _start_ is an optional argument specifying the starting position in the first string, _string1_. If you omit _start_ , VBA starts the search at the first character in _string1_ (which is usually where you want to start). However, you do need to use _start_ when you use the _compare_ argument to specify the type of string comparison to perform. + * _string1_ is a required argument specifying the string expression in which to search for _string2_. + * _string2_ is a required argument specifying the string expression for which to search in _string1_. + * _compare_ is an optional argument specifying the type of string comparison you want to perform. Text can be compared two ways: a _binary comparison_ , which is case sensitive, or a _textual comparison_ , which is not case sensitive. The default is a binary comparison, which you can specify by using the constant vbBinaryCompare or the value 0 for compare. Although specifying this value isn't necessary (because it's the default), you might want to include it to make your code ultra-clear. To specify a textual, case-insensitive comparison, use the constant vbTextCompare or the value 1 for _compare_. + +* * * + +Use Textual Comparisons with Unpredictable String Data + +A textual comparison is a useful weapon when you're dealing with data that may arrive in a variety of ways, like the telephone-number punctuation problem described in this chapter's Real World Scenario. Here's another example: If you wanted to search a selection for instances of a name, you'd probably want to find _all_ instances of the name—uppercase, lowercase, or title case (initial caps). Otherwise, you'd find only the name with exactly the same capitalization as you specified in the String2 argument. + +* * * + +Another way to use InStr is to find the _location_ of a certain string within another string so that you can then _change_ that substring. You might want to do this if you needed to move a file from its current position in a particular folder or subfolder to another folder that had a similar subfolder structure. For instance, suppose you work with documents stored in a variety of subfolders beneath a folder named _In_ (such as z:\Documents\In\), and after you're finished with them, you save them in corresponding subfolders beneath a folder named _Out_ (z:\Documents\Out\). The short procedure shown in Listing 9.1 automatically saves the documents in the _Out_ subfolder. + +**Listing 9.1**: Changing a file path + + 1. Sub Save_in_Out_Folder() + 2. Dim strOName As String, strNName As String, _ + intToChange As Integer + 3. strOName = ActiveDocument.FullName + 4. intToChange = InStr(strOName, "\In\") + 5. strNName = Left(strOName, intToChange - 1) & "\Out\" _ + & Right(strOName, Len(strOName) - intToChange - 3) + 6. ActiveDocument.SaveAs strNName + 7. End Sub + +The code in Listing 9.1 works as follows: + + * Line 1 begins the procedure, and line 7 ends it. + * Line 2 declares the String variable strOName (as in _original name_ ), the String variable strNName (as in _new name_ ), and the Integer variable intToChange. Line 3 then assigns strOName the FullName property of the ActiveDocument object: the full name of the active document, including the path to the document (for example, z:\Documents\In\Letters\My Letter.docm). + * Line 4 assigns to the variable intToChange the value of the InStr function that finds the string \In\ in the variable strOName. Using the example path from the previous paragraph, intToChange will be assigned the value 13 because the 1st character of the \In\ string is the 13th character in the strOName string. + * Line 5 assigns to the variable strNName the new filename created in the main part of the statement. This breaks down as follows: + * Left(strOName, intToChange - 1) takes the left section of the strOName string, returning the number of characters specified by intToChange - 1—the number stored in intToChange minus one. + * & "\Out\" adds to the partial string specified in the previous bullet item (to continue the previous example, z:\Documents) the characters \Out\, which effectively replace the \In\ characters, thus changing the directory name (z:\Documents\Out\). + * & Right(strOName, Len(strOName) - intToChange - 3) completes the partial string by adding the right section of the strOName string, starting from after the \In\ string (Letters\My Letter.docm), giving z:\Documents\Out\Letters\My Letter.docm. The number of characters to take from the right section is determined by subtracting the value stored in intToChange from the length of strOName and then subtracting 3 from the result. Here, the value 3 comes from the length of the string \In\; because the intToChange value stores the character number of the first backslash, you need count only the _I_ , the _n_ , and the second backslash to reach its end. + * Line 6 saves the document using the name in the strNName variable. + +The syntax for InStrRev is similar to that of InStr: + + InStrRev( _stringcheck, stringmatch_ [, _start_ [, _compare_ ]]) + +These are the arguments: + + * _stringcheck_ is a required String argument specifying the string in which to search for _stringmatch_. + * _stringmatch_ is a required String argument specifying the string for which to search. + * _start_ is an optional numeric argument specifying the starting position for the search. If you omit _start_ , VBA starts at the last character of _stringcheck_. + * _compare_ (as for InStr) is an optional argument specifying how to search: vbTextCompare for text, vbBinaryCompare for a binary comparison. + +## Using LTrim, ****RTrim , and Trim to Remove Spaces from a String + +Often you'll need to trim strings before concatenating them to avoid ending up with extra spaces in inappropriate places, such as in the middle of eight-character filenames. + +Data can contain appended or prepended spaces. And always remember that users might randomly type spaces in various ways when entering data. You never know. Your programming (and databases), however, need data in a predictable format (so the data can easily be searched, sorted, and otherwise manipulated). + +For example, if 500 users entered their zip code, some might type a space before entering the digits. Any such entries would be placed at the start of a list after the list was alphabetically sorted (the space character is seen as "lower" than ordinary characters by a sorting function). So the sort would produce an inaccurate result. It's easy, though, to use the Trim functions to get rid of spaces. + +As you saw in Table 9.9, VBA provides three functions specifically for trimming leading spaces and trailing spaces from strings: + + * LTrim removes leading spaces from the specified string. + * RTrim removes trailing spaces from the specified string. + * Trim removes both leading and trailing spaces from the specified string. + +* * * + +Trim Is Often the Only Space-Removal Function You Need + +In many cases, you can simply use Trim instead of figuring out whether LTrim or RTrim is appropriate for what you expect a variable to contain. At other times, you'll need to remove either leading or trailing spaces while _retaining_ spaces on the other end. In those special cases, you'll need to use either LTrim or RTrim. RTrim is especially useful for working with fixed-length String variables, which will contain trailing spaces if the data assigned to them is shorter than their fixed length. + +* * * + +The syntax for the LTrim, RTrim, and Trim functions is straightforward: + + LTrim(string) + RTrim(string) + Trim(string) + +In each case, string is any string expression. + +You could use the Trim function to remove both leading and trailing spaces from a string derived from the current selection in the active document in Word. The first line in this next code example declares strUntrimmed and strTrimmed as String variables. The second line assigns the data in the current selection to the strUntrimmed string. The third line assigns the trimmed version of the strUntrimmed string to the strTrimmed string: + + Dim strUntrimmed As String, strTrimmed As String + strUntrimmed = Selection.Text + strTrimmed = Trim(strUntrimmed) + +## Using Len to Check the Length of a String + +To find out how long a string is, use the Len function. The syntax for the Len function is straightforward: + + Len( _string_ ) + +Here, _string_ is any valid string expression. (If _string_ is Null, Len also returns Null.) + +One use for Len is to make sure a user's entry in an input box or in a text box on a form is of a suitable length. A United States phone number must be 10 digits, for instance. + +The CheckPassword procedure shown in Listing 9.2 uses Len to make sure a password the user enters is long enough to be difficult to guess, but not too long. + +**Listing 9.2**: Testing password length with the Len function + + 1. Sub CheckPassword() + 2. Dim strPassword As String + 3. BadPassword: + 4. strPassword = InputBox _ + ("Enter the password to protect this item from changes:" _ + , "Enter Password") + 5. If **Len** (strPassword) = 0 Then + 6. End + 7. ElseIf **Len** (strPassword) < 6 Then + 8. MsgBox "The password you chose is too short." _ + & vbCr & vbCr & _ + "Choose a password between 6 and 15 characters in length.", _ + vbOKOnly + vbCritical, "Unsuitable Password" + 9. GoTo BadPassword + 10. ElseIf **Len** (strPassword) > 15 Then + 11. MsgBox "The password you chose is too long." _ + & vbCr & vbCr & _ + "Choose a password between 6 and 15 characters in length.", _ + vbOKOnly + vbCritical, "Unsuitable Password" + 12. GoTo BadPassword + 13. End If + 14. End Sub + +Listing 9.2 ensures that a password contains between 6 and 15 characters (inclusive). Here's how the code works: + + * Line 2 declares a String variable named strPassword. + * Line 3 contains the label BadPassword, to which the GoTo statements in line 9 and line 12 redirect execution if the password fails either of the checks. Labels are locations within code that you might need to jump to during execution. A label is a word on its own line in the code that ends with a colon. Labels are discussed in Chapter 11, "Making Decisions in Your Code." + * Line 4 assigns to strPassword the result of an input box that invites the user to enter the password for the item. + * Lines 5 through 13 then use an If statement to check that the password is an appropriate length. First, line 5 checks strPassword for zero length, which would mean that the user clicked either the Cancel button or the close button on the input box or clicked the OK button with no text entered in the input box. If the length of strPassword is zero, the End statement in line 6 terminates the procedure. If the password passes that test, line 7 checks to find out if its length is less than 6 characters; if so, the procedure displays a message box alerting the user to the problem and then redirects execution to the BadPassword label. If the password is 6 or more characters long, line 10 checks to see if it's more than 15 characters long; if it is, the user is shown another message box and another trip to the BadPassword label. + +## Using StrConv, LCase, and UCase to Change the Case of a String + +If you need to change the case of a string, use the StrConv (whose name comes from _string conversion_ ), LCase, and UCase functions. Of these, the easiest to use is StrConv, which can convert a string to a variety of different formats varying from straightforward uppercase, lowercase, or _propercase_ (as VBA refers to initial capitals, also known as title case) to the Japanese _hiragana_ and _katakana_ phonetic characters. + +### Using StrConv + +The StrConv function has the following syntax: + + StrConv( _string, conversion_ ) + +Here, the _string_ argument is any string expression, and the _conversion_ argument is a constant or value specifying the type of conversion required. The most useful conversion constants and values are shown in Table 9.10. + +Table 9.10 The most common conversion constants + +**Constant** | **Value** | **Effect** +---|---|--- +vbUpperCase | 1 | Converts the given string to uppercase characters +vbLowerCase | 2 | Converts the given string to lowercase characters +vbProperCase | 3 | Converts the given string to propercase (aka title case—the first letter of every word is capitalized) +vbUnicode | 64 | Converts the given string to Unicode using the system's default code page +vbFromUnicode | 128 | Converts the given string from Unicode to the system's default code page + +For example, suppose you received from a database program a string called strCustomerName containing a person's name. You could use StrConv to make sure that it was in title case by using a statement such as this: + + strProperCustomerName = StrConv(strCustomerName, vbProperCase) + +* * * + +StrConv Ignores the Capitalization You Feed It + +Note that StrConv doesn't care about the _case_ of the string you feed it—it simply returns the case you asked for. For example, feeding StrConv uppercase and asking it to return uppercase doesn't cause any problem. + +* * * + +### Using LCase and UCase + +If you don't feel like using StrConv, you can alternatively use the LCase and UCase functions, which convert a string to lowercase and uppercase, respectively. + +LCase and UCase have the following syntax: + + LCase( _string_ ) + UCase( _string_ ) + +Here, _string_ is any string expression. + +For example, the following statement lowercases the string MyString and assigns it to MyLowerString: + + MyLowerString = LCase(MyString) + +## Using the StrComp Function to Compare Apples to Apples + +As you've seen already, you can compare one item to another item by simply using the = operator: + + If 1 = 1 Then MsgBox "One is one." + +This straightforward comparison with the = operator also works with two strings, as shown in the second line here: + + strPet = InputBox("Is your pet a dog or a cat?", "Pet") + If strPet = "Dog" Then MsgBox "We do not accept dogs." + +The problem with this code as written is that the strings need to match exactly in capitalization for VBA to consider them equal. If the user enters dog or DOG (not to mention dOG, doG, dOg, or DoG) rather than Dog, the condition isn't met. Again, permit your users a variety of correct responses—don't enforce pointless capitalization and punctuation rules. + +To accept variations of capitalization, you could use the Or operator to hedge your bets: + + If Pet = "Dog" Or Pet = "dog" Or Pet = "DOG" Or Pet = "dogs" _ + Or Pet = "Dogs" or Pet = "DOGS" Then MsgBox _ + "We do not accept dogs. " + +As you can see, such code rapidly becomes clumsy, even omitting some variations such as dOG. Or you could change the case of one or both strings involved to make sure their case matched, but it's simpler to just use the StrComp function, which is designed to permit you to ignore case. The syntax for StrComp is as follows: + + StrComp( _string1, string2_ [, _compare_ ]) + +Here, _string1_ and _string2_ are required String arguments specifying the strings to compare, and _compare_ is an optional argument specifying textual comparison (vbTextCompare) or binary comparison (vbBinaryCompare). + +The following statement uses StrComp to settle the pet question once and for all: + + If StrComp(Pet, "dog", vbTextCompare) = True Then _ + MsgBox "We do not accept dogs." + +# Using VBA's Mathematical Functions + +VBA provides a solid suite of functions for standard mathematical operations. Table 9.11 lists these functions with examples. + +Table 9.11 VBA's mathematical functions + +**Function(Argument)** | **Returns** | **Example** +---|---|--- +Abs( _number_ ) | The absolute value of _number_ —the unsigned magnitude of the number. | Abs(-100) returns 100. +Atn( _number_ ) | The arctangent of _number_ in radians. | Atn(dblMyAngle) +Cos( _number_ ) | The cosine of angle _number_. | Cos(dblMyAngle) +Exp( _number_ ) | e, the base of natural logarithms, raised to the power of _number_. | Exp(5) returns 148.413159102577. +Fix( _number_ ) | The integer portion of _number_ (without rounding). If _number_ is negative, returns the negative number greater than or equal to _number_. | Fix(3.14159) returns 3. Fix(-3.14159) returns –3. +Int( _number_ ) | The integer portion of _number_ (again, without rounding). If _number_ is negative, returns the negative number less than or equal to _number_. | Int(3.14159) returns 3. Int(-3.14159) returns –4. +Log( _number_ ) | The natural logarithm of _number_. | Log(dblMyAngle) +Rnd([ _number_ ]) | A random number (with no argument) or a number based on the given initial seed. | Rnd(1) returns a random number. +Sgn( _number_ ) | –1 if _number_ is negative, 0 if _number_ is 0, 1 if _number_ is positive. | Sgn(7) returns 1. Sgn(-7) returns –1. Sgn(0) returns 0. +Sin( _number_ ) | The sine of the angle specified by _number_ (measured in radians). | Sin(dblMyAngle) +Sqr( _number_ ) | The square root of _number_. If _number_ is negative, VBA gives a runtime error. | Sqr(9) returns 3. +Tan( _number_ ) | The tangent of the angle specified by _number_ (measured in radians). | Tan(dblMyAngle) + +# Using VBA's Date and Time Functions + +VBA provides a full complement of date and time functions, as listed in Table 9.12. The table provides brief examples of working with the functions. The sections after the table provide longer examples showing how to use some of the more complex functions. + +Table 9.12 VBA's date and time functions + +**Function (Arguments)** | **Returns** | **Example** +---|---|--- +Date | A Variant/Date containing the current date according to your computer | MsgBox Date might display 04/01/2010. (The format depends on your Windows date settings.) +DateAdd( _interval_ , _number_ , _date_ ) | A Variant/Date containing the date of the specified interval after the specified date | DateAdd("m", 1, "6/3/06") returns 7/3/2010. +DatePart( _interval_ , _date_ ) | The part (specified by _interval_ ) of the specified date | See the example in the next section. +DateSerial( _year_ , _month_ , _day_ ) | A Variant/Date containing the date for the specified year, month, and day | dteCompanyFounded = DateSerial(1997, 7, 4). +DateValue( _date_ ) | A Variant/Date containing the specified date | dteDeath = "July 2, 1971" +Day( _date_ ) | A Variant/Integer between 1 and 31, inclusive, representing the day of the month for _date_ | If Day(Date) = 1 And Month(Date) = 1 Then MsgBox "Happy new year!" +Hour( _time_ ) | A Variant/Integer between 0 and 23, inclusive, representing the hour for _time_ | dteHour = Hour(dteLoggedIn) +Minute( _time_ ) | A Variant/Integer between 0 and 59, inclusive, representing the minute for _time_ | dteMinute = Minute(dteLoggedIn) +Month( _date_ ) | A Variant/Integer between 1 and 12, inclusive, representing the month for _date_ | strThisDate = Month(Date) & "/" & Day(Date) +MonthName( _month_ ) | A String containing the name of the month represented by _month_ | MsgBox MonthName(Month(Date)) displays a message box containing the current month. +Now | A Variant/Date containing the current date and time according to your computer | MsgBox Now might display 04/01/2010 9:25:15PM. (The format of date and time will depend on your Windows date settings.) +Second( _time_ ) | A Variant/Integer between 0 and 59, inclusive, representing the second for _time_ | dteSecond = Second(dteLoggedIn) +Time | A Variant/Date containing the current time according to your computer | MsgBox Time might display 9:25:15PM. (The time format and time will depend on your Windows date settings.) +Timer | A Single giving the number of seconds that have elapsed since midnight | If Timer > 43200 Then MsgBox _ "This code only works in the morning.": End +TimeSerial( _hour_ , _minute_ , _second_ ) | A Variant/Date containing the time for the specified hour, minute, and second | TimeSerial(11, 12, 13) returns 11:12:13AM. (The format will depend on your Windows date settings.) +TimeValue( _time_ ) | A Variant/Date containing the time for _time_ | TimeValue(Now) +Weekday( _date_ ) | A Variant/Integer containing the day of the week represented by _date_ | See the next entry. +WeekdayName (weekday) | A String containing the weekday denoted by _weekday_ | WeekdayName(Weekday (#4/1/2013#)) returns Saturday, the day of the week for April Fool's Day 2013. + +## Using the DatePart Function to Parse Dates + +The DatePart function lets you take a date and separate it into its components. You can often achieve the same results by using other date functions, but DatePart is a great tool to have in your VBA toolbox. + +The syntax for DatePart is as follows: + + DatePart( _Interval, Date_ [ _,FirstDayOfWeek_ [ _, FirstWeekOfYear_ ]]) + +The components of the syntax are as follows: + + * _Interval_ is a required String expression giving the unit in which you want to measure the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute (because m is for month), and s for second. + * _Date_ is a required Variant/Date giving the date you want to examine. + * _FirstDayOfWeek_ is an optional constant specifying the day that starts the week (for date information). The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; this uses the system setting). + * _FirstWeekOfYear_ is an optional constant specifying the week considered first in the year. Table 9.13 shows the options for this constant. + +Table 9.13 The options for the FirstWeekOfYear constant + +**Constant** | **Value** | **Year Starts with Week** +---|---|--- +vbUseSystem | 0 | Use the system setting. +vbFirstJan1 | 1 | The week in which January 1 falls (the default setting). +vbFirstFourDays | 2 | The first week with a minimum of four days in the year. +vbFirstFullWeek | 3 | The first full week (7 days) of the year. + +For example, the following statement assigns the current year to the variable dteThisYear: + + dteThisYear = DatePart("yyyy", Date) + +## Using the DateDiff Function to Figure Out a Time Interval + +The DateDiff function returns the interval (the number of days, weeks, hours, and so on) between two specified dates. The syntax for DateDiff is as follows: + + DateDiff( _interval_ _,_ _date1_ _,_ _date2_ [ _,_ _firstdayofweek_ [, _firstweekofyear_ ]]) + +Here are the components of the syntax: + + * _interval_ is a required String expression giving the unit in which you want to measure the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute (because m is for month), and s for second. + * _date1_ and _date2_ are the dates between which you're calculating the interval. + * _firstdayofweek_ is an optional constant specifying the day that starts the week (for date information). The default setting is vbSunday (1), but you can also set vbMonday (2), vbTuesday (3), vbWednesday (4), vbThursday (5), vbFriday (6), vbSaturday (7), or vbUseSystem (0; this uses the system setting). + * _firstweekofyear_ is an optional constant specifying the week considered first in the year. Table 9.13 shows the options for this constant. + +For example, the following statement returns the number of weeks between June 3, 2009, and September 30, 2009: + + MsgBox DateDiff("ww", "6/3/2009", "9/30/2009") + +## Using the DateAdd Function to Add or Subtract Time from a Date + +The DateAdd function lets you easily add an interval of time to, or subtract an interval of time from, a specified date, returning the resulting date. The syntax for DateAdd is as follows: + + DateAdd( _interval, number, date_ ) + +Here are the components of the syntax: + + * _interval_ is a required String expression giving the unit of measurement for the interval: yyyy for year, q for quarter, m for month, y for the day of the year, d for day, w for weekday, ww for week, h for hour, n for minute, and s for second. + * _number_ is a required numeric expression giving the number of intervals to add (a positive number) or to subtract (a negative number). If _number_ isn't already of the data type Long, VBA rounds it to the nearest whole number before evaluating the function. + * _date_ is a required Variant/Date or literal date giving the starting date. + +For example, the following statement returns the date 10 weeks from May 27, 2010: + + DateAdd("ww", 10, #5/27/2009#) + +# Using File-Management Functions + +The following sections demonstrate how to use a couple of key VBA file-management functions: the Dir function, which you use to find out whether a file exists, and the CurDir function, which returns the current path. + +## Using the Dir Function to Check Whether a File Exists + +Often when managing files, you'll need to first check whether a particular file already exists. For instance, if you're about to save a file, you may want to make sure the save operation won't overwrite an existing file—a file with the same name in the same location on the hard drive. + +Or if you're about to open a file, you may want to see if that file exists before you use the Open method; otherwise, VBA will give an error. + +To test whether a file exists, you can use a straightforward procedure such as the one shown in Listing 9.3. + +**Listing 9.3**: Checking if a file exists with the Dir function + + 1. Sub Does_File_Exist() + 2. Dim strTestFile As String, strNameToTest As String, _ + strMsg As String + 3. strNameToTest = InputBox("Enter the file name and path:") + 4. If strNameToTest = "" Then End + 5. strTestFile = **Dir** (strNameToTest) + 6. If Len(strTestFile) = 0 Then + 7. strMsg = "The file " & strNameToTest & _ + " does not exist." + 8. Else + 9. strMsg = "The file " & strNameToTest & " exists. " + 10. End If + 11. MsgBox strMsg, vbOKOnly + vbInformation, _ + "File-Existence Check" + 12. End Sub + +This procedure in Listing 9.3 uses the Dir function to check whether a file exists and displays a message box indicating whether it does or doesn't. Figure 9.3 shows examples of the message box. This message box is for demonstration purposes only. In a real-world macro you'd likely use the result of the test to branch (execute different code blocks) based on whether the file exists. Branching is covered in Chapter 11. + +Figure 9.3 You can use the Dir function to check whether a file exists so that you don't accidentally overwrite it or cause an error by trying to open a nonexistent file. + +Here's how the code works: + + * Line 2 declares the string variables strTestFile, strNameToTest, and strMsg. + * Line 3 then displays an input box prompting the user to enter a filename and path; VBA assigns the result of the input box to strNameToTest. + * Line 4 compares strNameToTest to a blank string (which means the user clicked the Cancel button in the input box or clicked the OK button without entering any text in the text box) and uses an End statement to end the procedure if it gets a match. + * Line 5 assigns to strTestFile the result of running the Dir function on the strNameToTest string. If Dir finds a match for strNameToTest, strTestFile will contain the name of the matching file; otherwise, it will contain an empty string. + * Line 6 begins an If... Then statement by testing the length of the strTestFile string. If the length is 0, the statement in line 7 assigns to strMsg text saying that the file doesn't exist; otherwise, VBA branches to the Else statement in line 8 and runs the statement in line 9, assigning text to strMsg saying that the file does exist. Line 10 ends the If statement. + * Line 11 displays a message box containing strMsg. Line 12 ends the procedure. + +* * * + +Garbage In, Garbage Out + +The code shown in Listing 9.3 isn't bulletproof because Dir is designed to work with wildcards as well as regular characters. As long as you're working with a simple text filename in strNameToTest, you'll be fine because Dir compares that text to the existing filenames on the hard drive and the result lets you know whether you have a match. But if strNameToTest contains wildcards (say it's c:\temp\\*.*; the asterisks specifying _any filename_ ), Dir reports that the file exists. However, there's no file by that name, just one or more files that match the wildcard. You can check on line 5 whether the name returned by Dir is exactly the same as the input name and make sure you do a case-insensitive comparison. This literalness of Dir is a nice illustration of GIGO (garbage in, garbage out)—from the computer's (and VBA's) point of view, it's doing what you asked it to, but the result is far from what you intended. + +* * * + +## Returning the Current Path + +You can find out the current path (the location on the hard drive to which the host application is currently pointed) on either the current drive or a specified drive by using the CurDir function. Often, you'll need to change the current path (using the ChDir function) to make sure the user is saving files in, or opening files from, a suitable location. + +To return the current path, use CurDir without an argument: + + CurDir + +To return the current path for a specified drive, enter the drive letter as an argument. For example, to return the current path on drive D, use this statement: + + CurDir("D") + +# The Bottom Line + +**Understand what functions are and what they do.** + +A function is a unit of code, a procedure, that performs a task _and returns a value_. + +You can write your own functions by writing code between Function and End Function in the VBA Editor. Chapter 10, "Creating Your Own Functions," explores how to write such custom functions. But in addition to functions you might write, there are many functions already prewritten in VBA—ready for you to call them from your macros to perform various tasks. + +Master It + +A function is quite similar to a subroutine, but there is a significant difference. What is it? + +**Use functions.** + +In a macro, you can call a built-in function by merely typing in its name and providing any required arguments. + +Master It + +You can combine multiple functions in a single line of code. The MsgBox function displays a message box containing whatever data you request. The only required argument for this function is the _prompt_. The Now function returns the current date and time. Write a line of code that calls the MsgBox function and uses the Now function as its argument. + +**Use key VBA functions.** + +VBA offers the services of hundreds of built-in functions. You'll find yourself using some of them over and over. They are _key_ to programming. + +Master It + +What built-in function is used quite often to display information in a dialog box to the user while a procedure runs? + +**Convert data from one type to another.** + +It's sometimes necessary to change a value from one data type to another. Perhaps you used an input box to ask the user to type in a String variable, but then you need to change it into an Integer type so you can do some math with it. (You can't add pieces of text to each other.) + +Master It + +What built-in function would you use to convert a string such as "12" (which, in reality, is two text _characters_ , the digits 1 and 2) into an Integer data type, the actual _number_ 12, that you can manipulate mathematically? + +**Manipulate strings and dates.** + +VBA includes a full set of functions to manage text and date data. + +Master It + +Which built-in function would you use to remove any leading and trailing space characters from a string? For example, you want to turn + + " this " + +into + + "this" + +Chapter 10 + +Creating Your Own Functions + +In Chapter 9, "Using Built-in Functions," you learned how to use VBA's built-in functions. In this chapter, you'll learn how to create your own functions. You create a function the same way you create a subprocedure: by typing in the Code window. (You can't _record_ a function in Excel and Word—the applications that provide a Macro Recorder. Instead, you have to write functions yourself because the Recorder creates only subprocedures.) + +It's important to recall that, although both are procedures, functions differ from subs. The primary difference is that functions interact more with other procedures: They accept arguments (incoming data) from the procedure that calls them, and they return a value (outgoing data) back to the procedure that calls them. Subs, by contrast, normally don't require arguments and _never_ return any data. + +But functions are used in VBA far less often than subs. Most macros are self-contained subs. That's because most macros are small, brief automations: They perform simple, quick jobs like inserting a date into a document or saving a document using a particular filename. + +But you aren't limited to brief macros. You are free to create more complex, larger, and more sophisticated programs in VBA. And if you do create a large project, you'll want to use multiple procedures, not just one sub. This allows you to divide your work into multiple logical units that can each be individually tested and more easily modified. When you're using multiple procedures, however, they must work together and need to communicate among themselves. This is why you often use functions in large projects. Remember that the key feature of a function is that it facilitates _communication—_ sending values back and forth—among multiple procedures. + +This chapter will cover several ways to employ functions with the various Office 2013 applications. I'll start by explaining the components of a function and showing you how to put them together. You'll then create some functions that work in any VBA host and some functions that are specific to Word, Excel, and PowerPoint. + +In this chapter you will learn to do the following: + + * Understand the components of a function statement + * Create a generic function + * Create a function for Word + * Create a function for Excel + * Create a function for PowerPoint + * Create a function for Access + +# Components of a Function + +To create a function, you use a Function statement. This is essentially the same way you create a Sub: just type in the word **Function** followed by the name you're giving the function. + +The syntax for the Function statement is as follows: + + [Public | Private] [Static] Function _function_name_ [( _argument_list_ )] [As _type_ ] + [ _statements_ ] + [ _function_name_ = _expression_ ] + [Exit Function] + [ _statements_ ] + [ _function_name_ = _expression_ ] + End Function + +This syntax, most of which is optional, breaks down like this: + + * Public is an optional keyword that you can use to make the function publicly accessible—accessible to all other procedures in all loaded modules. (If you need to limit the function's scope to the project that contains it, you can override this public availability by putting an Option Private Module statement in the module that contains the function.) + * Private is an optional keyword that you can use to make the function accessible to the other procedures in the module that contains it. The function is hidden from procedures in any other module. + * Static is an optional keyword that you can use to make local variables in the function retain their value between calls to the function. + * _function_name_ is required. It specifies a name for the function so you can refer to it elsewhere in your project (so you can _call_ the function—in other words, start it running). Functions follow the same naming rules as other VBA items, such as the rules for variable names: alphanumerics and underscores are fine, but no spaces, symbols, or punctuation. Note that a function _passes data back_ to whatever procedure called (executed) the function. It passes data back by assigning a value to its (the function's) name. If the function's name is AddStateTax, you would do some calculations to add the tax, then assign the result to the function's name: + + Function AddStateTax(SubTotal) + + AddStateTax = SubTotal * 1.07 'do the math and assign the result + 'to the function name so it gets passed back + + End Function + + * _argument_list_ is an optional argument supplying the list of variables that represent arguments passed to the function when it is invoked. _argument_list_ takes the syntax shown here: + + [Optional] [ByRef | ByVal] [ParamArray] _variable_name_ [( )] [As _type_ ] + + [= _default_value_ ] + +Here's a description of the elements of the _argument_list_ : + + * Optional is an optional keyword that you can use to denote that an argument is optional—in other words, that it is not required. Once you've used Optional to declare an optional argument, any subsequent arguments in the _argument_list_ also have to be optional. That means you must put any _required_ arguments before the optional arguments, the same way VBA does with its built-in functions' argument lists. Also, it's a good idea to give optional arguments a default value. + * ByRef is an optional keyword that you can use to specify that an argument be passed _by reference_ ; ByVal is an optional keyword that you can use to specify that an argument be passed _by value_. You can pass an argument either by reference or by value. + * ParamArray is an optional keyword you can use as the last argument in _argument_list_ to denote an optional array of Variants. You can't use ParamArray with ByVal, ByRef, or Optional. + +* * * + +You Can Pass to a Function Either a Value's Address or a Copy of the Actual Value + +When a procedure (either a function or a subroutine) passes an argument to a function _by reference_ , the recipient procedure gets access to the actual memory location where the original variable is stored and can thus _change_ the value held in the original variable. By contrast, when an argument is passed _by value_ , the function gets only a copy of the information in the variable and therefore can't change the value held in the original variable (the recipient procedure doesn't even know where the original variable is located). By reference is the default way to pass an argument, and there is rarely any reason to pass by value, so just use the default. + +* * * + + * _variable_name_ is the name of the variable that you want to use for this argument. When the function is called and a value is supplied for this argument, this variable can be used in your code. + * _type_ is an optional keyword giving the data type of the argument (Byte, Boolean, Currency, Date, Decimal, Double, Integer, Long, Object, Single, variable-length String, or Variant). For nonoptional arguments, you can also specify an object type (for example, a Worksheet object) or a custom object (one you've created). + * _default_value_ is an optional _literal_ (the value itself spelled out, such as "Sacramento") constant, or constant expression that you use to specify a default value for optional parameters. You'll see how to provide a default value shortly. + + * _type_ is an optional argument specifying the data type of the value that the function returns: Byte, Boolean, Currency, Date, Decimal, Double, Integer, Long, Object, Single, variable-length String, Variant, or a custom type. + * _statements_ represents the statement or statements in the function (the code that does the job the function is supposed to accomplish). In theory, _statements_ is optional, but in practice, most functions will need one or more statements. + * _expression_ represents the value the function returns. _expression_ is also optional. + +# Creating a Function + +The following sections walk you through the process of creating a function. + +## Starting a Function Manually + +The easiest way to start creating a function is to type into the VBA Code window the word **Function** followed by the name you want to give to the function and any necessary arguments in parentheses, and then press Enter. VBA automatically enters a blank line and an End Function statement for you and places the insertion point on the blank line ready for you to create the programming code inside the new function. + +For example, if you type the following line and press Enter, the Visual Basic Editor displays what you see in Figure 10.1: + + Function MyFunction(MaxTemp, MinTemp) + +Figure 10.1 When you type a Function statement and press Enter, the Visual Basic Editor automatically inserts a blank line and an End Function statement for you. + +## Starting a Function by Using the Add Procedure Dialog Box + +If you like to make the Visual Basic Editor work for you as much as possible (and prefer the slow way of doing things), you can also start creating a new function by using the Add Procedure dialog box: + +1. Choose Insert ⇒ Procedure to display the Add Procedure dialog box (see Figure 10.2). + +Figure 10.2 You can also use the Add Procedure dialog box to specify elements of a new function. + +2. Type the name for the procedure in the Name text box. + +3. Select the Function option button in the Type group box. + +4. Select the Public option button or the Private option button (as appropriate) in the Scope group box. + +5. If you want all local variables in the function to be of the static type (which you usually won't), select the All Local Variables As Statics check box. + +6. Click OK to enter the stub for the function, and then enter any arguments for the function in the parentheses manually. + +## Passing Arguments to a Function + +The arguments that will be passed to a function are listed in parentheses, separated by commas. In the following example code, the function states that it requires an argument named MaxTemp and an argument named MinTemp. These data must be passed to (sent to) this function for it to work: + + Function GetTemps(MaxTemp As Double, MinTemp As Double) + +If somewhere in your code you attempt to call this function without passing the data it requires, VBA will display the error message "Argument Not Optional." + +You can also specify the data type of the arguments if you want by including an As statement with the data type after the argument's name. For example, you could use the following statement to set the MaxTemp and MinTemp arguments to the Double numeric data type: + + Function GetTemps(MaxTemp As Double, MinTemp As Double) + +Passing an argument by reference (the default) is useful when you want to manipulate the variable in the recipient procedure and then return the variable to the procedure from which it originated. Alternatively, passing an argument by value is useful when you want to use the information stored in the variable in the recipient procedure and at the same time ensure that the original information in the variable doesn't change (but this isn't typically necessary). + +Because _by reference_ is the default way of passing an argument, both of the following statements pass the argument MyArg by reference: + + Function PassByReference(MyArg) + Function PassByReference(ByRef MyArg) + +As you see, you can omit the default ByRef command. However, to pass an argument by value, you must use the ByVal keyword. The following statement passes the ValArg argument by value: + + Function PassByValue(ByVal ValArg) + +If necessary, you can pass some arguments for a procedure by reference and others by value. The following statement passes the MyArg argument by reference and the ValArg argument by value: + + Function PassBoth(ByRef MyArg, ByVal ValArg) + +In practice, though, you're likely to simply use the default ByRef approach for most, if not all, of your programming. + +## Declaring the Data Types of Arguments + +You can explicitly declare the data types of arguments. This conserves memory (although this is rarely an issue anymore) and ensures that the outside (calling) procedures are passing the correct type of information to your function. For this second reason, it's always a good idea to specify the data type. You avoid some kinds of errors that way. + +When passing an argument, you want to ensure that the data type of the argument you're passing matches the data type expected in the procedure. For example, if you declare a string and try to pass it as an argument when the receiving function specifies that it is expecting a Variant, VBA displays an error message. + +To declare the data type of an argument, just include the usual data-type declaration in the argument list. The following statement declares MyStrArg, specifying with As that a string must be passed and specifying a variant with VarArg: + + Function PassType(MyStrArg As String, VarArg As Variant) + +## Specifying an Optional Argument + +You can specify an optional argument by using the Optional keyword: + + Function PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _ + Optional ByVal strName As String) + +When you specify an optional argument, it's a good idea to assign a default value to it. Doing so makes the code less susceptible to errors and gives the programmer a clue as to what kind of information is used here. To assign the default value, type an equal sign after the variable's definition, and then type the default value (use double quotation marks for a String value). For example, the following function statement declares the strName optional argument and assigns the default value if no value is passed: + + Function PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _ + Optional ByVal strName As String = "Sacramento **"** ) + +What happens here is that this macro is being used by a company located in Sacramento, so they most often use that city's name for the literal value in this particular macro. Your default literal will differ, depending on what your macro is supposed to accomplish. + +## Controlling the Scope of a Function + +Like a subroutine, a function can have private or public scope. Private scope makes the function available only to procedures in the module that contains it, and public scope makes the function available to all open modules in your project. + +If you don't specify whether a function is private or public, VBA makes it public by default, so you don't need to specify the scope of a function unless you want it to have private scope. However, if you do use explicit Public declarations for those functions you want to be public, your code will be somewhat easier to read: + + Private Function MyFunction(MaxTemp, MinTemp) + Public Function AnotherFunction(Industry, Average) + +# Examples of Functions for Any VBA-Enabled Office Application + +This part of the chapter contains two examples of functions that will work in any application that hosts VBA. That's because these functions don't access objects particular to any specific Office application. + +Later in this chapter, you'll see examples of functions that employ resources or features particular to a specific Office application. + +To start, first declare the function and its arguments. The following statement declares a function named NetProfit: + + Function NetProfit(Gross As Double, Expenses As Double) As Double + +NetProfit uses two arguments, Gross and Expenses, declaring each as the Double data type; it's a floating-point (has a decimal point) number. + +At the end of this statement, we have specified that our function returns a Double value type. It's important to explicitly specify the variable types of the arguments and the type of the value that the function returns to the caller. This avoids unpleasant surprises (bugs) in your code because VBA catches and reports any attempt to pass the wrong data type to the function or send the wrong type of data back to whatever code called (executed) your function. + +Armed with the arguments (and their type, if you _explicitly type_ them as I'm suggesting you do), you _call_ (execute) your NetProfit function the same way you would execute a prewritten function that's built into VBA (like MsgBox). You simply use the function's name and supply the two arguments it needs, like this: + + MyProfit = NetProfit(44000, 34000) + +Here, the variable MyProfit is assigned the value of the NetProfit function. In other words, after this function finishes its job and execution resumes in the _caller_ (the procedure that invoked the function), the returned value is assigned to the variable MyProfit. + +In this example, the NetProfit function is provided with a Gross argument of 44000 and an Expenses argument of 34000. + +Once you've created a function, the Visual Basic Editor displays its argument list when you type the name of the function in a caller procedure, as shown in Figure 10.3. + +Figure 10.3 The Visual Basic Editor displays a ToolTip of Auto Quick Info for functions you create as well as for its built-in functions. + +Listing 10.1 contains an example of calling a function: The ShowProfit procedure calls the NetProfit function and displays the result in a message box. + +**Listing 10.1**: How to call a function + + 1. Sub ShowProfit() + 2. MsgBox (NetProfit(44000, 34000)),, "Net Profit" + 3. End Sub + 4. + 5. Function NetProfit(Gross As Double, Expenses As Double) As Double + 6. NetProfit = (Gross - Expenses) * 0.9 + 7. End Function + +In Listing 10.1, lines 1 through 3 contain the ShowProfit procedure, which simply calls the NetProfit function in line 2, passes it the arguments 44000 for Gross and 34000 for Expenses, and displays the result in a message box titled Net Profit. Notice that in line 2 we have employed a shortcut: using the function call inside an argument list. Line 2 does the same thing as this longer version: + + Dim result as double + Result = NetProfit(44000, 34000) + MsgBox (Result),, "Net Profit" + +Lines 5 through 7 contain the NetProfit function. Line 5 declares the function as working with two Double arguments, Gross and Expenses, telling VBA what to do with the two arguments that line 2 has passed to the function. + +Line 6 calculates NetProfit to be 90 percent (0.9) of the value of Gross minus Expenses. + +## How Functions Return Information + +It's important to notice what else happens in line 6: the information calculated by the function is being assigned to the name of the function. This is how _the information gets passed back_ to the ShowProfit procedure that called the function. + +To make this process a bit clearer, let's write the code in a more verbose way. We'll do this the long way, without using the shortcut of doing both the calculating and assigning all on the same line. Here's how a function goes about its business. + +There are three main steps: calculation, assignment, and return. They are labeled as lines 1, 2, and 3 in the following listing. + +The function first does some computing—in this case, calculating a net profit. Then, second, it assigns the results of the calculation to its own name (NetProfit in this case). This assignment is how the data gets passed back to the caller. And finally, third, with the End command, it sends the results back to whatever procedure called the function: + + Function NetProfit(Gross As Double, Expenses As Double) As Double + Dim Result As Double + + 1. Result = (Gross - Expenses) * 0.9 'do the calculating + + 2. NetProfit = Result 'store the information to be sent back + + 3. End Function 'send the information back + +## Returning Text Data from a Function + +Listing 10.2 contains a function that returns a String argument. + +**Listing 10.2**: A function that returns a string + + 1. Sub TestForSmog() + 2. Dim intCYear As Integer, strThisCar As String + 3. BadValueLoop: + 4. On Error GoTo Bye + 5. intCYear = InputBox("Enter the year of your car.", _ + "Do I Need a Smog Check?") + 6. strThisCar = NeedsSmog(intCYear) + 7. If strThisCar = "Yes" Then + 8. MsgBox "Your car needs a smog check.", _ + vbOKOnly + vbExclamation, "Smog Check" + 9. ElseIf strThisCar = "BadValue" Then + 10. MsgBox "The year you entered is in the future.", _ + vbOKOnly + vbCritical, "Smog Check" + 11. GoTo BadValueLoop + 12. Else + 13. MsgBox "Your car does not need a smog check.", _ + vbOKOnly + vbInformation, "Smog Check" + 14. End If + 15. Bye: + 16. End Sub + 17. + 18. Function NeedsSmog(CarYear As Integer) As String + 19. If CarYear > Year(Now) Then + 20. NeedsSmog = "BadValue" + 21. ElseIf CarYear <= Year(Now) - 3 Then + 22. NeedsSmog = "Yes" + 23. Else + 24. NeedsSmog = "No" + 25. End If + 26. End Function + +Listing 10.2 contains the procedure TestForSmog (lines 1 through 16) and the NeedsSmog function (lines 18 through 26). The TestForSmog procedure calls the NeedsSmog function, which returns a value indicating whether the user's car needs a smog check. TestForSmog uses this value to display a message box (see Figure 10.4) informing users whether or not their car needs a smog check. + +Figure 10.4 The TestForSmog procedure prompts for the car's year and then displays a message box stating whether the car needs a smog test. + +Here's how the code works: + + * TestForSmog starts by declaring the Integer variable intCYear and the String variable strThisCar in line 2. + * Line 3 contains the BadValueLoop label, to which execution returns from line 11 if the user has entered an unsuitable value for the year of the car. We'll want to display the input box again, to see if they can get it right this time. Note that if you want execution to jump to a particular zone in your code, you just type in a name for the location, such as BadValueLoop here, and end with a colon. This name-plus-colon is called a _label_ and it provides a way for you to transfer execution to a specific location within your macro. Then elsewhere in your code you can transfer execution to this label by using the GoTo command like this: + + GoTo BadValueLoop + + * Line 4 contains an On Error statement to transfer execution to the Bye label in line 15 if an error occurs. An error occurs if the user cancels the upcoming input box or clicks its OK button with no value entered in its text box. + * Line 5 displays an input box prompting the user to enter the year of the car. This line assigns to the intCYear variable the value the user enters in the input box. + * Line 6 then sets the value of the String variable strThisCar to the result of the NeedsSmog function running on the intCYear integer variable. + * Execution now shifts to the NeedsSmog function (line 18), which evaluates intCYear and returns the value for strThisCar. Line 18 declares the function, assigning its value to NeedsSmog. The function takes one argument, CarYear, which is declared as the Integer data type. + * Line 19 checks to see whether CarYear is greater than the value of the current year using Year(Now). If so, line 20 sets the value of NeedsSmog to BadValue, which is used to indicate that the user has entered a date in the future. If not, the ElseIf statement in line 21 runs, checking if the value of CarYear is less than or equal to Year(Now) - 3, the current year minus three. If so, line 22 sets the value of NeedsSmog to Yes; if not, the Else statement in line 23 runs, and line 24 sets the value of NeedsSmog to No. Line 25 ends the If statement, and line 26 ends the function. + * Execution then returns to the calling line (line 6) in the TestForSmog procedure, to which the NeedsSmog function returns the value it has assigned to the strThisCar variable. + * The rest of the TestForSmog procedure then works with the strThisCar variable. Line 7 compares strThisCar to Yes. If it matches, line 8 displays a message box stating that the car needs a smog check. If strThisCar doesn't match Yes, line 9 compares ThisCar to BadValue. If it matches, line 10 displays an alert-message box, and line 11 returns execution to the BadValueLoop label in line 3. If strThisCar doesn't match BadValue, the Else statement in line 12 runs, and line 13 displays a message box stating that the car doesn't need a smog check. + * Line 14 ends the If statement, line 15 contains the Bye label, and line 16 ends the procedure. + +Functions can be more complex than the simple, stand-alone examples shown here. For instance, you can include a function as part of a larger expression. You could add the results of the functions NetProfit and CurrentBalance (which takes a single argument) by using a statement such as this: + + CurrentEstimate = NetProfit(44000, 33000) + CurrentBalance(MainAccount) + +# Creating a Function for Word + +Functions such as those shown in the previous section work in any VBA-hosting application because they do not call any application-specific features. This section and the following three sections show you examples of functions that are specific to applications. + +The task accomplished by the example program shown in Listing 10.3 is to remove some special types of formatting (hyperlinks, bookmarks, and fields) but retain any text in those special zones. + +* * * + +Creating Custom Function Libraries + +Some programmers like to keep functions they write (that aren't application-specific) in separate modules in the Editor. These little libraries represent your own collections of tested, useful, generic procedures. Need to calculate sales tax? Don't reinvent the wheel. Just import your library of math functions, among which is just this procedure. You can export a module as a file with the .bas filename extension and import it into whichever application needs the functions. Choose File ⇒ Export File (or press Ctrl+E). For example, you might maintain separate modules that contain your math equations, your string-manipulation functions, and other custom functions that work in any VBA host. A .bas file is merely an ordinary text file containing a module's source code (its subroutines and functions). You can read it in Notepad, but you can also use the File Import feature to add it to a VBA project. When imported, it will appear in the Project Explorer as a new module. + +* * * + +The function shown in Listing 10.3 is for Word and—unusually for a function—returns no information (technically it returns a null value). The function's main purpose is to perform several operations on the specified document. So no data needs to be returned to the caller. + +**Listing 10.3**: A function that returns a null value + + 1. Option Explicit + 2. + 3. Function Strip_Hyperlinks_Bookmarks_Fields() + 4. Dim myLink As Hyperlink + 5. Dim myBookmark As Bookmark + 6. Dim myField As Field + 7. With ActiveDocument + 8. For Each myLink In .Hyperlinks + 9. myLink.Delete + 10. Next myLink + 11. For Each myBookmark In .Bookmarks + 12. myBookmark.Delete + 13. Next myBookmark + 14. For Each myField In .Fields + 15. myField.Unlink + 16. Next myField + 17. End With + 18. End Function + 19. + 20. Sub Clean_Up_Document_for_Conversion() + 21. Call Strip_Hyperlinks_Bookmarks_Fields + 22. 'other cleanup functions here + 23. End Sub + +Here's how the code works: + + * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer. + * Line 3 starts the function named Strip_Hyperlinks_Bookmarks_Fields, which removes all hyperlinks, bookmarks, and fields from the active document. The function continues until the End Function statement in line 18. + * Line 4 declares a variable named myLink as being of the Hyperlink type. Line 5 declares a variable named myBookmark as being of the Bookmark type. Line 6 declares a variable named myField as being of the Field type. + * Line 7 begins a With statement that works with the ActiveDocument object and continues until the End With statement in line 17. This With statement contains three For Each...Next loops. + * The first For Each...Next loop, in lines 8 through 10, goes through each myLink object in the current document's Hyperlinks collection. Line 9 uses the Delete method to delete each of the links in turn. Deleting a hyperlink removes the link from the document but leaves the text that was displayed for the hyperlink. + * The second For Each...Next loop, in lines 11 through 13, works with each myBookmark object in the Bookmarks collection. Line 12 uses the Delete method to delete each of the bookmarks in turn. Deleting a bookmark removes the marker from the document but leaves any text or other object that the bookmark contained. + * The third For Each...Next loop, in lines 14 through 16, works with each myField object in the Fields collection. Line 15 uses the Unlink method to unlink each of the fields in turn. Unlinking a field leaves the field's contents in the document as text or as an object but removes the field link. + * Line 17 contains the End With statement that ends the With statement, and line 18 contains the End Function statement that ends the function. Line 19 is a spacer. + * Lines 20 through 23 contain a short subprocedure that simply calls the Strip_Hyperlinks_Bookmarks_Fields function. Line 22 contains a comment stating that the subprocedure would call other cleanup functions. But the code to call other functions hasn't yet been written. It's a reminder. + +# Creating a Function for Excel + +This section shows you a function for Excel. The function in Listing 10.4 checks whether a workbook contains any unused sheets. + +**Listing 10.4**: An Excel function + + 1. Option Explicit + 2. + 3. Function BlankSheetsInWorkbook(ByRef WorkbookToTest As Workbook) As Boolean + 4. Dim objWorksheet As Worksheet + 5. BlankSheetsInWorkbook = False + 6. For Each objWorksheet In WorkbookToTest.Worksheets + 7. If Application.WorksheetFunction.CountBlank _ + (objWorksheet.Range("A1:IV65536")) = 16777216 Then + 8. BlankSheetsInWorkbook = True + 9. Exit Function + 10. End If + 11. Next objWorksheet + 12. End Function + 13. + 14. Sub Check_Workbook_for_Blank_Worksheets() + 15. If BlankSheetsInWorkbook(ActiveWorkbook) = True Then + 16. MsgBox "This workbook contains one or more blank worksheets." & _ + vbCr & vbCr & "Please remove all blank worksheets before" & _ + " submitting the workbook.", vbOKOnly & vbExclamation, _ + "Check Workbook for Blank Worksheets" + 17. End If + 18. End Sub + +Here's how the code works: + + * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer. + * Line 3 starts the function named BlankSheetsInWorkbook, which it declares as a Boolean function. The function works on an object named WorkbookToTest, which has the type Workbook—in other words, it's a workbook. + * Line 4 declares a variable named objWorksheet that is of the Worksheet type. + * Line 5 sets the value of the BlankSheetsInWorkbook function to False. + * Line 6 starts a For Each...Next loop that runs for each objWorksheet object (each worksheet) in the Worksheets collection in the WorkbookToTest object—that is, with each worksheet in the workbook that is passed to the function. + * Line 7 uses the CountBlank worksheet function to count the number of blank cells in the range A1:IV65536 in the worksheet being tested by the loop. If the number of blank cells is 16777216, the worksheet is blank because this is the number of cells in a worksheet. Line 8 then sets the value of the BlankSheetsInWorkbook function to True, and line 9 uses an Exit Function statement to exit the function. This is because there is no need to test any more worksheets once the function has found that at least one worksheet is blank. + * Line 10 contains the End If statement that ends the If statement. Line 11 contains the Next objWorksheet statement that ends the For Each...Next loop. And line 12 contains the End Function statement that ends the function. Line 13 is a spacer. + * Line 14 begins a short subprocedure named Check_Workbook_for_Blank_Worksheets. Line 15 runs the BlankSheetsInWorkbook function on the ActiveWorkbook object, which represents the active workbook in the Excel session. If the BlankSheetsInWorkbook function returns True, line 16 displays a message box that points out to the user that the workbook contains one or more blank worksheets and tells the user to remove them. + +# Creating a Function for PowerPoint + +This section includes an example function for PowerPoint. The function in Listing 10.5 checks that all the text on a slide is at least the minimum font size specified and displays an error-message box if any font is too small. (If, when you press Alt+F11 to open the VBA Editor, you see nothing in the Code window, choose Insert ⇒ Module so you'll have a container for your code.) + +**Listing 10.5**: A function in PowerPoint + + 1. Option Explicit + 2. + 3. Function CheckMinFontSize(objPresentation As Presentation) As Boolean + 4. + 5. Dim objSlide As Slide + 6. Dim objShape As Shape + 7. + 8. CheckMinFontSize = True + 9. + 10. For Each objSlide In objPresentation.Slides + 11. objSlide.Select + 12. objSlide.Shapes.SelectAll + 13. For Each objShape In Windows(1).Selection.ShapeRange + 14. If objShape.Type = msoPlaceholder Then + 15. If objShape.TextFrame.TextRange.Font.Size < 14 Then + 16. CheckMinFontSize = False + 17. Exit Function + 18. End If + 19. End If + 20. Next objShape + 21. Next objSlide + 22. End Function + 23. + 24. Sub Font_Check() + 25. If CheckMinFontSize(ActivePresentation) = False Then + 26. MsgBox "Some of the fonts in this presentation are too small." _ + & vbCr & vbCr & "Please change all fonts to 14 points or larger.", _ + vbCritical + vbOKOnly, "Font Size Check" + 27. End If + 28. End Sub + +Here's how the code works: + + * Line 1 contains the Option Explicit statement for the module to force explicit declarations of all variables. Line 2 is a spacer. + * Line 3 declares the function named CheckMinFontSize as Boolean and specifies that it works on a variable named objPresentation, which is of the Presentation type. Line 4 is a spacer. + * Line 5 declares a variable named objSlide that is of the Slide type. Line 6 declares a variable named objShape that is of the Shape type. Line 7 is a spacer. + * Line 8 sets the value of the CheckMinFontSize function to True. This indicates that the font sizes are the minimum size or larger. Line 9 is a spacer. + * Line 10 starts a For Each...Next loop that continues until line 21 and works with each objSlide object in the Slides collection in the objPresentation object. This loop makes the function examine each of the Slide objects in the presentation that is passed to the function. + * Line 11 selects the current objSlide object, and line 12 uses the SelectAll method of the Slides collection. + * Line 13 starts a nested For Each...Next loop that runs once for each of the objShape objects in the ShapeRange object in the Selection object in the first window using Windows(1). The ShapeRange object contains all of the Shape objects within the selection. Here, the Shape objects are represented by the objShape variable. + * Line 14 uses an If statement to see if the Type property of the current Shape object is msoPlaceholder, the type that indicates a placeholder used for text. If the shape is a placeholder, line 15 checks if the font size used in the TextRange object within the TextFrame object within the Shape object is smaller than 14 points. If so, line 16 assigns the value False to the CheckMinFontSize function, and line 17 uses an Exit Function statement to stop execution of the function. This is because once a font smaller than the minimum permitted size has been found, there is no need to check further. + * Line 18 contains the End If statement that ends the nested If structure, and line 19 contains the End If statement that terminates the outer If structure. + * Line 20 contains the Next objShape statement that ends the nested For Each...Next loop, and line 21 contains the Next objSlide statement that ends the outer For Each...Next loop. + * Line 22 contains the End Function statement that ends the function. Line 23 is a spacer. + * Lines 24 through 28 contain a subroutine named Font_Check that runs the CheckMinFontSize function on the ActivePresentation object. If the function returns False, the subprocedure displays a message box alerting the user to the problem. + +# Creating a Function for Access + +You can create functions for Access the same way you do for any other VBA-enabled Office 2013 application—just type in the word **Function** and give this function a name. + +However, Access often has special ways of programming, and it has several unique aspects to its object model. The first thing you'll notice is a general-purpose object named DoCmd. This object has no properties, but it has lots of methods that accomplish such common tasks as launching other applications, locating records, and opening reports and forms. + +Before we create a macro to illustrate how to use the DoCmd object, it's necessary to have a little database set up that you can experiment with. Access comes with several templates, so we'll use one of them. Follow these steps: + +1. Run Access. + +2. In Office 2013, the various applications such as Word and Access display on startup a set of common templates. + +3. Double-click _Desktop_ Contacts to open that database template. (Don't choose the Contacts template, which includes online features that will complicate this example.) + +4. If you see a security warning message (a yellow strip below the Ribbon), click the Enable Content button. + +5. Click Create. If you see an offer to watch some videos from Microsoft at this point, click the x in the upper-right corner to close that window. + +6. Type in some random data by clicking the (New) link in the Open column on the left side, as shown in Figure 10.5. A Contacts Details dialog box opens (not shown in the figure). + +Figure 10.5 Type in some data—any data will do—so you can experiment with Access's DoCmd object. + +7. Click the Save And New button in the Contact Details dialog box each time you add a new contact. Add about three contacts. + +Now you can use the DoCmd to locate a particular record by its ID number. Press Alt+F11 to open the Visual Basic Editor in Access; then right-click the database name (it's the one in boldface) in the Project Explorer. Choose Insert ⇒ Module from the context menu. In your new module, type the following code, which will move the insertion pointer to a new record: + + 1. Function MoveToNew() + 2. + 3. DoCmd.OpenForm "Contact List" + 4. DoCmd.GoToRecord , , acNewRec + 5. + 6. End Function + +To test this macro, click somewhere in one of the existing records so the blinking insertion cursor is located above the New record line. Then switch to the VBA Editor and click inside the MoveToNew function to place the Editor's cursor there. Press F5. Then go back to Access, and you should see that the blinking cursor has moved to the New record. + +Here's how the code works: + + * Line 3 ensures that the correct form is open. Because you've just started working with this Contacts database and filled in some information in the Contact List form, the correct form is open and has the focus. However, it's possible that later additional forms will be added. It's always a good idea to specify which form, table, or other object you want to work with. You can't assume that a macro will always be executed in a specific context (such as with the correct form having the focus). In other words, if you omit line 3, this macro will act on whatever form is currently open in Access. + * Line 4 employs the GoToRecord method of the DoCmd object. The acNewRec constant specifies a new, rather than an existing, record. + +# The Bottom Line + +**Understand the components of a function statement.** + +Arguments can be passed from the calling code to a function in one of two ways: by reference or by value. + +Master It + +Describe the difference between passing data by reference and passing data by value. + +**Create a generic function.** + +You can write, and save (File ⇒ Export File), sets of generic functions that work in any VBA-enabled application. + +**Master It** + +Create a function that displays the current year in a message box. This function will require no arguments, nor will it return any value. + +**Create a function for Word.** + +Word contains a whole set of objects and members unique to word-processing tasks. Functions that are specific to Word employ one or more of these unique features of the Word object model. + +Master It + +Write a function that displays the number of hyperlinks in the currently active document. Use Word's Hyperlinks collection to get this information. + +**Create a function for Excel.** + +Excel uses an ActiveWorkbook object to represent the currently selected workbook. You can employ a full set of built-in methods to manipulate the features of any workbook. + +Master It + +Using the Sheets collection of Excel's ActiveWorkbook object, write a function that displays the number of sheets in the current workbook. + +**Create a function for PowerPoint.** + +PowerPoint's object model includes an ActivePresentation object, representing the currently selected presentation. Functions can make good use of this object and its members. + +Master It + +Write a function that returns how many slides are on a presentation. Pass the ActivePresentation object as an argument to this function; then display the number of slides the presentation contains. Call this function from a subroutine. + +**Create a function for Access.** + +Access often works a little differently from other VBA-enabled Office applications. For example, some common tasks are carried out by using methods of the special DoCmd object rather than methods of a Form or Table object. + +Master It + +Write a function that closes Access by using the DoCmd object's Quit method. Ensure that all data is saved by employing the acQuitSaveAll constant as an argument for the Quit method. +Chapter 11 + +Making Decisions in Your Code + +Computers behave intelligently in large part because programming languages include commands that test conditions. Then based on the results of that test, the code jumps ( _branches_ ) to an appropriate area within the program. This is similar to human decision-making: if it's raining, then take an umbrella. If not, leave it home. + +This chapter covers what are called _conditional_ expressions. VBA uses these to create decision structures to direct the flow—the path of execution—of your procedures. + +By using decision structures, you can cause your procedures to branch to different sections of code depending on such things as the value of a variable or expression or whether the user clicks the OK or Cancel button in a message box. + +VBA offers two types of decision structures: If blocks and Select Case blocks. And there is a set of various kinds of If statements suitable for making typical decisions. For more complicated decision-making, you'll want to use the heavy-duty Select Case block structure. It's more efficient when working with truly involved decisions. + +The chapter starts by introducing you to the comparison operators and logical operators you can use when building conditional expressions and logical expressions. Then it covers the different types of If blocks, which take up the bulk of the chapter. At the end of the chapter, you'll learn how to use Select Case. + +In this chapter you will learn to do the following: + + * Use comparison operators + * Compare one item with another + * Test multiple conditions + * Use If blocks + * Use Select Case blocks + +# How Do You Compare Things in VBA? + +To compare things in VBA, you use _comparison operators_ to specify what type of comparison you want: whether one variable or expression is equal to another, whether one is greater than another, whether one is less than or equal to another, and so on. + +VBA supports the comparison operators shown in Table 11.1. + +Table 11.1 VBA's comparison operators + +**Operator** | **Meaning** | **Example** +---|---|--- += | Equal to | If strMyString **=** "Hello" Then +<> | Not equal to | If x **< >** 5 Then +< | Less than | If y **<** 100 Then +> | Greater than | If strMyString **>** "handle" Then +<= | Less than or equal to | If intMyCash **< =** 10 Then +>= | Greater than or equal to | If Time **> =** 12:00 PM Then MsgBox "It's afternoon." Else MsgBox "It's morning." End If +Is | Is the same object variable as | If Object1 **Is** Object2 Then + +The first six comparison operators shown in Table 11.1 are straightforward. Numeric expressions are evaluated as you would expect. Alphabetical expressions are evaluated in alphabetical order: for example, because _ax_ comes before _handle_ in alphabetical order, it's considered "less than" _handle_. + +So, "ax" < "handle" would evaluate to True. And whether an evaluation results in True or False determines what happens in an If. . . Then block. (In other words, the code in the Then section is executed when something is True. And it is not executed if something is False. Think If it's raining, Then take an umbrella. Otherwise, don't.) + +Mixed expressions (numbers and letters) are evaluated in alphabetical order as well: _Office 97_ is "greater than" _Office 2013_ because 9 is greater than 2. + +Is, the seventh comparison operator, is less familiar, and less often used. You use Is to compare object variables to establish whether two object variables represent the same object (a named object, not an object such as a document or a range). + +For example, the following statements declare two objects—objTest1 and objTest2—and assign to each ActiveDocument.Paragraphs(1).Range, the range consisting of the first paragraph in the active document in Word. The next statement then compares the two objects to each other, returning False in the message box because the two objects are different even though their contents are the same: + + Dim objTest1 As Object + Dim objTest2 As Object + Set objTest1 = ActiveDocument.Paragraphs(1).Range + Set objTest2 = ActiveDocument.Paragraphs(1).Range + 'the next statement returns False because the objects are different + MsgBox objTest1 **Is** objTest2 + +However, if both object variables refer to the same object, the Is comparison returns True, as in the following example, in which both objTest1 and objTest2 refer to the object variable objTest3: + + Dim objTest1 As Object + Dim objTest2 As Object + Dim objTest3 As Object + Set objTest3 = ActiveDocument.Paragraphs(1).Range + Set objTest1 = objTest3 + Set objTest2 = objTest3 + 'the next statement returns True because + 'objTest1 and objTest2 refer to the same object + MsgBox objTest1 **Is** objTest2 + +When using Is, keep in mind that it isn't the specific _contents_ of the object variables that are being compared, but which _object_ they refer to. + +# Testing Multiple Conditions by Using Logical Operators + +Often, you'll need to test two or more conditions before taking an action: If statement X is True and statement Y is True, then do this; if statement X is True or statement Y is True, then do the other; if statement X is True and statement Y isn't True, then find something else to do; and so on. For example, if it's raining _and_ you have a cold, put on your warmest rain gear. + +To test multiple conditions, you use VBA's logical operators to link the conditions together. Table 11.2 lists the logical operators that VBA supports, with short examples and comments. + +Table 11.2 VBA's logical operators + +Of these six logical operators, you'll probably use the conjunction (And), disjunction (Or), and negation (Not) operators the most, with the other three thrown in on special (in other words, rare) occasions. (If the Imp logical operator doesn't make sense to you at this point, you probably don't need to use it.) + +* * * + +**VBA Doesn't Do Short-Circuit Evaluation** + +Here's something to beware of when evaluating multiple conditions: VBA doesn't do short-circuit evaluation in logical expressions (unlike other programming languages, such as C and C++). + +_Short_ - _circuit evaluation_ is the formal term for a simple logical technique most people use several times a day when making decisions in their daily lives: If the first of two or more dependent conditions is false, you typically don't waste time evaluating any other conditions contingent upon it. + +For example, suppose your most attractive coworker says they'll take you to lunch if you get the product out on time _and_ get a promotion. If you don't get the product out on time, you've blown your chances—it doesn't much matter if you get the promotion because even if you do, your lunch will still be that brown bag you forgot to put in the department fridge. There's no point in evaluating the second condition because it depends on the first, and the first condition wasn't met. You can just short-circuit any further condition testing. + +VBA doesn't think that way. It evaluates the second condition (and any subsequent conditions) whether or not it needs to. Evaluating all conditions takes a little more time (which isn't usually an issue), but it _can_ introduce unexpected complications in your code (which can be an issue). For example, the following snippet produces an error when the selection is only one character long. The error occurs because the code ends up running the Mid function on a zero-length string (the one-character selection minus one character)—even though you wouldn't expect this condition to be evaluated when the first condition is not met (because the length of the selection is not greater than 1): + + Dim strShort As String + strShort = Selection.Text + If Len(strShort) **>** 1 And _ + Mid(strShort, Len(strShort) - 1, 1) **=** "T" Then + MsgBox "The second-last character is T." + End If + +To avoid problems such as this, use _nested_ If blocks. In the following code example, the first condition isn't met (again, for a one-character selection), so the second condition isn't evaluated. Notice that one of the If blocks here is nested within (contained within) the other If block: + + If Len(strShort) **>** 1 Then + If Mid(strShort, Len(strShort) - 1, 1) **=** "T" Then + MsgBox "The second-last character is T." + End If + End If + +* * * + +* * * + +Using to Toggle Boolean Properties + +Here's a useful tip. The Not command is a handy way of turning True to False and False to True. By using Not with a Boolean variable or property, you can toggle the state of the variable or property without even needing to check what the current state is. For example, in Excel, you could create an If structure to toggle the value of the Boolean property Saved (which controls whether Excel thinks the document in question contains unsaved changes) by using code such as this: + + If ActiveWorkbook.Saved = True Then + ActiveWorkbook.Saved = False + Else + ActiveWorkbook.Saved = True + End If + +But you can achieve the same toggling effect much more simply by using Not as shown in the following code: + + ActiveWorkbook.Saved = **Not** ActiveWorkbook.Saved + +* * * + +## _If_ Blocks + +As in most programming languages, If blocks in VBA are among the most immediately useful and versatile commands for making decisions. + +In the sections that follow, you'll look at three variations on the If statement: + + * If...Then + * If...Then... Else + * If...Then... ElseIf... Else + +## _If...Then_ + +If...Then statements tell VBA to make the simplest of decisions. If the condition is met, execute the following statement (or statements). If the condition isn't met, skip to the line immediately following the _conditional block_. + +An If statement block begins with If and concludes with End If. (However, a short If...Then statement can be written entirely on a single line, in which case the End If is omitted.) + +### Syntax + +Simple If...Then statements can be expressed entirely on a single line. A one-line If...Then statement looks like this: + + If _condition_ Then **Code to be executed goes here** + +If the condition is met, VBA executes the statement or statements that follow _on that same logical line_. If the condition isn't met, VBA doesn't execute the statement or statements. + +But you can also write multi-line If...Then blocks. A multiple-line If...Then statement (the lines of code between If and End If are more properly known as a _block_ ) looks like this: + + If _condition_ Then + **Code to be executed goes here** + End If + +If the condition is met, VBA executes all the code within the block (the statements enclosed between the If and End If). If the condition isn't met, VBA skips over the enclosed line or lines of code and resumes execution at the line after the End If statement. + +* * * + +Single-Line ⇒ Statements Don't Use + +Remember that a single-line If...Then statement has no End If to end it, whereas the If block requires an End If. VBA knows that a single-line If condition will end on the same line on which it starts. But an If block needs to have its end clearly specified so VBA knows which code to skip over if the condition evaluates to False. If blocks tend to be easier for humans to read. + +* * * + +### Examples + +In the previous chapters, you've already encountered a number of If blocks—they're so necessary in programming (not to mention in life itself) that it's hard to get anything done without them. The following sections show you some further examples. + +#### _One-Line_ If _Statements_ + +Here's an example of a one-line If statement: + + Dim bytAge As Integer + bytAge = InputBox("Enter your age.", "Age") + **If** bytAge < 21 **Then** MsgBox "You may not purchase alcohol.",, "Underage" + +The first line declares the Integer variable bytAge. The second line prompts the user to enter their age in an input box and stores the answer in the variable. The third line checks the value held in bytAge and displays an Underage message box if bytAge is less than 21. + +You can include multiple statements on a single line if you separate the statements by a colon. A single-line If statement can sometimes be a good candidate for a multi-statement line of code. What you are doing is specifying that more than one action should be taken if the expression in the If...Then statement evaluates to True. + +For example, let's say you wanted to halt the macro after displaying the Underage message box. You could include the End statement after a colon on the same line, as shown here: + + If bytAge < 21 Then MsgBox "You may not purchase alcohol.",, "Underage": End + +VBA executes this as follows: + +1. First, it evaluates the condition. + +2. If the condition is met, it executes the first statement after Then—in this case, it displays the Underage message box. Then it proceeds to execute any further statements on that line. Notice that _all_ statements on a single-line If structure are conditional based on (depend on) that If statement. They are executed (or not) based on whether the condition is true or false. + +3. Once the user has dismissed the Underage message box (by clicking the OK button, the only button it has), VBA executes the statement after the colon: End. + +If you wanted, you could even add several more statements on the same "logical" line, separated by colons. End would have to be the last one because it ends the procedure. (By the way, a _logical_ line means that VBA sees this as a single line of code to be executed, no matter how many real-world, _physical_ lines the code takes up on your monitor.) + +You could even add another If statement if you felt like it: + + **If** bytAge < 21 **Then If** bytAge > 18 **Then** MsgBox _ + "You may vote but you may not drink.",, "Underage": End + +As you'll see if you're looking at this line in the Visual Basic Editor, there are a couple of problems with this approach: + + * First, you need to break long lines of code with the line-continuation character or else they go off the edge of the Code window in the Editor, forcing you to scroll horizontally to read the ends of each line. You _could_ hide all windows except the Code window, use a minute font size for your code, or buy a larger monitor, but you're probably still not going to have any fun working with long lines of code. So, in practice, you don't want to pile statements onto a single code line. The brief End statement is probably the most you'll want to add. + * Second, long lines of code (broken or unbroken) that involve a number of statements tend to become visually confusing. Even if everything is obvious to you when you're entering the code, you may find the code hard to read when you have to debug it a few months later. Usually it's better to use If blocks rather than complex one-line If statements. + +#### If _Blocks_ + +Block If constructions work the same way as one-line If statements except blocks contain multiple lines—typically with one command to each line—and they require an End If statement at the end. For example, the one-line If statement from the previous section could also be constructed as an If block like this: + + If bytAge < 21 Then + MsgBox "You may not purchase alcohol.",, "Underage" + End + End If + +If the condition in the first line (the line with the If command) is True, VBA executes the statements within the block If. VBA displays the message box and then executes the End statement. + +As you can see from this example, If blocks are much easier to read (and thus easier to debug) than one-line If statements. This is especially true when you nest If statements within one another, which you'll need to do fairly often. + +To make If blocks easier to read, the convention is to press the Tab key to indent the lines within the block (VBA ignores the indentation during execution). You can see this indentation in the previous code example. + +With short If blocks, like the ones shown in this section, indentation doesn't make a great deal of difference. But with complex If statements, it can make all the difference between clarity and incomprehensibility, as you'll see in "Nesting If Blocks" later in this chapter. + +## _If...Then... Else_ Statements + +If...Then statements are good for taking a single course of action based on a condition, but often you'll need to decide between two courses of action. To do so, you use the If...Then... Else statement. + +By using an If...Then... Else statement, you can take one course of action if a condition is True and another course of action if it's False. It's the equivalent of ordinary language, such as If it's raining, Then take an umbrella, Else wear sunscreen. + +For example, If...Then... Else statements are a great way to deal with two-button message boxes. If the user clicks the OK button, the code will do one thing. If they click the Cancel button, it will do something different. + +* * * + +Use If...Then... Else with Clear-Cut True/False Situations + +The If...Then... Else statement is best used with clear-cut binary conditions—those that lend themselves to a true/false analysis. (Recall that a binary condition is like a two-position light switch—if it's not switched on, it must be switched off.) For more complex conditions, such as switches that can have three or more positions, you need to use a more complex logical statement, such as If...Then... ElseIf... Else or Select Case. We'll get to these structures later in this chapter. + +* * * + +### Syntax + +The syntax for the If...Then... Else statement is as follows: + + If _condition_ Then + _statements1_ + Else + _statements2_ + End If + +If the condition is True, VBA executes _statements1_ , the first group of statements. If the condition is False, VBA moves execution to the Else line and executes _statements2_ , the second group of statements. + +Again, you have the option of creating one-line If...Then... Else statements or block If...Then... Else statements. However, it makes more sense to create block If...Then... Else statements because they're much easier to read and debug and because an If...Then... Else structure is inherently longer than an If...Then structure and thus certain to result in an awkwardly long line. + +### Example + +As a straightforward example of an If...Then... Else statement, consider the Electronic_Book_Critic procedure shown in Listing 11.1. + +**Listing 11.1**: A Simple If...Then example + + 1. Sub Electronic_Book_Critic() + 2. + 3. Dim intBookPages As Integer + 4. + 5. intBookPages = InputBox _ + ("Enter the number of pages in the last book you read.", _ + "The Electronic Book Critic") + 6. **If** intBookPages > 1000 Then + 7. MsgBox "That book is seriously long.", vbOKOnly _ + + vbExclamation, "The Electronic Book Critic" + 8. **Else** + 9. MsgBox "That book is not so long.", vbOKOnly _ + + vbInformation, "The Electronic Book Critic" + 10. **End If** + 11. + 12. End Sub + +Here's what happens in Listing 11.1: + + * Line 1 starts the procedure, and line 12 ends it. Lines 2, 4, and 11 are spacers. + * Line 3 declares the Integer variable intBookPages. Line 5 then assigns to intBookPages the result of an input box prompting users to enter the number of pages in the last book they read. + * Line 6 checks to see if intBookPages is greater than 1000. If it is, the statement in line 7 runs, displaying a message box that states that the book is long. + * If intBookPages is not greater than 1000, VBA branches to the Else statement in line 8 and executes the statement following it, which displays a message box telling the user that the book wasn't so long. + * Line 10 ends the If condition. + +## _If...Then... ElseIf... Else_ Statements + +The last variation of the If command that you'll look at here is the If... Then... ElseIf... Else block, which you can use to help VBA decide between multiple courses of action. You can use any number of ElseIf lines, depending on how complex the condition is that you need to check. + +Again, you could create either one-line If... Then... ElseIf... Else statements or If... Then... ElseIf... Else blocks. However, in almost all cases, If... Then... ElseIf... Else blocks are easier to construct, to read, and to debug. As with the other If statements, one-line If... Then... ElseIf... Else statements don't need an End If statement, but If... Then... ElseIf... Else blocks do need one. + +### Syntax + +The syntax for If... Then... ElseIf... Else is as follows: + + If _condition1_ Then + _statements1_ + ElseIf _condition2_ Then + _statements2_ + [ElseIf _condition3_ Then + _statements3_ ] + [Else + _statements4_ ] + End If + +If the condition expressed in _condition1_ is True, VBA executes _statements1_ , the first block of statements, and then resumes execution at the line after the End If clause. If _condition1_ is False, VBA branches to the first ElseIf clause and evaluates the condition expressed in _condition2_. If this is True, VBA executes _statements2_ and then moves to the line after the End If line; if it's False, VBA moves to the next ElseIf clause (if there is one) and evaluates its condition (here, _condition3_ ) in turn. + +If _all_ the conditions in the ElseIf statements prove False, VBA branches to the Else statement (if there is one) and executes the statements after it (here, _statements4_ ). The End If statement then terminates the conditional statement, and execution resumes with the line after the End If. + +The Else clause is optional, although in many cases it's a good idea to include it to let VBA take a different course of action if none of the conditions specified in the If and ElseIf clauses turns out to be True. + +You can have any number of ElseIf clauses in an If block, each with its own condition. But if you find yourself needing to use If statements with large numbers of ElseIf clauses (say, more than 5 or 10), you may want to try using the Select Case command instead, which you'll look at toward the end of the chapter. + +### Examples + +This section shows you two examples of If...Then... ElseIf... Else statements: + + * A simple If...Then... ElseIf... Else statement for taking action based on which button the user clicks in a three-button message box + * An If...Then... ElseIf statement without an Else clause + +#### _A Simple_ If...Then... ElseIf... Else _Statement_ + +A simple If...Then... ElseIf... Else statement, as used in Listing 11.2, is perfect for dealing with a three-button message box. + +**Listing 11.2**: Understanding the If...Then...ElseIf...Else structure + + 1. Sub Creating_a_Document() + 2. + 3. Dim lngButton As Long + 4. Dim strMessage As String + 5. + 6. strMessage = "Create a new document based on the " & _ + "VP Report project?" & vbCr & vbCr & _ + "Click Yes to use the VP Report template." & vbCr & _ + "Click No to use a blank document." & vbCr & _ + "Click Cancel to stop creating a new document." + 7. + 8. lngButton = MsgBox _ + (strMessage, vbYesNoCancel + vbQuestion, "Create New Document") + 9. + 10. **If** lngButton = vbYes Then + 11. Documents.Add Template:= "z:\public\template\vpreport.dotm" + 12. **ElseIf** lngButton = vbNo Then + 13. Documents.Add + 14. **Else** 'lngButton is vbCancel + 15. End + 16. **End If** + 17. + 18. End Sub + +The Creating_a_Document procedure in Listing 11.2 displays a Yes/No/Cancel message box inviting the user to create a new document based on the VP Report project. The user can choose the Yes button to create such a document, the No button to create a blank document, or the Cancel button to cancel out of the procedure without creating a document at all. + +Here's what happens: + + * Line 1 starts the procedure, and line 18 ends it. + * Line 2 is a spacer, after which line 3 declares the Long variable lngButton and line 4 declares the String variable strMessage. Line 5 is another spacer. + * Line 6 assigns to the String variable strMessage a long string that contains all the text for the message box. Line 7 is another spacer. + * Line 8 displays the message box, using strMessage as the prompt, specifying the vbYesNoCancel constant to produce a Yes/No/Cancel message box, and applying a suitable title (Create New Document). It assigns the result of the message box to the Long variable lngButton. Line 9 is a spacer. + * Line 10 starts the If...Then... ElseIf... Else statement, comparing the value of lngButton to vbYes. + * If line 10 matches, line 11 uses the Add method of the Documents object to create a new document based on the vpreport.dotm template. If not, the ElseIf condition in line 12 is evaluated, comparing the value of lngButton to vbNo. If you run this procedure and choose the Yes button in the message box, you will need to have a template named vpreport.dotm in the folder z:\public\template\ for line 11 to run. If you don't have the template, you'll get an error. Given that you're unlikely to have this template, you might want to change the path and filename to a template that you do have. + * If this second comparison matches, line 13 uses the Add method of the Documents object to create a new blank document. If not, the Else statement in line 14 is activated because the user must have chosen the Cancel button in the message box. The End statement in line 15 ends execution of the procedure. + * Line 16 ends the If statement. Line 17 is a spacer. + +This example is a little unusual in that the Else statement is limited to three possible branches because that's the number of possible responses from a message box—Yes, No, and Cancel. + +Because the If statement checks for the vbYes response and the ElseIf statement checks for the vbNo response, only the vbCancel response will trigger the Else statement. + +In other circumstances, the Else statement can serve as a catchall for _anything_ not caught by the If and ElseIf statements above the Else, so you need to make sure the If and ElseIf statements cover all the contingencies you want evaluated _before_ the Else statement kicks in. So, put the Else statement at the bottom of the block. For example, if you quiz the reader about the colors of the US flag, you must provide If and ElseIf statements for red, white, and blue. If you omit, for example, _white_ (one of the possibilities), and the user types in _white_ , your code will fall through to the Else statement, which might display an incorrect message such as "The color you entered is not on the flag." + +#### _An_ If...Then... ElseIf _Statement without an_ Else _Clause_ + +You can use an If...Then... ElseIf statement without an Else clause when you don't need to take an action if none of the conditions in the If statement proves True. In the previous example, the situation had three clearly defined outcomes: the user could choose the Yes button, the No button, or the Cancel button in the message box. So you were able to use an If clause to test whether the user chose the Yes button, an ElseIf clause to test whether the user chose the No button, and an Else clause to test whether neither was chosen (meaning that the Cancel button was chosen). (Clicking the close button [x] on the title bar of a message box is the equivalent of choosing the Cancel button in the message box.) + +As an example of a situation in which you don't need to take action if no condition is True, consider the If statement in the Check_Password procedure in Listing 11.3. This procedure checks to ensure that the password a user enters to protect an item is of a suitable length. + +**Listing 11.3**: Taking no action when no condition is true + + 1. Sub Check_Password() + 2. + 3. Dim strPassword As String + 4. + 5. BadPassword: + 6. + 7. strPassword = InputBox _ + ("Enter the password to protect this item from changes:", _ + "Enter Password") + 8. + 9. **If** Len(strPassword) = 0 Then + 10. End + 11. **ElseIf** Len(strPassword) < 6 Then + 12. MsgBox "The password you chose is too short." & vbCr _ + & vbCr & "Please choose a password between " & _ + "6 and 15 characters in length.", _ + vbOKOnly + vbCritical, "Unsuitable Password" + 13. GoTo BadPassword + 14. **ElseIf** Len(strPassword) > 15 Then + 15. MsgBox "The password you chose is too long." & vbCr _ + & vbCr & "Please choose a password between " & _ + "6 and 15 characters in length.", + vbOKOnly + vbCritical, "Unsuitable Password" + 16. GoTo BadPassword + 17. **End If** + 18. + 19. End Sub + +This procedure forces users to enter an acceptable password. Here's what happens: + + * Line 1 starts the procedure, and line 19 ends it. + * Line 2 is a spacer, after which line 3 declares the String variable strPassword. + * Line 4 is a spacer. Line 5 contains a label, BadPassword, to which VBA will loop if the password the user enters proves to be unsuitable. Line 6 is another spacer. + * Line 7 displays an input box prompting the user to enter a password, which VBA stores in the variable strPassword. Line 8 is a spacer. + * Line 9 checks strPassword to see if its length is zero, which means it's an empty string. This could mean that either the user clicked the Cancel button in the input box or the user clicked the OK button without entering any text in the text box of the input box. Either of these actions causes VBA to branch to line 10, where it executes the End statement that ends execution of the procedure. + * If the length of strPassword isn't zero (that is, the user has entered text into the text box of the input box and clicked the OK button), the If clause in line 9 is False and VBA moves to line 11, where it checks to see if the length of strPassword is less than 6 characters. + * If the length of strPassword is zero, VBA executes the code in lines 12 and 13. Line 12 displays a message box telling the user that the password is too short and specifying the length criteria for the password. This message box contains only an OK button, so when the user clicks it to continue, VBA continues with line 13, which returns execution to the BadPassword label on line 5. From there the procedure repeats itself, redisplaying the input box so that the user can try again. + * If the length of strPassword isn't more than 15 characters, execution passes from line 11 to the second ElseIf clause in line 14, where VBA checks to see if the length of strPassword is more than 15 characters. + * If the length of strPassword is more than 15 characters, VBA executes the code in lines 15 and 16: Line 15 displays a message box (again, with only an OK button) telling the user that the password is too long, and line 16 returns execution to the BadPassword label, again displaying the input box. + +There's no need for an Else statement in this case because once the user has supplied a password that doesn't trigger the If clause or either of the ElseIf clauses, execution moves out of the If block and continues at the line after the End If statement. + +## Creating Loops with _If_ and _GoTo_ + +So far in this book, you've seen several examples of For... Next loops and For Each... Next loops. (Chapter 12, "Using Loops to Repeat Actions," shows you how to construct these types of loops and other types, such as Do loops.) You can also create loops with If statements and the GoTo statement, as you did in the last example. + +Many teachers and programmers frown upon making loops with If and GoTo. It's bad practice because If... GoTo loops can create "spaghetti code" (execution paths that jump around and are hard to visualize). Such paths can be not only grotesque in themselves, but also a nightmare to debug. + +However, _simple_ versions of If and GoTo loops can work perfectly well, so even if you choose not to use this technique yourself, you should at least know how such loops work. Whether or not to ban GoTo from your code is a matter of personal preference, company policy, or your teacher's beliefs. + +If nothing else, you might one day be responsible for working with someone else's code—someone whose standards aren't as rigorous as yours regarding the notorious GoTo command. So let's take a brief look at how GoTo can be used. + +### Syntax + +The GoTo statement is straightforward, and can be useful—it's already been used several times in the examples you've looked at so far in this book (in Listings 7.2 and 9.2, for example). The syntax is as follows: + + GoTo _line_ + +Here, the line argument can be a line label (or, rarely these days, a line number) within the current procedure. + +A line number is simply a number placed at the beginning of a line to identify it. For example, consider this demonstration of GoTo: + + Sub Demo_of_GoTo() + 1 + If MsgBox("Go to line 1?", vbYesNo) = vbYes Then + GoTo 1 + End If + End Sub + +The second line here contains only the line number 1, which identifies the line. The third line displays a message box offering the choice of going back to line 1; if the user chooses the Yes button, VBA executes the GoTo 1 statement and returns to the line labeled 1, after which it displays the message box again. (If the user chooses the No button, the If block is exited.) + +However, it's usually better to use a line _label_ than a line number. A line label is a name for a line. A label starts with a letter and ends with a colon. Between the letter and the colon, the label can consist of any combination of characters. For example, earlier in this chapter you saw the label BadPassword: used to loop back to an earlier stage in a procedure when certain conditions were met. Perhaps the quintessential example of a label is the Bye: label traditionally placed at the end of a procedure for use with this GoTo statement: + + GoTo Bye + +When this label is placed just above the End...Sub command, it simply exits the macro. + +GoTo is usually used with a condition. If you use it without a condition to go back to a line earlier in the code than the GoTo statement, you're apt to create an _infinite loop_ (this bug is discussed in Chapter 12). And if you were to use the GoTo Bye statement without a condition, you would guarantee that your procedure would stop executing—no statement after this line would ever be executed. You would be jumping to the end of the macro. + +### Example + +As an example of a GoTo statement with a condition, you might use a GoTo Bye statement together with a message box that makes sure that the user wants to run a certain procedure: + + Response = MsgBox("Do you want to create a daily report for " & _ + "the head office from the current document?", _ + vbYesNo + vbQuestion, "Create Daily Report") + If Response = vbNo Then GoTo Bye + +If the user chooses the No button in the message box that the first line displays, VBA executes the GoTo Bye statement, branching to the Bye: label located at the end of the subroutine. + +## Nesting _If_ Blocks + +You can _nest_ If blocks (put one inside another) as needed to manage any contortions required in your code. Each nested If block must be complete in and of itself. (This means each nested block must start with an If and conclude with its own End...If.) + +For example, if you nest one If block within another If block (but forget the End If that concludes the nested If), VBA assumes that the End If line for the outer If actually pairs with the nested If. That's so wrong. + +To make your If blocks easy to read, indent them to different levels. This is particularly important when nesting If blocks. Indenting provides you with visual cues, making it clear which If line is paired with each End If line. In other words, indentation makes the various If blocks stand out. + +To see how this is done, check out the following nested If statements: + + 1. **If** condition1 Then 'start of first If + 2. **If** condition2 Then 'start of second If + 3. **If** condition3 Then 'start of third If + 4. _statements1_ + 5. ElseIf condition4 Then 'ElseIf for third If + 6. _statements2_ + 7. Else 'Else for third If + 8. _statements3_ + 9. **End If** 'End If for third If + 10. Else 'Else for second If + 11. **If** condition5 Then 'start of fourth If + 12. _statements4_ + 13. **End If** 'End If for fourth If + 14. **End If** 'End If for second If + 15. **Else** 'Else for first If + 16. _statements5_ + 17. **End If** 'End If for first If + +By following the layout, you can easily trace the flow of execution. For example, if condition1 in line 1 is False, VBA branches to the Else statement in line 15 and continues execution from there. If _condition1_ in line 1 is True, VBA evaluates the nested _condition2_ in line 2, and so on. + +The indentation is for visual clarity only—VBA pays no attention to it—but it can be a great help to the human reader. The previous nested If commands are also annotated with comments so that you can see which Else, ElseIf, and End If line belongs with which If line. However, with the indentation, commenting is unnecessary. + +By contrast, check out the unindented version of these nested blocks. This version is hard for the human eye to follow—and is even harder when it's buried in a morass of other code: + + 1. If condition1 Then + 2. If condition2 Then + 3. If condition3 Then + 4. _statements1_ + 5. ElseIf condition4 Then + 6. _statements2_ + 7. Else + 8. _statements3_ + 9. End If + 10. Else + 11. If condition5 Then + 12. _statements4_ + 13. End If + 14. End If + 15. Else ' + 16. _statements5_ + 17. End If + +There's seldom a pressing need to nest multiple If blocks. Often, you'll need only to nest a simple If...Then statement within an If...Then... Else statement or within an If...Then... ElseIf... Else statement. Listing 11.4 shows an example using Word. + +**Listing 11.4**: Nesting an If...Then block + + 1. Selection.HomeKey Unit:=wdStory + 2. Selection.Find.ClearFormatting + 3. Selection.Find.Style = ActiveDocument.Styles("Heading 5") + 4. Selection.Find.Text = " " + 5. Selection.Find.Execute + 6. If Selection.Find.Found Then + 7. lngResponse = MsgBox("Make this into a special note?", _ + vbOKCancel, "Make Special Note") + 8. If lngResponse = vbOK Then + 9. Selection.Style = "Special Note" + 10. End If + 11. End If + +The code in Listing 11.4 searches through the active document for the Heading 5 style and, if it finds the style, displays a message box offering to make it into a special note by applying the Special Note style. Here's what happens: + + * Line 1 starts by returning the insertion point to the beginning of the document. + * Line 2 clears formatting from the Find command (to make sure that it isn't searching for inappropriate formatting). + * Line 3 sets Heading 5 as the style for which the Find command is searching, and Line 4 sets the search string as an empty string (" "). + * Line 5 then runs the Find operation. + * Lines 6 through 11 contain the outer If...Then loop. Line 6 checks to see if the Find operation in line 5 found a paragraph in Heading 5 style. If it did, VBA runs the code in lines 7 through 10. + * Line 7 displays a message box asking if the user wants to make the paragraph into a special note. + * Line 8 begins the nested If...Then statement and checks the user's response to the message box. + * If the user's response is a vbOK—if the user chose the OK button—VBA executes the statement in line 9, which applies the Special Note style (which I'll assume is included in the styles available to the current document or template) to the paragraph. + * Line 10 contains the End If statement for the nested If...Then block, and line 11 contains the End If statement for the outer If...Then block. + +If you expect a document to contain more than one instance of the Heading 5 style, use a Do While... Loop loop to search for each instance. See Chapter 12 for details on Do While... Loop loops. + +# _Select Case_ Blocks + +The Select Case block provides an effective alternative to complex multiple If...Then blocks or multiple ElseIf statements. Select Case combines the same decision-making capability of If constructions with tighter and more readable code. + +Use the Select Case statement when the decision you need to make is complicated because it involves more than two or three different values that are being evaluated + +Select Case blocks are easier to read than complex If...Then blocks, mostly because there's less code. This also makes Select Case blocks easier to modify: when you need to adjust one or more of the values used, you have less code to wade through. + +## Syntax + +The syntax for Select Case is as follows: + + Select Case _TestExpression_ + Case _Expression1_ + _Statements1_ + [Case _Expression2_ + _Statements2_ ] + [Case Else + _StatementsElse_ ] + End Select + +Here's how the syntax breaks down: + + * Select Case starts the block, and End Select ends it. + * _TestExpression_ is the expression that determines which of the Case statements executes. + * _Expression1, Expression2,_ and so on are the expressions against which VBA matches TestExpression. + +For example, you might test to see which of a number of buttons in a user form the user chose. The _TestExpression_ would be tied to a button that's been chosen; if it were the first button, VBA would match that to _Expression1_ and would run the statements in the lines following Case _Expression1;_ if it were the second button, VBA would match that to _Expression2_ and would run the statements in the lines following Case _Expression2;_ and so on for the rest of the Case blocks. + +Case Else is similar to the Else clause in an If block. Case Else is an optional clause that (if it's included) runs if none of the given expressions is matched. + +## Example + +As an example of a Select Case block, consider Listing 11.5, which prompts users to enter their typing speed and then displays an appropriate response. + +**Listing 11.5**: Working with a Select Case structure + + 1. Sub Check_Typing_Speed() + 2. + 3. Dim varTypingSpeed As Variant + 4. Dim strMsg As String + 5. + 6. varTypingSpeed = InputBox _ + ("How many words can you type per minute?", "Typing Speed") + 7. **Select Case** varTypingSpeed + 8. **Case** " " + 9. End + 10. **Case Is** < 0, 0, 1 To 50 + 11. strMsg = "please learn to type properly before " & _ + "applying for a job." + 12. **Case** 50 To 60 + 13. strMsg = "Your typing could do with a little brushing up. " + 14. **Case** 60 To 75 + 15. strMsg = "We are satisfied with your typing speed." + 16. **Case** 75 To 99 + 17. strMsg = "Your typing is more than adequate. " + 18. **Case** 100 To 200 + 19. strMsg = "You wear out keyboards with your blinding speed." + 20. **Case Is** > 200 + 21. strMsg = "I doubt that's true." + 22. **End Select** + 23. + 24. MsgBox strMsg, vbOKOnly, "Typing Speed" + 25. + 26. End Sub + +Here's what happens in the Check_Typing_Speed procedure in Listing 11.5: + + * Line 1 starts the procedure, and line 26 ends it. + * Line 2 is a spacer. Line 3 declares the Variant variable varTypingSpeed, and line 4 declares the String variable strMsg. Line 5 is another spacer. + * Line 6 displays an input box prompting the user to enter their typing speed. It stores this value in the variable varTypingSpeed. + * Line 7 begins the Select Case block, predicating it on the variable varTypingSpeed. + * Next, VBA evaluates each of the Case clauses in turn until it finds one that proves True. The first Case clause, in line 8, compares varTypingSpeed to an empty string (" ") to see if the user chose the Cancel button in the input box or clicked the OK button without entering a value in the text box. If Case " " is True, VBA executes the End statement in line 9, ending the procedure. + * If Case " " is False, VBA moves execution to the next Case clause—line 10 in this example—where it compares varTypingSpeed to three items: less than 0 (Is < 0), 0, and the range 1 to 50 words per minute. Notice three things here: + +1. You can include multiple comparison items in the same Case statement by separating them from each other with commas. + +2. Using the Is keyword with the comparison operator (here, the _less than_ operator) checks the relation of two numbers to each other. + +3. The To keyword denotes the range of values. + + * If varTypingSpeed matches one of the comparison items in line 10, VBA assigns to the String variable strMsg the text on line 11 and then continues execution at the line after the End Select statement. + * If varTypingSpeed isn't within this range, VBA moves to the next Case clause and evaluates it in turn. When VBA finds a Case clause that's True, it executes the statement following that clause (in this case, assigning a text string to the strMsg variable) and then continues execution at the line after the End Select statement. + * For any case other than that in line 8 (which ends the procedure), line 24 displays a message box containing the text stored in the statement strMsg. + +A Select Case block can be a good way of specifying which action to take based on the user's choice from a ListBox or ComboBox control (these controls are explored in Chapter 14, "Creating Simple Custom Dialog Boxes"). Typically, a list box or combo box displays a list of many different options, such as all the states in the USA. After the user clicks to select an item within a ListBox or ComboBox control, the chosen item appears in the control's Value property. Your macro could then check this Value property as the test expression in your Select Case block and take action accordingly. + +## When Order Matters + +One final point about complex test structures. You need to ensure that your Select Case and If...Then... Else statements (or other multiple If structures) evaluate their test conditions in the appropriate order. This means that each condition to be evaluated must _exclude_ all the conditions that follow it. + +Let's say you're asking the user how old they are. And you set up your test cases like this: + + 1. Age = InputBox ("How old are you?") + 2. + 3. Select Case Age + 4. + 5. **Case < 50** + 6. strMsg = "You're nearing retirement." + 7. + 8. **Case < 12** + 9. strMsg = "Hello, youngster." + +This is a logic bug. And a bad one. Line 8 can never execute because everyone under 50, including those younger than 12, will trigger line 5. (The expression "less than 50" _includes_ "less than 12.") + +To work properly, these tests must be reversed, like this: + + **Case < 12** + strMsg = "Hello, youngster." + + **Case < 50** + strMsg = "You're nearing retirement." + +You can avoid this problem entirely by testing for equality or a range, as illustrated in Listing 11.5: + + Case 50 To 60 + +# The Bottom Line + +**Use comparison operators.** + +Comparison operators compare items using such tests as _greater than_ or _not equal to_. + +Master It + +Write a line of code that uses a _less than_ comparison to test whether a variable named Surplus is less than 1200. + +**Compare one item with another.** + +You can compare strings using _less than_ and _more than_ comparison operators. + +Master It + +What symbol do you use to determine if VariableA is lower in the alphabet than VariableB? + +**Test multiple conditions.** + +To test multiple conditions, you use VBA's _logical operators_ to link the conditions together. + +Master It + +Name two of the most commonly used logical operators. + +**Use If blocks.** + +If blocks are among the most common programming structures. They are often the best way to allow code to make decisions. To test two conditions, use If... Else... EndIf. + +Master It + +Write an If... Else... End If block of code that displays two message boxes. If the temperature (the variable Temp) is greater than 80, tell the user that it's hot outside. Otherwise, tell the user that it's not that hot. + +**Use Select Case blocks.** + +Select Case structures can be a useful alternative to If blocks. + +Master It + +When should you use a Select Case structure? +Chapter 12 + +Using Loops to Repeat Actions + +As in life, so in macros. Sometimes, you'll want to repeat an action a predetermined number of times: break six eggs to make an omelet, or create two new documents. + +More often, though, you'll just repeat an action until a certain condition is met: break eggs until the pan is full, or buy two lottery tickets a week until you hit it big, or subtract five from every instance of a value in an Excel spreadsheet. In these situations, you don't know in advance when you'll triumph against the wretched odds of the lottery, or how many times the value will appear in the spreadsheet—your code must simply carry on until the condition is met. + +In VBA, you use _loops_ to repeat actions. VBA provides a number of ways to use loops in your code. In this chapter, you'll learn about the different types of loops and typical uses for each. + +In this chapter you will learn to do the following: + + * Understand when to use loops + * Use For... loops for fixed repetitions + * Use Do... loops for variable numbers of repetitions + * Nest one loop within another loop + * Avoid infinite loops + +# When Should You Use a Loop? + +To repeat an action or a series of actions in VBA, you could record the repetition itself into a macro by using the Macro Recorder (if the application you're using supports the Macro Recorder—remember that only Word and Excel do). + +Or you could copy some code and paste it back into the macro multiple times to repeat the behavior. For example, you could record a macro containing the code for creating a new Word document based on the default template, open the macro in the Visual Basic Editor, and then copy this new-document code and paste it five times to create a procedure that makes six new documents. + +It's almost always much better, however, to just write a loop block (structure) to repeat the commands as necessary. + +Loops have several straightforward advantages over repetitive, redundant code: + + * Your procedures are shorter—they contain less code and fewer instructions—and are thus easier to understand. + * Your procedures are more flexible: instead of hard-coding the number of repetitions, you can vary the number as necessary. ( _Hard_ - _coding_ means writing fixed code as opposed to flexible, variable code, such as _Create 6 new documents_ versus _Create_ x _number of new documents_ , thereby allowing the user or the code to supply the value of _x_.) + * Your procedures are easier to test, debug, and modify, particularly for people other than you. + +That said, if you just need to repeat one or more actions two or three times in a procedure and that procedure will always need to repeat the action this same number of times, there's nothing wrong with hard-coding the procedure by repeating the code. It'll work fine, it's easy to do, and you won't have to spend time considering the logic of loops. The code will be longer and a tad harder to maintain, but that's no big deal in simple situations. + +# Understanding the Basics of Loops + +In VBA, a loop is a structure (block of code) that repeats a number of statements, looping back to the beginning of the structure once it has finished executing them. Each cycle of execution of a loop is called an _iteration_. + +There are two basic categories of loops: + + * _Fixed_ - _iteration loops_ repeat a set number of times (six eggs). + * _Indefinite loops_ repeat a flexible number of times (enough eggs to fill whatever pan is being used). + +The execution of either type of loop is controlled by the _loop invariant_ , also called the _loop determinant_. This can be either a numeric expression or a logical expression. Fixed-iteration loops typically use numeric expressions, whereas indefinite loops typically use logical expressions. For example, a fixed-iteration loop might specify that the loop will iterate five times, while an indefinite loop might continue iterating until the end of a document is reached. + +Table 12.1 explains the types of loops that VBA provides. + +Table 12.1 VBA's loop types + +**Loop** | **Type** | **Explanation** +---|---|--- +For...Next | Fixed | Repeats an action or a sequence of actions a given number of times. +For Each... Next | Fixed | Repeats an action or a sequence of actions once for each object in a VBA collection. +Do While... Loop | Indefinite | Performs an action or a sequence of actions if a condition is True and continues to perform it until the condition becomes False. +While... Wend | Indefinite | Performs an action or a sequence of actions if a condition is True and continues to perform it until the condition becomes False. This type of loop is similar to Do... Loop While but is now almost obsolete. +Do Until... Loop | Indefinite | Performs an action or sequence of actions while a condition is False and continues to perform it until the condition becomes True. +Do... Loop While | Indefinite | Performs an action or a sequence of actions once and then repeats it while a condition is True until it becomes False. +Do... Loop Until | Indefinite | Performs an action or a sequence of actions once and repeats it while a condition is False until it becomes True. + +# Using For...loops for Fixed Repetitions + +For...loops execute for a fixed number of times. For...Next loops repeat for the number of times of your choosing, while For Each... Next loops execute once for each element in a specified VBA collection. + +## _For...Next_ Loops + +A For...Next loop repeats an action or a sequence of actions a given number of times. How many times it loops is specified by a _counter variable_. The counter variable can be hard-coded into the procedure, passed from an input box or dialog box, or passed from a value generated either by a different part of the procedure or by a different procedure. + +### Syntax + +The syntax for For...Next loops is as follows: + + For _counter_ = _start To end_ [Step _stepsize_ ] + [ _statements_ ] + [Exit For] + [ _statements_ ] + Next [ _counter_ ] + +Here's what happens in a For...Next loop (refer to the syntax): + +1. When VBA enters the loop at the For statement, it assigns the _start_ value to _counter_. It then executes the statements in the loop. When it reaches the Next statement, it increments _counter_ by 1 or by the specified _stepsize_ and loops back to the For statement. + +2. VBA then checks the _counter_ variable against the _end_ variable. When _stepsize_ is positive, if _counter_ is greater than _end_ , VBA terminates the loop and continues execution of the procedure with the statement immediately after the Next statement (which could be any action or the end of the procedure). If _counter_ is less than or equal to _end_ , VBA repeats the statements in the loop, increases _counter_ by 1 or by _stepsize_ , and loops back to the For statement again. (For a loop in which _stepsize_ is negative, the loop continues while _counter_ is greater than or equal to _end_ and ends when _counter_ is equal to or less than _end_. In other words, when the stepsize is negative, the loop counts _down_ rather than up.) + +3. The Exit For statement exits the For...loop early. You'll look at how to use the Exit For statement, and examples of the different uses of For...Next loops, later in this chapter. + +Table 12.2 explains the components of the For...Next loop syntax. As usual, brackets enclose optional items and italicized words are placeholders—elements in the code that are to be replaced by you, the programmer. + +Table 12.2 Components of the syntax for a For...Next loop + +**Component** | **Description** +---|--- +_Counter_ | A numeric variable or an expression that produces a number. By default, VBA increases the counter value by an increment of 1 with each iteration of the loop, but you can change this increment by using the optional Step keyword and _stepsize_ argument. _counter_ is required in the For statement and is optional in the Next statement, but it's a good idea to also include _counter_ in the Next statement to make your code easy to read. This is particularly important when you're using multiple For...Next statements in the same procedure or nesting one For...Next statement within another. +_Start_ | A numeric variable or numeric expression giving the starting value for _counter_. +_End_ | A numeric variable or numeric expression giving the ending value for _counter_. +_Stepsize_ | A numeric variable or numeric expression specifying how much to increase or decrease the value of _counter_. To use _stepsize_ , use the Step keyword and specify the _stepsize_ variable. _stepsize_ is 1 by default, but you can use any positive or negative value. +Exit For | A statement for exiting a For...loop. +Next | The keyword indicating the end of the loop. Again, you can specify the optional _counter_ here to make your code clear. + +### Straightforward _For...Next_ Loops + +In a simple For...Next loop, you first specify a _counter_ variable and the starting and ending values for it: + + Dim i As Integer + For i = 1 to 200 + +Here, i is the _counter_ variable, 1 is the starting value, and 200 is the ending value. Because VBA by default increases the _counter_ variable by 1 with each iteration of the loop, the counter variable in this example will count 1, 2, 3, and so on up to 200. Once the loop iterates enough times so the value in counter is 201, the looping ends and execution continues in the line below the loop's End statement. + +You can also use the Step keyword to specify a different increment, either positive or negative; more on this in the next section. + +* * * + +i Is the Traditional Counter Variable Name for...next Loops + +i is the classic integer _counter_ variable used in a For...Next loop; after using i, the convention is to use j, k, l, m, and n for any subsequent _counter_ variables (if you're adding nested loops within in i loop). These short variable names derive from the days of key-card computation, when memory was at a premium and longer names represented a significant extravagance. These days, computer memory is abundant, so using long variable names is common practice for _most_ variables. But not with loop counters. Using i as the loop counter is pervasive, even in languages like Java and C++. So stick with i. + +* * * + +After the previous two statements (Dim and For), you specify whatever actions you want carried out within the loop, followed by the Next keyword to end the loop: + + Application.StatusBar = _ + "Please wait while Excel checks for nonuniform prices: " & i & "..." + Next i + +This code displays (on the status bar) Excel's progress in checking your spreadsheet for improbable values. + +As another example, say you need to check every paragraph in Word documents you receive from contributors to make sure there's no unsuitable formatting. By using a loop that runs from 1 to the number of paragraphs in the active document, you can check each paragraph in turn and let the user view the progress in the status bar. The number of paragraphs in a document is stored in the Count property of the Paragraphs collection in the ActiveDocument object: + + Dim i As Integer + **For** i = 1 To ActiveDocument.Paragraphs.Count + + ' _CheckParagraphForIllegalFormatting_ + + DoEvents + + Application.StatusBar = _ + "Please wait while Word checks the formatting in " _ + & " this document: Paragraph " & i & " out of " _ + & ActiveDocument.Paragraphs.Count & "..." + Selection.MoveDown Unit:=wdParagraph, _ + Count:=1, Extend:=wdMove + **Next** i + +This code snippet executes a CheckParagraphForIllegalFormatting procedure. We've not yet written this procedure, so I just wrote a comment line indicating that the procedure needs to be called from inside this loop. + +Next we use the DoEvents command. This allows multitasking. It interrupts the loop to see if something else is going on in the computer (the user typing something, the status bar in Word being updated, or whatever). This prevents your loop from hogging the computer's microprocessor. + +Then the loop continues executing. The message is displayed in the status bar, indicating which paragraph out of the total number it's working on, and then the loop moves down a paragraph. When VBA reaches the Next statement, it increases the i counter by the default value, 1 (because no _stepsize_ variable is specified in the For statement) and loops back to the For statement, where it compares the value of i to the value of ActiveDocument.Paragraphs.Count. The procedure continues to loop until i has reached the value of ActiveDocument.Paragraphs.Count, which is the final iteration of the loop. Notice here how the counter variable is used twice: first to keep track of the loop's iterations, but it's also used later _within_ the loop to display the current paragraph number: + +Paragraph ″ & **i** & + +In a similar way you could use a simple For...Next loop to quickly build the structure of a timesheet or work log in Excel. The following statements use a For...Next loop to insert the labels 1.00 through 24:00 in the current column in the active sheet of the active workbook: + + Dim i As Integer + For i = 1 To 24 + ActiveCell.FormulaR1C1 = **i** & ":00" + ActiveCell.Offset(RowOffset:=1, ColumnOffset:=0).Select + Next i + +Here, the ActiveCell.FormulaR1Ci statement inserts the automatically increased string for the counter i together with a colon and two zeroes (to create a time format). The ActiveCell.Offset(RowOffset:=1, ColumnOffset:=0).Select statement selects the cell in the next row and the same column. The loop runs from i = 1 to i = 24 and stops when the automatic increase takes i to 25. Again, the counter variable is used within the loop. This is quite common. + +### For...Next Loops with Step Values + +If increasing the _counter_ variable by the default 1 doesn't suit your purpose, you can use the Step keyword to specify a different increment or decrement. + +For example, the following statement increases the _counter_ variable by 20, so the sequence is 0, 20, 40, 60, 80, 100: + + For i = 0 to 100 Step 20 + +You can also decrement by specifying a negative Step value: + + For i = 1000 to 0 Step -100 + +This statement produces the sequence 1000, 900, 800, and so on, down to 0. + +Instead of the "x out of y" countdown given in the example in the previous section, you could produce a countdown running from ActiveDocument.Paragraphs.Count to zero: + + Dim i As Integer + For i = ActiveDocument.Paragraphs.Count To 0 **Step -1** + CheckParagraphForIllegalFormatting + Application.StatusBar = _ + "Please wait while Word checks the formatting in this document: " & i + Selection.MoveDown Unit:=wdParagraph, Count:=1, Extend:=wdMove + Next i + +### Using an Input Box to Drive a For...Next Loop + +Sometimes you'll be able to hard-code the number of iterations into a For...Next loop (six eggs). You'll know the number of iterations when writing your code, so you can just type in the end condition number, like the 100 here: + + For i = 0 to 100 + +Other times, though, you can't know in advance how many loop iterations are needed. This information only becomes available during program execution (called _runtime_ ) rather than when you're writing the code (called _design time_ ). + +Often you'll take a number from another operation during execution, such as the ActiveDocument.Paragraphs.Count property in the previous example. + +You want to use this macro with many documents in the future. The number of paragraphs in various documents is different; it varies. So you can't know when writing your code how many times it should loop. Your macro itself has to gather that information at runtime. + +Frequently you ask the user to specify the number of loop repetitions. The easiest way of doing this is to display an input box, requesting the user to enter a value. + +For example, Listing 12.1 contains a simple procedure named CreatePresentations that displays an input box prompting users to enter the number of presentations they want to create. It then uses a For...Next loop to create the documents in PowerPoint. + +**Listing 12.1**: Letting the user specify the number of iterations + + 1. Sub CreatePresentations() + 2. Dim intPresentations As Integer + 3. Dim i As Integer + 4. intPresentations = InputBox _ + ("Enter the number of presentations to create:", _ + "Create Presentations") + 5. For i = 1 To intPresentations + 6. Presentations.Add + 7. Next i + 8. End Sub + +Here's what happens in the CreatePresentations procedure in Listing 12.1 + + * Line 2 declares the Integer variable intPresentations, and line 3 declares the Integer variable i. + * Line 4 displays an input box prompting users to enter the number of presentations they want to create. + * Lines 5 through 7 contain a For...Next loop that runs from i = 1 to i = intPresentations with the default increment of 1 per iteration. Each iteration of the loop executes the Presentations.Add statement in line 6, creating a new presentation based on the default template. + +* * * + +**Control a For...Next Loop with User Input via a Dialog Box** + +An input box returns only a single value. Sometimes you need multiple values from the user. So, for those occasions when an input box won't suffice, you can easily get input from a dialog box to drive a For...Next loop. This book hasn't yet shown you how to create dialog boxes, but in this section you'll get a sneak preview by looking at a procedure named Create_Folders. You aren't expected to build and test this example; just read the code to get an idea of how it accepts user input and then employs that information in the loop. + +This example procedure reduces the tedium of creating multiple folders with predictable names, such as when I had to create 31 folders, a folder for each chapter in this book. + +Say that you're using a four-digit number to identify the project, the letter _s_ for section, and a two-digit number to identify the section. So you'd end up with folders named 1234s01, 1234s02, 1234s03, and so on—simple enough to create manually, but tedious if you need more than a dozen or so. + +In its simplest form, this dialog box would provide a text box for the number of folders to be created (though you could also use a drop-down list for this, or even a spinner control) and a text box for the project number. The following illustration is an example of how this dialog box might look. + +You display a dialog box by using the Show method in a separate macro, perhaps using a Load statement first, like this: + + Sub makefolders() + + Dialogs(wdDialogFileSaveAs).Show + + **Load** frmCreateFolders + + frmCreateFolders. **Show** + + End Sub + +You might have noticed the Dialogs command in this code. It's quite useful, but we'll discuss it at the end of this sidebar. For now, our focus is on looping techniques. + +I named the example dialog box frmCreateFolders. However, any valid VBA name will work. The first text box—identified with the Number Of Folders To Create label—is named txtFolders; the second text box is named txtProjectNumber. + +The Cancel button here has an End statement attached to its Click event so that if the user clicks it, VBA ends the procedure: + + Private Sub cmdCancel_Click() + End + End Sub + +The OK button in the dialog box has the following code attached to its Click event: + + 1. Private Sub cmdOK_Click() + 2. + 3. Dim strMsg As String + 4. Dim strFolder As String + 5. Dim i As Integer + 6. + 7. frmCreateFolders.Hide + 8. Unload frmCreateFolders + 9. strMsg = "The Create_Folders procedure has created " _ + & "the following folders: " & vbCr & vbCr + 10. + 11. **For** i = 1 To **txtFolders.Value** + 12. strFolder = **txtProjectNumber.Value** & "p" & Format(i, "0#") + 13. MkDir strFolder + 14. strMsg = strMsg & " " & strFolder & vbCr + 15. **Next** i + 16. + 17. MsgBox strMsg, vbOKOnly + vbInformation, _ + "Create Folders" + 18. + 19. End Sub + +Let's pause here a minute for a pep talk. You might read the preceding code and say, "Hey! I'll never be able to remember all this stuff about Format and Hide and vbCr and vbOKOnly." Don't pout. Nobody memorizes all the variations of the Format command, or all the vb constants like vbCr. Remember, there are tons of sample code examples on the Internet and in books like this one. What's more, the VBA Editor itself displays lists of constants and object members as you type in a line of code. (Look up "Auto List Members" in this book's index. Or search the VBA Editor's Help index to locate online resources.) + +Now back to our regular programming. Notice that the Value properties of the two text boxes are used in this loop. The value in txtFolders specifies the loop's number of iterations. The txtProjectNumber specifies the first part of the name for each newly created folder. + +The cmdOK_Click procedure runs when the user clicks the OK button in the dialog box: + + * Line 1 declares the cmdOK_Click subroutine, and line 19 ends it. Line 2 is a spacer. + * Line 3 declares the String variable strMsg, which is used to contain a string to display in a message box at the end of the procedure. + * Line 4 declares the String variable strFolder, which will contain the name of the current folder to create in each iteration of the loop. + * Line 5 declares the Integer variable i, which will be the _counter_ variable for the For...Next loop. + * Line 6 is a spacer. + * Line 7 hides frmCreateFolders. + * Line 8 unloads frmCreateFolders from memory. + * Line 9 assigns some introductory text to strMsg, ending it with a colon and two vbCr carriage-return characters to make the start of a list. + * Line 10 is a spacer. + * Lines 11 through 15 contain the For...Next loop that creates the folders. Line 11 causes the loop to run from i = 1 to i = txtFolders.Value, the value supplied by the user in the Number Of Folders To Create text box. Line 12 assigns to the strFolder String variable the Value property of the txtProjectNumber text box, the letter _p_ , and the value of i formatted via the Format function to include a leading zero if it's a single digit (so that 1 will appear as 01, and so on). Line 13 uses the MkDir command with strFolder to create a folder (that is, make a directory—the old DOS command mkdir lives on in VBA) of that name. Line 14 adds some spaces (for an indent), the contents of strFolder, and a vbCr character to strMsg. Line 15 then loops back to the For statement, incrementing the i counter. VBA then compares the i counter to txtFolders.Value and repeats the loop as necessary. + +This procedure creates a set of new subfolders within whatever is the current folder, without giving the user a choice of location. Chances are you won't want to do this in real-life situations. You might want to change a folder to a set location (so as to keep all the project files together), but more likely you'll want to let the user choose a suitable location—for example, by displaying a common dialog box, such as the Save As dialog box used by most Windows applications. These built-in dialog boxes can be very useful because everyone who uses Windows is familiar with them and because they contain quite a bit of functionality. You display, for example, the classic Windows SaveAs dialog box like this: + + Dialogs(wdDialogFileSaveAs).Show + +When the user closes this dialog box, whatever folder the user specifies becomes the current folder and the document is automatically saved. You can find out more about how to use common dialog boxes in Chapter 14, "Creating Simple Custom Dialog Boxes," and also at this Microsoft web page: + + + +I wanted you to be aware that common dialog boxes exist, but in this example, perhaps a more direct way of allowing the user to specify the path for the new directories would be to use the ChDir (change directory) command, like this: + + Dim strDir As String + + strDir = InputBox("Type the full path where you want new folders to be stored") + + **ChDir** (strDir) + +* * * + +## For Each... Next Loops + +The For Each... Next loop, which is unique to the various versions of Visual Basic, including VBA, is similar to the For...Next loop. With For Each, however, the iterations are based on the number of objects in a collection, such as the Slides collection in a presentation or the Documents collection of Word documents. So, using For Each means that you, the programmer, don't necessarily know the number of loop iterations in advance, but VBA will know during execution because it will query an object's Count property. + +For example, you can choose to take an action for each Slide object in a presentation. During design time while writing your macro you don't need to know how many slides are in the collection. (If there are none, nothing happens.) + +### Syntax + +The syntax for the For Each... Next statement is straightforward: + + For Each _object_ In _collection_ + [ _statements_ ] + [Exit For] + [ _statements_ ] + Next [ _object_ ] + +VBA starts by evaluating the number of objects in the specified collection. It then executes the statements in the loop for the first of those objects. When it reaches the Next keyword, it loops back to the For Each line, reevaluates the number of objects, and performs further iterations as appropriate. + +Here's an example: The Documents collection contains the open documents in Word. So you could create a straightforward procedure to close all the open documents by using a For Each... Next loop like this: + + Dim Doc As Document + **For Each** Doc in Documents + Doc.Close SaveChanges:=wdSaveChanges + **Next** + +VBA closes each open document in turn by using the Close method. The statement uses the wdSaveChanges constant for the SaveChanges argument to specify that any unsaved changes in the document be saved when the document is closed. As long as there are open documents in the Documents collection, VBA repeats the loop, so it closes all open documents and then terminates the procedure. + +This example provides a straightforward illustration of how a For Each... Next loop works, but you probably wouldn't want to use the example in practice. Instead, you'd probably use the Close method with the Documents collection (this collection contains all the open documents) to close all the open documents. It's a simpler approach. However, you might use a For Each... Next loop to check each document for certain characteristics before closing it. + +## Using an _Exit For_ Statement + +As you saw earlier in this chapter when looking at the syntax for For statements, you can use one or more Exit For statements to exit a For...loop if a certain condition is met. Exit For statements are optional and are seldom necessary. If you find yourself needing to use Exit For statements in all your procedures, there's probably something wrong with the way you're constructing your loops. That said, you may sometimes find Exit For statements useful—for example, to respond to an error that happens within a loop or if the user chooses to cancel a procedure. + +On those occasions when you do need Exit For statements to exit a loop early, you'll typically use them with straightforward conditions. For example, in Word, if you wanted to close open windows until you reached a certain document that you knew to be open, you could use an Exit For statement like this: + + Dim Doc As Document + For Each Doc in Documents + If Doc.Name = "Document1" Then **Exit For** + Doc.Close + Next Doc + +This For Each... Next statement checks the Name property of the document to see if it's Document1; if it is, the Exit For statement causes VBA to exit the loop. Otherwise, VBA closes the document and returns to the start of the loop. + +* * * + +Use Multiple Exit for Statements If You Wish + +You can also use multiple Exit For statements if you need to. For example, you might need to check two or more conditions during the actions performed in the loop. + +* * * + +# Using _Do..._ Loops for Variable Numbers of Repetitions + +Do loops give you more flexibility than For...loops in that you can test for conditions and direct the flow of the procedure accordingly. VBA includes several types of Do loops: + + * Do While... Loop + * Do... Loop While + * Do Until... Loop + * Do... Loop Until + +These loops break down into two categories: + + * Loops that test a condition at the start of the loop, before executing any of the statements contained inside the loop. Do While... Loop and Do Until... Loop loops fall into this category. In other words, if the test fails, the loop's code within the loop block will not execute even once. + * Loops that test a condition at the end of the loop. This type of loop executes the code within the loop block before testing a condition. Do... Loop While and Do... Loop Until fall into this category. This type of loop will execute at least one time. + +The difference between the two types of loop in each category is that each While loop repeats itself _while_ a condition is True (until the condition becomes False), whereas each Until loop repeats itself _until_ a condition becomes True (while the condition remains False). + +This means that you can get by to some extent using only the While loops or only the Until loops—you'll just need to set up some of your conditions the other way around. For example, you could use a Do While... Loop loop with a condition of x < 100 or a Do Until... Loop loop with a condition of x = 100 to achieve the same effect. Put another way: _loop while_ x _is less than 100_ is equivalent to _loop until_ x = _100_ —as long as you start looping below 100. + +The following sections describe all the different kinds of Do loops so that you can know when to use each. + +## _Do While... Loop_ Loops + +In a Do While... Loop loop, you specify a condition that has to remain True for the actions (statements) inside the loop to be executed. If the condition isn't True, the actions aren't executed and the loop ends. When a loop ends, the code _below_ the loop block then executes. + +For example, you might want to search a document for an instance of a particular word or phrase and take action after you find it. Figure 12.1 shows a Do While... Loop loop. + +Figure 12.1 Do While... Loop loop tests for a condition before performing the actions contained in the loop. + +### Syntax + +The syntax for the Do While... Loop loop is straightforward: + + Do While _condition_ + [ _statements_ ] + [Exit Do] + [ _statements_ ] + Loop + +While the _condition_ is met (Do While the condition remains True), the statements in the loop are executed. The Loop keyword returns execution to the Do While line, which is then reevaluated. If the _condition_ is still True, the loop continues—it iterates again. + +However, if the _condition_ is False, execution jumps to the code below the loop block, starting with the statement on the line after the Loop keyword. + +You can use one or more optional Exit Do statements if you want to exit the loop without waiting until the condition turns False. + +Say you wanted to construct a glossary from a lengthy Word document that highlights the main terms by italicizing them. These terms are located in the body text as well as within bulleted or numbered lists. However, you want to avoid picking up italicized terms used in other elements of the document, such as headings or captions. In this situation, body text is in the Times New Roman font, but the captions and headlines are in other fonts. + +You could command Word to search for Times New Roman text with the italic attribute. If Word found instances of the text, it would take the appropriate actions, such as selecting the sentence containing the term, together with the next sentence (or the rest of the paragraph), and copying it to the end of another document. Then it would continue the search, performing the loop until it no longer found instances of italic Times New Roman text. + +Listing 12.2 shows an example of how such a procedure might be constructed with a Do While... Loop structure. This listing includes a number of commands that you haven't learned about yet, but you should easily be able to see how the loop works. + +**Listing 12.2**: Understanding how Do While works + + 1. Sub GenerateGlossary() + 2. + 3. Dim strSource As String + 4. Dim strDestination As String + 5. Dim strGlossaryName As String + 6. + 7. strSource = ActiveWindow.Caption + 8. strGlossaryName = InputBox _ + ("Enter the name for the glossary document.", _ + "Create Glossary") + 9. If strGlossaryName = "" Then End + 10. + 11. Documents.Add + 12. ActiveDocument.SaveAs FileName:=strGlossaryName, _ + FileFormat:=wdFormatDocument + 13. strDestination = ActiveWindow.Caption + 14. Windows(strSource).Activate + 15. + 16. Selection.HomeKey Unit:=wdStory + 17. Selection.Find.ClearFormatting + 18. Selection.Find.Font.Italic = True + 19. Selection.Find.Font.Name = "Times New Roman" + 20. Selection.Find.Text = "" + 21. Selection.Find.Execute + 22. + 23. **Do While** Selection.Find.Found + 24. Selection.Copy + 25. Selection.MoveRight Unit:=wdCharacter, _ + Count:=1, Extend:=wdMove + 26. Windows(strDestination).Activate + 27. Selection.EndKey Unit:=wdStory + 28. Selection.Paste + 29. Selection.TypeParagraph + 30. Windows(strSource).Activate + 31. Selection.Find.Execute + 32. **Loop** + 33. + 34. Windows(strDestination).Activate + 35. ActiveDocument.Save + 36. ActiveDocument.Close + 37. + 38. End Sub + +The GenerateGlossary procedure in Listing 12.2 copies italic items in the Times New Roman font from the current document and inserts them in a new document that it creates and saves. Here's what happens: + + * Line 1 begins the procedure, and line 2 is a spacer. + * Lines 3, 4, and 5 declare the String variables strSource, strDestination, and strGlossaryName, respectively. Line 6 is a spacer. + * Line 7 assigns the Caption property of the active window to the String variable strSource. The procedure uses this variable to activate the document when it needs to work with it. + * Line 8 displays an input box asking the user to enter a name for the document that will contain the glossary entries pulled from the current document. It stores the string the user enters in the String variable strGlossaryName. + * Line 9 then compares strGlossaryName to an empty string (″″) to make sure the user hasn't clicked the Cancel button to cancel the procedure or clicked the OK button in the input box without entering a name in the text box. If GlossaryName is an empty string, line 9 uses an End statement to terminate execution of the procedure. + * Provided line 9 hasn't stopped the procedure in its tracks, the procedure rolls on. Line 10 is a spacer. Line 11 then creates a new blank document. (This document is based on the Normal.dotm global template because no Template argument is used to specify a different template.) This document will become the glossary document. + * Line 12 saves the document with the name the user specified in the input box. + * Line 13 stores the Caption property of this document in the strDestination variable, again making it available to activate this document as necessary throughout the procedure. You now have the source document identified by the strSource variable and the destination document identified by the strDestination variable. + * Line 14 uses the Activate method to activate the strSource window. Line 15 is a spacer. + * Line 16 uses the HomeKey method of the Selection object with the wdStory unit to move the insertion point to the beginning of the document, which is where the procedure needs to start working to catch all the italicized words in Times New Roman. + * Lines 17 through 20 detail the _Find_ operation the procedure needs to perform: Line 17 removes any formatting applied to the current Find item, line 18 sets the Find feature to find italic formatting, line 19 sets _Find_ to find Times New Roman text, and line 20 specifies the search string, which is an empty string (″″) that causes _Find_ to search only for the specified formatting. + * Line 21 then performs the Find operation by using the Execute method. Line 22 is a spacer. + * Lines 23 through 32 implement the Do While... Loop loop. Line 23 expresses the condition for the loop: While Selection.Find.Found (while the Find operation is able to find an instance of the italic Times New Roman text specified in the previous lines). While this condition is met (is True), the commands contained in the loop will execute. + * Line 24 copies the selection (the item found with italic Times New Roman formatting). + * Line 25 moves the insertion point one character to the right, effectively deselecting the selection and getting the procedure ready to search for the next instance in the document. You need to move the insertion point off the selection to the right so that the next _Find_ operation doesn't find the same instance. (If the procedure were searching up through the document instead of down, you'd need to move the insertion point off the selection to the left instead by using a Selection.MoveLeft statement.) + * Line 26 activates the strDestination window, putting Word's focus on it. + * Line 27 then moves the insertion point to the end of the glossary document, and line 28 pastes the copied item in at the position of the insertion point. Moving to the end of the document isn't strictly necessary here, provided that the Normal.dotm global template doesn't contain any text—if Normal.dotm is empty, the new document created in line 11 will be empty too, and the start and end of the document will be in the same position. And after each paste operation, Word positions the insertion point after the pasted item. However, if Normal.dotm _does_ contain text, then this step is necessary. + * Line 29 uses the TypeParagraph method of the Selection object to enter a paragraph after the text inserted by the paste operation. + * Line 30 activates the strSource document once more, and line 31 repeats the Find operation. + * The Loop statement in line 32 then loops execution of the procedure back to line 23, where the Do While Selection.Find.Found condition evaluates whether this latest Find operation was successful (True). + * If it was successful, the loop continues; if it wasn't, execution of the procedure continues at line 34, which activates the glossary document again. Line 35 saves the active document (the glossary document, because it was just activated), and line 36 closes it. + * Line 37 is a spacer, and line 38 ends the procedure. + +## _Do... Loop While_ Loops + +A Do... Loop While block is similar to a Do While... Loop, except that in the Do... Loop While loop, the statements contained within the loop are executed at least once. + +Whether the condition is True or False, the loop executes at least the first time through because the condition isn't tested until the end of the loop block. + +If the condition is True, the loop continues to run until the condition becomes False. Figure 12.2 shows a Do... Loop While loop. + +Figure 12.2 In a Do... Loop While loop, the actions in the loop run once before the condition is tested. + +The Do While... Loop block described earlier probably made immediate sense to you, but this Do... Loop While block may seem odd. You're going to execute the contained statements _before_ checking the condition? + +But you'll find that Do... Loop While loops can be very useful, although they lend themselves to different situations than Do While... Loop loops. + +Consider the lottery example from the beginning of the chapter. In that situation, you execute the action before you check the condition that controls the loop. First you buy a lottery ticket, and then you check to see if you've won. If you haven't won, or you've won only a small sum, you loop back and buy more tickets for the next lottery. (Actually, this is logically a Do... Loop Until loop rather than a Do... Loop While loop because you continue the loop while the condition is False; when you win a suitably large amount, the condition becomes True.) + +Likewise, in programming it's not uncommon to take an action and then check whether you need to repeat it. For example, you might want to apply special formatting to the current paragraph and then check to see if other paragraphs need the same treatment. + +### Syntax + +The syntax for a Do... Loop While loop is as follows: + + Do + [ _statements_ ] + [Exit Do] + [ _statements_ ] + Loop While _condition_ + +VBA performs the statements included in the loop, after which the Loop While line evaluates the condition. If it's True, VBA returns execution to the Do line and the loop continues to execute; if it's False, execution continues at the line after the Loop While line. + +As an example of a Do... Loop While loop, consider this crude password checker that you could use to prevent someone from executing a macro without supplying the correct password: + + Dim varPassword As Variant + VarPassword = "corinth" + **Do** + varPassword = InputBox _ + ("Enter the password to start the procedure:", _ + "Check Password 1.0") + Loop While varPassword <> "CorrectPassword" + +Here the Do... Loop While loop first displays an input box for the user to enter the password. The Loop While line compares the value from the input box, stored in varPassword, against the correct password (here, CorrectPassword). If the two aren't equal (varPassword <> ″CorrectPassword″), the loop continues, displaying the input box again. + +This loop is just an example—you wouldn't want to use it as it is in real life. Here's why: Choosing the Cancel button in an input box causes it to return a blank string, which also doesn't match the correct password, causing the loop to run again. The security is perfect; the problem is that the only way to end the loop is for users to supply the correct password. If they're unable to do so, they will see the input box again and again. There's no way out of the loop. This is called an _endless loop_ and it's really bad programming. The user can get hopelessly trapped with the code repeating endlessly (in this case if they can't remember the password). Such loop stalls are also called _infinite loops_. More on these at the end of this chapter. + +You should build a more friendly password-checking procedure. You might specify a number of incorrect password guesses that the user could enter (perhaps three) and then if they still haven't gotten it right, make the procedure terminate itself. Or you could simply use an End statement to terminate the procedure if the user entered a blank string, like this: + + Do + varPassword = InputBox _ + ("Enter the password to start the procedure:", _ + "Check Password 1.0") + If varPassword = "" Then **End** + Loop While varPassword <> "CorrectPassword" + +## _Do Until... Loop_ Loops + +A Do Until... Loop loop is similar to a Do While... Loop loop. The difference is how the condition works. In a Do Until... Loop loop, the loop runs while the condition is False and stops running when it's True. So this is the opposite of the way that the condition works in a Do While... Loop loop. + +Figure 12.3 shows a Do Until... Loop loop. + +Figure 12.3 A Do Until... Loop loop runs while the condition is False and stops running when the condition becomes True. + +* * * + +Do Until...Loop Blocks Execute Until a Condition Becomes False + +Note that Do Until... Loop loops are useful if you prefer to work with a condition that's True and keep it looping until the condition becomes False. Otherwise, you can achieve the same effects using Do While... Loop loops and inverting the condition. In other words, these two approaches to looping are functionally the same; it's just a matter of how you want to manage the condition. It's the difference between "sweep the porch _until_ it's clean" versus "sweep the porch _while_ it's still dirty." Same idea, expressed differently. + +* * * + +### Syntax + +The syntax for Do Until... Loop loops is as follows: + + Do Until _condition_ + _statements_ + [Exit Do] + [ _statements_ ] + Loop + +When VBA enters the loop, it checks the _condition_. If the _condition_ is False, VBA executes the statements in the loop, encounters the Loop keyword, and loops back to the beginning of the loop, reevaluating the _condition_ as it goes. If the _condition_ is True, VBA terminates the loop and continues execution at the statement after the Loop line. + +For example, here's the lottery example once again, but now employing a Do...Until loop in Listing 12.3. + +**Listing 12.3**: Using Do...Until loops + + 1. Sub Lottery_1() + 2. Dim intWin As Integer + 3. Do Until intWin > 2000 + 4. intWin = Rnd * 2100 + 5. MsgBox intWin, , "Lottery" + 6. Loop + 7. End Sub + +Here's how Listing 12.3 works: + + * Line 2 declares the Single variable intWin. Line 3 then starts a Do Until... Loop loop with the condition that intWin > 2000—the value of the intWin variable must be larger than 2000 for the loop to end. Until then, the loop will continue to run. + * Line 4 assigns to intWin the result of 2100 multiplied by a random number produced by the Rnd function, which generates random numbers between 0 and 1. (This means that the loop needs to receive a random number of a little more than .95 to end—a chance of a little less than 1 in 20, considerably better than most lotteries.) + * Line 5 displays a simple message box containing the current value of the Win variable so that you can see how lucky you are. + * Line 6 contains the Loop keyword that completes the loop. + * Line 7 ends the procedure. + +Listing 12.4 shows a more useful example of a Do Until... Loop loop in Word. + +**Listing 12.4**: A practical example showing how to employ Do Until in Word + + 1. Sub FindNextHeading() + 2. **Do Until** Left(Selection.Paragraphs(1).Style, 7) = "Heading" + 3. Selection.MoveDown Unit:=wdParagraph, _ + Count:=1, Extend:=wdMove + 4. **Loop** + 5. End Sub + +Listing 12.4 contains a short procedure that moves the insertion point to the next heading in the active document in Word. Here's how it works: + + * Line 2 starts a Do Until... Loop loop that ends with the Loop keyword in line 4. The condition for the loop is that the seven leftmost characters in the name of the style for the first paragraph in the current selection—Left(Selection.Paragraphs(1).Style, 7)—match the string Heading. This will match any of the Heading styles (the built-in styles Heading 1 through Heading 9, or any style the user has defined whose name starts with _Heading_ ). + * Until the condition is met, VBA executes the statement in line 3, which moves the selection down by one paragraph. + +## _Do... Loop Until_ Loops + +The Do... Loop Until loop is similar to the Do Until... Loop structure except that in the Do... Loop Until loop, the statements contained within the loop block are executed at least once, whether the condition is True or False. If the condition is False, the loop continues to run until the condition becomes True. Figure 12.4 shows a Do... Loop Until loop. + +Figure 12.4 In a Do... Loop Until loop, the actions in the loop are run once before the condition is tested. + +### Syntax + +The syntax for Do... Loop Until loops is as follows: + + Do + [ _statements_ ] + [Exit Do] + [ _statements_ ] + Loop Until _condition_ + +VBA enters the loop at the Do line and executes the _statements_ in the loop. When it encounters the Loop Until line, it checks the _condition_. If the condition is False, VBA loops back to the Do line and again executes the _statements_. If the condition is True, VBA terminates the loop and continues execution at the line after the Loop Until line. + +As an example, say you want to repeatedly display an input box that adds new worksheets to a workbook until the user clicks the Cancel button or enters an empty string in the text box. You could use code like that shown in Listing 12.5. + +**Listing 12.5**: Use Do Loop to execute the code at least once + + 1. Sub Create_Worksheets() + 2. Dim strNewSheet As String + 3. **Do** + 4. strNewSheet = InputBox _ + ("Enter the name for the new worksheet " _ + & "(31 characters max.):", "Add Worksheets") + 5. If strNewSheet <> "" Then + 6. ActiveWorkbook.Worksheets.Add + 7. ActiveSheet.Name = strNewSheet + 8. End If + 9. **Loop Until** strNewSheet = "" + 10. End Sub + +Here's what happens in the Create_Worksheets procedure: + + * Line 2 declares the String variable strNewSheet. + * Line 3 begins a Do... Loop Until loop. + * Line 4 displays an input box asking the user to enter the name for the new worksheet. + * Line 5 uses an If statement to make sure that strNewSheet is not an empty string. If it's not, line 6 adds a new worksheet to the active workbook, and line 7 assigns the value of strNewSheet to the active sheet (the new sheet). Line 8 ends the If statement. + * Line 9 contains a Loop Until strNewSheet=″″ statement that causes the procedure to loop back to the Do line until the user enters an empty string in the input box. The user can enter an empty string either by leaving the text box in the input box blank and clicking the OK button or by clicking the Cancel button. + * Line 10 ends the procedure. + +## Using an _Exit Do_ Statement + +As with an Exit For statement in a For...loop, you can use an Exit Do statement to exit a Do loop without executing the statements below the Exit line. The Exit Do statement is optional, and you'll probably seldom want to use Exit Do statements in your loops—at least if the loops are properly designed. + +When you do need an Exit Do statement, you'll generally use it with its own condition. The example shown in Listing 12.6 makes the lottery a little more interesting by adding an If condition with an Exit Do statement to take effect if the win is less than $500. + +**Listing 12.6**: How to use the Exit Do command + + 1. Sub Lottery_2() + 2. Dim intWin As Integer + 3. Do Until intWin > 2000 + 4. intWin = Rnd * 2100 + 5. If intWin < 500 Then + 6. MsgBox "Tough luck. You have been disqualified.", _ + vbOKOnly + vbCritical, "Lottery" + 7. **Exit Do** + 8. End If + 9. MsgBox intWin, , "Lottery" + 10. Loop + 11. End Sub + +The procedure in Listing 12.6 works in the same way as the example in Listing 12.3 except that line 5 introduces a new If condition. If the variable intWin is less than 500, the statements in lines 6 and 7 run. Line 6 displays a message box announcing that the player has been disqualified from the lottery, and line 7 exits the Do loop. + +## Is the _Exit Do_ Statement Bad Practice? + +Some programmers consider using an Exit Do statement to exit a Do loop a tactic of last resort, or at least clumsy programming. Others disagree. Many reckon that it's always acceptable to use an Exit Do statement to respond to an error or to the user clicking a cancel button. + +VBA executes Exit Do statements with no problem, so it's there if you want to use it. However, you can often rewrite your code to avoid using an Exit Do statement. + +For example, a condition that you check in the middle of the loop to decide whether to exit the loop can often be built into the main condition of the loop by using an operator such as And, Or, or Not, as shown in Listing 12.7: + +**Listing 12.7**: How to avoid the Exit Do command + + 1. Sub Lottery_3() + 2. + 3. Dim intWin As Integer + 4. + 5. Do + 6. intWin = Rnd * 2100 + 7. MsgBox intWin, , "Lottery" + 8. **Loop Until intWin > 2000 Or intWin < 500** + 9. + 10. + 11. If intWin < 500 Then + 12. MsgBox "Tough luck. You have been disqualified.", _ + 13. vbOKOnly + vbCritical, "Lottery" + 14. End If + 15. + 16. End Sub + +Listing 12.7 is a revision of the example in Listing 12.6. Listing 12.7 shows you how to use the Or operator to specify two conditions for the loop to iterate. In this way, you can omit the Exit Do command entirely. + +In line 8 of Listing 12.7, we are saying that the loop should end if the variable is greater than 2000 Or less than 500. This makes it somewhat clearer what the loop is doing. + +We must also make two other changes. First, we have to move the condition test from the top of the loop to the bottom. The Do Until command in Listing 12.6 must be changed to the Loop Until command in Listing 12.7. If we leave the condition test at the top of the loop, the condition will _always_ prevent the loop from executing. This is because the intWin variable will always hold zero when this loop first executes. So we move the condition test to the bottom of the loop, allowing the variable to be assigned some value in line 6. + +The final change we need to make is to move the If...Then block down to the bottom of the procedure. + +If the code is simple like this example, you might be better off rewriting it to employ an operator. But if the code is complex and lengthy, there's no good reason to force yourself to use operators when an Exit Do statement will do the trick instead. + +# _While... Wend_ Loops + +In addition to the For...Next loop, the For Each... Next loop, and the four flavors of Do loops examined so far in this chapter, VBA includes the While... Wend loop. While... Wend is VBA's version of the While... Wend looping structure used by earlier programming languages, such as the WordBasic programming language used with versions of Word up to and including Word 95. VBA includes While... Wend more for compatibility with those earlier versions than as a recommended technique. But you can use it if you choose to. The various Do loops have replaced While... Wend, but While... Wend still works fine. + +The syntax of a While... Wend loop is as follows: + + While _condition_ + [ _statements_ ] + Wend + +While the _condition_ is True, VBA executes the _statements_ in the loop. When it reaches the Wend keyword (which is a contraction of While End), it returns to the While statement and evaluates the _condition_ again. When the _condition_ evaluates as False, the statements in the loop are no longer executed and execution moves to the statement after the Wend statement. + +The following statements create a simple While... Wend loop for Word: + + While Documents.Count < 10 + Documents.Add + Wend + +While the number of documents in the Documents collection (measured here by the Count property of the Documents collection) is smaller than 10, the loop runs. Each time through, the Documents.Add statement in the second line creates a new document based on the Normal template (because no other template is specified). After the new document is created, the Wend statement in the third line returns execution to the first line, where the While condition is evaluated again. + +* * * + +Avoid Branching into the Middle of a While...Wend Loop + +If you do use a While... Wend loop, make sure the only way to enter the loop is by passing through the gate of the While condition. Branching into the middle of a While... Wend loop (for example, by using a label and a GoTo statement) can cause errors. + +* * * + +# Nesting Loops + +You can nest one or more loops within another loop to create the pattern of repetition you need: You can nest one For...loop inside another For...loop, a For...loop inside a Do loop, a Do loop inside a For...loop, or a Do loop inside a Do loop. + +* * * + +VBA Permits up to 16 Levels of Nesting, but Who Could Understand Such Complexity? + +You can nest up to 16 levels of loops in VBA, but you'll be hard-pressed to comprehend even half that number of levels as you read over your code. If you find your code becoming this complicated, consider whether you can take a less tortuous approach to solve the problem. + +* * * + +For example, if you need to create a number of folders, each of which contains a number of subfolders, you could use a variation of the Create_Folders procedure you looked at earlier in the chapter. But such a task cries out for nesting. + +The dialog box for the procedure will need another text box to contain the number of subfolders to create within each folder. The new dialog box is named frmCreateFoldersAndSubFolders and the text box for the number of subfolders is named txtHowManySubFolders. Figure 12.5 shows the dialog box. + +Figure 12.5 The dialog box to create folders and subfolders + +Listing 12.8 shows the code triggered by the Click event on the cmdOK button of the form. + +**Listing 12.8**: Employing a nested loop + + 1. Private Sub cmdOK_Click() + 2. + 3. Dim strStartingFolder As String + 4. Dim strFolderName As String + 5. Dim strSubfolderName As String + 6. Dim intSubfolder As Integer + 7. Dim intLoopCounter As Integer + 8. + 9. frmCreateFoldersAndSubfolders.Hide + 10. Unload frmCreateFoldersAndSubfolders + 11. + 12. strStartingFolder = CurDir + 13. + 14. **For intLoopCounter** = 1 To txtHowManyFolders.Value + 15. strFolderName = txtProjectNumber.Value & "s" & _ + Format(intLoopCounter, "0#") + 16. MkDir strFolderName + 17. ChDir strFolderName + 18. **For intSubfolder** = 1 To txtHowManySubfolders.Value + 19. strSubfolderName = "Subsection" & intSubfolder + 20. MkDir strSubfolderName + 21. **Next intSubfolder** + 22. ChDir strStartingFolder + 23. **Next intLoopCounter** + 24. + 25. End Sub + +Here's what the code in Listing 12.8 does: + + * Line 1 begins the procedure, and line 25 ends it. Line 2 is a spacer. + * Lines 3 through 5 declare three String variables, strStartingFolder, strFolderName, and strSubfolderName, respectively. + * Line 6 declares the Integer variable intSubfolder, and line 7 declares the Integer variable i. Line 8 is a spacer. + * Line 9 hides the user form, and line 10 unloads it. Line 11 is a spacer. + * Line 12 stores the name of the current folder in the String variable strStartingFolder. You'll need this variable to make sure everything happens in the appropriate folder later in the procedure. Line 13 is another spacer. + * Lines 14 through 16 and line 23 are essentially the same as in the previous procedure. They build the folder name out of the Value property of the txtProjectNumber text box, the letter _s_ , a two-digit number, and the i variable and then use the MkDir statement to create the folder. + * Line 17 uses a ChDir statement to change folders to the folder that was just created, strFolderName. + * In line 18, the nested For...Next loop starts. This loop is controlled by the loop counter intSubfolder and will run from intSubfolder = 1 to intSubfolder = txtHowManySubFolders.Value, which is the value entered by the user in the Number Of Subfolders To Create text box in the dialog box. + * Line 19 builds the String variable strSubfolderName out of the word _Subsection_ and the value of the intSubfolder _counter_ variable. For this procedure, you can assume that there will be fewer than 10 subsections for each of the sections, so single-digit numbering is adequate. + * Line 20 creates the subfolder by using a MkDir statement with the strSubfolderName String variable. + * Line 21 uses the Next Subfolder statement to loop back to the beginning of the nested For...Next loop. VBA reevaluates the condition and repeats the loop as necessary. + * Line 22 changes folders back to strStartingFolder for the next iteration of the outside loop. (Otherwise, the next folder would be created within the current folder, strFolderName.) + * Line 23 then loops back to the beginning of the outer loop. + +* * * + +Use the Counter Variable with Next when Nesting for...loops + +Using counter variables with the Next command is optional (in Listing 12.8, the counter variables are named intLoopCounter and intSubfolder). You could simply use Next by itself and VBA will figure out what you mean. But when nesting For...loops, it's a good idea to include a counter variable to make it easier to see which loop is ending with the Next command (in other words, use Next intLoopCounter, for example, rather than just the shorthand version Next). Using a counter variable makes your procedures much easier to read and may prevent unpleasant surprises (bugs). Your nested loops must end in the exact reverse order of their starting, and the counters need to match. + +* * * + +# Avoiding Infinite Loops + +If you create an infinite (aka endless) loop in a procedure, it will happily run forever, unless the user presses Ctrl+Break, presses Ctrl+Alt+Del to use the Task Manager to shut down the frozen application, restarts the computer, or pulls the plug. + +For example, one type of loop you haven't yet encountered is the Do... Loop. As you can see in the example in Listing 12.9, without a condition attached to it, this structure is an infinite loop. There's no condition that can stop the looping. + +**Listing 12.9**: An example of an endless loop + + 1. Sub InfiniteLoop() + 2. Dim x + 3. x = 1 + 4. Do + 5. Application.StatusBar = _ + "Your computer is stuck in an endless loop: " & x + 6. x = x + 1 + 7. Loop + 8. End Sub + +In Listing 12.9, line 2 declares the variable _x_ , and line 3 assigns it the value 1. Line 4 begins the Do loop, which displays a status-bar message and increases the value of _x_ by 1. The effect of this loop is to display a message and an ever-increasing number on the status bar until you press Ctrl+Break to stop the procedure or until the value overflows the variable's maximum value. This is all thoroughly pointless (except perhaps as a way to burn in a new computer) and is perhaps a good reason not to use the Do... Loop structure—at least not without a condition attached to one end of it. + +No matter what type of loop you use, to avoid creating an infinite loop, you need to make sure the condition that will terminate the loop can be satisfied at some point. For example, for an editing or cleanup procedure, you'll often want to perform an action until the end of the document is reached and then stop. Or you'll want to include some form of counting mechanism to make sure a Do loop doesn't exceed a certain number of iterations. + +# The Bottom Line + +**Understand when to use loops.** + +Loops come in very handy when you need to perform a repetitive task, such as searching through a document for a particular word. + +Master It + +What is the alternative to looping if you are carrying out repetitive tasks in a macro? + +**Use For...loops for fixed repetitions.** + +For...loops are the most common loop structures in programming. You specify the number of iterations the loop must make, and the loop is exited when that number is reached. + +Master It + +Write a For...Next loop that counts up to 100, but use the Step command to increment by twos. + +**Use Do... loops for variable numbers of repetitions.** + +A Do... loop iterates until or while a condition exists, then exits from the loop when the condition no longer exists. + +Master It + +There are two categories of Do... loops. Do While... Loop and Do Until... Loop loops test a condition before performing any action. What is the other category? + +**Nest one loop within another loop.** + +You can put loops inside other loops. + +Master It + +Think of a programming task where nested loops would be useful. + +**Avoid infinite loops.** + +An infinite (or endless) loop causes your macro to continue execution indefinitely—as if the macro had stopped responding and was "frozen." + +Master It + +How do you avoid creating an infinite loop? +Part 4 + +Using Message Boxes, Input Boxes, and Dialog Boxes + + * **Chapter 13: Getting User Input with Message Boxes and Input Boxes** + * **Chapter 14: Creating Simple Custom Dialog Boxes** + * **Chapter 15: Creating Complex Forms** + +Chapter 13 + +Getting User Input with Message Boxes and Input Boxes + +This chapter shows you how to start adding a user interface to recorded or written code in order to increase the power and functionality of your macros or applications. + +You'll learn the three easiest ways of communicating with the user of your code, the two easiest ways of enabling the user to make decisions in a procedure, and the easiest way of soliciting input from the user. Along the way, you'll see how to decide what is the best way to communicate with the user in any given set of circumstances. This will set the scene for starting an examination of more complex interactions with the user via custom dialog boxes, later in the book. + +In most Office applications, VBA offers you a choice of up to five ways of communicating with the user of a procedure: + + * Displaying a message on the status bar at the bottom of the window (if the application provides a status bar). This is a bit limited, but it can be an effective way of communicating with the user. And it's not intrusive—users can easily ignore the status bar if they wish. + * Displaying a message box (usually in the middle of the screen). Message boxes are useful both for providing some information to users and for giving them the means to make a single choice based on the information you give them. You'll spend the bulk of this chapter working with message boxes. + * Displaying an input box (again, usually in the middle of the screen). You can use input boxes the same way you use message boxes—to communicate some information to users. But the primary purpose of an input box is input: to solicit one item of information from the user. Input boxes also provide users with the means of making a single choice to direct the flow of a procedure, although the mechanism for presenting this choice is much more limited than that in a message box. You'll look at input boxes toward the end of this chapter. + * Displaying a dialog box (once again, usually in the middle of a screen). You can use dialog boxes both to display information to the user and to let them make a variety of choices that are communicated back to your code. Dialog boxes are best reserved for those times when other forms of communication won't suffice; in other words, there's no point in using a dialog box when a simple message box or input box will do. You'll look at creating your own custom dialog boxes by using VBA user forms later in the book. + * Displaying an application's built-in dialog box, such as Word's FileOpen dialog box. This approach is explored in Chapter 14, "Creating Simple Custom Dialog Boxes." + +In this chapter you will learn to do the following: + + * Display messages on the status bar + * Display message boxes + * Display input boxes + * Understand the limitations of message boxes and input boxes + +# Opening a Procedure to Work On + +Make sure you're all set for editing in the Code window in the VBA Editor: + +1. Start the application for which you're creating code. + +2. Launch the Visual Basic Editor from the host application by pressing Alt+F11. + +3. Open a procedure for editing in the Code window: Use the Project Explorer to navigate to the module that holds the procedure, and then either scroll to the procedure in the Code window or choose it from the Procedures drop-down list in the Code window. + +* * * + +You Can Locate Procedures Using the Macro Dialog Box + +Alternatively, in the VBA Editor, choose Tools ⇒ Macros to display the Macros dialog box. Or to display this dialog box from an application such as Word, click the Developer tab on the Ribbon, then click the Macros icon. Once the Macros dialog box is open, you can select a procedure you've created from the Macro Name list box and click the Edit button to display the Visual Basic Editor with the procedure open in the Code window. + +If you've opened an existing procedure, test its code by using the F8 key to step through the statements or by clicking F5 (the Run Sub/UserForm) to run it without stepping. (You can also run it by typing the procedure's name into the Editor's Immediate window and pressing Enter.) + +* * * + +Nevertheless, it's probably best to work in a new procedure rather than in an existing one because that way you won't do any damage to a macro you may want to use in the future. + +Create a new procedure in the Visual Basic Editor Code window by typing the Sub keyword, giving the procedure a name on a blank line in a module, and then pressing Enter. VBA adds the parentheses and End Sub statement. For example, you could type the following and press the Enter key: + + Sub Experimentation_Zone + +VBA adds the parentheses and End Sub statement, together with a separator line to separate the procedure from any adjacent procedures in the Code window: + + Sub Experimentation_Zone() + End Sub + +# Displaying Status-Bar Messages in Word and Excel + +Word and Excel let you display information on the status bar. This is often a convenient way to tell the user what's happening in a procedure without halting execution of the code (or, more important, without interrupting the user's work and requiring them to click a button to get rid of your message box). + +By displaying status information on the status bar as the procedure works, you can indicate to the user not only what the procedure is doing, but also that it's still, in fact, running. Of course, the user might not _notice_ the status bar. So if you are displaying crucial information, you must use a message box or one of the other types of boxes, like an input box. These force the user to pay attention; no further work can be done within the application until that box is dismissed. + +* * * + +How to Avoid Alarming the User + +A problem you'll sometimes encounter is that the user thinks a procedure has frozen, crashed, gone into an infinite loop, or failed to work because no changes are visible onscreen, whereas in fact your procedure is working properly in the background. If you have a procedure that takes a long time to execute, updates on the status bar let the user see that the procedure is still working. To see example code that illustrates how to update the status bar, take a look at the sidebar entitled "i Is the Traditional Counter Variable Name for For...Next Loops" in Chapter 12, "Using Loops to Repeat Actions." + +* * * + +But remember that the main disadvantage of displaying messages on the status bar is that users may miss them if they're not paying attention, if they've hidden the status bar, or if they're not expecting to see messages there. + +* * * + +How to Hide the Status Bar + +When I mentioned hiding the status bar in the previous paragraph, you might have launched an effort to do just that. You looked all over the Ribbon, paying particular attention to the View tab. Then you clicked the File tab to open the Options dialog box. But you didn't find a way, anywhere, to hide the status bar. Well, this is yet one more reason to learn VBA. As I've mentioned, you can do things with VBA that are not possible any other way. Here's the code that will hide the status bar: + + Sub HideStatusBar() + Application.CommandBars("Status Bar").Visible = True + End Sub + +* * * + +If an application uses the status bar extensively to give the user information (as Word and Excel do), this might not be a problem for attentive users. But if there's any doubt, notify the user that information will be displayed on the status bar. For example, you might display a message box at the beginning of a procedure to tell the user to watch the status bar for updates. + +To display a message on the status bar in Word or Excel, you set the StatusBar property of the Application object to an appropriate string of text. The following example displays the status-bar information shown in Figure 13.1: + + Application.StatusBar = "Word is formatting the report. Please wait..." + +Figure 13.1 In some applications, you can display information on the status bar. + +Typically, any information you display on the status bar remains displayed there until you change it, until the user clicks something, or until the application displays a message there itself. + +For example, if you display a message on the status bar and then invoke the Copy command in Excel, Excel displays its normal Copy message, "Select destination and press ENTER or choose Paste," on the status bar, wiping out your message. Application messages trump user-created messages. + +If you display a message on the status bar in the course of a procedure, you should update it later in the procedure to avoid leaving a now-obsolete and potentially misleading message on the status bar after the procedure has finished running. For example, you might display another message saying that the procedure has finished or clear the status bar by displaying a blank string on it. + +To clear the status bar, assign an empty string to it, as in the following statement: + + Application.StatusBar = "" + +To see the effect of this statement, run it from the Visual Basic Editor (click the upper-right corner to ensure that the Editor window isn't maximized) with the Word or Excel window (or at least its status bar) visible at the same time. You'll see the effect best if you run a statement that displays information on the status bar (such as Application.StatusBar = "Hello, World!") first so that the status bar has information for the Application.StatusBar = "" statement to clear: + + Application.StatusBar = "Hello, World!" + Application.StatusBar = "" + +* * * + +Progress Indicators Can Be Written Various Ways + +It's especially helpful to display a progress indicator on the status bar during longer processes so that the user can tell that they're still running and that they're making progress. Progress indication is usually coded within a loop block. For example, you might display a readout of the progress, such as "Excel is working on sheet 9 out of 150." Even more simply, adding increasing numbers of periods to the end of the status message gives an indication of progress, although it doesn't give an idea of how much longer the task will take. Here's how you can add periods to a string: + + strPeriod = strPeriod & "." + +* * * + +# Message Boxes + +Another way to display information to the user is the message box; you've probably seen examples of it in almost every Windows application you've used. Message boxes are simple and limited, but they play an important role. + +Here are some typical uses of message boxes: + + * Telling users what a procedure is about to do (and giving them the chance to exit the procedure if it isn't what they thought it was). + * Presenting users with an explanation of what a procedure will do next and asking them to make a simple decision (usually, to let it proceed or to send it on a different course). + * Warning users of an error that the procedure encountered and allowing them to take action on it. + * Informing users that a procedure ran successfully and that it has finished. This message is particularly useful for procedures that turn off screen updating or otherwise hide from users what they are doing. Such procedures may leave users unsure of whether they are still running or have finished. You can also use the message box to report what a procedure has done—for example, that it changed particular items, made a certain number of changes, or discovered problems in the document that require attention. + +This chapter shows you how to create a message box suitable for each of these tasks. In later chapters, you'll create specific message boxes to enhance various procedures. + +## The Pros and Cons of Message Boxes + +These are the advantages of using a message box: + + * Users can't miss seeing the message box. Users are prevented from continuing to use the application until they close the message box. (If you want, you can even display a message box that the user can't escape by pressing Alt+Tab to switch to another application. You'll look at this a little later in the chapter.) + * You can present the user with a simple choice among two or three options. + +These are the disadvantages of using a message box: + + * A message box can present only one, two, or three buttons, which means it can offer only a limited set of options to the user. + * The buttons in message boxes are predefined in sets—you can't put a custom button in a message box. (For that, you have to use a dialog box.) + * You can't use features such as text boxes, group boxes, or list boxes within message boxes. + +## Message-Box Syntax + +The basic syntax for message boxes is as follows: + + MsgBox( _prompt_ [, _buttons_ ] [, _title_ ][, _helpfile, context_ ]) + +Here's what the elements of this syntax mean: + +**MsgBox** + +The function that VBA uses to display a message box. You typically use it with a number of arguments enclosed in parentheses after it. + +**_prompt_** + +A required argument for the MsgBox function that specifies what text is displayed in the message box. _prompt_ is a String argument, meaning you need to type in the text of your choice; it can be up to 1,023 characters long, although it's usually a good idea to be more concise than this. (Any prompt longer than 1,023 characters is truncated to 1,023 characters without warning.) + +**_buttons_** + +An optional argument that controls the type of message box that VBA displays by specifying which buttons it contains. For example, as you'll see in a couple of pages, you can display a message box with just an OK button; with OK and Cancel buttons; with Abort, Retry, and Ignore buttons; and so on. You can also add arguments to the _buttons_ argument that control the icon in the message box and the modality of the message box. You'll also look at these options later in this chapter. + +**_title_** + +An optional argument that controls the title bar of the message box. This too is a String argument. If you don't specify _title_ , VBA uses the application's title—Microsoft Word for Word, Microsoft Excel for Excel, Microsoft PowerPoint for PowerPoint, and so on. Usually, it's best to specify the title because the application name on its own isn't helpful (unless the user has become confused as to which application is running the procedure). + +**_helpfile_** + +An optional argument that controls which Help file VBA displays when the user presses F1 within the message box to get help (or clicks the Help button in a message box that contains a Help button). + +**_context_** + +An optional argument that controls which topic in the Help file VBA jumps to. If you specify the helpfile argument, you must specify the _context_ argument as well. + +In the following sections, you'll first look at how you can build the simplest of message boxes and then explore how to add arguments to it to make it more complex. + +## Displaying a Simple Message Box + +You can display the simplest message box by specifying only the prompt as a text string enclosed in double quotation marks: + + MsgBox "This is a simple message box." + +Run from Excel, this statement produces the simple message box shown in Figure 13.2. With _prompt_ as the only argument supplied, VBA produces a message box with only an OK button and with the application's name in the title bar. This message box does nothing except display information. + +Figure 13.2 When you use only the _prompt_ argument to display a simple message box, VBA uses the application's name as the title. + +You can enter this MsgBox statement on any blank line within a procedure. After you type the MsgBox keyword, VBA's Auto Quick Info feature prompts you with the syntax of the function, as shown in Figure 13.3. + +Figure 13.3 VBA's Auto Quick Info feature prompts you with the syntax for the message box. + +Once you've entered the MsgBox statement with its required argument ( _prompt_ ), you can display the message box by stepping through the code (by pressing the F8 key or clicking the Step Into button on the editor's Debug toolbar) or by running the procedure (by pressing the F5 key, by clicking the Run Sub/UserForm button, or by choosing Run ⇒ Run Sub/UserForm). + +Instead of entering a literal text string for the _prompt_ argument, you can use a String variable. The following example uses a String variable named strMsg: + + Dim strMsg As String + strMsg = "This is a simple message box." + MsgBox strMsg + +This approach can be useful when you're working with long strings (you can build a big string by concatenating several shorter strings with the & operator). Using a variable is also useful when you need to display a string that has been defined earlier in the procedure or a string dynamically created by the procedure (for example, after having gotten the user's name via an input box). + +## Displaying a Multiline Message Box + +By default, VBA displays short message strings as a single line in a message box and wraps longer strings onto two or more lines as necessary, up to the limit of 1,024 characters in a string. + +You can deliberately break a string into more than one line by including line-feed and carriage-return characters in the string as follows: + + * Chr(13) or vbCr represents a carriage return. + * Chr(10) or vbLf represents a line feed. + * Chr(10) + Chr(13) or vbCrLf represents a line-feed/carriage-return combination. + +In message boxes, these three characters all have the same effect—moving down one line. Your code is easier to read if you use a built-in constant (vbCr, vbLf, or vbCrLf) rather than the corresponding Chr() construction; it's also quicker to type. Usually, it's clearest to use the vbCr constant. + +You can add a tab to a string by using Chr(9) or vbTab. Again, vbTab is easier to read and to type. + +The following code displays the Word message box shown in Figure 13.4. Note that each part of the text string is enclosed in double quotation marks (to tell VBA that they're part of the string). The Chr(149) characters are bullets, so the text after them starts with a couple of spaces to give the bullets some room: + + Dim strMsg As String + strMsg = "Word has finished formatting the report you requested." _ + & vbCr & vbCr & "You can now run the following procedures:" & vbCr _ + & vbCr & Chr(149) & " Distribute_Report will email the report to " _ + & "the head office." & vbCr & vbCr & Chr(149) & _ + " Store_Report will copy the report to the holding directory." _ + & vbCr & vbCr & Chr(149) & " Backup_Report will create a backup " _ + & "of the report on the file server." + MsgBox strMsg + +Figure 13.4 You can display a multiline message box by using line-feed and carriage-return characters within the prompt string. + +* * * + +VBA Automatically Helps You Punctuate Your Code + +You'll notice that in this example, a space appears on either side of each of the ampersands (&) and the equal sign. You can enter these spaces yourself or have VBA enter them for you when you move the insertion point to another line by pressing Enter or clicking the mouse. (Moving the insertion point to another line causes VBA to check the line you've just been working on and make various automatic changes if necessary. For example, some characters may be capitalized, or if you typed EndIf, VBA will make it two words as it's supposed to be.) + +* * * + +## Choosing Buttons for a Message Box + +The _buttons_ argument controls which buttons a message box contains. VBA offers the types of message boxes shown in Table 13.1, controlled by the _buttons_ argument. + +Table 13.1 Message-box types, controlled by the _buttons_ argument + +**Value** | **Constant** | **Buttons** +---|---|--- +0 | vbOKOnly | OK +1 | vbOKCancel | OK, Cancel +2 | vbAbortRetryIgnore | Abort, Retry, Ignore +3 | vbYesNoCancel | Yes, No, Cancel +4 | vbYesNo | Yes, No +5 | vbRetryCancel | Retry, Cancel + +You can specify these message-box types in your code by using either the numeric value or the constant. For example, you can specify either 1 or vbOKCancel to produce a message box with OK and Cancel buttons. The value is easier to type; the constant is easier to read. Either of the following statements produces the message box shown in Figure 13.5 when run from PowerPoint: + + Dim lngR As Long + lngR = MsgBox("Apply standard formatting to the slide?", vbYesNo) + lngR = MsgBox("Apply standard formatting to the slide?", 4) + +Figure 13.5 The vbYesNo constant produces a message box with Yes and No buttons. + +From VBA's point of view, it doesn't matter whether you use values or constants in the message boxes for your procedures. For the human, though, the text constants are far preferable. Even if you're the only person who ever sees your code, the code is much easier to read if you use the constants. + +## Choosing an Icon for a Message Box + +You can also add an icon to a message box by including the appropriate value or constant argument. Table 13.2 shows the options. + +Table 13.2 Arguments for message-box icons + +**Value** | **Constant** | **Displays** +---|---|--- +16 | vbCritical | Stop icon +32 | vbQuestion | Question-mark icon +48 | vbExclamation | Exclamation-point icon +64 | vbInformation | Information icon + +Again, you can refer to these icons by using either the value or the constant: Either 48 or vbExclamation will produce an exclamation-point icon. Again, the constant is much easier to read. + +To link the value or constant for the message box with the value or constant for the icon, use a plus sign ( **+** ). For example, to produce a message box containing Yes and No buttons together with a question-mark icon (see Figure 13.6), you could enter **vbYesNo + vbQuestion** (or **4 + 32, vbYesNo + 32** , or **4 + vbQuestion** ): + + lngR = MsgBox("Apply standard formatting to the slide?", _ + vbYesNo + vbQuestion) + +Figure 13.6 Adding an icon gives a message box greater visual impact. + +## Setting a Default Button for a Message Box + +As usual in the Windows interface, the user is cued to a default button in a message box. It's the one with a blue border around its outside and a dotted line around its text area. (See the Yes button in Figure 13.6.) The user can move the selection to another button by using Tab or Shift+Tab or the →, ←, ↑, or ↓ key. + +However, you can specify in your code which button you want to be the default. + +* * * + +**The Practical Use of Default Buttons** + +You can set a default button for a message box by specifying a particular button in the MsgBox statement. Specifying a default button can be a wise move when you give procedures that take drastic action to users who may be unfamiliar with what's going to happen. (The user might accidentally hit the Enter key or click the highlighted button—the default button.) + +For example, consider a procedure that deletes the current document without the user having to close it and then switches to a file-management program (such as Windows Explorer) or messes around in one of the common dialog boxes (such as the Open or the Save dialog box). Common dialog boxes are demonstrated in the Real World Scenario sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12. + +Because such procedures can destroy someone's work if they run it inadvertently, you'd probably want to set a default button of No or Cancel in a confirmation message box so that the user has to actively choose to run the rest of the procedure. The message box halts execution, allows the user to agree or disagree with the action, and then carries out the user's wishes based on which button is clicked in the message box. + +Why does VBA include a default button at all? This makes it easy for the user to choose the ordinary VBA default button (captioned Yes or OK) by simply pressing Enter. Having the appropriate default button on a message box or dialog box can help the user deal with the message box or dialog box more quickly. But you as the programmer should decide if there is a different, more appropriate, default button. VBA automatically sets the first button in a message box to be the default button. But there are times that you will want to specify that the default button be a different button than the first. If you are doing something potentially dangerous in a macro—such as deleting the current document without saving it—it would be a good idea to make the second button (the No button) the default. This way, if the user simply presses Enter, nothing happens; the macro exits without deletion. Using this technique, you force the user to make a deliberate decision to move the mouse and click the Yes button. Table 13.3 shows you how to adjust which button is the default by using various built-in constants. And the short code example that ends the section demonstrates this technique. + +* * * + +Table 13.3 lists the arguments for default buttons. + +Table 13.3 Arguments for default message-box buttons + +**Value** | **Constant** | **Effect** +---|---|--- +0 | vbDefaultButton1 | The first button is the default button. +256 | vbDefaultButton2 | The second button is the default button. +512 | vbDefaultButton3 | The third button is the default button. +768 | vbDefaultButton4 | The fourth button is the default button. + +All the message boxes mentioned so far have only one, two, or three buttons, but you can add a Help button to any of the message boxes, making for a fourth button on those boxes that already have three buttons (such as vbYesNoCancel). You'll see how to add the Help button in the section "Adding a Help Button to a Message Box" later in this chapter. + +In VBA, unless you specify otherwise, the first button on each of the message boxes is automatically the default button: for example, the OK button in a vbOKCancel message box, the Abort button in a vbAbortRetryIgnore message box, the Yes button in a vbYesNoCancel message box, the Yes button in a vbYesNo message box, and the Retry button in a vbRetryCancel message box. VBA counts the buttons in the order they're presented in the constant for the type of message box (which in turn is the left-to-right order in which they appear in the message box onscreen). So in a vbYesNoCancel message box, Yes is the first button, No is the second button, and Cancel is the third button. + +To make a different button the default, specify the value or constant as part of the _buttons_ argument. When run in PowerPoint, this statement produces the message box shown in Figure 13.7: + + Dim lngQuery As Long + lngQuery = MsgBox("Do you want to delete this presentation?", _ + vbYesNo + vbCritical + vbDefaultButton2) + +Figure 13.7 Specify a default button to steer the user toward a particular button in a message box. + +## Controlling the Modality of a Message Box + +VBA can display both application-modal message boxes and system-modal message boxes—at least in theory. _Application-modal_ message boxes stop you from doing anything in the current application until you dismiss them, whereas _system-modal_ message boxes stop you from doing anything _on your entire computer_ until you dismiss them. + +Most message boxes are application modal, allowing the user to switch to another application by pressing Alt+Tab (or switching via the Taskbar). The user can then work in the other application even though they haven't gotten rid of the message box. This gives them freedom and flexibility. In contrast, some message boxes (most often used during an installation process) are system modal, insisting that users concentrate their attention on them and them alone. Windows's critical system errors and "you must restart your computer now" messages are system modal to prevent you from avoiding them. + +You probably know from your own experience how frustrating system-modal message boxes can be. So when designing procedures, use system-modal message boxes only when absolutely necessary—for example, when an action might result in data loss or system instability. For most conventional purposes, application-modal message boxes will do everything you need them to—and won't confuse or vex your users. + +In theory, you can control the modality of a message box by using the two _buttons_ arguments shown in Table 13.4. + +Table 13.4 Arguments for message-box modality + +**Value** | **Constant** | **Result** +---|---|--- +0 | vbApplicationModal | The message box is application modal. +4096 | vbSystemModal | The message box is system modal. + +In practice, even if you use the vbSystemModal argument, the user can switch to another application (provided that one is running) and continue working. However, the message box does stay "on top," remaining displayed—enough to annoy users but not totally prevent them from accessing another application. + +By default, message boxes are application modal, so you need to specify modality only on those rare occasions when you need a system-modal message box. When you do, add the vbSystemModal constant or 4096 value to the _buttons_ argument: + + Response = MsgBox("Do you want to delete this document?", _ + vbYesNo + vbCritical + vbDefaultButton2 + vbSystemModal) + +Please note that system-modal message boxes look the same as application-modal message boxes. + +## Specifying a Title for a Message Box + +The next component of the message box is its title bar, which is controlled by the optional _title_ argument. If you omit _title_ , VBA displays the application's name as the title, but users of your procedures will benefit from your providing a more helpful title. + +_title_ is a string expression and can be up to 1,024 characters in length, in theory (longer strings are truncated with no warning or error message), but in practice, any title longer than about 75 characters gets truncated with an ellipsis. If you want people to read the title bars of your message boxes, 25 characters or so is a reasonable maximum. + +## Title Bars Can Provide Useful Information + +The title bar is usually the first part of a message box that the user notices, so make your title bars as helpful as possible. Conventional etiquette is to put the name of the procedure in the title bar of a message box and then use the prompt argument to explain what actions the buttons in the message box will trigger. + +In addition, if you expect to revise your procedures, you may find it helpful to include their version number in the title so that users can easily check which version of the procedure they're using (and update to a more current version as appropriate). For instance, the Delete Workbook procedure is identified as version 12.39 in the message box shown in Figure 13.8. + +Figure 13.8 Usually, you'll want to specify the title argument for your message boxes. You may also want to include a version number. + +Specify the _title_ argument after the _buttons_ argument like this: + + Dim lngQuery As Long + lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _ + + vbCritical + vbDefaultButton2, "Delete Workbook 12.39") + +You can use a string variable as the _title_ argument. For example, you could declare a single string variable and use it to supply the title for each message box that a procedure calls. Or you might need to display in the title of the message box a string created or stored in the procedure. + +* * * + +**Avoid Using Special Characters in Titles** + +Don't try putting line-feed, carriage-return, or tab characters in the title argument. VBA just ignores them. + +* * * + +## Adding a Help Button to a Message Box + +To add a Help button to a message box, use the vbMsgBoxHelpButton constant. You add this argument to whichever buttons you're specifying for the message box: + + lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _ + + vbCritical + vbDefaultButton2 + **vbMsgBoxHelpButton** , _ + "Delete Workbook") + +Adding the vbMsgBoxHelpButton argument simply places the Help button in the message box—it doesn't make the Help button display a Help file until you specify which Help file and topic it should use (see the next section for details). Figure 13.9 shows the message box that this statement produces. + +Figure 13.9 Use the vbMsgBoxHelpButton constant to add a Help button to a message box. + +## Specifying a Help File for a Message Box + +The final arguments you can use for a message box are the helpfile and _context_ arguments: + + * The helpfile argument is a string argument specifying the name and location of the Help file that VBA displays when the user summons help from the message box. + * The _context_ argument is a Help context number within the Help file. The Help context number controls which Help-file topic is displayed. + +The helpfile and _context_ arguments are primarily useful if you're writing your own Help files, because otherwise it's difficult to access the Help context numbers, which are buried in the official Help files. + +If you're writing your own Help files, the syntax for specifying the helpfile and _context_ arguments is simple: + + Dim lngQuery As Long + lngQuery = MsgBox("Do you want to delete this workbook?", vbYesNo _ + + vbCritical + vbDefaultButton2 + vbMsgBoxHelpButton, _ + "Delete Workbook", "c:\Windows\Help\My_Help.chm", 1012) + +In this case, the Help file is specified as My_Help.chm in the \Windows\Help\ folder. VBA displays the Help topic numbered 1012. + +When the user clicks the Help button in the message box, VBA displays the specified topic in the Help file. The message box stays onscreen so that when users have finished consulting the Help file, they can make their choice in the message box. + +The Help context number for the opening screen of a Help file is 0. Use 0 when you need to display a Help file for which you don't know the Help context number. Users must then locate the information they need on their own. + +* * * + +**Three Unusual Constants for Special Effects** + +VBA provides three special constants for use with message boxes. You probably won't need to use these often, but if you do, they'll come in handy. Specify them as the first argument in the _buttons_ arguments: + +vbMsgBoxSetForeground + +Tells VBA to make the message box the foreground window. You shouldn't need to use this constant often, because message boxes are displayed in the foreground by default (so that you can see them). + +vbMsgBoxRight + +Tells VBA to right-align the text in the message box. + +vbMsgBoxRtlReading + +Tells VBA to arrange the text from right to left on Hebrew and Arabic systems. It has no effect on non-BiDi (bidirectional) systems. + +* * * + +## Using Some Arguments without Others + +When displaying a message box, you can either specify or omit optional arguments. If you want to specify arguments later in the argument list without specifying the ones before them, use a comma to indicate each unused optional argument. (This technique can be used with any argument list.) For example, if you wanted to display the message box shown in the previous example without specifying _buttons_ and _title_ arguments, you could use the following statement: + + Response = MsgBox("Do you want to format the report?",,, _ + "c:\Windows\Help\Procedure Help.chm", 1012 + +Here, the triple comma indicates that the _buttons_ and _title_ arguments are omitted (which will cause VBA to display defaults—a vbOKOnly message box with a title bar containing the application's name), preventing VBA from confusing the helpfile argument with the _buttons_ argument. Alternatively, you could use named arguments, which makes for less-concise but easier-to-read code: + + Response = MsgBox("Do you want to format the report?", _ + HelpFile:="c:\Windows\Help\Procedure Help.chm", Context:=1012) + +## Retrieving a Value from a Message Box + +If you display a vbOKOnly message box, you know which button the user clicks because the message box contains only an OK button. But when you use one of the other message-box styles, which can have two, three, or four buttons, you must retrieve a value that tells you which button the user clicked. You can then branch execution to respond appropriately to the user's choice. + +To retrieve a value from a message box, declare a variable for it. You can do so quite simply by telling VBA that the variable name is equal to the message box (so to speak), like this: + + Dim lngResponse As Long + lngResponse **=** MsgBox("Do you want to create the daily report?", _ + vbYesNo + vbQuestion, "Create Daily Report") + +You first declare a variable of the appropriate type (a Long variable) to contain the user's choice, as in the examples throughout this chapter: + +When you run the code, VBA stores which button the user clicked as a value in the variable. You can then check the value and take action accordingly. + +Table 13.5 shows the full list of buttons the user may choose. You can refer to the buttons by either the constant name or the value number. As usual, the constant is easier to read than the value. + +Table 13.5 Constants for selected buttons + +**Value** | **Constant** | **Button Selected** +---|---|--- +1 | vbOK | OK +2 | vbCancel | Cancel +3 | vbAbort | Abort +4 | vbRetry | Retry +5 | vbIgnore | Ignore +6 | vbYes | Yes +7 | vbNo | No + +For example, to check a vbYesNo message box to see which button the user chose, you can use a straightforward If... Then... Else statement: + + Dim lngUserChoice As Long + lngUserChoice = MsgBox("Do you want to create the daily report?", _ + vbYesNo + vbQuestion, "Create Daily Report") + If lngUserChoice = vbYes Then + Goto CreateDailyReport + Else + Goto Bye + EndIf + +Here, if the user chooses the Yes button, VBA goes to the line of code identified by the CreateDailyReport label and continues running the procedure from there; if not, it terminates the procedure by going to the Bye label at the end. The If condition checks the response generated by the choice the user made in the message box to see if it's a vbYes (generated by clicking the Yes button or pressing Enter with the Yes button selected). The Else statement runs if the response was not vbYes—that is, if the user clicked the No button or pressed Esc. + +# Input Boxes + +Message boxes tell VBA which button the user clicked. But sometimes you want the user to supply your macro with some text, such as their name or birthday. + +When you want to retrieve one simple piece of text information from the user, use an input box. You'll be familiar with input boxes by sight if not by name: they usually look something like the example shown in Figure 13.10. + +Figure 13.10 Use an input box to retrieve a single piece of information from the user. + +* * * + +Create Custom Boxes for Complex Interaction + +To retrieve two or more pieces of information from the user, you could use two or more input boxes in succession, but it's usually easier for the user if you create a custom dialog box. You'll start building custom dialog boxes in Chapter 14. + +* * * + +## Input-Box Syntax + +The syntax for displaying an input box is straightforward and similar to the syntax for a message box: + + InputBox( _prompt_ [, _title_ ] [, _default_ ] [, _xpos_ ] [, _ypos_ ] [, _helpfile, context_ ]) + +Here's what the arguments mean: + +**_prompt_** + +A required string that specifies the prompt that appears in the input box. As with MsgBox, _prompt_ can be up to about 1,024 characters long, and you can use the carriage-return constant (vbCr) to force separate lines. Like the MsgBox _prompt_ argument, the InputBox _prompt_ automatically wraps if the prompt is longer than about 35 characters. + +**_title_** + +A string that specifies the text in the title bar of the input box. If you don't specify a _title_ argument, VBA supplies the application's name. + +**_default_** + +A string that you can use to specify text that will appear in the text box. Entering a _default_ argument can be a good idea both for cases when the default text is likely to be suitable (so the user can just press Enter to accept that default) or when you need to display sample text so that the user can understand what type of response you're looking for. + +Here's an example of suitable default text to cue the user: if you display an input box asking for the user's name, you could enter the Name value by fetching it from the BuiltInDocumentProperties collection of the ActiveDocument object, like this: + + Dim strAuthor As String + strAuthor = _ + ActiveDocument.BuiltInDocumentProperties(wdPropertyLastAuthor) + +**_xpos_** and **_ypos_** + +These are optional numeric values for specifying the onscreen position of the input box. _xpos_ governs the horizontal position of the left edge of the input box from the left edge of the screen (not of the Word window), whereas _ypos_ governs the vertical position of the top edge of the input box from the top of the screen. Each measurement is in _twips_ , described in the sidebar "Input Boxes Are Usually Best Displayed in the Center of the Screen" in this chapter. If you omit these two arguments, VBA displays the input box at the default position of halfway across the screen and one-third of the way down it. + +**_helpfile_** and **_context_** + +Optional arguments for specifying the Help file and context in the Help file to jump to if the user summons help from the input box. If you use helpfile, you must also use _context_. + +* * * + +Input Boxes Are Usually Best Displayed in the Center of the Screen + +A twip is 1/1440 inch. An average computer screen uses 96 dots per inch (dpi), so there are 15 twips per pixel, and a computer screen at 1024 × 768 resolution is 15,360 × 11,520 twips. If you need to position your input boxes and dialog boxes precisely, experiment with twips at different screen resolutions until you achieve satisfactory results. Generally, it's most effective to display an input box in the default center position. Your users are likely to have a variety of screen resolutions. + +* * * + +You can omit any of the optional arguments for an input box. But if you want to use another argument later in the syntax sequence, remember that you need to indicate the omission with a spacer comma (or use named arguments as described earlier in this chapter). + +Unlike message boxes, input boxes come with a predefined set of buttons—OK and Cancel, plus a Help button if you specify the helpfile and _context_ arguments—so there's no need to specify the main buttons for an input box. The following example declares the String variable strWhichOffice and assigns to it the result of the input box shown in Figure 13.11: + + Dim strWhichOffice As String + strWhichOffice = InputBox( _ + "Enter the name of the office that you visited:", _ + "Expense Assistant", "Madrid", , , _ + "c:\Windows\Help\Procedure Help.chm", 0) + +Figure 13.11 The input box comes with a predefined set of buttons. + +## Retrieving Input from an Input Box + +To retrieve the user's input from an input box, declare the numeric variable or String variable that will contain it. Here, the variable strWhichOffice will contain what the user types into the input box: + + Dim strWhichOffice + strWhichOffice = _ + InputBox("Enter the name of the office that you visited:", _ + "Expense Assistant 2000", "Madrid", , , _ + "c:\Windows\Help\Procedure Help.chm", 0) + +Once the user has entered a value or a string and clicked the OK button, your code can then use the returned value as usual in VBA. To make sure the user has clicked the OK button, check that the input box hasn't returned a zero-length string (which it also returns if the user chooses the OK button with the text box empty), and take action accordingly: + + strWhichOffice = InputBox _ + ("Enter the name of the office that you visited:", _ + "Expense Assistant 2000", "Madrid", , , _ + "c:\Windows\Help\Procedure Help.chm", 0) + If strWhichOffice = "" Then End + +# Forms: When Message Boxes and Input Boxes Won't Suffice + +As you've seen in this chapter, a message box can greatly enhance a procedure by enabling the user to make a choice at a turning point or by presenting the user with important information. But once you've used message boxes for a while, you're apt to start noticing their shortcomings: + + * You can present only a limited amount of information, and you're constrained in the way you can display it (to whatever layout you can conjure up with new paragraphs, line breaks, tabs, and spaces). + * You can use only seven sets of buttons, which limits the amount of information that a user can return to your code via message boxes. + +While you _can_ get creative and enter complex messages in message boxes to make the most use of the buttons they offer, you'll usually do better to just create a custom dialog box instead. As you'll see in Chapters 14 and 15, custom dialog boxes are relatively simple to create, and they are more powerful and flexible than message boxes. + +You'll also want to avoid writing procedures that present the user with a number of choices via a _sequence_ of message boxes. Similarly, input boxes are useful for retrieving a single piece of information from the user, but beyond that, their limitations quickly become apparent too. If you find yourself planning to use two or more input boxes in immediate succession, create a custom dialog box instead. That way you display a single form for the user to fill in all the needed information, instead of several boxes. You'll see how to create forms in Chapter 14. + +# The Bottom Line + +**Display messages on the status bar.** + +The information bar at the bottom of the window in many applications is a useful, unobtrusive way of communicating with the user. The status bar is frequently used by applications to indicate the current page, zoom level, active view (such as _datasheet_ in Access), word count, and so on. However, you, too, can display information on the bar. + +Master It + +Write a small sub in the Visual Basic Editor that displays the current date and time in the status bar. + +**Display message boxes.** + +Message boxes are commonly used to inform or warn the user. By default, they appear in the middle of the screen and prevent the user from interacting with the host application until a button on the message box is clicked, thereby closing it. + +Master It + +Write a small sub in the Visual Basic Editor that displays the current date and time using a message box. + +**Display input boxes.** + +An input box is similar to a message box, except the former can get more information from the user. An input box allows the user to type in a string, which is more data than the simple information provided by which button the user clicked in a message box. + +Master It + +Write a small sub in the Visual Basic Editor that asks users to type in their name. Use the InStr function to see if there are any space characters in the returned string. If not, it means either they are Madonna or they have typed in only one name—so display a second input box telling them to provide both their first and last names. + +**Understand the limitations of message boxes and input boxes.** + +For even moderately complex interaction with the user, message and input boxes are often too limited. They return to the VBA code, for example, only a single user response: a button click or a single piece of text. So you can't conveniently use an input box to ask for multiple data—such as an address _and_ a phone number—without displaying multiple input boxes. That's ugly and disruptive. + +Master It + +In addition to the limitations on the amount of information you can retrieve from the user, what are the two other major limitations of message boxes and input boxes? +Chapter 14 + +Creating Simple Custom Dialog Boxes + +In this chapter, you'll start looking at Visual Basic for Applications' tools for creating custom dialog boxes that interact with the user. The terms _dialog box_ and _form_ (or _user form_ ) are generally used interchangeably. Technically, a dialog box is a quite simple, small window, such as a message box or input box. Forms, generally, are larger windows featuring more rich and complex interaction with the user. These terms, though, are equivalent in common usage. + +Dialog boxes and forms are among the most powerful and feature-packed elements of VBA. We will spend quite a bit of time exploring their uses as the primary communication path between users and procedures. + +This chapter covers the most straightforward form components and how to manipulate them. The next chapter shows you how to create more elaborate forms, such as those with tabbed pages and those that update themselves when the user clicks a control. + +In this chapter you will learn to do the following: + + * Understand what you can do with a custom dialog box + * Create a custom dialog box + * Add controls to a dialog box + * Link dialog boxes to procedures + * Retrieve the user's choices from a dialog box + +# When Should You Use a Custom Dialog Box? + +You'll often want to use a _form_ (another word for dialog box or window) when simpler methods of interacting with the user fall short. Sometimes, because of the limited selection, the buttons provided in message boxes are insufficient for getting needed information from the user. Similarly, the single text field available in an input box would be inadequate if you need the user to provide multiple data (name, address, phone number, and so on). In other words, sometimes you need the user to fill in a _form_. + +You'll also want to use a custom dialog box for specialized input: when you need the user to choose nonexclusive options by selecting or clearing check boxes, to choose from among mutually exclusive choices via option buttons (also called radio buttons), or to select an item within a list displayed in a list box. Or perhaps you need to show users a picture. In other words, simple message boxes or input boxes cannot handle complex user input. + +Custom dialog boxes can include the full range of interface elements the user is probably familiar with from working with Windows applications. You can create custom dialog boxes that look and function almost exactly like the dialog boxes built into applications (such as the File Save dialog box). Or you can create even larger constructions that approach the sophistication of typical application windows. + +You'll use forms often in your more sophisticated macros. For example, when the user starts a procedure, you can have the procedure display a form presenting options—such as choosing the files for the procedure to manipulate. The user's choices determine what the procedure will then do. + +You can also create dialog boxes that VBA triggers in response to events in the computer system: for example, an event that runs at a specific time or runs when the user takes a specific action (such as creating, opening, or closing a document). + +Making your own dialog boxes is not that _hard_ , but it can be time-consuming if you're building a complicated form. Because creating forms is not the fastest programming job, you might want to consider any practical alternatives to using them. + +You've already looked at message boxes and input boxes, which provide a simple alternative for some of the relatively easy tasks for which you might want to create a custom dialog box. + +Also, some applications, such as Word and Excel, even let you use their built-in dialog boxes for your own purposes. If users are familiar with the application, they're probably familiar with these built-in dialog boxes and can immediately use them to perform standard actions—for example, to open or save files. These are called _common dialog boxes_. How to use common dialog boxes in your macros is demonstrated briefly in the Real World Scenario titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12, "Using Loops to Repeat Actions," and more fully later in this chapter in the section titled "Using an Application's Built-in Dialog Boxes from VBA." + +# Creating a Custom Dialog Box + +If you want to employ a custom dialog box or window in VBA, you use a visual object called a _user form._ A user form (also sometimes just referred to as a _form_ ) is a blank sheet on which you can place _controls_ (such as check boxes, buttons, and text boxes) to create a made-to-order dialog box. + +As you'll see, a user form contains its own code page where you, the programmer, write code to manage the various controls in the form. You can attach code to any of the controls, or to the user form itself, and that code is stored in the user form's _code sheet_. You can display the user form's code sheet in the Code window of the Visual Basic Editor and work with it as you would any other code. You can also run and test a user form as you would any other procedure (for example, by pressing F5 with the user form selected), and the VBA Editor will execute the code behind the form. + +You can display a user form (a dialog box) for the user to interact with, and you can then retrieve information from the user form and manipulate it with VBA code. It's in this sense that code supporting a form is said to be _behind_ a form. The user sees and interacts with a form, but behind the scenes you have written code to intelligently react to whatever the user might input. + +Each user form is itself an object and can contain a number of other objects that you can manipulate individually. + +* * * + +User Forms Aren't Always Dialog Boxes + +You can also create user forms that aren't dialog boxes. The distinction between a dialog box and a full window is imprecise, but it's usually easiest to define a resizable form as a window (you can resize it by dragging its borders or by clicking its Maximize button), while a dialog box has a fixed size. Some dialog boxes, such as the Find And Replace dialog box in Word, have an initially hidden part that the user can display (in the case of the Find And Replace dialog box, by clicking a More button). + +But apart from this simple resizing, the bounds of the dialog box are fixed—you can't grab the corner of the dialog box with the mouse and drag it to enlarge it. But remember that you, the programmer, can create very large user forms that have the complexity and dimensions of a typical application window. + +* * * + +For example, you could create a simple dialog box with two option buttons, an OK button, and a Cancel button. Each option button would be an object, the OK button would be a third object, and the Cancel button would be a fourth object. You could set properties individually for each object—such as the action to take when the Cancel button was clicked or the ScreenTip (also called a ToolTip) to display when the user moved the mouse pointer over each of the option buttons. (ToolTips help make the components of your form understandable for the user.) The point is to consider the components of a form—the _controls_ you place on the form—as _objects_. This is another use of the concept of objects. Controls are _visual_ objects, but like purely programmatic objects, controls have members such as _properties_. + +You can specify most properties of an object either at design time (when you're creating the user form) or at runtime (while the code is executing, either before or after you display the user form). For example, you can set the Value property of a check-box control to True to display the check box in its selected state or to False to display the check box in its cleared state. You can set the Value property three different ways: + + * When building the user form, you can use the Editor's Properties window to specify values. For example, you can make a check box that will default to its selected (checked) state each time the user form is displayed. + * You can write code that sets the check box before the form gets displayed to the user while the macro is running. + * You can write code that sets the check box while the user is interacting with the form. Note that the user can click the check box to toggle it between its selected and deselected states. But your code can also do this. + +The next sections explain the process of creating a dialog box. Later ⇒ in this chapter, you'll find examples that step through creating a procedure and adding a dialog box to it. + +## Designing a Dialog Box + +It's possible to whip together a half-decent dialog box without much planning. Some programmers like to just "sketch" the user interface in a dialog box by dragging and dropping controls onto it, then positioning them so they look good and modifying their properties. + +Other programmers prefer to adopt a more methodical approach and plan what they need to include in the dialog box before they start creating it. If you fall into this latter category, consider the intended purpose of the dialog box and list the elements it will need in order to achieve this goal. Then sketch on paper a rough diagram of the dialog box to get an approximate idea of where you'll want to locate each of the elements (the controls you place on the form). + +* * * + +Try Basing Custom Dialog-Box Designs on Existing Dialog Boxes + +Another option is to base the design for your custom dialog box on an existing dialog box—either a dialog box built into an application (called a common dialog box) or a custom dialog box that your company or organization has already implemented. Leveraging previous development efforts can not only help you avoid reinventing the wheel, but also produce a custom dialog box that users find familiar and intuitive. + +* * * + +## Inserting a User Form + +Once you have a design in mind, the first step in creating a custom dialog box is to insert a user form in the appropriate template or document: + +1. Press Alt+11 to display the Visual Basic Editor if it's not already open. + +2. In the Project Explorer window, right-click the appropriate project and choose Insert ⇒ UserForm from the context menu. + +* * * + +Other Ways to Add a User Form + +You can also insert a user form by clicking the Insert UserForm button on the far left of the Editor's Standard toolbar. + +* * * + +The Visual Basic Editor opens a new user form like that shown in Figure 14.1, named UserForm1 (or the next available number if the project already contains other user forms). + +Figure 14.1 The first step in creating a new dialog box is to start a new user form. The Visual Basic Editor displays the Toolbox when a user form is the active window. + +The Visual Basic Editor also displays the _Toolbox_. (If you've previously hidden the Toolbox while working on a user form, the Visual Basic Editor doesn't display it. Choose View ⇒ Toolbox or click the Toolbox button on the far right of the Standard toolbar.) + +VBA automatically inserts the user form in the Forms object (the collection of forms) for the project. If the project you chose didn't already contain a Forms collection, VBA adds one to contain the new user form. You'll see the Forms object displayed in the Project Explorer. + +### Choosing User-Form Grid Settings + +The Visual Basic Editor displays a grid in each user form to help you place controls relative to the dialog box and to align controls relative to each other so they look neat instead of random. + +I don't know why you would want to do this, but to switch off the display of this grid or to switch off the Visual Basic Editor's automatic alignment of controls to the grid, follow these steps: + +1. Choose Tools ⇒ Options to display the Options dialog box. + +2. Click the General tab to display the General page (see Figure 14.2). + +Figure 14.2 The General page of the Options dialog box includes options for toggling the display of the grid, resizing the grid, and toggling whether VBA aligns the controls to the grid. + +3. Choose the settings you want: + +a. Clear the Show Grid check box if you want to turn off the display of the grid. (The grid continues to function, but the dots are not displayed.) + +b. Clear the Align Controls To Grid check box if you want to stop using the grid whether it's visible or not. This feature is usually a timesaver, but if the grid is too coarse for the layout you're trying to achieve, just reduce the sizing of the grid from the default 6 to perhaps 3 or 4. + +c. Change the number of points in the Width and Height text boxes to adjust the sizing of the grid's units. + +4. Click the OK button to close the Options dialog box and apply your choices. + +* * * + +Naming Conventions in Visual Basic for Applications + +Naming controls in VBA is similar to naming variables. Names for controls can be up to 40 characters long, must begin with a letter, and after that can be any combination of letters, numbers, and underscores. You can't use spaces or symbols in the names, and each name must be unique in its context—for example, each user form must have a unique name within a project, but within any user form or dialog box, a control can have the same name as another control in a different form. + +Those are the rules; you can also use conventions to make the names of your VBA objects as consistent and easy to understand as possible. Recall the conventions you've used in previous chapters for identifying the variable type with a prefix: str, lng, int, and so on. The prefixes widely used when naming controls identify the control. For example, by using the convention of prefixing a text box control's name with txt, you can be sure that anyone else reading your code will immediately identify the name as belonging to a text box—and that you yourself will easily identify the name when you revisit your old code. + +Here's an example showing conventional prefixes for several controls: + + Private Sub cmbSelectEmployee_Change() + lblEmployeeName = cmbSelectEmployee.Text + fraStep2.Enabled = True + lblInstructions = "Enter text in the Step 2 text box. " & _ + "For example, you might include brief biographical " & _ + "information on the employee, details of their position, " & _ + "or your hopes for their contribution to the company." + cmdClearEmployeeName.Enabled = True + End Sub + +Some popular naming conventions for the most-used VBA objects are shown in the following list. You'll encounter the naming conventions for other VBA objects later in the book. This list includes the control's name, the standard prefix, and finally an example showing how the control can be named in code: + +**Check box** + +The standard prefix is chk **,** as in chkReturnToPreviousPosition. + +**Command button** + +The standard prefix is cmd **,** as in cmdOK. + +**Form (user form)** + +The standard prefix is frm, as in frmMoveParagraph. + +**Frame** + +The standard prefix is fra, as in fraMovement. + +**List box** + +The standard prefix is lst, as in lstConferenceAttendees. + +**Combo box** + +The standard prefix is cmb, as in cmbColor. + +**Menu** + +The standard prefix is mnu, as in mnuProcedures. + +**Option button** + +The standard prefix is opt, as in optSpecialDelivery. + +**Label** + +The standard prefix is lbl, as in lblUserName. + +**Text box** + +The standard prefix is txt, as in txtUserDescription. + +Just as with variable names, the naming convention for controls begins with three lowercase letters and then starts the rest of the object's name with an uppercase letter to make it a little easier to read. For example, a text box in which the users are to type their last names might be named txtLastName. + +Naming conventions tend to seem awkwardly formal at first, and there's a strong temptation to avoid them. But if you plan to distribute your macros or expect others to work with them, it's usually worth the trouble to follow the naming conventions. Plus they help you when debugging. It's just another way to make reading code easier for everybody. + +* * * + +## Renaming a User Form + +Next, change the user form's name property from the default (UserForm1) to a more descriptive name. The following steps show how to do this. (For advice on choosing names, refer to the sidebar "Naming Conventions in Visual Basic for Applications" in this chapter.) + +1. If the Properties window isn't displayed, press F4 to display it. Figure 14.3 shows the two pages of the Properties window: Alphabetic and Categorized. Alphabetic displays an alphabetical listing of the properties of the currently selected object; Categorized displays the same properties but separated into categories, such as Appearance, Behavior, Font, Misc., Picture, and Position. (Some controls have more categories than those listed here.) You can expand a category by clicking the plus (+) sign beside it to display the properties it contains, and collapse it by clicking the resulting minus (–) sign. If the Alphabetic tab isn't selected, click it to select it. + +Figure 14.3 You can choose either an alphabetized or a categorized list in the Properties window. + +The Categorized option is not, in my view, very helpful because many of the properties are simply too difficult to fit into categories that make any sense. The Caption property, for example, is assigned to the Appearance category, but the (Name) property is contained in the Misc. collection. The very existence of a "miscellaneous" category demonstrates that the categorization effort has broken down. I suggest you stick with the default Alphabetic option instead. + +**2.** Make sure the drop-down list (at the top of the Properties window) is displaying the default name of the user form. If it isn't, select the user form from the drop-down list. + +**3.** Select the user form's default name (such as UserForm1 or UserForm2) in the cell to the right of the Name cell (you can double-click the name to select it quickly). Now type a new, more descriptive name for the user form. This name can be anything you want, with the standard VBA limitations: + + * It must start with a letter. + * It can contain letters, numbers, and underscores but no spaces or symbols. + * It can be up to 40 characters long. + +**4.** Click the Caption cell to select the user form's default name and type the caption for the user form—that is, the text label that you want the user to see in the title bar of the dialog box. This name has no restrictions beyond the constraints imposed by the length of the title bar. You can enter a name longer than will fit in the title bar, but VBA truncates it with an ellipsis at its maximum displayable length. As you type, the name appears in the user-form title bar as well, so it's easy to see what's an appropriate length—at least, for the current size of the user form. + +**5.** Press Enter or click elsewhere in the Properties window (or elsewhere in the Visual Basic Editor) to set (make official) the user form's name. (Naming controls works the same way as naming forms.) + +* * * + +Dealing with the "Name Conflicts with Existing Module" Error + +If you run into the "Name _name_ conflicts with existing module, project, or object library" error (shown here), chances are you've just tried to give a user form the same name already assigned to something else. + +You've tried to reuse the name of a VBA project or object library. + +* * * + +## Adding Controls to the User Form + +Now that you've renamed the user form, you're ready to add controls to it from the Toolbox, shown in Figure 14.4. VBA automatically displays the Toolbox when a user form is active, but you can also display the Toolbox when no user form is active by choosing View ⇒ Toolbox. + +Figure 14.4 Use the Toolbox to add controls to the user form. + +* * * + +Removing the "Roaming Office" Control + +Obviously an oversight on Microsoft's part, they've included in the VBA 2013 Toolbox an obscure and—even in MSDN—essentially ignored control called the RoamingOffice control (not shown in Figure 14.4). Its use is beyond the scope of this book, not to mention beyond the scope of the VBA Help system and even Google. Perhaps Microsoft intends to make it useful in the future. For now, though, it clearly doesn't belong among the default controls on the Toolbox. It's the small gray crosshatched square icon next to the Image control. If you wish, you can remove the RoamingOffice control from your Toolbox by right-clicking its crosshatched icon, then choosing Delete RoamingOffice from the context menu. + +* * * + +Here's what the buttons on the Toolbox do: + +**Select Objects** + +This first control has a very specialized purpose, and you might never need to use it. First, it's not an ordinary control (it doesn't appear on a form; you can't drag and drop it onto a form). Its job is to restore the mouse pointer to _selection mode_. However, the mouse pointer _automatically_ returns to selection mode after you've dropped a control onto a form. So usually you'll need to click the Select Objects button only when you've selected another control and then changed your mind and decided not to use it. So you need to restore the pointer to its normal state. Alternatively, if you double-click a control (such as the check box), you trigger a technique that allows you to quickly add multiple versions of the same control repeatedly. (Every time you click in the form, a new check box is added to it, for example, while the Editor is in this state. To stop this repetitive behavior, you click the Select Objects button. + +**Label** + +Creates a _label_ , which is text used to identify a part of the dialog box or to explain information the user needs to know in order to use the dialog box effectively. + +**TextBox** + +Creates a text box (also sometimes called an _edit box_ ), a field into which the user can type text. You can also use a text box to display text to the user or to provide text for the user to copy and paste elsewhere. A text box can contain either one line (the default) or multiple lines and can display a horizontal scroll bar, a vertical scroll bar, or both. + +**ComboBox** + +Creates a combo box, a control that combines a text box with a list box. The user can either choose a value from the list box or enter a new value in the text box. + +**ListBox** + +Creates a list box, a control that lists a number of values. Users can pick one value from the list but can't enter a new value of their own (unlike with a combo box). The list box is good for presenting closed sets of data. + +**CheckBox** + +Creates a check box and an accompanying label. The user can select or clear the check box to turn the associated action on or off. + +**OptionButton** + +Creates an option button (also known as a _radio button_ ) and an accompanying label to identify the purpose of the button. This button is usually a circle that contains a black dot when selected. The user can select only one option button out of any group of option buttons. (The name radio button comes from radios with push buttons for stations; you can select only one button at a time. Push one, and the others pop out.) + +**ToggleButton** + +Creates a toggle button, a button that shows whether or not an item is selected. A toggle button can be defined with any two settings, such as On/Off or Yes/No. You can add a picture to a toggle button, which provides a graphical way of letting a user choose between options. + +**Frame** + +Creates a frame, an area of a user form or dialog box surrounded by a thin line and an accompanying label. You can use frames (also known as _group boxes_ ) to group related elements in your forms. As well as cordoning off elements visually, frames can separate elements logically. For example, VBA treats a group of option buttons contained within a frame as separate from option buttons in other frames or option buttons loose in the dialog box. This separation makes it easier to use multiple sets of option buttons in a form. + +**CommandButton** + +Creates a command button. This is the typical, ordinary Windows button that users click to communicate their wishes. Most dialog boxes contain command buttons such as OK and Cancel, or Open and Cancel, or Save, or Apply and Close. + +**TabStrip** + +Creates a tab strip for displaying multiple sets of data in the same set of controls. Tab strips are especially useful for presenting records in a database for review or modification: Each record in the database contains the same fields for information, so they can be displayed in the same group of controls. The tab strip provides an easy way of navigating between records. + +**MultiPage** + +Creates a multipage control for displaying multipage dialog boxes that have different layouts on each of their tabs. An example of a multipage dialog box is the Options dialog box (Tools ⇒ Options), which has multiple pages (often referred to incorrectly as tabs) in most of the Office applications. + +**ScrollBar** + +Creates a stand-alone scroll bar. Stand-alone scroll bars are of relatively little use in dialog boxes. Combo boxes and list boxes have built-in scroll bars. + +**SpinButton** + +Creates a spin-button control for attaching to another control. Spin buttons (also known as _spinners_ ) are typically small, rectangular buttons with one arrow pointing up and one down (or one arrow pointing left and the other pointing right). Spin buttons are useful for presenting sequential values with consistent intervals within an understood range, such as times or dates. For example, if you want the user to increment or decrement a price in a text box in 25-cent steps, you could use a spinner to adjust the price rather than letting the user type directly into the text box. + +**Image** + +Creates an image control for displaying a picture within a form. For example, you might use an image control to show a corporate logo or a picture of some sort. (If you want to display a photo, texture, or other graphic on the background of the form itself, set the form's Picture property.) + +* * * + +Adding Controls to the Visual Basic Editor Toolbox + +The Toolbox shown in Figure 14.4 contains the basic set of tools provided by VBA. As discussed in "Customizing the Toolbox" in Chapter 2, "Getting Started with the Visual Basic Editor," you can customize the Toolbox in various ways: by adding other controls to it, creating additional pages for the controls, moving controls from page to page, and creating customized controls of your own making so that you can avoid having to repeatedly adjust properties each time you add those controls. + +* * * + +Click one of the controls in the Toolbox to select it. Then click in the user form to insert the control on the form, as illustrated in Figure 14.5. VBA places the top-left corner of the control where you click. As you place a control, it snaps to the grid on the user form (unless you've turned off the Align Controls To Grid feature as described in "Choosing User-Form Grid Settings," earlier in this chapter). + +Figure 14.5 When you click in the user form, VBA places a standard-size control of the type you chose. If the Align Controls To Grid feature is switched on (as it is by default), VBA automatically aligns the control with the grid on the user form. + +You can resize the control as desired by selecting it and then clicking and dragging one of the selection handles (the white squares) that appear around it, as shown in Figure 14.6. The mouse pointer changes to a double-arrow icon when you've correctly positioned it to drag. When you drag a corner handle, VBA resizes the control on both sides of the corner; when you drag the handle at the midpoint of one of the control's sides, VBA resizes the control only in that dimension. In either case, VBA displays a dotted outline indicating the size that the control will be when you release the mouse button. + +Figure 14.6 Once you've placed a control, you can resize it as necessary by dragging one of its selection handles. + +To resize the user form itself, click its title bar, or click in any blank space in the form (anywhere outside a control). This selects the user form. Then click and drag one of the selection handles that appear around the form. + +To delete a control, right-click it in the user form and choose Delete from the context menu. Alternatively, click it to select it and then press the Delete key or choose Edit ⇒ Delete. Restore it by pressing Ctrl+Z. + +* * * + +Random Additional Default Toolbox Controls + +Now and then Microsoft adds application-specific or novel controls to the default Toolbox. This not only causes confusion, but it also means that the VBA Editor's Toolboxes are not standardized across the Office applications. This is a recent development, and unwelcome. Word 2013 arbitrarily includes a "Roaming Office" control. For more on this peculiar feature, see the sidebar "Removing the 'Roaming Office' Control" earlier in this chapter. + +Excel's VBA Editor includes a RefEdit control that mimics Excel's reference-edit boxes. + +Nobody objects to Microsoft providing additional controls to we programmers. (You can easily add controls to the Toolbox by right-clicking within the Toolbox and choosing Additional Controls from the context menu.) What's problematic is the randomness of what's now being included in the default Toolboxes. + +* * * + +## Grouping Controls + +Sometimes it's quite efficient to temporarily select several controls as a group in the Editor. This allows you to manipulate all the grouped controls as a unit. For example, if you want to change the font size of three text boxes, two option buttons, and four labels, just group them and change the font-size property in the Properties window only _once_. The whole group will have all their font sizes changed automatically. (This trick is not related to grouping controls within a Frame control as described earlier in this chapter.) + +We'll explore this useful grouping technique later in this chapter in the section titled "Working with Groups of Controls." For now, I'll just briefly introduce the concept. + +To delete, move, resize, or change the properties of multiple controls at once, first select them into a group. You can then delete them all at once by using the methods just described. Or you can move, resize, or modify the properties of the group as a whole. + +Here's how to group controls: + + * To select multiple contiguous controls, click the first control, hold down Shift, and then click the last control in the sequence. + * To select multiple noncontiguous controls—or to add additional controls to a group after you've selected multiple contiguous controls by using the Shift key—hold down the Ctrl key as you click each additional control. (With the Ctrl key pressed, you can deselect any control in a group by clicking it a second time.) + * To select multiple controls in the same area of the user form, click in the form's background outside the controls and drag the resulting selection box until it encompasses at least part of each control. When you release the mouse button, the Visual Basic Editor selects the controls as a group. + +## Renaming Controls + +As with user forms, VBA automatically gives each control that you add to a form a default name consisting of the type of control plus a sequential number. When you add the first text box in a user form, VBA names it TextBox1; when you add another text box, VBA names it TextBox2; and so on. (Each control in a dialog box must have a unique name so that you can refer to it specifically in code.) + +You'll usually want to change the controls' default names to names that describe their purpose so you can remember what they do for the macro. + +For example, if TextBox2 is used for entering the user's organization name, you might want to rename it txtOrganizationName, txtOrgName, txtO_Name, or something similar. + +To rename a control, follow these steps: + +1. Click the control in the user form to select it and thereby display its properties in the Properties window. + + * If the Properties window is already displayed, you can, if you prefer, select the control from the drop-down list at the top of the Properties window instead of selecting it in the user form. VBA then visually highlights (selects) the control in the user form, which helps you make sure that you've selected the control you want to affect. + * If the Properties window isn't displayed, you can quickly display it with the properties for the appropriate control by right-clicking the control in the user form and choosing Properties from the context menu. + +2. In the Properties window, double-click to select the default name in the cell to the right of the Name property. + +3. Type the new name for the control. + +4. Press Enter to set the control name, or click elsewhere in the Properties window or in the user form. + +* * * + +If You Rename a Control, You May Have to Modify Your Code + +You can rename a control anytime. But if you do, you must also change any existing references to it in the code that drives the user form. This gives you a strong incentive to choose suitable names for your controls before you write the code. + +* * * + +## Moving a Control + +To move a control, click anywhere in it to select it, and then drag it to where you want it to appear, as shown in Figure 14.7. + +Figure 14.7 If a control isn't currently selected, you can move it by clicking it and dragging it. + +To move a selected control, move the mouse pointer over the selection border around it so that the mouse pointer turns into a four-headed arrow (as shown in Figure 14.8), and then click and drag the control to where you want it to appear. + +Figure 14.8 If a control is selected, move the mouse pointer over its selection border, and then click and drag the control. + +* * * + +Useful Copy-and-Paste Techniques with Controls + +You can use the Copy and Paste commands (from the Standard toolbar, the Edit menu, or the context menu or by using the easiest approach, the keyboard, such as pressing Ctrl+X and Ctrl+V) to move a control. + +Copy and Paste isn't that efficient when moving a _single_ control; the Paste command places the control right in the middle of the user form, so you have to drag it to its new position anyway. + +However, when creating multiple, similar control sets—such as a group of text boxes with accompanying labels—copying and pasting can be quite useful. It's a quick way to build a whole set of fields for the user to fill in, for example. This way, you don't have to position and align each label/text box pair. Nor do you have to adjust each control's properties, because they are copied too. Align the first label/text pair, set the Font property the way you want it (usually larger, changing it from the default 8 pt. size to 11), resize the controls as you want them, change any other properties to suit yourself, and then copy and paste (clone) the pair as often as necessary by repeatedly pressing Ctrl+V. + +Be aware, though, that the VBA Editor unfortunately places each new clone directly on the center of the form, thereby hiding any other clones you've just added. In other words, when you paste, you can't actually see the new clone—it's in a pile on the center of the form. So you have to drag the clones away from the center to reveal the others beneath. + +Here's a related technique: Sometimes you want to copy the entire set of controls from one form to another. Select all the controls on Form1, then press Ctrl+C to copy them, then click Form2 to select it, and press Ctrl+V to paste the entire set of controls into the new form. + +The advantage of using Copy and Paste for creating new controls is that the new controls inherit all the characteristics of the original controls, so you can save time by creating a control, setting its properties, and then cloning it. + +You don't even need to change the names of the copies you paste to another user form—they just need to be named suitably for the code with which they work. + +As an alternative to using the Copy and Paste commands, you can also copy a control by holding down the Ctrl key as you click and drag the control. VBA displays a + sign attached to the mouse pointer to indicate that you're copying the control rather than moving it. Drop the copy where you want it to appear on the user form. + +* * * + +## Changing the Caption on a Control + +Some controls—such as option buttons and check boxes—have built-in text captions to let the user understand their purpose. You can change these captions like this: + +1. Click the control to select it. + +2. Click the caption itself to select it. VBA displays the blinking insertion cursor and a faint dotted border around the text, as shown in Figure 14.9. + +Figure 14.9 To change the caption on a control, select the control, and then click in the text so that it displays this faint dotted border. + +* * * + +Double-Clicking Opens the Code Window Rather Than Selects a Control + +When you click a label to select it and click again to position the insertion point to change the caption, make sure you click slowly enough that Windows doesn't interpret this as a double-click. A double-click displays the code sheet for the user form and automatically adds a procedure for the Click event of the control. If this happens, you can easily get back to viewing the form (it's called Design view, as opposed to Code view). Just press Shift+F7, double-click the module's name in the Project Explorer, or choose View ⇒ Object to view the form again. + +* * * + +3. Now click in the label to position the insertion point for editing it, or drag through the label to select all of it. + +4. Edit the text of the label as desired. + +5. Press Enter or click elsewhere in the user form to effect the change to the label. (You can alternatively change the label by changing its Caption property in the Properties window.) + +* * * + +When Should You Set Properties of a Control? + +You can set (specify) many properties of a control either at design time (while you're creating the user form) or at runtime (while the form's code is executing). There's a time and a place for each approach, a time when either is a reasonable course of action. + +Generally speaking, the more static the property, the more likely you'll want to set it at design time. Some properties, such as the Name property of a user form, _have_ to be set at design time—you can't change such properties at runtime for a user form. You'll also usually want to name your controls at design time, though you can add controls at runtime and set their Name properties during execution. + +In most cases, you'll want to set the properties that govern the position and size of the user form itself and its controls at design time. The advantages are clear: you can make sure that the user form looks as you intend it to, that it's legible, and so on. + +Occasionally, you may want to change the properties of a user form or the size or position of some of the controls on it at runtime. For example, you might need to add a couple of option buttons to the form to take care of eventualities not included in the basic design of the form. Alternatively, you might create a form that had two groups of option buttons sharing the same space—one group, in effect, positioned on top of the other. At runtime, you could modify their Visible properties in your code and thus display one group and hide the other group. If each group contained the same number of option buttons, you could even make do with only one group of option buttons, assigning the appropriate properties to each at runtime. However, there's no particular advantage in trying to simultaneously make just the one group do double duty like that. It can make your code more confusing. + +Given the flexibility that many properties of controls provide, you can often design your user forms to handle several circumstances by displaying and hiding different groups of controls at runtime rather than having to add or remove controls at runtime. Creating the complete set of controls for a user form at design time avoids most of the difficulties that can arise from adding extra controls at runtime. That said, you may sometimes need to create a user form on the fly to present information about the situation in which users have placed themselves. + +As you'll see as you continue to work with controls, you have to set values for _some_ controls at runtime. For example, you sometimes can't assign the list of items to a list box or combo box at design time. If a list displays items from a database, the list can vary depending on which data set the user selects. So you would have to write code that fills the list box during execution. (Often, you'll fill a list box during a UserForm_Initialize procedure that runs as the user form is being initialized for display.) The set of items in some lists can be known in advance and specified in your code during design time, such as a list box displaying all the countries in the world, from which the user selects the country of residence. + +* * * + +## Key Properties of the Toolbox Controls + +The following sections discuss the key properties of the controls in the default Toolbox. + +First, I'll explain the common properties used to manipulate many of the controls effectively. After that, I'll go through the controls one by one, listing the properties particular to each control. + +If you're new to VBA and find this section heavy going, just skip it for the time being and return to it when you're creating code and need to reference information about the properties of the controls. + +### Common Properties + +Table 14.1 lists the properties shared by all or most controls, grouped by category. + +Table 14.1 Properties common to most or all controls + +**Property Information** | **Applies To** | **Explanation** +---|---|--- +**General Properties** | | +BoundValue | All controls except Frame, Image, and Label | Contains the value of the control when the control receives the focus in the user form. +HelpContextID | All controls except Image and Label | Returns the context identifier of the Help file topic associated with the control. +Name | All controls | Contains the name for the control. +Object | All controls | Enables you to assign to a control a custom property or method that uses the same name as a standard property or method. +Parent | All controls | Returns the name of the user form that contains the control. +Tag | All controls | Used for assigning extra information to the control. This is rarely used. +Value | CheckBox, ComboBox, CommandButton, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton | One of the most varied properties, Value specifies the current state or value of the control. A CheckBox, OptionButton, or ToggleButton can have an integer value of −1 (True), indicating that the item is selected, or a value of 0 (False), indicating that the item is cleared. A ScrollBar or SpinButton returns a value containing the current value in the control. A ComboBox or ListBox returns the currently selected row's (or rows') BoundColumn value. A MultiPage returns an integer indicating the active page, and a TextBox returns the text in the text box. +| | The value of a CommandButton is False because choosing the command button triggers a Click event. However, you can set the value of a CommandButton to True, which has the same effect as clicking it. In other words, the value property is similar to the value of a variable, but the property's possible values are highly specific to each control. +**Size and Position** | | +Height | All controls | The height of the control, measured in points. +LayoutEffect | All controls except Image | Indicates whether a control was moved when the layout of the form was changed. +Left | All controls | The distance of the left border of the control in pixels from the left edge of the form or frame that contains it. +OldHeight | All controls | The previous height of the control, measured in pixels. +OldLeft | All controls | The previous position of the left border of the control, measured in pixels. +OldTop | All controls | The previous position of the top border of the control, measured in pixels. +OldWidth | All controls | The previous width of the control, measured in points. +Top | All controls | The distance of the top border of the control in pixels from the top edge of the form or frame that contains it. +Width | All controls | The width of the control, measured in points. +**Appearance** | | +Alignment | CheckBox, OptionButton, ToggleButton | Specifies how the caption is aligned to the control. +AutoSize | CheckBox, ComboBox, CommandButton, Image, Label, OptionButton, TextBox, ToggleButton | A Boolean (True or False only) property that controls whether the object resizes itself automatically to accommodate its contents. The default setting is False, which means that the control doesn't automatically resize itself. +BackColor | All controls | The background color of the control. This property contains a number representing the color. +BackStyle | CheckBox, ComboBox, CommandButton, Frame, Image, Label, OptionButton, TextBox, ToggleButton | Specifies whether the background of the object is transparent (fmBackStyleTransparent) or opaque (fmBackStyleOpaque, the default). You can see through a transparent control—anything behind it on the form will show through. You can use transparent controls to achieve interesting effects—for example, by placing a transparent command button on top of an image or another control. +BorderColor | ComboBox, Image, Label, TextBox, ListBox | Specifies the color of the control's border. You can choose a border color from the System drop-down list or the palette or enter BorderColor as an eight-digit integer value (such as 16711680 for mid-blue). VBA stores the BorderColor property as a hexadecimal value (for instance, 00FF0000). For BorderColor to take effect, BorderStyle must be set to fmBorderStyleSingle. +BorderStyle | ComboBox, Frame, Image, Label, ListBox, TextBox, UserForm | Specifies the style of border on the control or user form. Use BorderStyle with the BorderColor property to set the color of a border. +Caption | CheckBox, CommandButton, Label, OptionButton, ToggleButton | A text string containing the description that appears for a control—the text that appears in a label, on a command button or toggle button, or next to a check box or option button. +Font (object) | All controls except Image, SpinButton, and ScrollBar | Font—an object rather than a property—controls the font in which the label for the object is displayed. For TextBox, ComboBox, and ListBox controls, Font controls the font in which the text in the control is displayed. +ForeColor | All controls except Image | The foreground color of the control (often the text on the control). This property contains a number representing the color. +Locked | CheckBox, ComboBox, CommandButton, ListBox, OptionButton, TextBox, ToggleButton | A Boolean property that specifies whether the user can change the control. When Locked is set to True, the user can't change the control, though the control can still receive the focus (that is, be selected) and trigger events. When Locked is False (the default value), the control is open for editing. +MouseIcon | All controls except MultiPage | Specifies the image to display when the user moves the mouse pointer over the control. To use the MouseIcon property, the MousePointer property must be set to 99, fmMousePointerCustom. +MousePointer | All controls except MultiPage | Specifies the type of mouse pointer to display when the user moves the mouse pointer over the control. +Picture | CheckBox, CommandButton, Frame, Image, Label, OptionButton, Page, ToggleButton, UserForm | Specifies the picture to display on the control. By using the Picture property, you can add a picture to a normally text-based control, such as a command button. +PicturePosition | CheckBox, CommandButton, Label, OptionButton, ToggleButton | Specifies how the picture is aligned with its caption. +SpecialEffect | CheckBox, ComboBox, Frame, Image, Label, ListBox, OptionButton, TextBox, ToggleButton | Specifies the visual effect to use for the control. For a CheckBox, OptionButton, or ToggleButton, the visual effect can be flat (fmButtonEffectFlat) or sunken (fmButtonEffectSunken). For the other controls, the visual effect can be flat (fmSpecialEffectFlat), raised (fmSpecialEffectRaised), sunken (fmSpecialEffectSunken), etched (fmSpecialEffectEtched), or a bump (fmSpecialEffectBump). +Visible | All controls | Indicates whether the control is visible; expressed as a Boolean value. +WordWrap | CheckBox, CommandButton, Label, OptionButton, TextBox, ToggleButton | A Boolean property that specifies whether the text in or on a control wraps at the end of a line. For most controls, WordWrap is set to True by default; you'll often want to change this property to False to prevent the text from wrapping inappropriately. If the control is a TextBox and its MultiLine property is set to True, VBA ignores the WordWrap property. +**Behavior** | | +Accelerator | CheckBox, CommandButton, Label, OptionButton, Page, Tab, ToggleButton | The accelerator key (or _access key_ , or _mnemonic_ ) for the control—the key the user presses (typically in combination with Alt) to access the control. For example, in many dialog boxes, the user can access the Cancel button by pressing Alt+C. The accelerator key for a label applies to the next control in the tab order rather than to the label itself. The accelerator character must be one of the characters in the control's text caption, usually the first (The _C_ in Cancel, for example). Once you specify the accelerator character, VBA automatically underlines that character in the caption to cue the user that they can press, for example, Alt+C to select the Cancel button. For additional information on tab order, see the section titled "Adjusting the Tab Order of a Form" later in this chapter. +ControlSource | CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, ToggleButton | The cell or field used to set or store the Value of the control. The default value is an empty string (""), indicating that there is no control source for the control. +ControlTipText | All controls | The text of the ScreenTip displayed when the user holds the mouse pointer over the control. The default value of ControlTipText is a blank string, which means that no ScreenTip is displayed. +Enabled | All controls | A Boolean value that determines whether the control can be accessed (either interactively or programmatically). +TabIndex | All controls except Image | The position of the control in the tab order of the user form, expressed as an integer from 0 (the first position) through the number of controls on the user form. +TabStop | All controls except Image and Label | A Boolean value establishing whether the user can select the control by pressing the Tab key. If TabStop is set to False, the user can select the control only with the mouse. The TabStop setting doesn't change the tab order of the dialog box. + +### Label + +The Label control simply displays text on the screen. It's most often used to identify the purpose of another control, so you frequently see a Label control placed on a form to the left of a textbox whose purpose the label describes. Use the Caption property to type in the text that you want the label to display. Use the TextAlign property as shown in Table 14.2 to align the text of the label with the borders of the Label control. + +Table 14.2 TextAlign property values for the Label control + +**Constant** | **Value** | **Text Alignment** +---|---|--- +fmTextAlignLeft | 1 | With the left border of the control +fmTextAlignCenter | 2 | Centered on the control's area +fmTextAlignRight | 3 | With the right border of the control + +### TextBox + +The TextBox is one of the most common controls. Recall that it can be a single-line control (often employed to display a field the user must fill in) or a multiline control, for displaying lots of text, as in a diary program where the user determines how much they want to write. Adjust this feature with the MultiLine property. Also, the defaults for a TextBox are a size of 8 pt. (too small usually) and a sans-serif font called Tahoma (sans-serif type is generally thought more appropriate for headlines than body text). So you'll usually find yourself employing the Font property to choose a larger font size and more readable font (such as Times New Roman). + +Table 14.3 lists the key properties of the TextBox control. + +Table 14.3 Key properties of the TextBox control + +**Property** | **Description** +---|--- +AutoTab | A Boolean property that determines whether VBA automatically moves to the next field when the user has entered the maximum number of characters in the text box or combo box. +AutoWordSelect | A Boolean property that determines whether VBA automatically selects a whole word when the user drags the mouse through text in a text box or a combo box. +DragBehavior | Enables or disables drag-and-drop for a text box or combo box: fmDragBehaviorDisabled (0) disables drag-and-drop; fmDragBehaviorEnabled (1) enables drag-and-drop. +EnterFieldBehavior | Determines whether VBA selects the contents of the edit area of the text box or combo box when the user moves the focus to the text box or combo box: fmEnterFieldBehaviorSelectAll (0) selects the contents of the text box or current row of the combo box; fmEnterFieldBehaviorRecallSelection (1) doesn't change the previous selection. +EnterKeyBehavior | A Boolean property that determines what VBA does when the user presses Enter with the focus on a text box. If EnterKeyBehavior is True, VBA creates a new line when the user presses Enter; if EnterKeyBehavior is False, VBA moves the focus to the next control on the user form. If MultiLine is False, VBA ignores the EnterKeyBehavior setting. +HideSelection | A Boolean property that determines whether VBA displays any selected text in a text box or combo box. If HideSelection is True, VBA displays the text without indicating the selection when the control doesn't have the focus. If HideSelection is False, VBA indicates the selection both when the control has the focus and when it doesn't. +IMEMode | Determines the default runtime mode of the Input Method Editor (IME). This property is used only in Far Eastern applications (for example, those using Japanese hiragana or katakana or Korean hangul). +IntegralHeight | A Boolean property that determines whether a list box or a text box resizes itself vertically to display any rows that are too tall to fit into it at its current height (True) or not (False). +MultiLine | A Boolean property that determines whether the text box can contain multiple lines of text (True) or only one line (False). When MultiLine is True, the text box adds a vertical scroll bar when the content becomes more than will fit within the current dimensions of the text box. VBA defaults to Multiline = False. +PasswordChar | Specifies the placeholder character to display in place of the characters the user types (so somebody peeping won't see the actual password). The common password character is the asterisk *. This property is normally used for entering passwords and other information that needs to be obscured so that it cannot be read. +ScrollBars | Specifies which scroll bars to display on the text box. Usually, you'll do best to set the WordWrap property to True and let VBA add the vertical scroll bar to the text box as needed rather than using the ScrollBars property. +SelectionMargin | A Boolean property that determines whether the user can select a line of text in the text box or combo box by clicking in the selection bar to the left of the line. +ShowDropButtonWhen | Determines when to display the drop-down button for a combo box or a text box. fmShowDropButtonWhenNever (0) never displays the drop-down button and is the default for a text box. fmShowDropButtonWhenFocus (1) displays the drop-down button when the text box or combo box has the focus. fmShowDropButtonWhenAlways (2) always displays the drop-down button and is the default for a combo box. +TabKeyBehavior | A Boolean property that specifies whether the user can enter tabs in the text box. If TabKeyBehavior is True and MultiLine is True, pressing Tab enters a tab in the text box. If MultiLine is False, VBA ignores a TabKeyBehavior setting of True. If TabKeyBehavior is False, pressing Tab moves the focus to the next control in the tab order. + +### ComboBox and ListBox + +From the user's point of view, a key distinction is that a list box simply provides a list of options the user can choose from, whereas a combo box offers that list and also includes a field where the user can type in items. + +Table 14.4 shows the key properties of the ComboBox control and the ListBox control. These two controls are similar and share many properties. They do, however, differ somewhat in behavior and features; these differences are described in the entries marked "List box only" and "Combo box only" in the table. + +Table 14.4 Key properties of the ComboBox and ListBox controls + +**Property** | **Description** +---|--- +AutoTab | See Table 14.3. +AutoWordSelect | See Table 14.3. +BoundColumn | A Variant property that determines the source of data in a combo box or a list box that has multiple columns. The default setting is 1 (the first column). To assign another column, specify the number of the column (columns are numbered from 1, the leftmost column). To assign the value of ListIndex to BoundColumn, use 0. +ColumnCount | A Long (data type) property that sets or returns the number of columns displayed in the combo box or list box. If the data source is unbound, you can specify up to 10 columns. To display all available columns in the data source, set ColumnCount to –1. +ColumnHeads | A Boolean property that determines whether the combo box or list box displays headings on the columns (True) or not (False). +ColumnWidths | A String (data type) property that sets or returns the width of each column in a multicolumn combo box or list box. +ListRows | (Combo box only.) A Long (data type) property that sets or returns the number of rows displayed in the combo box. If the number of items in the list is greater than the value of ListRows, the combo box displays a scroll bar so that the user can scroll to the unseen items. +ListStyle | Determines the visual effect the list uses. For both a combo box and a list box, fmListStylePlain displays a regular, unadorned list. For a combo box, fmListStyleOption displays an option button to the left of each entry, allowing the user to select one item from the list. For a list box, fmListStyleOption displays option buttons for a single-select list and check boxes for a multiselect list. +ListWidth | (Combo box only.) A Variant property that sets or returns the width of the list in a combo box. The default value is 0, which makes the list the same width as the text area of the combo box. +MatchEntry | Determines which type of matching the combo box or list box uses when the user types characters with the focus on the combo box or list box. fmMatchEntryFirstLetter (0) matches the next entry that starts with the letter or character typed: if the user types _t_ twice, VBA selects the first entry beginning with _t_ and then the second entry beginning with _t_. fmMatchEntryComplete (1) matches each letter the user types: if the user types _te_ , VBA selects the entry that starts with _te_. fmEntryMatchNone (2) specifies no matching: the user can't select an item by typing in the list box or combo box but must use the mouse or the arrow keys instead. The default MatchEntry setting for a combo box is fmMatchEntryComplete. The default setting for a list box is fmMatchEntryFirstLetter. +MatchRequired | (Combo box only.) A Boolean property determining whether the user must select an entry from the combo box before leaving the control (True) or not (False). This property is useful for making sure that if the user types a partial entry into the text-box area of the combo box, they don't forget to complete the selection in the drop-down list area. +If MatchRequired is True and the user tries to leave the combo box without making a selection, VBA displays an "Invalid Property Value" message box. +MultiSelect | (List box only.) Controls whether the user can make a single selection in the list or multiple selections. fmMultiSelectSingle (0) lets the user select only one item. fmMultiSelectMulti (1) lets the user select multiple items by clicking with the mouse or by pressing the spacebar. fmMultiSelectExtended (2) lets the user use Shift+click, Ctrl+click, and Shift with the arrow keys to extend or reduce the selection. +RowSource | A String property that specifies the source of a list to be displayed in a combo box or a list box. +SelectionMargin | See Table 14.3. +ShowDropButtonWhen | See Table 14.3. + +### CheckBox + +Check boxes are similar to option buttons—a set of choices presented to the user. However, option buttons permit the user to select only one from among the displayed options (like a set of radio pushbuttons). By contrast, users can select as many check boxes as they wish. + +Most of the properties of the CheckBox control have been discussed already. The key property of the CheckBox that you haven't come across yet is TripleState, which is a feature of the OptionButton and ToggleButton controls as well. + +TripleState is a Boolean property that determines whether the check box, option button, or toggle button can have a null state as well as True and False states. When a check box or other control is in the null state, it appears with a small black square in its box. + +You can see the null state in the Font dialog box in Word when one of the check-box-controlled properties—such as the Strikethrough check box in Figure 14.10—is true for _some_ but not all of the current selection. For example, select a word (or any amount of selected text) that is only partly struck through, and you trigger the null state for the Strikethrough check box, as shown in Figure 14.10. Normally, a check box is either checked or not, but when in a null state, it contains a small black square, indicating it's neither true nor false. (In earlier versions of Office, the null state in a check box was indicated by filling the box with gray or black.) + +Figure 14.10 By setting the TripleState property of a check box to True, you can display a check box in a null state. Here Word's Font dialog box shows the Strikethrough check box in a null state (containing a small black square, but not checked). + +A couple of properties described briefly in the context of other controls deserve more detail here: + + * The SpecialEffect property controls the visual appearance of the check box. The default value is fmButtonEffectSunken (2), which displays a sunken box—the norm for 3D Windows dialog boxes. You can also choose fmButtonEffectFlat(0) to display a box with a flat effect, but why? To me, it doesn't look as good as the default 3D, shadowed box. The flat version is less subtle, crude actually. But, it fits in well with the new "Modern" aesthetic promoted by Microsoft in Windows 8—no gradients, opacity, dimensional effects, shadows, subtle colors, highlights, reflections, serif typefaces, and so on. In other words, flatland. + +Figure 14.11 shows a sunken check box and a flat check box. The Value property, which indicates whether the check box is selected (True) or cleared (False), is the default property of the check box. Recall that the default property need not be specified in code; it's assumed. Thus, you can either write CheckBox1.Value or just CheckBox. The following three statements have the same effect: + + If CheckBox1.Value = True Then + If CheckBox1 = True Then + If CheckBox1 Then + +Figure 14.11 Use the SpecialEffect property to display a flat check box (bottom) rather than the traditional sunken check box. + + * The Accelerator property provides quick access to the check box. Assign a unique accelerator key to check boxes so that the user can swiftly toggle them on and off from the keyboard. + +### OptionButton + +A _group_ of OptionButtons provides a set of mutually exclusive options from which the user can choose. Only one of the buttons in a group can be selected. For instance, you could have two OptionButtons under the heading Sex: Male and Female. (Recall that a set of CheckBoxes permits multiple options to be chosen simultaneously. CheckBoxes are useful for choosing more complex options. For example, under the heading Typeface, you could have Italic, Bold, and Underlined options, all of which could be selected simultaneously.) + +Like the CheckBox, the OptionButton control has a straightforward set of properties, almost all of which you've seen already in this chapter. This section shows you the GroupName property, which is unique to the OptionButton, and some of the key properties for working with option buttons. + +The GroupName property is a String data type that assigns the option button to a group of option buttons. Alternatively, you can create a group by placing a set of option buttons on a Frame control. The key idea here is that, once grouped, the buttons become mutually exclusive. However, there can be more than one group (or set) on a form—as long as you employ a Frame control or the GroupName property to isolate the various groups of buttons. + +The default setting for GroupName is a blank string (""), which means that an option button isn't assigned to a group until you explicitly assign it. When you enter the group name, the group is created. By using the GroupName property, you can have multiple groups of option buttons on the same form without using frames to specify groups, but you must somehow distinguish the logical groups of option buttons from each other so that the user can tell which option buttons constitute a group. Using a Frame control is the easiest way of segregating groups of option buttons both visually and logically—but it's useful to have the flexibility that GroupName provides when you need it. Also, a Frame has a built-in Caption property you can use to describe the group's purpose. + +These are the other key properties of the OptionButton control: + + * The Value property, which indicates whether the option button is selected (True) or cleared (False), is the default property of the option button. So you can set or return the state of the option button by setting either the OptionButton object or its Value to True or False, as appropriate. Setting the Value of one OptionButton to True sets the Value of all other OptionButton controls in the same group or frame to False. + * The Accelerator property provides quick access to the option button. Assign a unique accelerator key to each option button so that the user can toggle it on and off from the keyboard. + * The SpecialEffect property controls the visual appearance of the option button. The default value of fmButtonEffectSunken (2) displays a sunken button, while fmButtonEffectFlat (0) displays a flattened button. Figure 14.11 shows a sunken option button and a flat option button. + * The TripleState property (discussed in the previous section, "CheckBox") lets you create an option button that has three states: selected (True), cleared (False), and null (which appears selected but grayed out). The TripleState property is disabled so that the user can't set the null state interactively, but you can set it programmatically as needed. + +### ToggleButton + +When it's not selected, the ToggleButton control appears raised, but it looks pushed in when it's selected. The key properties for the ToggleButton control are the same as those for the CheckBox and CommandButton: + + * The Value property is the default property of the ToggleButton. + * The TripleState property lets you create a ToggleButton that has three states: selected (True), cleared (False), and null. The user can set a triple-state ToggleButton to its null state by clicking it. In its null state, a ToggleButton appears selected, but gray. + * The Accelerator property provides quick access to the toggle button. + +### Frame + +The Frame control is relatively straightforward, but it has several properties worth mentioning; they're shown in Table 14.5. The Frame control shares a couple of these properties with the Page object. + +Table 14.5 Properties of the Frame control + +**Property** | **Description** +---|--- +Cycle | Determines the action taken when the user leaves the last control in the frame or on the page. fmCycleAllForms (0) moves the focus to the next control in the tab order for the user form or page, whereas fmCycleCurrentForm (2) keeps the focus within the frame or on the page until the focus is explicitly moved to a control in a different frame or on a different page. This property applies to the Page object as well. +InsideHeight | A read-only property that returns the height (measured in points) of the area inside the frame, not including the height of any horizontal scroll bar displayed. This property applies to the Page object as well. +InsideWidth | A read-only property that returns the width (in points) of the area inside the frame, not including the width of any vertical scroll bar displayed. This property applies to the Page object as well. +KeepScrollBarsVisible | A property that determines whether the frame or page displays horizontal and vertical scroll bars when they aren't required for the user to be able to navigate the frame or the page. fmScrollBarsNone (0) displays no scroll bars unless they're required. fmScrollBarsHorizontal (1) displays a horizontal scroll bar all the time. fmScrollBarsVertical (2) displays a vertical scroll bar all the time. fmScrollBarsBoth (3) displays a horizontal scroll bar and a vertical scroll bar all the time. fmScrollBarsNone is the default for the Frame object, and fmScrollBarsBoth is the default for the Page object. This property applies to the Page object as well. +PictureTiling | A Boolean property that determines whether a picture displayed on the control is tiled (True) so that it takes up the whole area covered by the control or not (False). To set the tiling pattern, you use the PictureAlignment and PictureSizeMode properties. This property applies to the Page object and the Image control as well. +PictureSizeMode | Determines how to display the background picture. fmPictureSizeModeClip (0), the default setting, crops (removes) any part of the picture too big to fit in the page, frame, or image control. Use this setting to show the picture at its original dimensions and in its original proportions. fmPictureSizeModeStretch (1) stretches the picture horizontally or vertically to fill the page, frame, or image control. This setting is good for colored backgrounds and decorative effects but tends to be disastrous for pictures that need to be recognizable; it also overrides the PictureAlignment property setting. fmPictureSizeModeZoom (3) zooms the picture proportionately until the horizontal dimension or the vertical dimension reaches the edge of the control but doesn't stretch the picture so that the other dimension is maximized as well. This is good for maximizing the size of a picture while retaining its proportions, but you'll need to resize the nonmaximized dimension to remove blank spaces. This property applies to the Page object and the Image control as well. +PictureAlignment | Determines where a picture is located. fmPictureAlignmentTopLeft (0) aligns the picture with the upper-left corner of the control. fmPictureAlignmentTopRight (1) aligns the picture with the upper-right corner of the control. fmPictureAlignmentCenter (2), the default setting, centers the picture in the control (both horizontally and vertically). fmPictureAlignmentBottomLeft (3) aligns the picture with the lower-left corner of the control. fmPictureAlignmentBottomRight (4) aligns the picture with the lower-right corner of the control. This property applies to the Page object and the Image control as well. + +### CommandButton + +The CommandButton is used quite often. This control has three unique properties, listed in Table 14.6. + +Table 14.6 Unique properties of the CommandButton control + +**Property** | **Description** +---|--- +Cancel | A Boolean property that determines whether the command button is the Cancel button for the user form (True) or not (False). The Cancel button for a user form can bear any name; what distinguishes it is that its Cancel property is set to True. The Cancel button is activated by the user's pressing Esc, or clicking the button, or putting the focus on the button and pressing Enter. Only one command button on a form can be the Cancel button at any given time. Setting the Cancel property for a command button to True causes VBA to set the Cancel property to False for any button for which it was previously set to True. +Default | A Boolean property that determines whether the command button is the default button for the user form (True) or not (False). Only one command button on a form can be the default button at any given time. Setting the Default property for a command button to True causes VBA to set the Default property to False for any button for which it was previously set to True. The default button is activated by the user pressing Enter when the focus isn't on any other command button. +TakeFocusOnClick | A Boolean property that determines whether the command button takes the focus when the user clicks it (True) or not (False). The default setting for this property is True, but you may want to set it to False when you need the focus to remain on another control in the user form even when the user clicks the command button. However, if the user uses the Tab key or the arrow keys to move to the command button, the command button will take the focus even if the TakeFocusOnClick property is set to False. + +Note that it's useful to set the Accelerator property for each command button on a form. This way, the user can quickly access it from the keyboard. + +* * * + +Sometimes the Cancel Button Should Be the Default Button + +Sometimes you'll be tempted to make the Cancel button the default on a form. This offers an obvious benefit for forms that offer irreversible actions, such as deleting text or deleting a file, but it can confuse accessibility aids (such as screen readers) and make it difficult for users with cognitive difficulties to work with the form. For these reasons, it's usually best to make the default button on a form a different button than the Cancel button. + +* * * + +### TabStrip and MultiPage + +TabStrip controls allow you to create a multipage dialog box. Click the Home tab in Word and then click the small arrow icon in the lower-right corner of the Font area on the Ribbon. Word's Font dialog box will open and you'll see a two-tab dialog box. One tab is labeled _Font_ and the other tab is labeled _Advanced_. This is a good way to organize a dialog box when you have quite a few options to present to the user. + +The TabStrip control has several unique properties and a number of properties that it shares with the MultiPage control. Table 14.7 lists these properties. + +Table 14.7 Properties of the TabStrip and MultiPage controls + +**Property** | **Description** +---|--- +ClientHeight | (Tab strip only.) A Single (data type) property that sets or returns the height of the display area of the tab strip, measured in points. +ClientLeft | (Tab strip only.) A Single property that returns the distance, measured in points, between the left border of the tab strip and the left border of the control inside it. +ClientTop | (Tab strip only.) A Single property that returns the distance, measured in points, between the top border of the tab strip and the top border of the control inside it. +ClientWidth | (Tab strip only.) A Single property that sets or returns the width of the display area of the tab strip, measured in points. +SelectedItem | Sets or returns the tab currently selected in a tab strip or the page currently selected in a MultiPage control. +TabFixedHeight | A Single property that sets or returns the fixed height of the tabs, measured in points. Set TabFixedHeight to 0 to have the tabs automatically size themselves to fit their contents. +TabFixedWidth | A Single property that sets or returns the fixed width of the tabs, measured in points. Set TabFixedWidth to 0 to have the tabs automatically size themselves to fit their contents. +TabOrientation | Determines the location of the tabs in the tab strip or multipage. fmTabOrientationTop (0), the default, displays the tabs at the top of the tab strip or multipage. fmTabOrientationBottom (1) displays the tabs at the bottom of the tab strip or multipage. fmTabOrientationLeft (2) displays the tabs at the left of the tab strip or multipage, and fmTabOrientationRight displays the tabs at the right of the tab strip or multipage. + +### ScrollBar and SpinButton + +A SpinButton allows the user to easily increment or decrement numbers, dates, and so on. The ScrollBar and SpinButton share a number of properties that you haven't yet encountered. Table 14.8 lists these properties. + +Table 14.8 Properties of the ScrollBar and SpinButton controls + +**Property** | **Description** +---|--- +Delay | A Long (data type) property that sets the delay in milliseconds between clicks registered on the control when the user clicks and holds down the mouse button. The default delay is 50 milliseconds. The control registers the first click immediately, the second click after Delay x 5 (the extra delay is to assist the user in clicking only once), and the third and subsequent clicks after Delay. +LargeChange | (Scroll bar only.) A Long property that determines how much the item is scrolled when the user clicks in the scroll bar between the thumb (the small square within the scroll bar) and the scroll bar's arrow. Set the LargeChange property after setting the Max and Min properties of the scroll bar. +SmallChange | A Long property that determines how much movement occurs when the user clicks a scroll arrow in a scroll bar or spin button. SmallChange needs to be an integer value; the default value is 1. +Max | A Long property that specifies the maximum value for the Value property of the scroll bar or spin button. Max must be an integer. The default value is 1. +Min | A Long property that specifies the minimum value for the Value property of the scroll bar or spin button. Min must be an integer. The default value is 1. +ProportionalThumb | (Scroll bar only.) A Boolean property that determines whether the thumb is a fixed size (False) or is proportional to the size of the scrolling region (True), thereby giving the user an approximate idea of how much of the scrolling region is currently visible. The default setting is True. + +### Image + +By now, you've seen all the properties of the Image control. Most of the time when you use an Image control, you'll want to adjust the following properties: + + * Use the Picture property to assign the picture file you want to appear in the Image control. Click in the Picture row in the Properties window, and then click the ellipsis button (...) that the text box displays. In the Load Picture dialog box, select the picture and click the OK button to add it. The Image control can display .BMP, .CUR (cursor), .GIF, .ICO (icon), .JPG, and .WMF files, but not other graphics files, such as .TIF. Most graphics applications, however, can easily convert one graphics file type into another. + +* * * + +An Easy Way to Capture a Graphic Image + +The easiest way to display part of a Windows screen in an Image control is to capture it by pressing the Print Screen key (to capture the entire screen) or the Alt+Print Screen key combination (to capture the currently active window). Then paste it into an application such as the Windows Paint accessory, trim (crop) it there as necessary, and save it as a .BMP file. Windows 8 provides a third option: Press the Windows key plus the Print Screen key to capture and automatically save the screen to disk. The captured image will be saved in your Libraries folder in a subfolder named Screenshots. The image is saved as a .PNG graphics filetype—widely considered to be the best way to grab screen images. + +* * * + + * Use the PictureAlignment property to set the alignment of the picture. + * Use the PictureSizeMode property to set whether the picture is clipped, stretched, or zoomed to fill the Image control. Adjust the height and width of the Image control as necessary. + * Use the PictureTiling property if you need to tile the image to take up the full space in the control. + +### Page + +The Page object is one of the pages contained within a MultiPage object. You've already seen all its properties (in the context of other controls) except for the Index property, which it shares with the Tab object. + +The Index property is an Integer data type that determines the position of the Page object in the Pages collection in a MultiPage control or the position of a Tab object in the Tabs collection in a TabStrip. The first Page object or Tab object is numbered 0 (zero); the second Page or Tab object is numbered 1; and so on. You can change the Index property of a tab or page to change the position in which the tab or page appears in the collection. + +### Tab + +The Tab object is one of the tabs contained within a TabStrip object. You've already learned about all its properties in the context of other controls. + +## Working with Groups of Controls + +As mentioned briefly earlier in this chapter, when you are designing a form, it's often handy to group controls. By grouping two or more controls, you can work with them as a single unit to size, reposition, format, or delete them. (Recall that this form-design grouping technique has nothing to do with creating a set of option buttons within a Frame control. That creates a mutually exclusive collection of radio buttons to display to the user during runtime.) + +### Grouping Controls + +To group controls, select them by Shift+clicking, Ctrl+clicking, or dragging around them, and then right-click and choose Group from the context menu. Alternatively, select the controls, and then click the Group button on the UserForm toolbar (you'll need to display this toolbar—it's not displayed by default) or choose Format ⇒ Group. VBA creates a new group containing the controls and places a shaded border with handles around the whole group, as shown on the right in Figure 14.12. + +Figure 14.12 You can work with multiple controls simultaneously by grouping them. VBA indicates a group of controls by placing a border around the entire group, as shown on the right. + +When you merely select a set of controls (by Shift+clicking, Ctrl+clicking, or dragging around them), you have only temporarily grouped them. You can still manipulate them as a group, but as soon as you deselect them—by, for example, clicking the background of the form itself—the grouping disappears. However, when you right-click and choose Group from the context menu, they will remain grouped until you right-click and choose Ungroup. + +### Ungrouping Controls + +To ungroup controls, right-click any of the controls contained in the group and then choose Ungroup from the context menu. Alternatively, select the group of controls by clicking in any control in the group and then click the Ungroup button on the UserForm toolbar, or choose Format ⇒ Ungroup. VBA removes the shaded border with handles from around the group and displays the normal border and handles around each individual control. + +### Sizing Grouped Controls + +You can quickly size all controls in a group by selecting the group and then dragging the sizing handles on the surrounding border. For example, you could select the middle handle on the right side and drag it inward to shorten the controls, as shown in Figure 14.13. The controls will be resized proportionately to the change in the group outline. + +Figure 14.13 You can resize all the controls in a group by dragging a sizing handle on the surrounding border. + +When the controls are grouped, you can then use the Properties window to quickly modify any properties they have in common (such as Font). But resizing a group can present problems—the results can be ugly. Generally speaking, resizing works fine when you've grouped a number of controls of the same type, as in Figure 14.13. For example, sizing a group that consists of several command buttons or option buttons works well, whereas sizing a group that consists of a text box, a command button, and a combo box is seldom a good idea. + +### Deleting Grouped Controls + +You can quickly delete a whole group of controls by right-clicking any of them and choosing Delete from the context menu or by selecting the group and pressing the Delete key. + +### Working with One Control in a Group + +Even after you've grouped a number of controls, you can still work with them individually if necessary. To do this, first click any control in the group to select the group as a whole, as shown on the left in Figure 14.14. Then click the control you want to work with. As shown on the right in Figure 14.14, VBA displays a dark shaded border around the group (indicating that the group still exists) and displays the lighter shaded border around the individual control, indicating that that control is selected. + +Figure 14.14 To work with one control in a group, start by selecting the group (as shown on the left) and then select the control (as shown on the right). + +You can then modify the selected individual control as if it were not grouped. Change its ForeColor property to blue, for instance, and only the caption in that particular control will turn blue. When you've finished working with it, click another control in the group to individually select it, or click elsewhere in the user form to deselect all individual controls and restore the group. + +## Aligning Controls + +Even if you use the Snap To Grid feature, you'll often need to align controls manually. They must be ungrouped for this feature to work. The easiest way to align controls is to select two or more, then right-click in any one of them and choose an option from the Align submenu: Lefts, Centers, Rights, Tops, Middles, Bottoms, or To Grid. These options work as follows: + +**Lefts** aligns the left borders of the controls. + +**Centers** aligns the horizontal midpoints of the controls. + +**Rights** aligns the right borders of the controls. + +**Tops** aligns the tops of the controls. + +**Middles** aligns the vertical midpoints of the controls. + +**Bottoms** aligns the bottoms of the controls. + +**To Grid** aligns the controls to the grid. + +VBA aligns the borders or midpoints to the current position of that border or midpoint on the dominant control—the control that has white sizing handles around it rather than black sizing handles. After selecting the controls you want to align, make dominant the one that is already in the correct position by clicking it so that it takes on the white sizing handles. Then choose the alignment option you want. + +* * * + +Ensure That You Choose Appropriate Alignment Options + +Make sure the alignment option you choose makes sense for the controls you've selected. VBA will happily align controls in an inappropriate way if you tell it to. For example, if you select a number of option buttons or text boxes and choose Tops from the Align submenu, VBA will obligingly stack all the controls on top of each other, rendering them unusable. (To recover from such minor mishaps, press Ctrl+Z.) + +* * * + +## Placing Controls + +The VBA Editor offers several placement commands on the Format menu: + + * On the Format ⇒ Make Same Size submenu, use the Width, Height, and Both commands to make two or more controls the same size in one or both dimensions. + * Use the Format ⇒ Size To Fit command to have VBA decide on a suitable size for an element based on the size of its label. This works well for, say, a toggle button with a medium-length label, but VBA will shrink an OK button to a size so small as to be unusable. + * Use the Format ⇒ Size To Grid command to increase or decrease the size of a control to the nearest gridpoints. + * On the Format ⇒ Horizontal Spacing and Format ⇒ Vertical Spacing submenus, use the Make Equal, Increase, Decrease, and Remove commands to set the horizontal spacing and vertical spacing of two or more controls. The Remove option removes extra space from between controls, which works well for, say, a vertical series of option buttons (which look good close together) but isn't a good idea for command buttons (which need a little space between them). + * On the Format ⇒ Center In Form submenu, use the Horizontally and Vertically commands to center a control or a group of controls in the form. Centering controls vertically is seldom a good idea, but you'll often want to center a frame or a group of command buttons horizontally. + * On the Format ⇒ Arrange Buttons submenu, use the Bottom and Right commands to reposition command buttons in a form quickly. + +## Adjusting the Tab Order of a Form + +The _tab order_ of a user form (or of a frame control within a form) is the order in which VBA selects controls in the form or frame when the user moves through them by pressing the Tab key (to move forward) or the Shift+Tab key combination (to move backward). + +Put another way, it's a Windows convention that when the user presses the Tab key, the _focus_ moves to the next control in a window. + +Only one control at a time can have the focus. For example, if a form has five text boxes, only one of these text boxes, the one that currently has the focus, will display characters as the user types. In addition, a button in a set of buttons can also have the focus, and when the user presses the Enter key, the button with the focus will be triggered. Or the user can click a different button to move the focus to that button. + +VBA displays a visual cue to indicate which control currently has the focus. You'll see a dotted frame around a button or option button and a blinking insertion cursor in a text box. + +Each frame you add to a user form has a separate tab order for the controls it contains: The frame itself appears in the tab order for the form, and the controls within the frame appear in the tab order for the frame. + +Set the tab order for a form or a frame to make it as easy as possible for the user to work with your form. Generally, for English-speaking users, it's best to arrange the tab order from left to right and from top to bottom in the dialog box or frame. For international users, you may want to arrange the tab order from right to left. You may also need to arrange the tab order to move from one control to a related control that would not normally be next in the tab order. + +The whole point of managing the tab order is that you simplify things for your user. Employing the Tab key in this way allows the user to fill in a whole form without once having to move their hand off the keyboard to keep selecting, with a mouse click, each next text box. + +This kind of tabbing is particularly useful when the user is asked to fill in several fields by typing into multiple text boxes (such as Name, Address, Phone, and so on). As soon as users finish filling in one field, they can press Tab to move on to the next. (Even easier, pressing the Enter key while in a text box moves users to the next control in the tab order.) At the end, after they've filled in the last field, they can quickly close the dialog box if you make the OK button the next control in the tab order. + +VBA assigns the tab order to the controls in a dialog box or frame on a first-come, first-served basis as you add the controls. Unless you add all the controls in perfect order, this default order will seldom produce the optimal tab order for a dialog box, so usually you'll want to adjust the tab order—or at least check to ensure that it's right. You're likely to place fewer controls on a frame than on a form, so you have a better chance of adding them in a suitable order, but you should check these controls too before unleashing the dialog box on users. + +Just press F5 and then repeatedly press the Tab key to examine your current tab order. Alternatively, you can open a Tab Order dialog box (shown in Figure 14.15) by right-clicking in the open space in the background of the form or frame and choosing Tab Order from the context menu. Or select the user form or frame and then choose View ⇒ Tab Order. + +Figure 14.15 Use the Tab Order dialog box to arrange the controls in your user form or frame into a logical order for the user. + +The time to adjust the tab order is after you've finished creating your form (adding a control later will require that you go back and modify the tab order). Here's how to change the tab order in a dialog box or frame: + +1. Rearrange the controls into the order in which you want them to appear by selecting them in the Tab Order list box and clicking the Move Up button or Move Down button as appropriate. You can Shift+click or drag to select a range of controls, or Ctrl+click to select two or more noncontiguous controls. (Or just change the controls' TabIndex properties in the Properties window.) + +2. Click the OK button to close the Tab Order dialog box. + +# Linking a Form to a Procedure + +Designing a custom form is only the first step in getting it to work in a procedure. The other step is writing the code to display the form to the user and make it perform its tasks. + +Typically, the code for a form consists of the following: + + * A macro procedure that displays the dialog box by loading it and using the Show method. Usually, this procedure can be assigned to a Quick Access Toolbar button or to a shortcut key combination so that the user can conveniently invoke it. However, a procedure can also be designed to run automatically in response to a system event (such as running at a specified time or when a worksheet is opened). + * The user form that represents the form and its controls. + * The code attached to the user form. This code consists of procedures for designated controls. For example, for a simple dialog box containing two option buttons and two command buttons (an OK button and a Cancel button), you'd typically write one procedure for the OK button and one for the Cancel button. The procedure for the OK button is executed when the user either clicks the button with the mouse or presses the Enter key while the focus is on that button. Either of these user actions triggers the button's Click event, and whatever code you, the programmer, have written within this event is then executed. Remember that the easiest way to create an event (procedure) for a control is to just double-click the control on the form. The editor then switches to Code view and writes the necessary Sub...End Sub envelope for that event, like this: + + Private Sub btnOK_Click() + + End Sub + +Notice that the Editor automatically combines the Name property of the control with the name of the event as the procedure's name, separated by an underscore character: btnOK_Click. + +* * * + +In Static Dialog Boxes, Click Events Are Usually Employed with Command Buttons + +Most controls have quite a few events. Some of them might seem inappropriate or useless at first. For example, option buttons have a Click event. But why? It makes sense to _trap_ (to respond in code to an event such as a user's mouse click) using command buttons in a static dialog box. (A static dialog box is the most common type. The controls don't change or move.) However, as you'll see in the next chapter, in a dynamic dialog box, you may want to trap the click on an option button and display further controls to get additional input from the user. + +* * * + +Once the code attached to a button has run, execution returns to the form (if it's still displayed) or to the procedure that called the form. + +Note that code that runs directly in response to an event is called an _event procedure_ or _event handler_. An event procedure can call other procedures as necessary, so multiple procedures can be run indirectly when a single event handler Sub is triggered. + +## Loading and Unloading a Form + +You load a form by using the Load statement, and unload it by using the Unload statement. The Load statement loads the form into memory so that it's available to the program but doesn't display the form; for that you use the Show method (discussed in the next section). The Unload statement unloads the form from memory and releases any memory associated with that object. If the form is displayed when the Unload statement runs, VBA removes the form from the screen. + +The syntax for the Load and Unload statements is straightforward: + + Load _UserForm1_ + Unload _UserForm1_ + +Here, _UserForm1_ is the name of the user form or dialog box. For example, the following statement loads the dialog box named frmMyDialog: + + Load frmMyDialog + +## Displaying and Hiding a Form + +To display a form, you use the Show method; to hide a form, you use the Hide method. For example, the following statement displays the form named frmMyDialog: + + frmMyDialog.Show + +If you execute a procedure containing this line, the frmMyDialog form appears onscreen so the user can interact with it: enter text in its text boxes, select or clear its check boxes, use its drop-down lists, click its buttons, and so on. + +When the user closes the form (by clicking the Close button on its title bar or by clicking a command button that dismisses it), the form disappears from the screen and the procedure continues to run. But until you retrieve settings from the form and take action on them, the form has no effect beyond its graphical display. + +You can display a form by using the Show method without explicitly loading the form with a Load command first; VBA takes care of the implied Load command for you. There's no particular advantage to including the Load command, but it might make your code easier to read and to debug. For example, the two procedures shown here have the same effect: + + Sub Display_Dialog() + Load frmMyDialog 'loads the form into memory + frmMyDialog.Show 'displays the form + End Sub + + Sub Display_Dialog() + frmMyDialog.Show 'loads the form into memory and displays it + End Sub + +If you run a Hide method without having loaded the form into memory by using the Load statement or the Show method, VBA loads the form but does not display it onscreen. + +Once you've displayed the form, take a moment to check its tab order by pressing F5 and then moving through it using the Tab key. When you first open the form, is the focus on the appropriate control, the control the user is most likely to want to interact with first? When you move forward from that control, is the next control that is selected the next control that the user will typically need to use? Adjust the tab order as necessary, as described in "Adjusting the Tab Order of a Form" earlier in this chapter. + +## Setting a Default Command Button + +To specify a default command button in a form, set that command button's Default property to True. VBA selects the default button when it displays the form so that if the user simply presses the Enter key to dismiss the dialog box, this button receives the keystroke. + +Only one button can be the default button at any given time. If you set the Default property of any button to True, VBA automatically changes to False the Default property of any other button previously set to True. + +# Retrieving the User's Choices from a Dialog Box + +To make a form do something, your code will usually respond to the user's input. The following sections first cover the VBA commands for retrieving information from a dialog box. Then you'll see an example of how to retrieve the user's choices from both a relatively simple dialog box and then a more complex form. + +## Returning a String from a Text Box + +To _return_ (retrieve) a string from a text box, your code can check its Value property or Text property after the user has clicked an OK or Cancel button or otherwise dismissed the dialog box. + +For example, if you have a text box named txtMyText, you could return its value and display it in a message box by using the following line: + + MsgBox txtMyText. **Value** + +* * * + +The Text Property of a Text Box Is Unique + +For a text box, the Value property and the Text property return the same information; for most other VBA objects, the Value property and the Text property return different information. + +* * * + +Recall that VBA supports both one-line and multiline text boxes. To create a multiline text box, select the text box in the user form or in the drop-down list in the Properties window and set its MultiLine property to True. The user can then enter multiple lines in the text box and start new lines by pressing Shift+Enter. + +* * * + +Quick Changes for Two-State Properties + +Here's a tip: If you're changing a Boolean (two-state, True versus False) property—like Enabled, Visible, or Multiline—just double-click the value in the Properties window. For example, to change the default False setting for Multiline, double-click False in the Properties window. It changes to True. (This doesn't work with the Value property.) + +* * * + +To add a horizontal or vertical scroll bar to a text box, set its ScrollBars property to 1 - fmScrollBarsHorizontal (for a horizontal scroll bar), 2 - fmScrollBarsVertical (for a vertical scroll bar, which is usually more useful), or 3 - fmScrollBarsBoth (for both). + +## Returning a Value from an Option Button + +A regular option button is a binary control, so it can have only two values: True and False. True indicates that the button is selected, False that it's unselected. You can check an option button's value with a simple If... Then structure. For example, if you have two option buttons, named optSearchForFile and optUseThisFile, you can check their values and find out which was selected by using the following code: + + If optSearchForFile = True Then + 'optSearchForFile was selected; take action on this + Else 'optSearchForFile was not selected, so optUseThisFile was + 'take action for optUseThisFile + End If + +Remember that Value is the default property of the OptionButton control. The previous code checks the value of the default property of the control, so you need not specify the property in your code. Default properties can be omitted as a kind of shorthand programming. The first line of code could be written out more fully as If optSearchForFile. **Value** = True Then. But in the code example, I chose to write it more succinctly, with = True implied: If optSearchForFile Then. + +With more than two option buttons, use an If... Then... ElseIf condition or a Select Case statement to determine which option button is selected. + +* * * + +You Can't Directly Test for a Null Value + +This is a bit esoteric, but as you saw earlier in this chapter, an option button or a check box can also have a null value if its TripleState property is set to True. Null means basically "neither completely true nor false"—the selected paragraph is _partially_ boldface, so its FontStyle is both bold and regular. If you allow your option buttons or check boxes to have a null state, you'll need to check for that as well in your procedures. You can't directly check for the control's value being Null (for example, If opt1.Value = Null causes an error), so use an If statement or Select Case statement to test True and False first. If the Value of the control is neither True nor False, it Else must be Null. + +* * * + +## Returning a Value from a Check Box + +Like an option button, a regular check box can only be either True or False, so you can use an If... Then structure to check its value. Here's an example: + + If chkDisplayProgress = True Then + 'take actions for chkDisplayProgress + End If + +Again, you're checking the default property of the control here—the Value property. The first line of code could also be written as If chkDisplayProgress.Value = True Then. + +Sometimes you'll need to take an action if the check box was cleared (deselected) rather than selected. For example, if the user clears the check box, you may need to turn off a configuration option. + +## Returning a Value from a List Box + +List boxes start out empty. So, before you can ask the user to choose an item in a list box, you must first fill the box with items from which the user can choose—you must tell VBA which items to display. To do so, you create a procedure to _initialize_ (prepare) the user form and add the items to the list box before displaying it: + +1. Right-click the name of the user form in the Project Explorer and choose View Code from the context menu to display (in the Code window) the code for the controls assigned to the dialog box. Or you can just double-click somewhere in the background on the user form to go to Code view. Recall that you can toggle between the Code window (press F7) and the form-design window (Shift+F7). + +2. In the Object drop-down list (on the top left of the Code window), make sure UserForm is selected. + +3. Choose Initialize from the Procedure drop-down list (on the top right of the Code window). The Visual Basic Editor creates a new procedure named Private Sub UserForm_Initialize for you, inserting it at the end of the procedures currently displayed in the code window: + + Private Sub UserForm_Initialize() + End Sub + +Here's a tip: VBA runs a UserForm_Initialize procedure every time the user form is brought to life. This procedure is a good place to add items to a list box or combo box or to set properties of other controls on the user form. In other words, this Initialize event is where you write code to do any necessary preliminary housekeeping before displaying the form to the user. + +4. To add items to a list box, you can use the AddItem method for the list box object (here the box is named lstBatteries) with a text string in quotation marks to display the ID number of each battery in the list box: + + lstBatteries.AddItem "Battery #A4601" + + lstBatteries.AddItem "Battery #A4602" + lstBatteries.AddItem "Battery #A4603" + lstBatteries.AddItem "Battery #A4604" + +* * * + +The Initialize Event Is Flexible + +By adding items when you initialize the form, you can add different numbers of items as appropriate. For example, if you wanted the user to pick a document from a particular folder, you could create a list of the documents in that folder on the fly in your code during runtime and fill the list box with the documents' names. + +* * * + +To retrieve the user's choice from a single-select-style list box, check the Value property in your code, as in this example: + + MsgBox "You chose this entry from the list box: " & lstBattery.Value + +Single-select list boxes are like a set of option buttons—the user is allowed to select only one of them. + +When you use the MultiSelect property to create a list box capable of multiple selections, you can no longer use the Value property to return the items selected in the list box. When MultiSelect is set to True, Value always returns a null value. Instead, you use the Selected property to determine which rows in the list box are selected and the List property (it's an array) to return the contents (the values) of each selected row. + +The following statements use a For... Next loop to build a string named strMsg containing the entries selected from a multiselect list box: + + strMsg = "You chose the following entries from the list box: " & vbCr + For i = 1 To lstBatteries.ListCount + If lstBatteries.Selected(i - 1) = True Then + strMsg = strMsg & lstBatteries.List(i - 1) & vbCr + End If + Next i + MsgBox strMsg + +## Returning a Value from a Combo Box + +To return a value from a combo box (a control that is, in effect, a combination list box and text box), you add items to the combo box list in an Initialize procedure and then check the Value of the combo box after the user has dismissed the dialog box. (The combo box control doesn't offer multiple-selection capabilities, so Value is the property to check.) + +For example, you would use the following code to add items to a combo box named cmbColor: + + Private Sub UserForm_Initialize() + cmbColor.AddItem "Red" + cmbColor.AddItem "Blue" + cmbColor.AddItem "Yellow" + End Sub + +To return the item the user chose in the combo box, retrieve the Value property: + + Result = cmbColor.Value + +The item retrieved from a combo box can be either one of the items assigned in the Initialize procedure or one that the user has typed into the text-box portion of the combo box. + +# Examples of Connecting Forms to Procedures + +The following sections show you two examples of how you can create a procedure and then design a form that works with it to make the procedure more useful and powerful. In the first example, you'll record a macro in Word and then link a form to that code. In the second example, which will work with any VBA-enabled application, you'll create a user form and its associated code from scratch. + +## Word Example: The Move-Paragraph Procedure + +This first example moves the current paragraph up or down within the document by one or two paragraphs in Word. + +### Recording the Procedure + +Start by recording a procedure in Word to move the current paragraph. In the procedure, you need to record the commands for the following actions: + + * Selecting the current paragraph + * Cutting the selection and then pasting it + * Moving the insertion point up and down the document + * Inserting a bookmark, moving the insertion point to it, and then deleting the bookmark + +We want our finished procedure to display a dialog box with option buttons for moving the current paragraph up one paragraph, up two paragraphs, down one paragraph, or down two paragraphs. The dialog box should also include a check box that indicates the user wants the insertion point returned to its original position at the end of the procedure. Because this is presumably desirable default behavior for the procedure, this check box is selected by default. Users can clear the check box if they don't want to return the insertion point to its original position. + +First, start Word and create a new, blank, scratch document (press Ctrl+N), and enter three or four paragraphs of text—just about anything will do, but it'll be easier to have recognizable text so that you can make sure the procedure is moving paragraphs as it should. Then place the insertion point in one of the paragraphs you've just entered and start recording a macro as discussed in Chapter 1, "Recording and Running Macros in the Office Applications": + +1. Click the Record Macro icon on the status bar or the Record Macro icon in the code section of the Ribbon's Developer tab. Either way, you see the Record Macro dialog box. + +2. Type the name for the macro, **Move_Paragraph** , in the Macro Name text box and a description in the Description text box. + +3. Choose a template or document, if necessary, in the Store Macro In drop-down list. (You probably don't want to add this to the global NewMacros module in the Normal.dotm file. Why clutter it up with practice macros?) + +4. If you want, use the Button or Keyboard button to create a Quick Access Toolbar button or keyboard shortcut for the macro. + +5. Click the OK button to start recording the macro. + +Record the following actions in the macro: + +1. Insert a bookmark at the current position of the insertion point by clicking the Bookmark icon in the Links section of the Ribbon's Insert tab. This displays the Bookmarks dialog box. Enter a name for the bookmark, and click the Add button. In this example, the bookmark is named Move_Paragraph_Temp to indicate that it's a temporary bookmark used for the Move_Paragraph procedure. + +2. Select the current paragraph by pressing F8 four times. The first press of F8 activates Extend mode, the second selects the current word, the third selects the current sentence, and the fourth selects the current paragraph. Press the Esc key to turn off Extend mode once the paragraph is selected. + +3. Cut the selected paragraph by using one of the variations of the Cut command (for example, press either Ctrl+X or Shift+Delete, or click the Cut icon in the Ribbon's Clipboard section). + +4. Move the insertion point up one paragraph by pressing Ctrl+. + +5. Paste the cut paragraph back in by using a Paste command (for example, press Ctrl+V or Shift+Insert, or click the Paste button on the Home tab of the Ribbon). + +6. Move the insertion point down one paragraph by pressing Ctrl+ ↓. + +7. Move the insertion point up two paragraphs by pressing Ctrl+ ↑ twice. + +Note that if you started with the insertion point at the beginning of the first paragraph in the document, you'll only be able to move the insertion point up one paragraph. This doesn't matter—press the keystroke anyway to record it. If Word beeps at you, ignore it. + +8. Move the insertion point down two paragraphs by pressing Ctrl+ ↓ twice. (If in doing so you hit the end of the document after the first keystroke, don't worry—perform the second keystroke anyway to record it. Word may sound a beep.) + +9. Open the Bookmarks dialog box again (click the Bookmark icon in the Links section of the Ribbon's Insert tab), select the Move_Paragraph_Temp bookmark, and click the Go To button to go to it. Then click the Delete button to delete the Move_Paragraph_Temp bookmark. Click the Close button to close the Bookmarks dialog box. + +10. Stop the Macro Recorder by clicking the Stop Recording icon on the status bar or the Stop Recording icon in the code section of Ribbon's Developer tab. + +Open the recorded macro in the Visual Basic Editor by pressing Alt+F8, selecting the macro's name in the Macros dialog box, and clicking the Edit button. + +You should see a macro that looks something like this: + + 1. Sub Move_Paragraph() + 2. ' + 3. ' Move_Paragraph Macro + 4. ' Move a paragraph up or down + 5. ' + 6. With ActiveDocument.Bookmarks + 7. .Add Range:=Selection.Range, Name:="Move_Paragraph_Temp" + 8. .DefaultSorting = wdSortByName + 9. .ShowHidden = False + 10. End With + 11. Selection.Extend + 12. Selection.Extend + 13. Selection.Extend + 14. Selection.Extend + 15. Selection.EscapeKey + 16. Selection.Cut + 17. Selection.MoveUp Unit:=wdParagraph, Count:=1 + 18. Selection.Paste + 19. Selection.MoveDown Unit:=wdParagraph, Count:=1 + 20. Selection.MoveUp Unit:=wdParagraph, Count:=2 + 21. Selection.MoveDown Unit:=wdParagraph, Count:=2 + 22. Selection.GoTo What:=wdGoToBookmark, Name:="Move_Paragraph_Temp" + 23. ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete + 24. With ActiveDocument.Bookmarks + 25. .DefaultSorting = wdSortByName + 26. .ShowHidden = False + 27. End With + 28. End Sub + +You can probably read this macro code easily enough by now: + + * Line 1 starts the macro, and line 28 ends it. Lines 2 and 5 are blank comment lines around the comment lines showing the macro's name (line 3) and description (line 4). + * Lines 6 through 10 contain a With statement that adds the Move_Paragraph_Temp bookmark. Lines 7 and 8 are unnecessary here, but the Macro Recorder records all the settings in the Bookmarks dialog box, including the setting for the Sort By option button and the Hidden Bookmarks check box. + * Lines 11 through 15 use the Extend Selection feature to select the current paragraph. + * Lines 17, 19, 20, and 21 record the syntax for moving the insertion point up and down one paragraph and two paragraphs, respectively. + * Line 16 records the Cut command and Line 18 the Paste command. + * Line 22 moves the insertion point to the Move_Paragraph_Temp bookmark, and line 23 deletes the bookmark. Lines 24 through 27 again record the settings in the Bookmarks dialog box, which you don't need here either. + +If you wish, you can quickly delete unnecessary lines of code, and collapse the first With structure, to create a more succinct, more easily understood, version of the code: + + 1. Sub Move_Paragraph() + 2. ActiveDocument.Bookmarks.Add Range:=Selection.Range, _ + Name:="Move_Paragraph_Temp" + 3. Selection.Extend + 4. Selection.Extend + 5. Selection.Extend + 6. Selection.Extend + 7. Selection.EscapeKey + 8. Selection.Cut + 9. Selection.MoveUp Unit:=wdParagraph, Count:=1 + 10. Selection.Paste + 11. Selection.MoveDown Unit:=wdParagraph, Count:=1 + 12. Selection.MoveUp Unit:=wdParagraph, Count:=2 + 13. Selection.MoveDown Unit:=wdParagraph, Count:=2 + 14. Selection.GoTo What:=wdGoToBookmark, _ + Name:="Move_Paragraph_Temp" + 15. End Sub + +### Creating the Dialog Box + +Next, create the dialog box for the procedure (see Figure 14.16): + +Figure 14.16 The Move Current Paragraph dialog box that you will connect to the Move_Paragraph macro + +1. Start a user form in the Visual Basic Editor by clicking the Insert button's drop-down list and choosing UserForm (or just click the Insert button if it's already showing the UserForm icon) or by choosing Insert ⇒ UserForm. + +2. Use the Properties window for the user form to set its Name and Caption properties. Click in the cell next to the Name cell and enter the Name property there, and then click in the cell next to the Caption cell and enter the Caption property. The example user form is named frmMoveParagraph and has the caption Move Current Paragraph so that the name of the form is closely related to the text the user will see in the title bar of the dialog box but different from the procedure name (Move_Current_Paragraph). + +3. Place two frames in the user form, as shown in Figure 14.17, to act as group containers in the dialog box: + +a. Double-click the Frame tool in the Toolbox, and then click and drag in the user form to place each frame. + +b. Align the frames by selecting them both and choosing Format ⇒ Align ⇒ Lefts. + +c. With the frames still selected, verify that they are the same width by choosing Format ⇒ Make Same Size ⇒ Width. (Don't choose Format ⇒ Make Same Size ⇒ Height or Format ⇒ Make Same Size ⇒ Both. The top frame will need to be taller than the bottom frame.) + +d. Caption the top frame **Movement** and the bottom frame **Insertion Point** by selecting each in turn and then setting the Caption property in the Properties window. Then name the top frame **fraMovement** and the bottom frame **fraInsertionPoint**. + +Figure 14.17 Start by placing two frames in the user form. + +4. Place four option buttons in the Movement frame, as shown in Figure 14.18: + +Figure 14.18 Place four option buttons in the Movement frame like this. + +a. Double-click the OptionButton tool in the Toolbox, and then click in the Movement frame to place each option button. This time, don't click and drag—just click to place a normal-width option button. + +b. When you've placed the four option buttons, click the Select Objects button in the Toolbox to restore the selection pointer. Then select the four option buttons and align them with each other by choosing Format ⇒ Align ⇒ Lefts. Even out any disparities in spacing by choosing Format ⇒ Vertical Spacing ⇒ Make Equal. If necessary, use the other items on the Format ⇒ Vertical Spacing submenu—Increase, Decrease, and Remove—to adjust the amount of space between the option buttons. (You can do all these things freehand if you prefer by just eyeballing. Drag them around until you have them neatly positioned and sized.) + +c. Change the caption for each option button by setting the Caption property in the Properties window. Caption them as illustrated in Figure 14.18: **Up one paragraph** , **Up two paragraphs** , **Down one paragraph** , and **Down two paragraphs**. These option buttons will control the number of paragraphs the procedure moves the current paragraph. + +d. If you need to resize the option buttons to make all the text in the captions visible, select them and group them by right-clicking and choosing Group from the context menu, by choosing Format ⇒ Group, or by clicking the Group button on the UserForm toolbar. Then select the group and drag one of the handles to resize all the option buttons evenly. For example, to reveal hidden text that's cut off on the right side, drag the handle at the right midpoint of the group outward. + +e. Name the option buttons **optUpOne** , **optUpTwo** , **optDownOne** , and **optDownTwo** , respectively, by changing the Name property of each in turn in the Properties window. + +* * * + +Option Buttons Are Mutually Exclusive + +By default, all the option buttons on a user form (if they're not contained within a frame) are part of the same option group. This means that only one of these option buttons can be selected at any given time. If you want to provide more than one group of option buttons on a user form, you need to specify the separate groups. The easiest way to do this is to position each group within a separate Frame control as you did in this example. Alternatively, you can specify a different GroupName property for each option button. + +* * * + +f. Next, set the first option button's Value property to True by selecting the default False value in the Properties window and entering **True** instead. Doing so will select the option button in the user form you're designing, and when the dialog box is displayed, that option button will be selected as the default choice for the option group. Set its accelerator key to _U_ by entering **U** as its Accelerator property. Set the Accelerator property of the second option button to _t_ , the third to _D_ , and the fourth to _w_. The Accelerator property is case sensitive only when the caption for the control contains both the uppercase and lowercase versions of the same letter. + +5. Place a check box in the Insertion Point frame, as shown in Figure 14.19: + +Figure 14.19 Place a check box in the Insertion Point frame. + +a. Click the CheckBox tool in the Toolbox and then click in the Insertion Point frame in the user form to place a check box of the default size. + +b. In the Properties window, set the name of the check box to **chkReturnToPreviousPosition** (a long name but a descriptive one). Then set its Caption property to **Return to previous position**. Set its accelerator key to _R_ by entering **R** as its Accelerator property. Finally, set the check box to be selected by default by entering **True** as its Value property. + +6. Next, insert the command buttons for the form (see Figure 14.20): + +Figure 14.20 Add two command buttons and set their properties. + +a. Double-click the CommandButton tool on the Toolbox and click to place the first command button at the bottom of the user form. Click to place the second command button, and then click the Select Objects button to restore the selection mouse pointer. + +b. Size and place the command buttons by using the commands on the Format menu. For example, group the buttons, and then use the Format ⇒ Center In Form ⇒ Horizontally command to center the pair horizontally. You must group the buttons before doing this—if you simply select both of them, VBA centers one button on top of the other so that only the uppermost button is visible. + +c. Set properties of the command buttons as follows: For the left-hand button (which will become the OK button), set the Name property to **cmdOK** , the Caption property to **OK** , the Accelerator property to **O** (that's _O_ as in _OK_ , not a zero), and the Default property to **True**. For the right-hand button (which will become the Cancel button), set the Name property to **cmdCancel** , the Accelerator property to **A** , the Caption property to **Cancel** , and the Cancel property to **True**. Leave the Default property set to False. + +7. Now we attach our code to this form. Dive down into the Code window by double-clicking the Cancel button to display a procedure associated with it: + + Private Sub cmdCancel_Click() + + End Sub + +Recall that the Editor chooses to create a procedure for the most common event for whatever control (or the form) you double-click to get down into the code window. For most controls, this will be the Click event, as it is for the CommandButton control. + +Type an End statement between the lines: + + Private Sub cmdCancel_Click() + + End + End Sub + +This End statement removes the form from the screen and ends the current procedure—in this case, the Move_Current_Paragraph procedure. + +Now you'll attach code to the OK button, which is where things get interesting. When the user clicks the OK button, the procedure needs to continue executing and do all of the following: + + * Remove the dialog box from display by hiding it or by unloading it (or, preferably, both). As discussed earlier in the chapter, the choice is yours, but using both commands is usually clearest. + * Check the Value property of the check box to see whether it was selected or cleared. + * Check the Value property of each option button in turn to see which of them was selected when the OK button was clicked. + +Now continue creating the Move Current Paragraph dialog box: + +8. Double-click the OK button to display the code attached to it. (If you're still working in the Code window, select cmdOK in the Object drop-down list (on the top left of the Code window). The editor automatically creates the Click event procedure for this button. + +First, enter the following two lines between the Private Sub and End Sub lines: + + frmMoveParagraph.Hide + + Unload frmMoveParagraph + +The frmMoveParagraph.Hide line activates the Hide method for the frmMoveParagraph user form, hiding it from display on the screen. The Unload frmMoveParagraph line unloads the dialog box from memory. + +* * * + +Removing a Form Can Prevent Confusion + +It isn't necessary to hide or unload a form to continue execution of a procedure, but if you don't, users may become confused. For example, if you click the OK button on a Print dialog box in a Windows application, you expect the dialog box to disappear and the Print command to be executed. If the dialog box didn't disappear (but it launched the printing job in the background), you'd probably think it hadn't registered your click, so you'd click again and again until it went away. Then you'd print multiple copies, which is so wrong. + +* * * + +9. Next, the procedure needs to check the Value property of the chkReturnToPreviousPosition check box to find out whether to insert a bookmark in the document to mark the current position of the insertion point. To do this, enter a straightforward If... Then statement: + + If chkReturnToPreviousPosition = True Then + + End If + +If the chkReturnToPreviousPosition statement is set to True—that is, if the check box is selected—the code in the lines following the Then statement runs. The Then statement consists of the lines for inserting a bookmark that you recorded earlier. Cut these lines from the procedure and paste them into the If... Then statement like this: + + If chkReturnToPreviousPosition = True Then + + With ActiveDocument.Bookmarks + .Add Range:=Selection.Range, Name:=" Move_Paragraph_Temp" + End With + End If + +If the check box is selected, the procedure inserts a bookmark; if the check box is cleared, the procedure passes over these lines. + +**10.** Next, right after the End If, paste in the code for selecting the current paragraph and cutting it to the Clipboard: + + Selection.Extend + Selection.Extend + Selection.Extend + Selection.Extend + Selection.Cut + +11. After this, you need to retrieve the Value properties from the option buttons to see which one was selected when the user chose the OK button in the dialog box. For this, you can again use an If condition—this time, an If... Then ElseIf... Else condition, with the relevant insertion-point-movement lines from the recorded procedure pasted in: + + If optUpOne = True Then + + Selection.MoveUp Unit:=wdParagraph, Count:=1 + ElseIf optUpTwo = True Then + Selection.MoveUp Unit:=wdParagraph, Count:=2 + ElseIf optDownOne = True Then + Selection.MoveDown Unit:=wdParagraph, Count:=1 + Else + Selection.MoveDown Unit:=wdParagraph, Count:=2 + End If + Selection.Paste + +Here, optUpOne, optUpTwo, optDownOne, and optDownTwo (which uses the Else statement here and therefore isn't specified by name in the listing) are the four option buttons from the dialog box, representing the choice to move the current paragraph up one paragraph, up two paragraphs, down one paragraph, or down two paragraphs, respectively. + +The condition is straightforward: If optUpOne is True (that is, if the option button is selected), the first Then condition runs, moving the insertion point up one paragraph from its current position (after the current paragraph is cut, the insertion point will be at the beginning of the paragraph that was after the current one). If optUpOne is False, the first ElseIf condition is evaluated; if the condition evaluates to True, the second Then condition runs; and if the condition evaluates to False, the next ElseIf condition is evaluated. If that conditiona, too, turns out to be False, the Else code is run. In this case, the Else statement means that the optDownTwo option button was selected in the dialog box, so the Else code moves the insertion point down two paragraphs. + +Wherever the insertion point ends based on which option button the user chose, the next line of code (Selection.Paste) pastes in the cut paragraph from the Clipboard. + +**12.** Finally, the procedure must return the insertion point to where it was originally if the chkReturnToPreviousPosition check box is selected. Again, you can test for this with a simple If... Then condition that incorporates the go-to-bookmark and delete-bookmark lines from the recorded procedure: + + If chkReturnToPreviousPosition = True Then + + Selection.GoTo What:=wdGoToBookmark, _ + Name:=" Move_Paragraph_Temp" + ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete + End If + +If the chkReturnToPreviousPosition check box is selected, VBA moves the insertion point to the temporary bookmark and then deletes that bookmark. + +Listing 14.1 shows the full listing for the cmdOK button. + +**Listing 14.1**: The full listing + + 1. Private Sub cmdOK_Click() + 2. frmMoveParagraph.Hide + 3. Unload frmMoveParagraph + 4. If chkReturnToPreviousPosition = True Then + 5. With ActiveDocument.Bookmarks + 6. .Add Range:=Selection.Range, _ + Name:="Move_Paragraph_Temp" + 7. End With + 8. End If + 9. Selection.Extend + 10. Selection.Extend + 11. Selection.Extend + 12. Selection.Extend + 13. Selection.Cut + 14. If optUpOne = True Then + 15. Selection.MoveUp Unit:=wdParagraph, Count:=1 + 16. ElseIf optUpTwo = True Then + 17. Selection.MoveUp Unit:=wdParagraph, Count:=2 + 18. ElseIf optDownOne = True Then + 19. Selection.MoveDown Unit:=wdParagraph, Count:=1 + 20. Else + 21. Selection.MoveDown Unit:=wdParagraph, Count:=2 + 22. End If + 23. Selection.Paste + 24. If chkReturnToPreviousPosition = True Then + 25. Selection.GoTo What:=wdGoToBookmark, _ + Name:="Move_Paragraph_Temp" + 26. ActiveDocument.Bookmarks("Move_Paragraph_Temp").Delete + 27. End If + 28. End Sub + +Go ahead and try it. To test this example properly, you should remove the bookmark you inserted while recording the macro earlier in this chapter. To remove it, click the Bookmark item in the Links section in the Insert tab on Word's Ribbon. In the Bookmarks dialog box that opens, select Move_Paragraph_Temp and click the Delete button. + +Now open the scratch document in Word that you created earlier in this chapter and filled with several paragraphs of text. Press Alt+F11 to open the Visual Basic Editor. Double-click frmMoveParagraph in the Project Explorer to display the user form. Press F5 to run this procedure. Click the OK button in your user form and observe that the paragraphs were rearranged in the document. + +## General Example: Opening a File from a List Box + +This next example displays a user form that employs a list box from which the user can select a file to open. The user form is simple, as is its code. The macro includes a loop and an array to gather the names of the files in a folder and then displays the filenames in the list box. The user gets to select a file and click the Open button to open it. Figure 14.21 shows the user form in action, displaying Excel files. + +Figure 14.21 The user form you'll build in this example contains a list box that gives the user quick access to all current files. + +You can adapt this example to any of the Office 2013 applications discussed in this book by changing the filename to an appropriate type for that application and also modifying a couple of the key statements. The version of this example we'll look at now shows you how to create the procedure in Excel. + +### Building the User Form + +Follow these steps to build the user form: + +1. Start the application you want to work in. The example uses Excel. + +2. Display the Visual Basic Editor by pressing the Alt+F11 key or by clicking the Visual Basic icon in the Ribbon's Developer tab. + +3. In the Project Explorer, right-click the project to which you want to add the user form and choose Insert ⇒ UserForm from the context menu to insert a default-size user form in the project. + +4. Drag the handle at the lower-right corner of the user form to the right to make the user form a bit wider. + +5. Set the Name property of the form to **frmOpen_a_Current_File** and its Caption to **Open a Current File**. Check the Width property. You want it to be about 350 pixels wide. + +6. Click the Label button in the Toolbox, and then click in the upper-left corner of the user form to place a default-size label there. Activate the Properties window and set the properties of the label as shown in Table 14.9. + +Table 14.9 Set these properties of your label + +**Property** | **Value** +---|--- +(Name) | lblInfo +AutoSize | True +Caption | Choose the file to open and click the Open button. +Left | 10 +Top | 6 +WordWrap | False + +7. Click the ListBox button in the Toolbox, and then click below the label in the user form to place a default-size list box there. Set its properties as shown in Table 14.10. + +Table 14.10 Set these properties of the ListBox + +**Property** | **Value** +---|--- +(Name) | lstifles +Height | 100 +Left | 10 +Top | 25 +Width | 300 + +8. Double-click the CommandButton button in the Toolbox, and then click twice at the bottom of the user form to place two default-size command buttons there. Set their properties as shown in Table 14.11. + +Table 14.11 Set these properties of the CommandButton + +**Property** | **First Button Value** | **Second Button Value** +---|---|--- +(Name) | cmdOpen | cmdCancel +Cancel | False | True +Caption | Open | Cancel +Default | True | False +Height | 21 | 21 +Width | 55 | 55 + +9. Arrange the command buttons as follows: + +a. Click the cmdCancel button to select it, and then drag it close to the cmdOK button. + +b. With the cmdCancel button still selected, Ctrl+click the cmdOK button to add it to the selection. + +c. Choose Format ⇒ Group to group the buttons. + +d. Choose Format ⇒ Center In Form ⇒ Horizontally to center the buttons horizontally in the form. + +e. Drag the group up or down as necessary. + +(Or just drag them around and eyeball them into a pleasing position.) + +## Creating the Code for the User Form + +Follow these steps to create the code for the user form: + +1. With the user form selected, press the F7 key to display the user form's code sheet. + +2. In the declarations portion of the code sheet (just keep pressing the up-arrow key until you move to the very top of the Code window), enter an Option Base 1 statement to make the array numbering start at 1 instead of at 0: + + Option Base 1 + +3. Make sure that UserForm is selected in the Object drop-down list (top left of the code sheet), and then pull down the Procedure drop-down list (top right) and choose Initialize from it. The Visual Basic Editor enters the stub of an Initialize procedure in the code sheet, like this: + + Private Sub UserForm_Initialize() + + End Sub + +4. Enter the statements for the Initialize procedure shown in Listing 14.2. + +5. In the Object drop-down list, select cmdCancel. The Visual Basic Editor enters the stub of a Click procedure, as shown here. (Click is the default event for the CommandButton control, so the Visual Basic Editor assumes that you want to create a Click procedure.) + + Private Sub cmdCancel_Click() + + End Sub + +6. Enter the statements for the cmdCancel_Click procedure shown in Listing 14.2. + +7. In the Object drop-down list, select cmdOpen. The Visual Basic Editor enters the stub of a Click procedure. + +8. Enter the statements for the cmdOpen_Click procedure shown in Listing 14.2. + +9. Customize line 9 (in the Initialize procedure) and line 32 (in the cmdOpen_Click procedure) so that the code will work with the application you're using, as shown in the following list. The procedure as shown is set up to run for Excel, but you'll probably need to change the path to reflect where the target files are on your computer. + + * For Word, change the Workbooks.Open statement to Documents.Open: + + If lstFiles.Value <> "" Then Documents.Open _ + + Filename:="c:\transfer\" & lstFiles.Value + + * For PowerPoint, change the Workbooks.Open statement to Presentations.Open: + + If lstFiles.Value <> "" Then Presentations.Open _ + + Filename:="c:\transfer\" & lstFiles.Value + +Listing 14.2 shows the full version of the code behind the Open a Current File user form. + +**Listing 14.2**: Using a ListBox to open a file + + 1. Option Base 1 + 2. + 3. Private Sub UserForm_Initialize() + 4. + 5. Dim strFileArray() As String + 6. Dim strFFile As String + 7. Dim intCount As Integer + 8. + 9. **strFFile = Dir("c:\transfer\spreads\*.xlsb")** + 10. intCount = 1 + 11. + 12. Do While strFFile <> "" + 13. If strFFile <> "." And strFFile <> ".." Then + 14. ReDim Preserve strFileArray(intCount) + 15. strFileArray(intCount) = strFFile + 16. intCount = intCount + 1 + 17. strFFile = Dir() + 18. End If + 19. Loop + 20. + 21. lstFiles.List() = strFileArray + 22. + 23. End Sub + 24. + 25. Private Sub cmdCancel_Click() + 26. Me.Hide + 27. Unload Me + 28. End Sub + 29. + 30. Private Sub cmdOpen_Click() + 31. Me.Hide + 32. **If lstFiles.Value <> "" Then Workbooks.Open _** + **Name:="c:\transfer\spreads" & lstFiles.Value** + 33. Unload Me + 34. End Sub + +Listing 14.2 contains all the code that appears on the code sheet for the frmOpen_a_Current_File user form: a declarations section and three event procedures. + +In the declarations section, line 1 contains the Option Base 1 statement, which makes any array used on the code sheet begin at 1 rather than at 0. Line 2 is a spacer. + +Here's what happens in the UserForm_Initialize procedure (lines 3 to 23): + + * Line 3 begins the Initialize procedure for the user form. Line 4 is a spacer. + * Line 5 declares the String array variable strFileArray. Line 6 declares the String variable strFFile. Line 7 declares the Integer variable intCount. Line 8 is a spacer. + * Line 9 assigns to strFFile the result of a directory operation on the designated folder (here, c:\transfer\spreads\), but substitute your own path to a folder on your computer that contains files with an .xlsb filename extension. Enter your own path in line 32 as well. + * Line 10 sets the intCount counter to 1. Note that if you don't use the Option Base 1 declaration for this procedure, you need to set Count to 0 (or the corresponding value for a different option base that you use). The first call to Dir, which specifies the pathname in an argument, returns the first file it finds in the folder (assuming it finds at least one file). Each subsequent call without the argument returns the next file in the folder, until Dir finds no more files. + * Line 11 is a spacer. Lines 12 through 19 contain a Do While...Loop loop that runs while strFFile isn't an empty string (″″): + * Line 13 makes sure that strFFile isn't a folder by comparing it to the single period and double period used to denote folders. If strFFile isn't a folder, line 14 uses a ReDim Preserve statement to increase the dimensions of the strFileArray array to the number in intCount while retaining the current information in the array, thus building the list of files in the folder. + * Line 15 assigns to the intCount index of the strFileArray array the current contents of strFFile. + * Line 16 then adds 1 to intCount, and Line 17 sets strFFile to the result of the Dir function (the first filename matching the *.xlsb pattern in the designated folder). + * Line 18 ends the If condition. Line 19 contains the Loop keyword that will continue the loop as long as the Do While statement is True. + * When the loop ends, line 21 sets the List property of the lstFiles list box in the dialog box to the contents of strFileArray, which now contains a list of all the files in the folder. + * Line 22 is a spacer, line 23 ends the procedure, and line 24 is another spacer. + +Here's what happens in the cmdCancel_Click procedure (lines 25 through 28): + + * Line 25 starts the cmdCancel_Click procedure, and line 28 ends it. + * Line 26 hides the user form, using the Me keyword to reference it. + * Line 27 unloads the user form from memory. + + * Here's what happens in the cmdOpen_Click procedure (lines 30 through 34): + * Line 30 starts the cmdOpen_Click procedure, and line 34 ends it. + * Line 31 hides the user form, again by using the Me keyword. + * Line 32 checks to make sure the Value property of the lstFiles list box is not an empty string ("") and, if it is not, uses the Open method of the Documents collection to open the file selected in the list box. The statement adds to the path (c:\transfer\spreads\) the Value property of the list box to produce the full filename. Substitute your own path for c:\transfer\spreads\. + * Line 33 unloads the user form from memory. + +Remember that to test this example, you'll need to adjust lines 9 and 32 to include a file path on your machine where some XLSB files are stored. For Excel 2013, try this location: C:\Users\ _YourName_ \AppData\Roaming\Microsoft\Excel\XLSTART. + +# Using an Application's Built-in Dialog Boxes from VBA + +Some applications, such as Word and Excel, let you use their built-in dialog boxes via VBA. If a built-in dialog box offers the functionality you need, using it can be a great solution: you don't have to build a custom dialog box, just reference the built-in dialog box in your code. + +You shouldn't even need to debug the dialog box, and users of your procedures will probably be familiar with the dialog box from their work in the application. These built-in dialog boxes are called _common dialog boxes_ , and we explored them briefly in the sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12. + +## Displaying a Built-in Dialog Box + +To display a built-in dialog box, you need to know its name and constant. You also must decide which method to use to display the dialog box. + +### Finding the Dialog Box Name and Constant + +Although Office 2013 no longer uses menus (with some exceptions, such as the Visual Basic Editor), built-in dialog boxes (in Word and other applications) are still identified by constants derived from the older, pre-Ribbon menu-style interface. These constants start with the letters wdDialog (as in Word Dialog), followed by the name of the dialog box. + +The name of the dialog box is derived from the pre–Office 2010 menu commands that displayed the dialog box prior to the introduction of the Ribbon interface (with Office 2007). For example, to refer to the Open dialog box, you use the constant wdDialogFileOpen, because in previous versions of Word, you would have chosen File ⇒ Open to display that dialog box. + +Or to display the Print dialog box (the old File ⇒ Print options), you use the constant wdDialogFilePrint, and to display the Options dialog box (Tools ⇒ Options), you use the constant wdDialogToolsOptions. + +So, although the user interface has evolved beyond classic menus, the menu structure itself remains as part of the classification system for internal objects—such as these constants used to identify various dialog boxes. + +Excel follows a similar but less rigid taxonomic convention. Built-in Excel dialog boxes are (for backward compatibility with older macro code) still identified by constants starting with the letters xlDialog followed by the name of the dialog box. The name of the dialog box is derived either from the classic menu commands that were required to display it or from the dialog box's title. For example, to refer to the Open dialog box, you use the constant xlDialogOpen (rather than xlDialogFileOpen). + +Anyway, the easiest way to find the name for the built-in dialog box you need is to search the Visual Basic Editor's Help system for "Built-in Dialog Box Argument Lists" in Word or Excel. (Access employs a whole different system for common dialog boxes, requiring the importation of object libraries using its Visual Basic Editor's Tools ⇒ References menu and the employment of specialized objects.) + +You can also view a list of Word or Excel built-in dialog boxes by displaying the Object Browser (press F2 in the Editor) and typing **wddialog** (for Word) or **xldialog** (for Excel) in the Search textbox. + +You use these constants with the Dialogs property, which returns the Dialogs collection object, which in turn contains all the built-in dialog boxes in the host application. + +For example, to display Word's Save As dialog box, you use the Show method, as illustrated in the following statement: + + Dialogs(wdDialogFileSaveAs).Show + +It's as simple as that. To display Word's Replace dialog box, just substitute wdDialogEditReplace for wdDialogFileSaveAs. + +* * * + +The Dialogs Collection Is Creatable in Word, but Not in Excel + +In Word, the Dialogs collection is a "creatable object," meaning you can access it directly without going through the Application object. In Excel, however, the Dialogs collection is not creatable, so you must always add the Application object to this code, like this: + + **Application.** Dialogs (xlDialogOptionsGeneral).Show. + +* * * + +### Choosing between the Show Method and the Display Method + +VBA provides two methods of displaying built-in dialog boxes onscreen: Show and Display: + + * The Show method shows the specified Dialog object and then uses functions built into the Dialog object to carry out the user's requests. You don't need to write any code of your own. For example, if you use the Show method to display the wdDialogFileSaveAs dialog box and the user enters a name for the file in the File Name box and clicks the Save button, VBA itself automatically saves the file with the given name in the specified folder (and with any other options the user chose). You didn't write any programming to save this file. + * The Display method merely displays the dialog box onscreen, but it does _not_ execute the actions the user requests in the dialog box. Instead, it allows you to fetch the settings (the user's requests and selections) from the dialog box once the user dismisses it, but then you must write your own code to carry out what the user requested. + +* * * + +Displaying a Particular Tab of a Word Dialog Box + +If the dialog box you want to display has tabs, you can display the tab of your choice by specifying the DefaultTab property. You refer to a tab by the name of the dialog box plus the word Tab and the name of the tab. For example, the constant for the Bullets And Numbering dialog box is wdDialogFormatBulletsAndNumbering, and the constant for its Outline Numbered tab is wdDialogFormatBulletsAndNumberingTabOutlineNumbered. Likewise, the Font dialog box is referred to as wdDialogFormatFont, and its Character Spacing tab is referred to as wdDialogFormatFontTabCharacterSpacing. You could display this tab by using the following statements: + + With Dialogs(wdDialogFormatFont) + .DefaultTab = wdDialogFormatFontTabCharacterSpacing + .Show + End With + +To get a list of all the tab constants, search for wdWordDialogTab in the Object Browser. + +* * * + +### Using the Show Method to Display and Execute a Dialog Box + +The Show method displays the specified dialog box and automatically responds to whatever actions the user takes in it. Show is useful when your user is merely going to perform a conventional interactive action. As a simple example, in a procedure that's supposed to perform certain formatting tasks on the current document, you could check to make sure a document was open before attempting to perform the formatting; then, if no document was open, you could display the built-in Open dialog box so that the user could open a file. (You might precede the Open dialog box with a message box explaining the problem.) Listing 14.3 shows the code for this part of the procedure. + +**Listing 14.3**: Using a common dialog box + + 1. If Documents.Count = 0 Then + 2. Proceed = MsgBox("There is no document open." _ + & vbCr & vbCr & _ + "Please open a document for the procedure to work on.", _ + vbOKCancel + vbExclamation, "Format Report") + 3. If Proceed = vbOK Then + 4. **Dialogs(wdDialogFileOpen).Show** + 5. If Documents.Count = 0 Then End + 6. Else + 7. End + 8. End If + 9. End If + 10. 'rest of procedure here + +Here's how the code works: + + * Line 1 checks the Count property of the Documents collection to see if no documents are open; if that's the case, the statements in lines 2 through 8 run. + * Line 2 displays a message box informing users that no document is open and asking them to open one for the procedure to work on. The message box has OK and Cancel buttons and stores the button chosen in the variable Proceed. + * Line 3 checks to see if the OK button was chosen; if it was, line 4 displays the Open dialog box so that users can select the file, which VBA will open when they click the Open button in the Open dialog box. + * Users can cancel the procedure at this point by clicking the Cancel button in the Open dialog box, so line 5 checks the Count property of the Documents collection again and uses an End statement to terminate execution of the procedure if there is still no document open. + * If the OK button was not chosen, execution moves from line 3 to the Else statement in line 6, and the End statement in line 7 ends execution of the procedure. + * Line 8 contains the End If statement for the nested If statement, and line 9 contains the End If statement for the outer If statement. + * Line 10 contains a comment to indicate that you'd write more code here—the rest of the procedure would run from this point, which is reached only if a document is open. + +### Using the _Display_ Method to Display a Dialog Box + +Remember that unlike the Show method, the Display method displays a built-in dialog box but doesn't respond to any actions the user takes in the dialog box. Instead, you must write code that checks the settings that the user chose in the dialog box and then write more code to carry out the user's wishes. When you use the Display method, the user gets to work with familiar dialog boxes, but you totally control the behavior that results from that interaction. + +For example, you'll often need to find out which folder a procedure should be working in, such as when you need the location of a number of documents that the user wants to manipulate. To get the folder, you _could_ display a straightforward input box and prompt the user to type in the correct path to the folder—if the user knows the path and can type it in correctly. + +Perhaps a better solution is to display a list box containing the tree of drives, folders, and files on the user's hard drive, but to do this you need to dimension an array and fill it with the folders and filenames, _and_ you need to refresh the display every time the user moves up or down the tree—quite a lot of programming work. + +So why not just borrow all this functionality from a built-in common dialog box? It's already part of the Office applications. You can achieve the same result much more easily by using a built-in dialog box that has the tree built in (for example, the Open dialog box) and then retrieving the user's responses for your own purposes. + +If you need to execute the settings (user choices) in a built-in dialog box, you can use the Execute method. But you might want to check the user's selections in the dialog box before implementing them. If you find a problem, you could then, for example, display a dialog box of your own, such as an Input Box, asking for clarification. + +## Setting and Restoring Options in a Built-in Dialog Box + +Most of the built-in Word and Excel dialog boxes have arguments that you can use for retrieving or setting values in the dialog box. For example, the Open dialog box in Word has arguments for Name, ConfirmConversions, ReadOnly, LinkToSource, AddToMru (adding the document to the Most Recently Used document list on the Recent section of the File tab on the Ribbon), PasswordDoc, and more. Some of these are options that you'll see in the Open dialog box itself; others are associated options that you'll find on the various tabs of the Options dialog box. You can guess some argument names from the names of the corresponding controls in the dialog box, but other names aren't directly related. To learn the names, search for "Built-in Dialog Box Argument Lists" in the VBA Editor's Help system (choose MSDN on the Web, then search with Bing). + +For example, the following statements set the contents of the File Name text box in the Save As dialog box in Word and then display the dialog box: + + With Dialogs(wdDialogFileSaveAs) + .Name = "Yellow Paint Primer" + .Show + End With + +Be aware that some arguments that applied to dialog boxes displayed by Office 2003 no longer apply to Office 2007, 2010, or 2013 dialog boxes. So you may need to experiment a bit to see if a particular legacy argument is still useful in the Office 2013 interface. + +If you change the settings in a dialog box that uses sticky (persistent) settings, it's a good idea to change them back at the end of your procedure so that users don't get unexpected results the next time they open the dialog box. + +## Which Button Did the User Choose in a Dialog Box? + +To find out which button the user clicked in a dialog box, check the return value of the Show method or the Display method. The return values are shown in Table 14.12. + +Table 14.12 Click return values + +**Return Value** | **Button Clicked** +---|--- +–2 | Close +–1 | OK +0 | Cancel +1 | The first command button +2 | The second command button +>2 (greater than 2) | Subsequent command buttons + +For example, you might want to cancel your whole procedure if the user clicks the Cancel button in a dialog box, like this: + + If Dialogs(wdDialogFileOpen).Show = 0 Then End + +## Specifying a Time-Out for a Dialog Box + +In some applications, including Word, you can display some built-in dialog boxes for a specified time rather than having them stay open until the user dismisses them by clicking OK or Cancel or some other button. To do so, you use the TimeOut Variant argument with the Show method or the Display method. You specify TimeOut as a number of units, each of which is approximately a thousandth of a second. (If the system is busy with many other tasks, the actual result might be a slightly longer delay.) So you could display the General page of the Word Options dialog box for about 10 seconds—long enough for the user to check the Name setting and change it if necessary—by using the following statements: + + With Dialogs(wdDialogToolsOptions) + .DefaultTab = wdDialogToolsOptionsTabUserInfo + .Show (10000) + End With + +* * * + +The TIMEOUT Argument Doesn't Work with Custom Dialog Boxes + +TimeOut doesn't work with custom dialog boxes you create, only with the built-in Word dialog boxes. Also, some built-in Word dialog boxes—such as the New dialog box (wdDialogFileNew) and the Customize dialog box (wdDialogToolsCustomize)—don't recognize the TimeOut option either. + +* * * + +Timing out a dialog box is especially useful for noncritical information like the username in this example because it allows the procedure to continue even if the user has left the computer. Likewise, you might want to time out a Save As dialog box in which the procedure suggested a viable filename but allowed users to override it if they were present. However, for a procedure in which the user's input is essential, you won't want to use the TimeOut argument. You want to compel the user to respond by at least clicking a button; in this context, the dialog box should not disappear all by itself via this timeout technique. + +# The Bottom Line + +**Understand what you can do with a custom dialog box.** + +Custom dialog boxes—user interfaces you design as forms in the Visual Basic Editor—are often needed in macros and other kinds of Office automation. You might, for example, want to display a dialog box that allows the user to specify whether to let a macro continue beyond a certain point in its code or cease execution. Perhaps your macro is searching through a document for a particular phrase; then when it finds that phrase, it displays a dialog box to users asking if they want to continue further. + +**Master It** + +Which VBA statement would you use to stop a macro from continuing execution? + +**Create a custom dialog box.** + +You use the Visual Basic Editor to both design a custom dialog box (form) and write code for macros. You can attach the various controls to a form and then enter code _behind_ the dialog box. + +**Master It** + +How do you switch between the form-design window (sometimes called the object window) and the Code window in the Visual Basic Editor? + +**Add controls to a dialog box.** + +It's easy in the Visual Basic Editor to add various controls—such as command buttons and text boxes—to a user form (a custom dialog box). + +**Master It** + +How do you add a command button to a custom dialog box? + +**Link dialog boxes to procedures.** + +Buttons, check boxes, option buttons—displaying various controls to the user is fine, but unless you write some code _behind_ these various user-interface objects, what's the point? Your macro's user shouldn't discover that clicking a button _does nothing_. + +Dialog boxes often display objects with which users can communicate their wishes to your code. Therefore, you write code that explores the values the user enters into controls and responds to whatever buttons the user might click. + +**Master It** + +Create a small custom dialog box that displays a message in a label control saying, "Would you like to know the current date and time?" Put an OK button and a Cancel button on this form. Write code that simply ends the procedure if the user presses the Cancel button but that displays the date and time in the label if the user clicks the OK button. If the user clicks OK a second time, end the procedure. + +**Retrieve the user's choices from a dialog box.** + +A major task of most dialog boxes is retrieving values that the user has specified in various controls by selecting check boxes and so on. Then you write code to carry out the user's wishes based on these retrieved values. This interaction via dialog box is the typical way that a user communicates with your procedures, and vice versa. + +**Master It** + +Create a new dialog box that contains three option buttons captioned Small, Medium, and Large and named optSmall, optMedium, and optLarge. Write code in each option button's Click procedure to change the button's caption to boldface when the button is clicked. +Chapter 15 + +Creating Complex Forms + +While simple dialog boxes tend to be static, more complex dialog boxes can be _dynamic_ : They can change when the user clicks certain elements in them. Such changes can include the following: + + * The application changes the information in the dialog box to reflect choices that the user has made. For example, if a user selects a particular check box, the application may make other check boxes unavailable (hidden or disabled) because the options offered by the other check boxes cannot be simultaneously chosen along with the first check box. + * The dialog box displays a hidden section of secondary, less frequently used options when the user clicks a button in the primary area of the dialog box. + * The application uses the dialog box to keep track of a procedure and to guide the user to the next step by displaying appropriate instructions and by activating relevant controls. In this chapter, you'll look at an example of this technique. + +In this chapter, you'll start by investigating how to create dynamic forms. Such dialog boxes cost you a little more work than static dialog boxes, but they're a great way to both present information and allow the user to make choices. (Note that the terms _form_ and _dialog box_ can be used interchangeably, though dialog boxes tend to be smaller and simpler than forms.) + +From dynamic dialog boxes you'll move on to multipage dialog boxes, which you use to present more information or options to the user than the eye and mind can comfortably encompass at once. + +You'll then look at how to create a _modeless_ dialog box (one that users can leave onscreen while they continue to work in their application, much like Word's Research pane displays results from the thesaurus, though you can continue to edit the document). + +The chapter ends by showing you how to work with the many events supported by the UserForm object and the controls you use on it. By using events, you can monitor what the user does and take action accordingly, or even prevent the user from doing something that doesn't seem like a good idea. + +In this chapter you will learn to do the following: + + * Understand what a complex dialog box is + * Reveal and hide parts of a dialog box + * Create multipage dialog boxes + * Create modeless dialog boxes + * Explore all the form and control events + +# Creating and Working with Complex Dialog Boxes + +You should never use a complex dialog box when a simple one will do the trick and be easier for users to work with. If all a procedure needs is a pair of check boxes and a group of option buttons, there's no need to employ multiple pages of dynamically updating controls. But often, you will want to create complex dialog boxes (like the examples given at the beginning of this chapter) to provide users with the flexibility that your procedures demand. + +## Updating a Dialog Box to Reflect the User's Choices + +You'll find it relatively easy to change a form to reflect the options the user chooses. Your primary tool for doing this is the Click event, to which most controls placed on a form react and to which you can code in the Code window that's "behind" (associated with) your form. + +When you double-click a control on a form, the Code window for that form opens and a default Sub procedure is displayed. This procedure is associated with the clicked control. The procedure is automatically named after the control and the control's default event. If you double-click a command button, for example, the Code window opens with this button's default Click event: + + Private Sub CommandButton1_Click() + + End Sub + +Whatever code you put into this procedure will be executed when the user clicks this particular command button. + +Some controls have different default events than Click; you'll learn about the Change event as you work with complex dialog boxes, and you'll see the full slew of other events in the second half of the chapter. + +Listing 15.1 in the next section shows you an example of code that updates a dialog box should the user click a button captioned More. + +## Revealing a Hidden Part of a Form + +Hiding part of a complex form is a great way to simplify the user's initial interaction with the dialog box. Consider the Find And Replace dialog box in Word: When you first see it (by pressing Ctrl+H, or by clicking the Replace icon in the Editing section of the Ribbon's Home tab), you're shown only the part of the dialog box (see the top box in Figure 15.1) for the most common type of search and replace—just the target and the replacement, along with the option to replace them one by one, or _en masse_. + +Figure 15.1 Word's Find And Replace dialog box hides some of its options (top) until you click the More button to display its lower half (bottom). + +But, should you want to use the less common or more advanced options that the abbreviated version of the Find And Replace dialog box doesn't display by default, you can click the More button to reveal the bottom part of the dialog box, as shown at the bottom in Figure 15.1. Here are more rarely used options, such as matching prefix or case. + +You may want to take a similar approach with your own dialog boxes, hiding a subset of actions that most users won't need most of the time. To do so, you can use two techniques, either separately or in tandem: + + * Set the Visible property to False to hide controls that are located in a displayed part of the dialog box. Set the Visible property to True when you want to display these controls (after the user presses a More button or some such trigger). + * Increase the height or width (or both) of the dialog box to reveal an area containing further controls. The Find And Replace dialog shown in Figure 15.1 uses the technique of increasing the Height property of the box. + +As a simple example of the latter technique, consider the dialog box shown in Figure 15.2. When you display the dialog box, only the top part is visible; when you click the More button, the bottom part is displayed. Listing 15.1 contains the code behind the dialog box that makes all this happen. + +Figure 15.2 The top part of this Inventories form offers the most frequently used options. Clicking the More button reveals the rest of the dialog box (shown on the bottom), which contains less-often-used controls. + +**Listing 15.1**: Revealing part of a dialog box + + 1. Private Sub UserForm_Initialize() + 2. frmInventories.Height = 120 + 3. End Sub + 4. + 5. Private Sub cmdMore_Click() + 6. If cmdMore.Caption = "< < Less" Then + 7. cmdMore.Caption = "More > >" + 8. cmdMore.Accelerator = "M" + 9. frmInventories.Height = 120 + 10. Else + 11. frmInventories.Height = 240 + 12. cmdMore.Caption = "< < Less" + 13. cmdMore.Accelerator = "L" + 14. fraOptions.Enabled = True + 15. End If + 16. End Sub + 17. + 18. Private Sub chkArtNames_Click() + 19. If chkArtNames = True Then + 20. optFromDocument.Enabled = True + 21. optFromDocument = True + 22. optAutoNames.Enabled = True + 23. Else + 24. optFromDocument.Enabled = False + 25. optFromDocument = False + 26. optAutoNames.Enabled = False + 27. optAutoNames = False + 28. End If + 29. End Sub + 30. + 31. Private Sub cmdOK_Click() + 32. frmInventories.Hide + 33. Unload frmInventories + 34. 'create inventories here + 35. End Sub + 36. + 37. Private Sub cmdCancel_Click() + 38. End + 39. End Sub + +Listing 15.1 contains five short procedures that control the behavior of the dialog box: + +**UserForm_Initialize** + +Initializes the dialog box before it's displayed. + +**cmdMore_Click** + +Runs when the cmdMore button is chosen. This button bears the caption More when only the top half of the dialog box is displayed, and the caption Less when the full dialog box is displayed. + +**chkArtNames_Click** + +Runs when the Enter Art Filenames check box is chosen. + +**cmdOK_Click** + +Runs when the OK button is chosen. + +**cmdCancel_Click** + +Runs when the Cancel button is chosen. + +Here's what happens in the code. + + * The UserForm_Initialize procedure sets the Height property of the frmInventories user form to 120, which is enough to display only the top part of the dialog box. (To find the appropriate height for your dialog box, drag it to the height that looks right and note the Height property in the Properties window.) This procedure is necessary only if the user form is set to its full height at design time. By setting the user form to a height of 120 at design time, you could avoid having to use a UserForm_Initialize procedure. However, for a user form that has three or more different sizes—or for a user form with two different sizes, one of which needs to be chosen at runtime depending on environmental conditions—you'll need to use a UserForm_Initialize procedure. + * The cmdMore_Click procedure starts by checking in line 6 whether the Caption property of the cmdMore command button is < >, the button that will be used to display the bottom part of the dialog box again if necessary. Line 8 sets the Accelerator property of the cmdMore command button to M (to make the _M_ in _More_ the accelerator key for the button). Line 9 sets the Height property of frmInventories to 120, which is the depth required to show only the top part of the dialog box. + +* * * + +The caption Property Works, But Using a State Variable Is Considered More Elegant + +Checking the Caption property of the cmdMore button is an effective way of determining the current state of this form (whether it's expanded or not), but this isn't the most elegant of techniques. It's a form of _hard coding_ , considered by many to be a sleazy way of programming. Instead, you could maintain an internal state variable (a Static toggle) in which you store information about whether the dialog box is displayed in its full state or its partial state. Using an internal state variable avoids assuming that this caption will always remain the same. The code would fail to work correctly, for example, if the form were at some point _localized_ (adapted for a different language locale, where the words more and less are not used). + +* * * + +If the condition in line 6 is False, execution shifts from line 6 to the Else statement in line 10. This must mean that the Caption property of the cmdMore button is already set to More > >, so the dialog box is displayed in its smaller version and the More > > button is being clicked to expand the dialog box again. Line 11 sets the Height property of the user form to 249, thus displaying the lower part of the dialog box. Line 12 changes the Caption property of the cmdMore command button to < < Less. Line 13 sets the Accelerator property of the cmdMore command button to L. + +Line 14 enables the fraOptions frame (identified as Options in the dialog box and disabled in the user form, as are the optFromDocument option button and the optAutoNames option button), making it and the controls it contains available to the user. Line 16 ends the cmdMore_Click procedure. + + * The chkArtNames_Click procedure (lines 18 to 29) runs when the Enter Art Filenames check box is clicked. This procedure enables and disables the option buttons below it, as appropriate. Line 19 checks to see if the chkArtNames check box is selected. If it is, the statements in lines 20 through 22 run. Line 20 sets the Enabled property of the optFromDocument option button (identified as From Document in the dialog box) to True, thus making it available, and line 21 selects this option button as the default choice. Line 22 enables optAutoNames, the option button identified as Automatic Naming in the dialog box. + +If the chkArtNames check box isn't selected, execution shifts to the Else statement in line 23, which directs execution to line 24. This line sets the Enabled property of the optFromDocument option button to False, disabling it. Line 25 then deselects this option button (whether it's selected or not). Line 26 disables the optAutoNames option button, and line 27 deselects it (again, whether it's selected or not). The End If statement in line 28 ends this If statement, and line 29 ends this procedure. + + * The cmdOK_Click procedure in lines 31 to 35 shows the beginning of the procedure that runs once the OK button is clicked. Line 32 hides the Inventories dialog box, and line 33 unloads it from memory. Line 34 contains a comment indicating that the instructions for creating the inventories appear here. + * The cmdCancel_Click procedure contains only an End statement to end execution of the procedure if the user chooses the Cancel button. + +## Tracking a Procedure in a Form + +The next level of complexity in working with forms is using them to track the different stages of a procedure and to guide the user as to how to continue. + +Take a look at the Create New Employee Web Page dialog box shown in Figure 15.3. This dialog guides the user through a four-stage procedure to create a web page for a new employee. The first step is to identify the employee deserving of this honor by using either the drop-down list or the Select Other Employee command button in the step 1 frame. The second step is to enter suitable introductory, critical, or laudatory text about the employee. The third step is to select the most (or perhaps least) flattering photo of the employee to include in the web page. The fourth step is to save the web page to a folder on the company's intranet. + +Figure 15.3 The Create New Employee Web Page form provides users with instructions that are dynamically updated as they work their way through the procedure. + +When the user first displays the Create New Employee Web Page dialog box, they will see the version of the dialog box shown in Figure 15.3, with steps 2, 3, and 4 disabled and instructions for step 1 shown in the Instructions box at the top. + +When the user follows the instructions and selects the employee by using either the combo box drop-down list or the Select Other Employee command button, the code attached to the combo box drop-down list or the command button enables the step 2 frame, making its text box available to the user, as shown in Figure 15.4. Here is the code for the Change event of the cmbSelectEmployee combo box; the code for the Click event of the cmdSelectOtherEmployee command button is similar, although a little more complex. + +Figure 15.4 The second stage of the Create New Employee Web Page dialog box. Notice the changes from the first stage: the instructions in the Instructions frame have changed, and the use of the step 1 combo box drop-down list has enabled the step 2 frame. + + Private Sub cmbSelectEmployee_Change() + lblEmployeeName = cmbSelectEmployee.Text + fraStep2.Enabled = True + lblInstructions = "Enter text in the Step 2 text box. " & _ + "For example, you might include brief biographical " & _ + "information on the employee, details of their position, " & _ + "or your hopes for their contribution to the company." + cmdClearEmployeeName.Enabled = True + End Sub + +* * * + +An Ellipsis Signals That a Dialog Box Can Be Displayed + +The Select Other Employee button in the Create New Employee Web Page dialog box ends with an ellipsis (...), as do some of the other command buttons. This ellipsis is the Windows convention for indicating that the choice (here a command button, but also other contexts) results in a dialog box being displayed rather than an action being taken immediately. + +* * * + +These are the changes that occur when the user completes step 1 of the dialog box: + + * The text of the label in the Instructions box at the top of the dialog box is changed to contain information about step 2 of the procedure. + * The name of the employee selected by the user is listed above the Employee label in the step 1 frame. + * The frame for step 2 is enabled (the text box it contains is enabled along with the frame). + +## Using Multipage Dialog Boxes and Tab Strip Controls + +VBA includes a MultiPage control, which enables you to create multipage dialog boxes, and a TabStrip control, which lets you create dialog boxes driven by tab strips (similar to the tabs on the Office applications' Ribbon). You've almost certainly used multipage dialog boxes (if you're not sure what they are, press Ctrl+D in Word to open the Font dialog box and see an example of one). You can access any page (one at a time) by clicking the tab at the top of the page. Each page contains a different set of controls and can have a different layout appropriate to the page's purpose. + +* * * + +A Tab Is Not a Page + +The tab is the little thing that sticks out from the top of the page, not the whole page itself. Many people refer to the pages as "tabs" because the tab is the part you click to access the page. It's perfectly okay to use these terms interchangeably, but this discussion uses _tab_ to mean only the tab component and _page_ to refer to the page qua page. + +* * * + +Multipage dialog boxes are great for packing a lot of information into a single form without having it take up the whole screen with a bewildering embarrassment of options. You'll need to divide the information into discrete sets of related information to fit it onto the pages. Each page can (and should) have a different layout of controls that govern the behavior of discrete items; the pages are normally separate in theme or purpose. Again, the Font dialog boxes in the Office applications have a Font tab and an Advanced tab. Look at the Tools ⇒ Options dialog box in the VBA Editor for another example. + +A dialog box that uses a tab strip differs from a multipage dialog box in that it contains a tab strip control containing multiple _tabs_ but not multiple _pages_. To the user, it looks as if different pages are being displayed, but the actual layout of the controls in the dialog box doesn't change. No matter which tab on the tab strip is selected, the set of controls remains the same, although the data displayed in the controls does change. This approach is useful for displaying records from a database. The tabs merely switch to a different record. + +Tab strips are useful when you need to display consistent sets of information, such as the records you might maintain on your company's customers. Each customer record has the same set of fields (analogous to the columns in a database): an account number, a name (perhaps several), an address, phone numbers, email addresses, URLs, an order history, an account balance, and so on. Therefore, you can use the same set of controls (text boxes and labels, for example) to display the information for each record. The tab strip control governs which customer's set of information is displayed in them. Because few databases have a small and fixed number of records, you'll need to populate the tab strip on the fly (during execution) with tabs and captions, but it works fine. + +Table 14.7 in Chapter 14, "Creating Simple Custom Dialog Boxes," explains the properties unique to the TabStrip control and MultiPage control. + +* * * + +**Limit the Number of Pages in Your Multipage Dialog Boxes** + +You can create dialog boxes containing dozens of tabs or dozens of pages. And if you run out of horizontal space to display the tabs, the VBA Editor adds a scroll bar to enable the user to scroll through the tabs. However, gigantic tab dialog boxes are impractical in the real world. As you doubtless know, not everything that's possible is also desirable. + +You'll probably want to avoid creating multipage dialog boxes with more than 10 or 12 pages because the wealth of information such a dialog box will contain is likely to overwhelm the user. + +If you need more than a dozen pages to organize the information in a dialog box, you're probably trying to present the user with too much data at once. Consider an alternative way of displaying it. Most likely, you should subdivide the information into smaller, easier-to-manage categories. For example, Microsoft spends countless hours spread over several years testing focus groups, quizzing users, and observing people's behavior when using Word. One result is that Microsoft's designers subdivide tasks and user interaction into various subcategories. Click the Page Layout tab in the Word Ribbon. Notice that the many tasks within this category are subdivided into logical areas: Page Setup, Paragraph, and Arrange. What's more, two of these subcategories—Page Setup and Paragraph—have small box icons you can click in the lower-right corner. Clicking these icons opens a separate dialog box with additional, less frequently used, options. + +Tabs are a different matter. If you use a tab strip to move through the records in a database recordset, you may need to use quite a few tabs in a given tab strip. Unless the number of tabs is absurdly large, this shouldn't normally be a problem. However, a better solution if you're attempting to manage a database might be to switch to one of the more robust, specialized database-related user interface controls available in Access, Visual Basic Express, or Visual Basic .NET. For more information, see + +www.microsoft.com/visualstudio/eng/products/visual-studio-express-products + +* * * + +### Multipage Dialog Boxes + +To create a multipage dialog box, click the MultiPage icon in the Toolbox, and then click in the user form where you want the control to appear. The VBA Editor places a MultiPage control with two pages, whose tabs have the labels Page 1 and Page 2. You can then move and size the control as usual. In typical usage, you'll want to create a MultiPage control that's only a little smaller than the user form it inhabits (like most of the multipage dialog boxes you'll see in Windows applications). + +Once you've created a MultiPage control, you work with a page on it by right-clicking its tab and using the resulting context menu: + + * To add a page, right-click the label and choose New Page from the context menu. VBA will add a new page of the default size and will name it Page _n_ , where _n_ is the next number after the current number of pages (even if the other pages have names other than Page1, Page2, and so on). + * To rename a page in a MultiPage control, right-click the label and choose Rename from the context menu. In the Rename dialog box (see Figure 15.5), enter the caption (the label text) for the page in the Caption text box, the accelerator key in the Accelerator Key text box, and any control-tip text (the tip the user sees when they move the mouse pointer over the tab for the page) in the Control Tip Text text box. Click the OK button to close the Rename dialog box. + +Figure 15.5 Use the Rename dialog box to set the caption, accelerator key, and control-tip text for a page. + + * To delete a page from a MultiPage control, right-click the label and choose Delete Page from the context menu. The VBA Editor will remove the page without prompting for confirmation. + * To move a page to a different place in the MultiPage control, right-click the label and choose Move from the context menu to display the Page Order dialog box (see Figure 15.6). In the Page Order list box, select the page or pages that you want to move (Shift+click to select multiple contiguous pages, Ctrl+click to select multiple noncontiguous pages), and then use the Move Up and Move Down buttons to rearrange the page or pages as desired. When you've finished, click the OK button to close the Page Order dialog box. + +Figure 15.6 Use the Move Up and Move Down buttons in the Page Order dialog box to change the order of pages in a MultiPage control. + + * To specify which page of a multipage dialog box to display by default, use the Value property of the MultiPage control. You can set this property either at design time or at runtime. For example, you could use an initialization procedure such as the one shown here to display the third page (identified by the value 2, because the page numbering starts at 0) of a dialog box with a MultiPage control called MyMulti at runtime: + + Sub UserForm_Initialize() + MyMulti.Value = 2 + End Sub + +Once you've created a multipage dialog box, you can populate its pages with controls using the techniques you learned in Chapter 14. Each control must have a unique name in the entire form (not just within the page on which it appears). + +When designing a multipage dialog box, keep the following issues in mind: + + * What's the best way to divide the information or options in the dialog box? What belongs on which page? Which information or options will the user expect to find grouped together? + * Which controls should appear on each page? Most dialog boxes need at least a pair of command buttons—such as OK and Cancel or OK and Close—available from each page to allow the user to dismiss the dialog box from whichever page they happen to end up on. In rare instances, you may want to force the user to return to a particular page in order to close a dialog box. In these cases, make sure each page that doesn't contain a command button to dismiss the dialog box tells the user where they will find such a command button. + * For settings, do you need to have an Apply button (as well as an OK button) to apply the changes on a particular page without closing the dialog box? + +Because each control in a multipage form has a unique name, when returning information from a multipage dialog box you need specify only the relevant object—you don't need to specify which page it's on. + +Figure 15.7 shows an example of a multipage dialog box. The first page contains the customer's personal contact information; the second, the customer's professional information; the third, the associations the customer belongs to; and the fourth, the certifications the customer holds. + +Figure 15.7 By using multiple pages in a dialog box, you can achieve a clean and uncluttered look that's also easily navigable. + +Most of the properties of the MultiPage control are straightforward, but a few deserve special mention: + + * The Style property offers fmStyleTabs (the default setting, showing tabs for navigating between the pages), fmStyleButtons (which gives each page a rectangular button, with the button for the current page appearing pushed in), or fmStyleNone (which provides no means of navigating between the pages and no indication of the borders of the multipage dialog box). fmStyleNone can be useful for creating user forms that have two or more alternate layouts of which the user will only ever need to see one at a time. By including one set of controls on one page of the multipage dialog box and another set of controls on the other page, you can present two seemingly different dialog boxes by doing nothing more than changing which page of the MultiPage control is displayed. For example, you can use this approach to create a wizard that guides the user through a multistep process. + * The TabOrientation property controls where the tabs (or buttons) for the pages appear on the control. Your choices are fmTabOrientationTop (the default setting, placing the tabs at the top of the control), fmTabOrientationBottom, fmTabOrientationLeft, and fmTabOrientationRight. Experiment with the effects that the bottom, left, and right orientations offer, but unless they provide significant advantages over the more normal top orientation, use them sparingly if at all. Users won't thank you for deviating from the traditional, familiar interface unnecessarily. + * The MultiRow property controls whether a MultiPage control has one row of tabs for its pages (False) or multiple rows (True). When you have MultiRow set to True, the VBA Editor adds the second or subsequent rows of tabs when you run out of space on the first or current row. + +The MultiPage control doesn't have to take up the whole dialog box—in fact, most dialog boxes keep the key command buttons like OK and Cancel outside the multipage area so that they're available to the user no matter which page the user is on. + +That said, it is usually a good idea to make a MultiPage control the dominant part of a dialog box. In a complex and busy dialog box, a small MultiPage control can appear to be little more than a group box, and the user may miss the tabs, particularly if they're just skimming the controls looking for a particular option. + +### Using the Tab Strip Control + +Forms that use a tab strip are substantially different from multipage dialog boxes. A TabStrip control is used not to rearrange other controls but to change the data that appears in them as the user moves from one set of data to another. In other words, the layout of the controls remains static; just the values displayed in the controls changes from page to page on the strip. + +For instance, you might use a dialog box driven by a tab strip to view and update the records in a data source such as a Word table, an Excel spreadsheet, or an Access database. This example uses an Excel workbook in which information is stored on a number of worksheets. Figure 15.8 shows the DataSurfer dialog box, which is driven by a tab strip. + +The actual strip of tabs in a TabStrip control can appear above, below, or beside the controls that it contains. Above is the conventional—and default—position, just as it is in real-world recipe-card boxes and file drawers. But vertical and bottom tabs have shown up in eccentric Windows applications from time to time. As with the MultiPage control, use the TabOrientation property of the TabStrip control to specify whether the tab strip should appear at the top, bottom, left, or right of its control. But be sure to have some pretty good reason if you're departing from convention. + +Figure 15.8 Using a TabStrip control to create a multitab dialog box. The tab strip is used to control which set of information is displayed in the other controls in the dialog box. + +The tab strip can contain zero, one, or more tabs. For most purposes, there's little point in having only one tab on a tab strip, and even less in having no tab at all. But if you dynamically populate the tab strip with tabs in your procedures (as you're about to do in this next example) and create one tab for each record found, you may run into situations with only one record and thus a dialog box with only one tab—or even a tab strip without any tabs at all. + +Click the TabStrip button on the Toolbox, click in the user form to place the tab strip, and then drag it to an appropriate size. Bear in mind that a tab strip is only a visual display for the user's benefit. Unlike the MultiPage control, you establish the logical connection between the tab strip and the other controls through code. You can then add, rename, move, and delete tabs in the same way as you can pages in a MultiPage control. + +If you haven't placed the other controls for the dialog box, do so now. + +Once everything's in place, you write the code that will enable the tab strip to display the contents of the other controls. Listing 15.2 shows the code for the tab strip in the DataSurfer dialog box. This tab strip is named tabSurfer, and the code works with its Change event—the event procedure that _fires_ (is triggered and executes its code) when the user clicks a new tab on the strip. + +**Listing 15.2**: Programming a tab strip + + 1. Private Sub tabSurfer_ **Change** () + 2. If blnInitializing = False Then + 3. With ActiveWorkbook.Sheets(tabSurfer.Value + 1) + 4. 'load the contents of the worksheet that corresponds _ + to the tab chosen + 5. .Activate + 6. txtFirstName.Text = .Cells(1, 2).Text + 7. txtInitial.Text = .Cells(2, 2).Text + 8. txtLastName.Text = .Cells(3, 2).Text + 9. txtAddress1.Text = .Cells(4, 2).Text + 10. txtAddress2.Text = .Cells(5, 2).Text + 11. txtCity.Text = .Cells(6, 2).Text + 12. txtState.Text = .Cells(7, 2) + 13. txtZip.Text = .Cells(8, 2).Text + 14. txtHomeArea.Text = .Cells(9, 2).Text + 15. txtHomePhone.Text = .Cells(10, 2).Text + 16. txtWorkArea.Text = .Cells(11, 2).Text + 17. txtWorkPhone.Text = .Cells(12, 2).Text + 18. txtWorkExtension.Text = .Cells(13, 2).Text + 19. txtEmail.Text = .Cells(14, 2).Text + 20. End With + 21. End If + 22. End Sub + +After specifying the worksheet, the code in Listing 15.2 essentially repeats itself for each of the text boxes that appears in the DataSurfer dialog box. This dialog box works with a data source implemented as Excel spreadsheets in the active workbook. + +Each worksheet in the workbook is one customer's record, with the name of the customer appearing on the worksheet's tab and the customer's data appearing in the second column: the first name in the first cell of the second column, the middle initial in the second cell, the last name in the third cell, and so on for the address, phone numbers (both home and work), and email address. So to get at any piece of information, you need to know the sheet of the record in question and the appropriate cell in the second column. + +Here's how the code works: + + * Line 1 declares the procedure tabSurfer_Change, which executes automatically whenever the Change event of the tabSurfer tab strip fires. The Change event fires each time the user clicks a new tab, so you use this event to control the information displayed in the text boxes. + * The Change event also fires when a tab is added to (or removed from) the tab strip. Because the DataSurfer user form uses the Initialize event procedure to populate the tab strip with tabs (one per worksheet in the workbook), you do need to prevent the Change event procedure from running unnecessarily during the initialization phase of your program. So the user form declares a private Boolean variable named blnInitializing that the Initialize procedure sets to True while it's running and to False just before it ends. Line 2 of the Change event procedure checks to make sure that blnInitializing is False. If it's not, the Initialize procedure has fired the event, and the Change procedure does not need to load the information into the cells—so execution continues at line 21, just before the end of the procedure. But once the Initialize procedure has finished running, blnInitializing will be set to False, and the Change event procedure will run each time the user changes tabs in the tab strip. + * Line 3 begins a With statement that works with the appropriate worksheet in the active workbook: (ActiveWorkbook.Sheets(tabSurfer.Value + 1). The Value property of the tabSurfer tab strip tells us which tab in the tab strip is selected. Because the first tab in the tab strip is numbered 0 and the first worksheet in the workbook is numbered 1, you need to add 1 to the Value of the tab strip to even the numbers. + * Line 4 is a comment. Line 5 uses the Activate method to activate the worksheet in question. + * Lines 6 through 19 then set the Text property of each text box in the user form to the contents of the corresponding cell in the second column on the worksheet. For example, line 6 sets the Text property of the txtFirstName text box (which appears under the First Name label in the dialog box) to the contents of the first cell in the second column: .Cells(1, 2).Text. + * Line 20 ends the With statement, line 21 ends the If statement, and line 22 ends the procedure. + +### Using Pictures in Forms + +VBA includes extensive graphics capabilities that allow you to make your forms look pretty much any way you want them to. This book doesn't go into design aesthetics in any detail, but there's much you can do to make your forms look good. You can fiddle with Format ⇒ Order to pile controls on top of each other. Controls like the command button have their own Picture properties, as do forms themselves. Take a look at Figure 15.9. It shows a photo inside an image control, a background texture in the form's picture property, and a command button that blends into the background because its BackStyle property is set to Transparent. + +Figure 15.9 VBA includes extensive graphics features—you can make your forms look any way you want them to. + +You can add a picture to a form by using an Image control. Click the Image button in the Toolbox, and then click in the user form where you want the Image control to appear. Once you've placed the Image control, you can size and move the picture just as you would any other control. + +* * * + +Ensure That You Include Any Necessary Graphics Files When You Deploy a Macro + +Make sure the picture you choose for an Image control or a user form's background is available to all computers that will display the dialog box. If the picture isn't available, it fails to appear in the dialog box, which spoils the effect. + +* * * + +To choose the picture that will appear in the Image control, select the Picture property in the Properties window and click the ellipsis button that then appears to the right of the entry. The VBA Editor displays the Load Picture dialog box. Select the picture file and choose the Open button. The Picture property in the Properties window registers the type of picture you selected—such as Bitmap—but not its filename, and the picture appears in the Image control so that you can see if it's an appropriate size. + +* * * + +Loading a Picture into an Image Control Programmatically + +When specifying the picture for an Image control _programmatically_ (the picture is loaded while the macro is executing, during runtime), you need to use a LoadPicture statement. Compare that to how, when programming (design time) you can simply use the Properties window to assign a picture to the Picture property of the Image control. LoadPicture has the following syntax: + + LoadPicture _filename_ , [ _WidthDesired_ ], [ _HeightDesired_ ] + +_filename_ is a String argument specifying the name of the picture file to be loaded into the Image control. _WidthDesired_ is an optional Long argument specifying the width of the picture in twips, and _HeightDesired_ is an optional Long argument specifying the height of the picture. + +For example, the following statement loads the picture named Rose.jpg that's located in the c:\root directory + + Image1.Picture = LoadPicture("C:\rose.jpg") + +* * * + +Once you've chosen the picture, you have various options for positioning it and formatting it: + + * If necessary, set the alignment of the picture by using the PictureAlignment property. (If the picture fully fills the Image control—neither overlapping it nor leaving parts of it empty—you may not need to set the alignment for it.) Table 15.1 shows the constants and values for the PictureAlignment property. + * If necessary, clip, stretch, or zoom the picture by using the PictureSizeMode property: fmPictureSizeModeClip (0) clips the picture to fit the Image control; fmPictureSizeModeStretch (1) stretches or squeezes the picture so that it fits the Image control (this option often makes for strange effects); and fmPictureSizeModeZoom (2) enlarges or reduces the picture so that its nearest dimension exactly fits the width or height of the Image control without changing the picture's proportions (this option usually leaves an unfilled gap on the other side). + * If you need to tile the image to take up the remaining space in the control, set the PictureTiling property to True. This option is rarely used with database work. + * If you need to adjust the position of the picture relative to its caption, set the PicturePosition property of the check box, command button, label, option button, or toggle button in question. Table 15.2 shows the constants and values for PicturePosition. + +Table 15.1 Constants and values for the PictureAlignment property + +**Constant** | **Value** | **Picture Alignment in Image Control** +---|---|--- +fmPictureAlignmentTopLeft | 0 | Top left +fmPictureAlignmentTopRight | 1 | Top right +fmPictureAlignmentCenter | 2 | Centered +fmPictureAlignmentBottomLeft | 3 | Bottom left +fmPictureAlignmentBottomRight | 4 | Bottom right + +Table 15.2 PicturePosition property + +Once you've placed, sized, and formatted a picture, there are various possibilities for what you can do with it, such as using a picture's Click event to trigger an action. For example, you could display two graphics illustrating a choice of two formats for a document. Then the user could click the appropriate picture to signal their choice. + +## Creating a Modeless Dialog Box + +We're using VBA version 7, and ever since version 6 the language has offered the programmer an option to create a _modeless_ dialog box—one that users can leave onscreen while they continue to work in their application. In other words, they don't have to click an OK or Cancel button or otherwise dismiss the dialog box to regain the ability to interact with their application. + +You're doubtless familiar with modeless dialog boxes from working with Office. For example, the Find And Replace dialog box in Access, Word, and Excel is modeless, as is the Replace dialog box in PowerPoint. + +When you display a modeless dialog box, it takes the focus just as any modal dialog box does (its frame turns from gray to white and the X close icon in the upper right changes from dark gray to red, the indication that focus is on a message box in Windows 8's graphic scheme). + +But you can click in the application window to transfer the focus back to that window. For example, you can continue typing in a Word document, even while the Find And Replace dialog box remains visible. + +Creating a modeless dialog box is as simple as setting the ShowModal property of the user form to False from its default setting of True. + +There are various situations where you might want to use a modeless dialog box rather than a modal one. As a simple example, you might create a procedure and dialog box in Word that collects information from the user for a memo or a report. By making the dialog box modeless, you could allow the user to copy information from an open document (or open other documents and gather information from them) and paste it into the dialog box—saving users from having to copy the information before invoking the dialog box and allowing them to copy multiple separate items easily. Likewise, you could create a modeless user form (perhaps shaped like a toolbar) that users could keep onscreen and use to automatically enter text into predefined sections of three or four other documents without losing their place in the current document. + +You can also use modeless dialog boxes to display complex sets of interrelated user forms in which the user needs to copy and paste information from one user form to another or at least to access different areas of two or more displayed user forms at the same time. Displaying multiple forms at once can be confusing to the user, but you may sometimes find it necessary. + +Most of the time, you'll probably want to use modal dialog boxes in your VBA procedures. With modal dialog boxes, users must deal with the dialog box before they can continue to work in the application, and there's no risk that they'll end up with multiple dialog boxes scattered around the screen in assorted states of disuse. + +* * * + +You Can Use Serial Modal Dialog Boxes + +You can't display both modal and modeless user forms at the same time, but you can display one modal dialog box from another modal dialog box. When users close the second modal dialog box, VBA returns them to the first modal dialog box by default. However, you can write code to make the second modal dialog box automatically close the first dialog box after it closes itself. + +* * * + +## Specifying a Form's Location Onscreen + +By default, VBA centers a dialog box on the middle of the application window as much as possible, which is the normal behavior for Windows applications. If you want to position a form elsewhere on the screen (for example, to avoid obscuring important data onscreen), set the StartUpPosition property for the user form. Table 15.3 explains the settings you can use. + +Table 15.3 StartUpPosition property settings + +**Property** | **Value** | **Effect** +---|---|--- +Manual | 0 | Displays the user form in the upper-left corner of the Windows Desktop. +CenterOwner | 1 | Centers the user form horizontally and vertically in the _owner_ application—the application to which the user form belongs. +CenterScreen | 2 | Centers the user form horizontally and vertically on the Desktop. In a multimonitor arrangement, this value centers the user form on the monitor containing the active window. +WindowsDefault | 3 | Displays the user form in the default position for Windows dialog boxes. + +# Using Events to Control Forms + +This section discusses the events built into VBA for use with forms and with individual controls to give the programmer fine control over how user forms look and behave. + +So far in this chapter, you've used three of the most useful events: + + * You used the Initialize event to add items to list boxes just before a form is loaded and to adjust the number of tabs on a tab strip. + * You used the Click event to take action when the user clicks a particular control in a user form. So far you've been using Click mostly for command buttons, but you can use it for just about any control—including the user form itself. + * You used the Change event to control what happens when the user changes the tab displayed on a tab strip. + +Table 15.4 lists the events that VBA supports and the objects and controls with which each can be used. + +Table 15.4 Events that VBA supports and the objects and controls associated with them + +**Event** | **Occurs** | **Applies to These Controls and Objects** +---|---|--- +Activate | When the user form becomes the active window | UserForm +Deactivate | When the user form ceases to be the active window | UserForm +AddControl | When a control is added at runtime | Frame, MultiPage, UserForm +AfterUpdate | After the user has changed data in a control | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +BeforeDragOver | When the user is performing a drag-and-drop operation | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +BeforeDropOrPaste | When the user is about to release a dragged item or about to paste an item | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +BeforeUpdate | When the user has changed data in the control before the new data appears in the control | CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, ToggleButton +Change | When the Value property of a control changes | CheckBox, ComboBox, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton +Click | When the user clicks a control or object with the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, ToggleButton, UserForm +DblClick | When the user double-clicks a control or object with the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, ToggleButton, UserForm +DropButtonClick | When the user displays or hides a drop-down list | ComboBox, TextBox +Enter | Just before one control on a user form receives the focus from another control | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton +Exit | Just before one control on a user form loses the focus to another control | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton +Error | When a control or object encounters an error | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +Initialize | After a user form is loaded but before it's displayed | UserForm +KeyDown | When the user presses a key on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +KeyUp | When the user releases a key they've pressed on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +KeyPress | When the user presses an ANSI key on the keyboard | CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +Layout | When the size of a frame, multipage, or user form changes | Frame, MultiPage, UserForm +MouseDown | When the user presses the primary mouse button | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +MouseUp | When the user releases the primary mouse button (after pressing it) | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, ToggleButton, UserForm +MouseMove | When the user moves the mouse | CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, ToggleButton, UserForm +QueryClose | When a user form is about to close | UserForm +RemoveControl | When a control is deleted | Frame, MultiPage, UserForm +Resize | When a user form is resized | UserForm +Scroll | When the user moves the scroll box | Frame, MultiPage, ScrollBar, UserForm +SpinDown | When the user clicks the down button on a SpinButton control | SpinButton +SpinUp | When the user clicks the up button on a SpinButton control | SpinButton +Terminate | When a user form has been unloaded from memory | UserForm +Zoom | When the Zoom property of the control or user form changes | Frame, MultiPage, UserForm + +The ByVal keyword is used to pass arguments between procedures. When used with forms, it can return ReturnBoolean, ReturnEffect, ReturnInteger, and ReturnString objects. + +As you can see, VBA's events fall into several categories, which are discussed in the following sections in descending order of usefulness: + + * Events that apply only to the UserForm object + * Events that apply to the UserForm object and other container objects (such as the Frame control and the MultiPage control) + * Events that apply to many or most of the controls, sometimes including the UserForm object as well + * Events that apply only to a few controls + +## Events Unique to the UserForm Object + +This section discusses the events that are unique to the UserForm object. These are the Initialize, QueryClose, Activate, Deactivate, Resize, and Terminate events. + +### Initialize Event + +An Initialize event occurs when the user form is loaded but before it appears onscreen. + +VBA's syntax for the Initialize event is as follows, where _userform_ is a valid UserForm object: + + Private Sub _userform_ _Initialize() + +Typical uses for the Initialize event include retrieving information—from a database, a set of worksheets, or whatever—that the user form or application needs and assigning information to the controls on the user form (especially ListBox and ComboBox controls, to which you often need to add the information at runtime rather than at design time). + +Depending on the style and complexity of your user forms, you may also want to use the Initialize event to resize the user form, resize controls on the user form, display or hide particular controls, and in general make sure the user form is as closely suited as possible to the user's needs before displaying it. + +### QueryClose Event + +The QueryClose event applies to the UserForm object only. This event fires just before the user form closes. + +The syntax for the QueryClose event is as follows: + + Private Sub UserForm_QueryClose(Cancel As Integer, CloseMode As Integer) + +Here, Cancel is an integer, typically 0 (zero). A nonzero value prevents the QueryClose event from firing and stops the user form (and the application) from closing. + +CloseMode is a value or a constant giving the cause of the QueryClose event. Table 15.5 shows the values and constants for CloseMode. + +Table 15.5 Values and constants for the CloseMode argument + +**Constant** | **Value** | **Cause of the QueryClose Event** +---|---|--- +vbFormControlMenu | 0 | The user has closed the user form by clicking its close button or by invoking the Close command from the user form's control menu (for example, by right-clicking the title bar of the user form and choosing Close from the context menu). +vbFormCode | 1 | An Unload statement in code has closed the user form. +vbAppWindows | 2 | Windows is closing down and is closing the user form. +vbAppTaskManager | 3 | The Task Manager is closing the application and thus is also closing the user form. + +At first glance, QueryClose may appear to have few uses beyond double-checking that users really want to close a user form that they're attempting to close. Say that you have established that users had entered a lot of data in a form they were about to close. You might want to check that they hadn't clicked the user form's Close button or Cancel button by mistake, as illustrated in the following code fragment for Word: + + Private Sub UserForm_QueryClose(Cancel As Integer, _ + CloseMode As Integer) + 'make sure the user wants to close the user form + 'if they have entered information in it + Select Case CloseMode + Case 0 + 'user has clicked the close button or invoked an Unload statement + 'if text box contains more than 5 characters, ask to save it + If Len(txtDescription.Text) > 5 Then + If MsgBox("The Description text box contains " & _ + "a significant amount of text." & vbCr & _ + "Do you want to save this text?", vbYesNo + _ + vbQuestion, "Close Form") <> 0 Then + Documents.Add + Selection.TypeText txtDescription.Text + ActiveDocument.SaveAs _ + "c:\temp\Temporary Description.docm" + MsgBox "The contents of the Description text " & _ + "box have been saved in " & _ + "c:\temp\Temporary Description.docm.", _ + vbOKOnly + vbInformation, _ + "Form Information Saved" + End If + End If + +However, QueryClose comes into its own when the whole application, rather than just the user form, is closing. If the user form is modeless, users may not be aware that it's still open and that they're about to lose data they've typed into it or options they've selected in it. + +Sometimes you may be able to use QueryClose to save information from a user form when the application has stopped responding and is being closed by Windows or the Task Manager. Be warned that QueryClose's record isn't perfect on this—the code sometimes won't run. + +To stop an application from closing, set the Cancel property of the QueryClose event to True. + +### Activate Event + +The Activate event fires when the user form becomes the active window. Typically, this means the event fires when the user form is displayed, occurring just after the Initialize event if the user form is loaded by a Show statement rather than a Load statement. + +Note that if the user form is loaded by using a Load statement before being displayed with the Show statement, the Initialize event fires after the Load statement. The Activate event, firing after the Show statement, fires later. + +However, the Activate event also fires when the user form is reactivated after having been deactivated. For example, if you create a modeless user form with an Activate event procedure, the code is executed each time the user reactivates the user form after having deactivated it (for example, by working in the application window). Likewise, if you display one user form from another and then close the second user form, returning the focus to the first user form and reactivating it, the Activate event fires again. + +The syntax for the Activate event is as follows: + + Private Sub UserForm_Activate() + +* * * + +Bug Alert: You May Face Problems Using Deactivate and Activate in Immediate Succession + +VBA can't always execute the event procedures for the Deactivate event of one user form and the Activate event of another user form in immediate succession. Sometimes things work as they should; more often, they don't. + +For example, say you have two user forms, named One and Two, each with an Activate event procedure and a Deactivate event procedure. If you display Two from One, the Deactivate event code from One should run, followed by the Activate event code from Two. This doesn't usually happen: Often, the Deactivate code of One will run, but the Activate code of Two won't. Run it again, and you may get the Activate code of Two to run but not the Deactivate code of One. However, if you remove or comment out the Deactivate event procedure from One and try again, Two's Activate code will run consistently each time One displays Two, indicating that the Activate event is firing but the Activate event procedure's code isn't running when the Deactivate event procedure is present. + +* * * + +### Deactivate Event + +The Deactivate event fires when the user form loses the focus after having been the active window, but it doesn't fire when the user form is hidden or unloaded. For example, if you display a user form that contains a Deactivate event procedure and then close the user form, the Deactivate event doesn't fire. However, if you display one user form from another, the Deactivate event for the first user form fires as the focus is transferred to the second user form. With modeless user forms, the Deactivate event is triggered each time the user leaves one user form by clicking on another. + +The syntax for the Deactivate event is as follows: + + Private Sub UserForm_Deactivate() + +See the previous sidebar for details on a bug in using the Deactivate and Activate events in immediate succession. + +### Resize Event + +The Resize event fires when a user form is resized either manually by the user or programmatically by you. + +The syntax for the Resize event is as follows: + + Private Sub UserForm_Resize() + +The main use for the Resize event is to move, resize, display, or hide controls to respond to a resized form. For example, you might resize a text box so that it occupies most of the width of the user form it lives on (see Figure 15.10) by using code such as that shown in Listing 15.3. + +Figure 15.10 You can use the Resize event of a user form to resize or reposition the controls it contains. + +**Listing 15.3**: Resizing via code + + 1. Private Sub cmdWidenForm_Click() + 2. With frmResize + 3. If .Width < 451 Then + 4. .Width = .Width + 50 + 5. If cmdNarrowForm.Enabled = False Then _ + cmdNarrowForm.Enabled = True + 6. If .Width > 451 Then _ + cmdWidenForm.Enabled = False + 7. End If + 8. End With + 9. End Sub + 10. + 11. Private Sub cmdNarrowForm_Click() + 12. With frmResize + 13. If .Width > 240 Then + 14. .Width = .Width - 50 + 15. If cmdWidenForm.Enabled = False Then _ + cmdWidenForm.Enabled = True + 16. If .Width < 270 Then _ + cmdNarrowForm.Enabled = False + 17. End If + 18. End With + 19. End Sub + 20. + 21. Private Sub cmdClose_Click() + 22. Unload Me + 23. End Sub + 24. + 25. Private Sub UserForm_Resize() + 26. txt1.Width = frmResize.Width - 30 + 27. End Sub + +Listing 15.3 contains four short procedures: one for the Click event of the cmdWidenForm command button, one for the Click event of the cmdNarrowForm command button, one for the Click event of the cmdClose command button, and one for the Resize event of the user form. + +The cmdWidenForm_Click procedure shown in lines 1 through 9 increases the width of the user form by 50 points (1 point is 1/72 inch) when the user clicks the Widen Form button, as long as the Width property of the user form is less than 451 points. Line 5 enables the cmdNarrowForm command button if it isn't already enabled. (The cmdNarrowForm command button is disabled when the user form is displayed at its original narrow width.) Line 6 disables the cmdWidenForm command button if the Width property of the user form is more than 451 points. + +The cmdNarrowForm_Click procedure shown in lines 11 through 19 narrows the user form by 50 points as long as the Width of the user form is greater than 240 points (its original width), reenabling the cmdWidenForm button if it's disabled and disabling the cmdNarrowForm button if the Width of the user form is less than 270 points. + +The cmdClose_Click procedure shown in lines 21 through 23 simply unloads the user form (which it refers to by the Me keyword). + +The UserForm_Resize event procedure in lines 25 through 27 sets the Width property of txt1, the text box in the user form, to 30 points less than the Width of the user form. If you step through the code (repeatedly pressing F8) in the user form, you'll notice that the Resize event fires when the size of the user form changes. For example, when line 4 of the cmdWidenForm_Click procedure is executed, execution branches to the Resize event procedure in line 25, and this procedure is executed before the code in line 5. + +### Terminate Event + +The Terminate event fires when the user form has been unloaded—or, more precisely, when all references to an instance of the user form have been removed from memory or have gone out of scope. + +The syntax for the Terminate event is as follows: + + Private Sub UserForm_Terminate() + +## Events That Apply to Both UserForms and Container Controls + +This section discusses the events that apply to the UserForm object _and_ to the container controls—the MultiPage control and the Frame control. Container controls can have other controls placed inside of them. (The Scroll event applies to the ScrollBar control as well as to MultiPage, Frame, and UserForm.) These events are Scroll, Zoom, Resize, Layout, AddControl, and RemoveControl. + +### Scroll Event + +The Scroll event applies to the Frame control, the MultiPage control, the ScrollBar control, and the UserForm object. This event occurs when the user moves the scroll box (the thumb) on a scroll bar on a frame, MultiPage control, scroll bar, or user form. + +The syntax for the Scroll event varies for the three controls and the UserForm object. The syntax for the Scroll event with the UserForm object is as follows: + + Private Sub UserForm_Scroll(ByVal ActionX As MSForms.fmScrollAction, ByVal ActionY + As MSForms.fmScrollAction, ByVal RequestDx As Single, ByVal RequestDy As Single, + ByVal ActualDx As MSForms.ReturnSingle, ByVal ActualDy As MSForms.ReturnSingle) + +The syntax for the Scroll event with the ScrollBar control is as follows: + + Private Sub scrollbar_Scroll() + +The syntax for the Scroll event with the MultiPage control is as follows: + + Private Sub multipage_Scroll(index As Long, ActionX As fmScrollAction, ActionY As + fmScrollAction, ByVal RequestDx As Single, ByVal RequestDy As Single, ByVal + ActualDx As MSForms.ReturnSingle, ByVal ActualDy As MSForms.ReturnSingle) + +The syntax for the Scroll event with the Frame control is as follows: + + Private Sub frame_Scroll(ActionX As fmScrollAction, ActionY As fmScrollAction, + ByVal RequestDx As Single, ByVal RequestDy As Single, ByVal ActualDx As MSForms + .Return Single, ByVal ActualDy As MSForms.ReturnSingle) + +In these last three syntax statements, _scrollbar_ is a valid ScrollBar object, _multipage_ is a valid MultiPage object, and _frame_ is a valid Frame object. + +Here are the arguments for the Scroll event: + +**Index** + +A required argument specifying the page of the MultiPage with which the event procedure is to be associated. + +**ActionX** and **ActionY** + +Required arguments determining the user's horizontal and vertical actions (respectively), as shown in Table 15.6. + +**RequestDx** + +The distance to move the scroll box horizontally, specified in points. + +**RequestDy** + +The distance to move the scroll box vertically, specified in points. + +**ActualDx** + +The distance the scroll box moved horizontally, measured in points. + +**ActualDy** + +The distance the scroll box moved vertically, measured in points. + +Table 15.6 ActionX and ActionY constants and values for the Scroll event + +**Constant** | **Value** | **Scroll Box Movement** +---|---|--- +fmScrollActionNoChange | 0 | There was no change or movement. +fmScrollActionLineUp | 1 | The user moved the scroll box a short way upward on a vertical scroll bar (equivalent to pressing the ↑ key) or a short way to the left on a horizontal scroll bar (equivalent to pressing the ← key). +fmScrollActionLineDown | 2 | The user moved the scroll box a short way downward on a vertical scroll bar (equivalent to pressing the ↓ key) or a short way to the right on a horizontal scroll bar (equivalent to pressing the → key). +fmScrollActionPageUp | 3 | The user moved the scroll box up one page on a vertical scroll bar (equivalent to pressing the Page Up key) or one page to the left on a horizontal scroll bar (also equivalent to pressing the Page Up key). +fmScrollActionPageDown | 4 | The user moved the scroll box down one page on a vertical scroll bar (equivalent to pressing the Page Down key) or one page to the right on a horizontal scroll bar (also equivalent to pressing the Page Down key). +fmScrollActionBegin | 5 | The user moved the scroll box to the top of a vertical scroll bar or to the left end of a horizontal scroll bar. +fmScrollActionEnd | 6 | The user moved the scroll box to the bottom of a vertical scroll bar or to the right end of a horizontal scroll bar. +fmScrollActionPropertyChange | 8 | The user moved the scroll box, changing the value of either the ScrollTop property or the ScrollLeft property. +fmScrollActionControlRequest | 9 | The scroll action was requested by a control in the container in question. +fmScrollActionFocusRequest | 10 | The user moved the focus to a different control. This movement scrolls the user form so that the selected control is fully displayed in the available area. + +### Zoom Event + +Changing the Zoom property is like using a magnifying glass. The form's controls all grow larger if the Zoom value is greater than 100, and they grow smaller if the value is less than 100. However, the form itself doesn't change size. To change the size of the form, you must adjust its Height and Width properties. + +The Zoom _event_ fires when the Zoom property of the object changes at runtime. The Zoom property can be changed either automatically through code or by the user's manipulating—dragging a scroll bar's thumb, for example—a control that changes the property because you've written code that responds this way. + +The Zoom property uses this syntax for the control and the UserForm object: + + Private Sub object_Zoom(Percent As Integer) + +Here, _object_ is a Frame control or a UserForm object. Percent is an Integer argument used to specify the percentage (from 10 percent to 400 percent) the user form is to be zoomed to. By default, user forms and controls are displayed at 100 percent zoom—full size. + +The Zoom property uses this syntax for the MultiPage control: + + Private Sub multipage_Zoom(ByVal Index As Long, Percent As Integer) + +Index is the index (name or number) of the Page object in the MultiPage control with which the Zoom event procedure is associated. + +Zooming a user form zooms all the controls that are on it. For example, say a user form named frmEventsDemo includes a combo box named cmbZoom that offers a selection of zoom percentages. When the user selects an item in the combo box, the Change event for cmbZoom applies the combo box's Value property to the Zoom property of the user form, zooming it to the percentage selected. Zooming the user form triggers the Zoom event, whose procedure in this example sets the Width and Height of the user form to new values suited to the new zoom percentage: + + Private Sub cmbZoom_Change() + 'change the size of the controls: + frmEventsDemo.Zoom = cmbZoom.Value + End Sub + Private Sub UserForm_Zoom(Percent As Integer) + ' change the size of the form itself: + frmEventsDemo.Width = 300 * cmbZoom.Value / 100 + frmEventsDemo.Height = 350 * cmbZoom.Value / 100 + End Sub + +### Layout Event + +A Layout event is triggered when the size of the frame, MultiPage control, or user form is changed, either by the user or programmatically (automatically by an autosized control's becoming resized). + +By default, the Layout event automatically calculates the new position for any control that has been moved and repaints the screen accordingly. However, you can also use the Layout event for your own purposes if you need to. + +The syntax for the Layout event with a Frame control or a UserForm object is as follows: + + Private Sub object_Layout() + +Here, _object_ is a Frame control or a UserForm object. + +The syntax for using the Layout event with a MultiPage control is as follows: + + Private Sub multipage_Layout(index As Long) + +Here, _multipage_ is a MultiPage control and index is the Page object in the MultiPage control. + +* * * + +**VBA Automatically Saves Height and Width Properties** + +When a control is resized, VBA automatically stores its previous height and width in the OldHeight and OldWidth properties, while the Height and Width properties take on the new height and width values. It allows you to restore a control to its previous size by retrieving the OldHeight and OldWidth properties and assigning them to the Height and Width properties. + +* * * + +### AddControl Event + +The AddControl event is triggered when a control is added programmatically to a Frame control, a MultiPage control, or the user form at runtime; it isn't triggered when you add a control manually at design time. The event isn't triggered when the user form is initialized unless the Initialize event adds a control to the user form. + +The syntax for the AddControl event varies depending on the object or control. The syntax for the UserForm object and the Frame control is as follows: + + Private Sub object_AddControl(ByVal Control As MSForms.Control) + +Here, _object_ is a UserForm object or Frame control, and Control is the control that's being added. + +The syntax for the MultiPage control is as follows: + + Private Sub multipage_AddControl(ByVal Index As Long, ByVal Control As MSForms + .Control) + +Here, Index is the index number or name of the Page object that will receive the control. + +For example, the following cmdAddControl_Click procedure adds three option buttons (opt1, opt2, and opt3, respectively) to the frame fraOptions and sets properties for the first option button. (A comment indicates where the code would go on to set properties for the second and third option buttons.) The fraOptions_AddControl event procedure displays a message box giving the number of controls the frame now contains. Because the cmdAddControl_Click procedure adds three controls, the AddControl event fires three times, and the fraOptions_AddControl procedure runs three times: + + Private Sub cmdAddControl_click() + Dim opt1 As OptionButton + Dim opt2 As OptionButton + Dim opt3 As OptionButton + Set opt1 = fraOptions.Controls.Add("Forms.OptionButton.1") + Set opt2 = fraOptions.Controls.Add("Forms.OptionButton.1") + Set opt3 = fraOptions.Controls.Add("Forms.OptionButton.1") + With opt1 + .Left = 10 + .Top = 10 + .Name = "optDomestic" + .Caption = "Domestic" + .AutoSize = True + .Accelerator = "D" + End With + 'set properties for opt2 and opt3 here + End Sub + + Private Sub fraOptions_AddControl(ByVal Control As MSForms.Control) + MsgBox "The frame now contains " & _ + fraOptions.Controls.Count & " controls." + End Sub + +### RemoveControl Event + +The RemoveControl event fires when a control is deleted from a frame control, a MultiPage control, or a user form, either programmatically or manually at runtime. (To remove a control manually, the user would typically use a control built into the user form for that purpose. There has to be some programming here—users can't simply delete controls all by themselves.) + +The syntax for the RemoveControl event is as follows for all controls but the MultiPage control: + + Private Sub object_RemoveControl(ByVal Control As MSForms.Control) + +Here, _object_ is a valid object, and Control is a valid control. + +The syntax for the RemoveControl event is as follows for the MultiPage control: + + Private Sub multipage_RemoveControl(ByVal Index As Long, ByVal Control As + MSForms.Control) + +Here, _multipage_ is a valid MultiPage object. For a MultiPage control, Index specifies the Page object in the MultiPage control that contains the control to be deleted. + +## Events That Apply to Many or Most Controls + +This section discusses the events that apply to many or most controls. Some of these events apply to the UserForm object as well. These events are Click; Change; Enter and Exit; BeforeUpdate and AfterUpdate; KeyDown, KeyUp, and KeyPress; MouseDown, MouseUp, and MouseMove; BeforeDragOver; BeforeDropOrPaste; DblClick; and Error. + +### Click Event + +The most common event of all, the Click event services the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, and ToggleButton controls. It is not available to the TextBox, ScrollBar, or SpinButton controls, but it _is_ a member of the UserForm object. + +A Click event occurs when the user clicks a control with the left mouse button or when the user selects a value for a control that has more than one possible value. For most controls, this means that each time the user clicks the control, the event fires. But there are a few exceptions: + + * Clicking a disabled control fires the Click event of the user form (as if the user were clicking the user form through the control). + * The Click event of an OptionButton control fires when the user clicks the option button to select it. If the option button is already selected, clicking it has no effect. (On the other hand, the Click event of a CheckBox control fires each time the user clicks the check box—either to select it or to clear it.) + * The Click event of a ListBox control or ComboBox control fires when the user clicks to select an item from the list (not when the user clicks on the drop-down arrow or in the undropped portion of the combo box). If the user clicks an already-selected item, the Click event doesn't fire again. + * The Click event of a ToggleButton control occurs whenever the toggle button is clicked and when its Value property is changed. This means that it isn't a good idea to use the Click event of the ToggleButton control to toggle its Value. + * The Click event of a selected CommandButton control fires when you press the spacebar. + * The Click event of the default command button (the button with its Default property set to True) fires when the user presses Enter with no other command button selected. + * The Click event of the command button with its Cancel property set to True fires when the user presses Esc. The Click event for a control with an accelerator key set also fires when the user presses the accelerator key. + +For all controls except the TabStrip control and the MultiPage control, the Click event needs no arguments, as follows: + + Private Sub object_Click() + +For a TabStrip control or a MultiPage control, your code must react to the Index argument, a required Long (data type) argument that VBA passes to indicate the affected tab or page of the control: + + Private Sub object_Click(ByVal Index As Long) + +Here, _object_ is a valid MultiPage control or TabStrip control. + +* * * + +Sequence of Events: What Happens When the User Clicks (and Clicks Again) + +The order in which events trigger can sometimes be important to the programmer. If you don't understand the order in which events take place, you can become baffled and start using events in ways that trigger each other, or conflict with each other. + +When the user clicks a command button, the Enter event for this button occurs before its Click event if the click transfers the focus to the command button. When the Enter event for the command button fires, it usually prevents the Click event from firing. + +When the user clicks a control, the first event triggered is the MouseDown event, which fires when the user presses the mouse button. Then the MouseUp event fires when the user releases the mouse button. A Click event occurs after a MouseUp event. If the user clicks again within the double-click timeframe set in Windows, the DblClick event fires, followed by another MouseUp event. + +* * * + +### Change Event + +The Change event applies to the CheckBox, ComboBox, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. This event fires when the Value property of a control changes. This change can occur either through an action of the user's (such as typing text into a text box, selecting an option button, selecting or clearing a check box, clicking a toggle button, or changing the page displayed on a MultiPage control) or through an action taken programmatically at runtime. + +Bear in mind that when the Change event is fired by an action of the user's, that action may also trigger a Click event. (Even when this happens, Change is regarded as a better way of determining the new Value of the control than Click—though for many purposes Click will work satisfactorily as well.) Changing the Value property of a control manually at design time doesn't fire a Change event. + +The syntax for the Change event is as follows: + + Private Sub object_Change() + +The Change event is useful for updating other controls after the user changes a control. For example, if the user enters the name for a new report into a text box (here, txtReportName), you could use the Change event to automatically insert into another text box (here called txtFileName) the name of the file in which to save the report: + + Private Sub txtReportName_Change() + txtFileName.Text = txtReportName.Text & ".txt" + End Sub + +### Enter and Exit Events + +The Enter and Exit events apply to CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. + +The Enter event fires when the focus is moved from one control on a user form to another control. The event fires just before the second control receives the focus. + +Like the Enter event, the Exit event fires when the focus is moved from one control on a user form to another control. However, the Exit event fires just before the first event loses the focus. + +The syntax for the Enter event is as follows: + + Private Sub object_Enter() + +The syntax for the Exit event is a little more complex: + + Private Sub object_Exit(ByVal Cancel As MSForms.ReturnBoolean) + +Here, Cancel is a required argument specifying event status. The default setting is False, which specifies that the control involved should handle the event and that the focus will pass to the next control; a setting of True specifies that the application handle the event, which keeps the focus on the current control. + +By using the Enter and Exit events, you can track the user's progress through the controls on a user form. + +The Exit event is useful for checking to see if the user has made an appropriate selection in the control or has entered a suitable value. For example, you could check the user's entry in the control and, if you find it inappropriate, display a message box alerting the user to the problem and then return the focus to the control so that the user can try again. + +* * * + +Other Ways to Trap User Input + +Other events that you might use for checking the contents of a control after the user has visited it include AfterUpdate and LostFocus. Similarly, you might use the BeforeUpdate and GotFocus events instead of the Enter event. A significant difference between Enter and GotFocus and between Exit and LostFocus is that GotFocus and LostFocus fire when the user form receives or loses the focus, respectively, but Enter and Exit don't fire. + +* * * + +### BeforeUpdate Event + +The BeforeUpdate event applies to the CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, and ToggleButton controls. This event occurs as the value or data in the specified control is changed; you can use the event to evaluate the change and decide whether to implement it. + +The syntax for the BeforeUpdate event is as follows: + + Private Sub object_BeforeUpdate(ByVal Cancel As MSForms.ReturnBoolean) + +Here, _object_ is a valid object, and Cancel is a required argument indicating the status of the event. The default setting of False makes the control handle the event; True prevents the update from being executed and makes the application handle the event. + +Here's the sequence in which events fire as you move focus to a control, update it, and move on: + +1. The Enter event for the control fires when you move the focus to the control. + +2. The BeforeUpdate event for the control fires after you've entered the information for the update (for example, after you've pressed a key in a text box) but before the update is executed. By setting Cancel to True, you can prevent the update from taking place. (If you don't set Cancel to True, the update occurs and the AfterUpdate event can't prevent it from occurring.) + +3. The AfterUpdate event for the control fires after you've entered the information in the control and the update has been executed. If you set the Cancel argument for BeforeUpdate to True, the AfterUpdate event doesn't fire. + +4. The Exit event for the control fires when you move from this control to another control. (After the Exit event fires for the control you've left, the Enter event fires for the control to which you have moved the focus.) + +### AfterUpdate Event + +The AfterUpdate event applies to the CheckBox, ComboBox, ListBox, OptionButton, ScrollBar, SpinButton, TextBox, and ToggleButton controls. This event fires after the user changes information in a control and after that update has been executed. + +The syntax for the AfterUpdate event is the same for all the controls and objects it applies to: + + Private Sub object_AfterUpdate() + +### KeyDown and KeyUp Events + +The KeyDown event and KeyUp event work with the CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. These events are not available to the Image and Label controls. + +The KeyDown event fires when the user presses a key on the keyboard. The KeyUp event fires when the user releases the key. The KeyDown and KeyUp events also occur when a key is sent to the user form or control programmatically by using the SendKeys statement. These events don't occur when the user presses Enter when the user form contains a CommandButton control with its Default property set to True, nor when the user presses Esc when the user form contains a CommandButton control with its Cancel property set to True. + +When the keystroke moves the focus to another control, the KeyDown event fires for the original control, while the KeyPress and KeyDown events fire for the control to which the focus is moved. + +The KeyPress event fires after the KeyDown event and before the KeyUp event. + +The syntax for the KeyDown event is as follows: + + Private Sub object_KeyDown(ByVal KeyCode As MSForms.ReturnInteger, ByVal Shift As + Integer) + +The syntax for the KeyUp event is as follows: + + Private Sub object_KeyUp(ByVal KeyCode As MSForms.ReturnInteger, ByVal Shift As + Integer) + +Here, _object_ is an object name and is required. KeyCode is a required Integer argument specifying the key code of the key pressed. For example, the key code for the letter _t_ is 84. The key code isn't an ANSI value—it's a special number that identifies the key on the keyboard. + +Shift is a required argument specifying whether the Shift, Ctrl, or Alt key was pressed. Use the constants or values shown in Table 15.7. + +Table 15.7 Shift constants and values + +**Constant** | **Value** | **Description** +---|---|--- +fmShiftMask | 1 | Shift key pressed +fmCtrlMask | 2 | Ctrl key pressed +fmAltMask | 4 | Alt key pressed + +### KeyPress Event + +The KeyPress event is a member of the CheckBox, ComboBox, CommandButton, Frame, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. It also is a member of the UserForm object. The Label control has no KeyPress event. + +The KeyPress event fires when the user presses a printable character, Ctrl plus an alphabetic character, Ctrl plus a special character (symbols), the Esc key, or the Backspace key while the control or object in question has the focus. Pressing the Tab key, the Enter key, or an arrow key doesn't cause the KeyPress event to fire, nor does a keystroke that moves the focus to another control from the current control. + +Technically, only ANSI keys fire the KeyPress event. The Delete key isn't an ANSI key, so pressing the Delete key to delete, say, text in a text box doesn't fire the KeyPress event. But deleting the same text in the same text box using the Backspace key does because Backspace is an ANSI key. + +The KeyPress event fires after the KeyDown event and before the KeyUp event. It also fires when you use SendKeys to send keystrokes to a user form programmatically. + +The syntax for the KeyPress event is as follows: + + Private Sub object_KeyPress(ByVal KeyAscii As MSForms.ReturnInteger) + +Here, _object_ is a required argument specifying a valid object, and KeyAscii is a required Integer argument specifying an ANSI key code. To get the ANSI key code, use the Asc function. For example, Asc("t") returns the ANSI key code for the letter _t_ (the code is 116). + +By default, the KeyPress event processes the code for the key pressed—in humble terms, what you press is what you get. For example, if you press the _t_ key, you get a _t_ ; if you press the Delete key, you get a Delete action; and so on. By using a KeyPress event procedure, you can perform checks such as filtering out all nonnumeric keys when the user must enter a numeric value. + +### MouseDown Event and MouseUp Event + +The MouseDown and MouseUp events apply to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. The MouseDown event fires when the user presses a button on the mouse, and a MouseUp event occurs when the user releases that button. A Click event fires after a MouseUp event occurs. + +The syntax for the MouseDown and MouseUp events is as follows for all controls except for MultiPage and TabStrip: + + Private Sub object_MouseDown(ByVal Button As Integer, ByVal Shift As Integer, + ByVal X As Single, ByVal Y As Single) + + Private Sub object_MouseUp(ByVal Button As Integer, ByVal Shift As Integer, + ByVal X As Single, ByVal Y As Single) + +The syntax for the MouseDown and MouseUp events with the MultiPage and TabStrip controls adds an Index argument to specify the index of the page or the tab involved: + + Private Sub object_MouseUp(ByVal Index As Long, ByVal Button As Integer, ByVal + Shift As Integer, ByVal X As Single, ByVal Y As Single) + Private Sub object_MouseDown(ByVal Index As Long, ByVal Button As Integer, ByVal + Shift As Integer, ByVal X As Single, ByVal Y As Single) + +Here, _object_ is a valid object for the statement. + +Index returns –1 if the user clicks outside the page or tab area of the control but still within the control (for example, to the right of the rightmost tab in a top-tab tab strip). + +Button is a required Integer argument specifying the mouse button that triggered the event. Table 15.8 lists the possible values for Button. + +Table 15.8 Button values and constants + +**Constant** | **Value** | **Description** +---|---|--- +fmButtonLeft | 1 | Left (primary) +fmButtonRight | 2 | Right (non-primary) +fmButtonMiddle | 4 | Middle + +Shift is a required argument specifying whether the Shift, Ctrl, or Alt key was pressed. Table 15.9 lists the values for Shift. + +Table 15.9 Shift values + +**Value** | **Key or Keys Pressed** +---|--- +1 | Shift +2 | Ctrl +3 | Shift+Ctrl +4 | Alt +5 | Alt+Shift +6 | Alt+Ctrl +7 | Alt+Shift+Ctrl + +You can also detect a single key by using the key masks listed in Table 15.7. + +X is a required Single argument specifying the horizontal position in points from the left edge of the user form, frame, or page. Y is a required Single argument specifying the vertical position in points from the top edge of the user form, frame, or page. + +### MouseMove Event + +The MouseMove event is available to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. This event fires when the user moves the mouse pointer over the control or object in question. + +The syntax for the MouseMove event is different for the MultiPage control and the TabStrip control than for the other controls and for the UserForm object. The syntax for the other controls is as follows: + + Private Sub object_MouseMove(ByVal Button As Integer, ByVal Shift As Integer, + ByVal X As Single, ByVal Y As Single) + +The syntax for the MultiPage control and the TabStrip control is as follows: + + Private Sub object_MouseMove(ByVal Index As Long, ByVal Button As Integer, + ByVal Shift As Integer, ByVal X As Single, ByVal Y As Single) + +Here, _object_ is a required argument specifying a valid object. + +For the MultiPage and TabStrip controls, Index is a required argument that returns the index of the Page object in the MultiPage control or the Tab object in the TabStrip control associated with the event procedure. + +Button is a required Integer argument that returns which mouse button (if any) the user is pressing. Table 15.10 lists the values for Button. + +Table 15.10 Button values + +**Value** | **Button Pressed** +---|--- +0 | No button +1 | Left +2 | Right +3 | Left and right +4 | Middle +5 | Left and middle +6 | Middle and right +7 | Left, middle, and right + +Shift is a required Integer argument that returns a value indicating whether the user is pressing the Shift, Alt, and/or Ctrl keys. Refer back to Table 15.9 for the list of Shift values. + +X is a required Single argument that returns a value specifying the horizontal position in points from the left edge of the user form, frame, or page. Y is a required Single argument specifying the vertical position in points from the top edge of the user form, frame, or page. + +As with the MouseDown and MouseUp events, you can also detect a single key by using the key masks listed in Table 15.7. + +Like most windows in the Windows operating system, user forms largely experience life as a nonstop sequence of mouse events. MouseMove events monitor where the mouse pointer is on the screen and which control has captured it. MouseMove events fire even if you use the keyboard to move a user form from under the mouse pointer because the mouse pointer ends up in a different place in relation to the user form even though it hasn't moved in the conventional sense. + +One use for the MouseMove event is to display appropriate text or an image for a control at which the user is pointing. For example, suppose a user form provides a list of available products, with each product's title appearing in a label. When the user positions the mouse pointer over a title in the label, you could use the MouseMove event to load a picture of the product into an Image control and a short description into another label. + +* * * + +MouseMove Events May Not Trigger between Close Controls + +The user form traps MouseMove events when the mouse pointer isn't over any control. However, if the user moves the mouse pointer quickly from one control to another very close to it, the user form may fail to trap the movement over the short intervening space. + +* * * + +### BeforeDragOver Event + +The BeforeDragOver event applies to the UserForm object itself and to the following controls: CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton. A BeforeDragOver event is triggered when the user is performing a drag-and-drop operation. + +The syntax for the BeforeDragOver event depends on the object or control in question. The basic syntax for the UserForm object and all controls except the Frame, TabStrip, and MultiPage is as follows, where _object_ is a valid UserForm or control: + + Private Sub object_BeforeDragOver(ByVal Cancel As MSForms.ReturnBoolean, ByVal + Control As MSForms.Control, ByVal Data As MSForms.DataObject, ByVal X As Single, + ByVal Y As Single, ByVal State As MSForms.fmDragState, ByVal Effect As MSForms. + ReturnEffect, ByVal Shift As Integer) + +The syntax for the BeforeDragOver event with the Frame control is as follows, where _frame_ is a valid Frame control: + + Private Sub frame_BeforeDragOver(ByVal Cancel As MSForms.ReturnBoolean, ByVal + Control As MSForms.Control, ByVal Data As MSForms.DataObject, ByVal X As Single, + ByVal Y As Single, ByVal State As MSForms.fmDragState, ByVal Effect As MSForms. + ReturnEffect, ByVal Shift As Integer) + +The syntax for the BeforeDragOver event with the MultiPage control is as follows, where _multipage_ is a valid MultiPage control: + + Private Sub multipage_BeforeDragOver(ByVal Index As Long, ByVal Cancel As MSForms. + ReturnBoolean, ByVal Control As MSForms.Control, ByVal Data As MSForms. + DataObject, ByVal X As Single, ByVal Y As Single, ByVal State As MSForms. + fmDragState, ByVal Effect As MSForms.ReturnEffect, ByVal Shift As Integer) + +The syntax for the BeforeDragOver event with the TabStrip control is as follows, where tabstrip is a valid TabStrip control: + + Private Sub tabstrip_BeforeDragOver(ByVal Index As Long, ByVal Cancel As MSForms. + ReturnBoolean, ByVal Data As MSForms.DataObject, ByVal X As Single, ByVal Y As + Single, ByVal DragState As MSForms.fmDragState, ByVal Effect As MSForms. + ReturnEffect, ByVal Shift As Integer) + +These are the different parts of the statements: + + * Index is the index of the Page object in a MultiPage control (or the Tab object in a TabStrip control) that is affected by the drag-and-drop. + * Cancel is a required argument giving the status of the BeforeDragOver event. The default setting is False, which makes the control handle the event. A setting of True makes the application handle the event. + * Control is a required argument specifying the control that is being dragged over. + * Data is a required argument specifying the data being dragged. + * X is a required argument specifying the horizontal distance in points from the left edge of the control. Y is a required argument specifying the vertical distance in points from the top of the control. + * DragState is a required argument specifying where the mouse pointer is in relation to a target (a location at which the data can be dropped). Table 15.11 lists the constants and values for DragState. + * Effect is a required argument specifying the operations the source of the drop is to support, as listed in Table 15.12. + * Shift is a required argument specifying whether the Shift, Ctrl, or Alt key is held down during the drag-and-drop operation, as listed in Table 15.7. + +Table 15.11 DragState constants and values + +**Constant** | **Value** | **Position of Mouse Pointer** +---|---|--- +fmDragStateEnter | 0 | Within range of a target +fmDragStateLeave | 1 | Outside the range of a target +fmDragStateOver | 2 | At a new position, but remains within range of the same target + +**Table 15.12:** Effect constants and values + +**Constant** | **Value** | **Drop Effect** +---|---|--- +fmDropEffectNone | 0 | Doesn't copy or move the source to the target +fmDropEffectCopy | 1 | Copies the source to the target +fmDropEffectMove | 2 | Moves the source to the target +fmDropEffectCopyOrMove | 3 | Copies or moves the source to the target + +You use the BeforeDragOver event to control drag-and-drop actions that the user performs. Use the DragState argument to make sure that the mouse pointer is within range of a target. + +### BeforeDropOrPaste Event + +The BeforeDropOrPaste event applies to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls and to the UserForm object. + +A BeforeDropOrPaste event fires just before the user drops or pastes data onto an object. + +The syntax for the BeforeDropOrPaste event is different for the MultiPage and TabStrip controls than for the UserForm object and for the other controls. The basic syntax is as follows: + + Private Sub object_BeforeDropOrPaste(ByVal Cancel As MSForms.ReturnBoolean, ByVal + Control As MSForms.Control, ByVal Action As MSForms.fmAction, ByVal Data As + MSForms.DataObject, ByVal X As Single, ByVal Y As Single, ByVal Effect As + MSForms.ReturnEffect, ByVal Shift As Integer) + +The syntax for the MultiPage control is as follows, where _multipage_ is a valid MultiPage control: + + Private Sub multipage_BeforeDropOrPaste(ByVal Index As Long, ByVal Cancel As + MSForms.ReturnBoolean, ByVal Control As MSForms.Control, ByVal Action As MSForms. + fmAction, ByVal Data As MSForms.DataObject, ByVal X As Single, ByVal Y As + Single, ByVal Effect As MSForms.ReturnEffect, ByVal Shift As Integer) + +The syntax for the TabStrip control is as follows, where _tabstrip_ is a valid TabStrip control: + + Private Sub tabstrip_BeforeDropOrPaste(ByVal Index As Long, ByVal Cancel As + MSForms.ReturnBoolean, ByVal Action As MSForms.fmAction, ByVal Data As MSForms. + DataObject, ByVal X As Single, ByVal Y As Single, ByVal Effect As MSForms. + ReturnEffect, ByVal Shift As Integer) + +Here are the parts of the syntax: + + * _object_ is a required object specifying a valid object. + * For the MultiPage control, Index is a required argument specifying the Page object involved. + * Cancel is a required argument giving the status of the event. The default setting of False makes the control handle the event; True makes the application handle the event. + * Control is a required argument specifying the target control. + * Action is a required argument specifying the result of the drag-and-drop operation. Table 15.13 shows the constants and values for Action. + * Data is a required argument specifying the data (contained in a DataObject) being dragged and dropped. + * X is a required argument specifying the horizontal distance in points from the left edge of the control for the drop. Y is a required argument specifying the vertical distance in points from the top of the control. + * Effect is a required argument specifying whether the drag-and-drop operation copies the data or moves it, as listed in Table 15.12. + * Shift is a required argument specifying whether the user has pressed the Shift, Ctrl, and/or Alt keys, as listed in Table 15.7. + +Table 15.13 Action constants and values + +**Constant** | **Value** | **Action Taken** +---|---|--- +fmActionPaste | 2 | Pastes the object into the target. +fmActionDragDrop | 3 | The user has dragged the object from its source and dropped it on the target. + +The BeforeDropOrPaste event fires when a data object is transferred to a MultiPage or TabStrip control and just before the drop or paste operation occurs on other controls. + +### DblClick Event + +The DblClick event works with the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, TabStrip, TextBox, and ToggleButton controls, as well as the UserForm object. + +A DblClick event occurs when the user double-clicks a control or object with the primary mouse button. The double-click must be fast enough to register as a double-click in Windows (this speed is controlled by the setting on the Buttons tab in the Mouse Properties dialog box in Control Panel) and occurs after the MouseDown event, the MouseUp event, and the Click event (for controls that support the Click event). + +The DblClick event has a different syntax for the MultiPage and TabStrip controls than for the other controls or for the user form. + +For the MultiPage and TabStrip controls, the syntax is as follows: + + Private Sub object_DblClick(ByVal Index As Long, ByVal Cancel As MSForms + .ReturnBoolean) + +The syntax for the DblClick event for other controls is as follows: + + Private Sub object_DblClick(ByVal Cancel As MSForms.ReturnBoolean) + +Here, _object_ is a required argument specifying a valid object. For the MultiPage control and the TabStrip control, Index is a required argument specifying the Page object within a MultiPage control or the Tab object within a TabStrip control to be associated with the event procedure. + +Cancel is a required argument specifying the status of the event. The default setting of False causes the control to handle the event; True causes the application to handle the event instead and causes the control to ignore the second click. + +In controls that support both the Click event and the DblClick event, the Click event occurs before the DblClick event. If you take an interface action (such as displaying a message box) with the Click event procedure, it blocks the DblClick event procedure from running. In the following example, the DblClick event procedure doesn't run: + + Private Sub CommandButton1_Click() + MsgBox "Click event" + End Sub + + Private Sub CommandButton1_DblClick _ + (ByVal Cancel As MSForms.ReturnBoolean) + MsgBox "Double-click event" + End Sub + +However, you can execute non-interface statements in the Click event procedure without blocking the DblClick event procedure. The following example declares a private String variable named strMessage in the declarations portion of the code sheet for the user form. The Click event procedure for the CommandButton1 command button assigns text to strMessage. The DblClick event procedure assigns more text to strMess and then displays a message box containing strMessage so that you can see that both events have fired. Don't step into this code by pressing F8 in the VBA Editor—instead, press F5 to run it, or it won't work: + + Private strMess As String + Private Sub CommandButton1_Click() + strMess = "Click event" & vbCr + End Sub + + Private Sub CommandButton1_DblClick _ + (ByVal Cancel As MSForms.ReturnBoolean) + strMessage = strMessage & "Double-click event" + MsgBox strMessage + End Sub + +For most controls you won't want to use both a Click event procedure and a DblClick event procedure—you'll choose one or the other as appropriate to the control's purpose. + +### Error Event + +The Error event applies to the CheckBox, ComboBox, CommandButton, Frame, Image, Label, ListBox, MultiPage, OptionButton, ScrollBar, SpinButton, TabStrip, TextBox, and ToggleButton controls. It also applies to the UserForm object. The Error event fires when a control encounters an error and is unable to return information about the error to the program that called the control. We will explore error handling in depth in Chapter 17 "Debugging Your Code and Handling Errors." + +The syntax for the Error event for the UserForm object and for all controls except the MultiPage control is as follows: + + Private Sub object_Error(ByVal Number As Integer, ByVal Description As MSForms. + ReturnString, ByVal SCode As Long, ByVal Source As String, ByVal HelpFile As + String, ByVal HelpContext As Long, ByVal CancelDisplay As MSForms.ReturnBoolean) + +The syntax for the Error event for the MultiPage control is as follows, where _multipage_ is a valid MultiPage control: + + Private Sub multipage_Error(ByVal Index As Long, ByVal Number As Integer, ByVal + Description As MSForms.ReturnString, ByVal SCode As Long, ByVal Source As String, + ByVal HelpFile As String, ByVal HelpContext As Long, ByVal CancelDisplay As + MSForms.ReturnBoolean) + +These are the components of the syntax: + + * _object_ is the name of a valid object. + * For a MultiPage control, Index is the index of the Page object in the MultiPage control associated with the event. + * Number is a required argument that returns the value used by the control to identify the error. + * Description is a required String argument describing the error. + * SCode is a required argument giving the OLE status code for the error. + * Source is a required String argument containing the string identifying the control involved. + * HelpFile is a required String argument containing the full path to the Help file that contains the Description. + * HelpContext is a required Long argument containing the context ID for the Description within the Help file. + * CancelDisplay is a required Boolean argument that controls whether VBA displays the error message in a message box. + +## Events That Apply Only to a Few Controls + +This section discusses the three events that apply only to one or two controls. The first of the three is the DropButtonClick event, which applies only to the ComboBox and TextBox controls; the second and third are the SpinUp and SpinDown events, which apply only to the SpinButton control. + +### DropButtonClick Event + +The DropButtonClick event fires when the user displays or hides a drop-down list on a ComboBox by clicking the drop-down button or by pressing the F4 key when the ComboBox has the focus (is selected). DropButtonClick also fires when the user press the F4 key with a TextBox control selected, though this manifestation of the event is arcane enough to be singularly useless. It also fires when the DropDown method is executed in VBA to display the drop-down list, and it fires again when the DropDown method is executed again to hide the drop-down list. + +The syntax for the DropButtonClick event is as follows: + + Private Sub object_DropButtonClick( ) + +Here, _object_ is a valid ComboBox or TextBox control. + +One use for the DropButtonClick event is to add items to a ComboBox control rather than adding them at load time via the Initialize event. By adding these items only on demand (I'm assuming the user might not use the ComboBox control at all or might type information into its text-box area), you can cut down on load time for the user form. You can also load the ComboBox with data relevant to the other choices the user has made in the dialog box, allowing for more targeted information than you could have provided by loading the ComboBox with the Initialize event. + +### SpinDown and SpinUp Events + +The SpinDown and SpinUp events apply only to the SpinButton control. SpinDown and SpinUp are used to control what happens when the user clicks either the down-arrow button and up-arrow button, respectively, of a vertical SpinButton control or the right-arrow button and left-arrow button, respectively, of a horizontal SpinButton control. The SpinDown event fires when the user clicks the down-arrow or right-arrow button, and the SpinUp event fires when the user clicks the up-arrow or left-arrow button. + +The syntax for the SpinUp event and the SpinDown event is as follows: + + Private Sub spinbutton_SpinDown() + Private Sub spinbutton_SpinUp() + +Here, _spinbutton_ is a SpinButton control. + +By default, the SpinDown event decreases and the SpinUp event increases the Value property of the SpinButton by the SmallChange increment. + +# The Bottom Line + +**Understand what a complex dialog box is.** + +Simple dialog boxes tend to be static, but complex dialog boxes are dynamic—they change during execution in response to clicks or other interaction from the user. + +Master It + +Describe two types of dynamic behavior typical of complex dialog boxes. + +**Reveal and hide parts of a dialog box.** + +Dialog boxes need not display everything at once. Word's Find And Replace dialog box illustrates how useful it can be to display an abbreviated dialog box containing the most common tasks and expand the box to reveal less-popular options if the user needs access to them. + +Master It + +Name the two most common techniques you can use to display additional options in a dialog box. + +**Create multipage dialog boxes.** + +VBA includes the MultiPage control, which enables you to create multipage dialog boxes. Word's Font dialog box is an example of one. You can access any page (one at a time) by clicking its tab at the top of the page. + +Master It + +How does the TabStrip control differ from the MultiPage control? What are the typical uses for each? + +**Create modeless dialog boxes.** + +A _modeless_ dialog box can be left visible onscreen while the user continues to work in an application. For example, the Find And Replace dialog box in Access, Word, and Excel is modeless, as is the Replace dialog box in PowerPoint. A _modal_ dialog box, by contrast, must be closed by users before they can continue to interact with the application. + +Master It + +How do you make a user form modeless? + +**Work with form events.** + +Events are actions that happen while a program is executing. Many events are supported by the UserForm object and the controls you use on it. By using events, you can monitor what the user does and take action accordingly or even prevent the user from doing something that doesn't seem like a good idea. + +Master It + +Name two of the three most useful events available in VBA programming. +Part 5 + +Building Modular Code and Using Classes + + * **Chapter 16: Building Modular Code and Using Classes** + * **Chapter 17: Debugging Your Code and Handling Errors** + * **Chapter 18: Building Well-Behaved Code** + * **Chapter 19: Securing Your Code with VBA's Security Features** + +Chapter 16 + +Building Modular Code and Using Classes + +This chapter shows you how to start building modular code—code broken up into individual components rather than all built together into a monolithic mass. You'll also see how to create _reusable code_ that you can use in future procedures. + +The second part of this chapter discusses how you can build and use your own classes in VBA to implement custom objects, store information in them, and return information from them. + +In this chapter you will learn to do the following: + + * Arrange your code in modules + * Call a procedure + * Pass information from one procedure to another + * Understand what classes are and what they're for + * Create an object class + +# Creating Modular Code + +The code that you've created so far in this book has been effective—it _worked_ —but much of it has been less concise, organized, or elegant than it might be. The following sections show you how to refine your code. + +* * * + +What Is Elegance in Code? + +_Elegance_ in computer programming means not only that your code is bug-free and impeccably put together and that your user interface is well designed, but also that the code contains nothing unnecessary—it has been stripped down to the minimum required to achieve the desired effect. + +* * * + +## What Is Modular Code? + +_Modular code_ is code composed of different procedures that you can use in combination. The name doesn't specifically come from the fact that you store your VBA code in modules. + +For example, suppose you're working in Word. You can take a monolithic approach and create a single giant procedure that does a lot of things: creates a document based on the user's choice of template, inserts text and formats it, saves it in a particular folder under a name of the user's choice, prints it to a specific printer, and then closes it. Whew! + +Or...you can take the more practical _modular_ approach and subdivide this lengthy series of tasks into several separate procedures—one for each task. You can then create a kind of master procedure that runs each of these individual task procedures. In this way you can achieve the same results as executing the large, monolithic procedure. But subdivided code is easier to read, test, and even sometimes reuse. Think of it as using multiple small subs rather than a single large sub. + +You can also later create new master procedures that reuse these individual task procedures in a different way. + +## Advantages of Using Modular Code + +Modular code has several advantages over code that lumps everything together in one long sub or function. For one thing, it's often easier to write modular code because you create a number of short procedures, each of which performs a specific task. You stay focused on the single task at hand. + +You can usually debug these procedures relatively easily too, because their shorter length makes it simpler to identify, locate, and eliminate bugs. + +The procedures will also be more readable because they're less complex and you can more easily follow what they do. + +Modular code is also more efficient, for four reasons: + + * By breaking your code into procedures, you can repeat their tasks at different points in a sequence of procedures without needing to repeat the lines of code. Having less code should make your procedures run faster. + * By reusing whole procedures, you can reduce the amount of code you have to write. And by writing less code, you give yourself less chance to write new errors into your program. + * If you need to change an item in the code, you can make a single change in the appropriate procedure instead of having to make changes at a number of locations in a long procedure (and perhaps missing some of them). This change then also applies to any procedures that call the procedure. + * You can call individual procedures from other procedures without having to assimilate them into the other procedures. Just think how tedious it would be if you had to create each of VBA's many built-in functions from scratch instead of being able to invoke them at will. You can do much the same with functions you create—reuse them rather than reinvent the wheel. + +## How to Approach Creating Modular Code + +The usefulness of modular coding will vary from person to person, from project to project, and from procedure to procedure. For example, if you record a macro to perform a simple, one-time task on a number of presentations, there's no need to worry about breaking it down into its components and formalizing them as procedures. Just go ahead and use a single procedure. + +However, if you sit down to plan a large procedure that's going to automate the creation of your company's budget-estimate spreadsheets, you can benefit greatly from dividing the code into a set of several procedures. This automation job is complex and requires a lot of code, and it's also a program that must be reused every time there's a new budget proposal. + +You can go about creating modular code in two main ways: + + * Record (if the application you're using supports the VBA Macro Recorder) or write a procedure as usual and then examine it and break it into modules as necessary. This is a great way to start creating modular code, but it's usually less efficient: You'll end up spending a lot of time retrofitting your original, large procedure as you break it into smaller, separate procedures. + * List the different actions that your project requires, then code each action (or set of actions) as a separate procedure. This method requires a bit more planning but usually proves more efficient in the long run. + +## Arranging Your Code in Modules + +Once you've created a set of procedures, you can move them to a new module within the same project, or even to a different project. By grouping your procedures in modules, you can easily distribute the procedures to your colleagues without including any they don't need. In addition, you can remove from your immediate working environment any modules of code that you don't need. + +* * * + +Give Descriptive Names to Your Modules + +Give your modules descriptive names so that you can instantly identify them in the VBA Editor Project Explorer and other module-management tools. Avoid leaving modules named the default Module1, Module2, and so on. + +* * * + +## Calling a Procedure + +When one of your procedures needs to use another procedure you wrote, it _calls_ it (by name) in the same way that you learned in Chapter 9, "Using Built-in Functions," to call a built-in function like MsgBox. + +To call a procedure in the same project, either enter the name of the procedure to be called as a statement or use a Call statement with the name of the procedure. + +The syntax for the Call statement is the same for procedures as for functions: + + [Call] _name_ [, _argumentlist_ ] + +Here, _name_ is a required String argument giving the name of the procedure to call. Meanwhile, _argumentlist_ is an argument (or list of several arguments) providing a comma-delimited list of the variables, arrays, or expressions to pass to the procedure. You use an argument list only for procedures that require arguments. + +Calling involves two procedures, the caller and the called. For example, the following CreateReceiptLetter procedure (the caller) calls the procedure FormatDocument (the called): + + Sub CreateReceiptLetter() + 'other actions here + **Call FormatDocument** + 'other actions here + End Sub + +Most programmers omit the Call keyword, using just the name of the procedure. This next code does the same thing as the previous code example: + + Sub CreateReceiptLetter() + 'other actions here + **FormatDocument** + 'other actions here + End Sub + +However, as with calling built-in functions, some programmers believe that using the Call keyword can make it clearer that your code is calling a procedure, and it enables you to search more easily for your calls. (When debugging, you can see what procedures are calling others by choosing the Call Stack option on the Editor's View menu. This feature is only available in Break mode, however, not during design time.) + +In the following example, a procedure named Caller calls a procedure named Called, which takes the String argument strFeedMe. Note that when you use Call, you need to enclose the argument list in parentheses: + + Sub Caller() + Call Called("Hello") + End Sub + + Sub Called(ByVal strFeedMe As String) + Msgbox strFeedMe + End Sub + +Again, you can omit the Call keyword and, if you wish, the parentheses, and yet achieve the same result: + + Sub Caller() + Called "Hello" + End Sub + +As well as calling a procedure in the same project, you can call a procedure in another open project in the same host application (but usually not in another application). Typically, the syntax used to call a procedure in another project is as follows, although it can vary by application and version: + + Project.Module.Procedure + +To call a procedure in another project, you need to add a reference to that project in the VBA Editor's References dialog box. Choose Tools ⇒ References, select the project (click the Browse button if you need to browse to it), and then click the OK button. Once this reference is in place, you can call the procedure. + +* * * + +Circular References Are Not Allowed + +You can't add to the current project a reference to a project that itself contains a reference to the current project. If you attempt a circular reference like that, when you add the reference and close the References dialog box, the VBA Editor displays a message box with the warning "Cyclic reference of projects not allowed" and the Editor refuses to insert the reference. (It does close the References dialog box, though.) + +* * * + +Let's turn our attention to another benefit of modular code: you can refine your code and make it run faster by making logical improvements and visual improvements. + +## Making Logical Improvements to Your Code + +Breaking a large procedure into several smaller procedures can improve the logic of your code by forcing you to consider each set of actions the procedure takes as _modular_ , which means they're separate from other sets of actions. And you can also improve the logic of your code in other ways: by using explicit variable declarations, by stripping out unnecessary statements to simplify recorded code, and by using With statements to eliminate repetitive object references. The following sections describe ways to improve the quality of your code. + +### Declaring Variables Explicitly nstead of Implicitly + +This has been mentioned before, but it's important. Instead of declaring variables implicitly, declare all your variables explicitly: + + Dim strName As String + strName = "Lola Montez" + +Use that approach rather than the implicit declaration approach, which skips declaring the variable and merely assigns a value to it (which implicitly creates it): + + strName = "Lola Montez" + +Explicit declaration allows VBA to allocate only as much memory as that variable type needs. What's more, by specifying the data type of a variable, you relieve VBA of the necessity to waste time figuring out the data type each time the variable appears in your code. Better still, you avoid the risk of unintentionally storing the wrong type of data in the variable. Because the variable is explicitly typed, VBA displays an error message rather than storing the data and changing the variable type. + +Table 16.1 shows the details on the amounts of memory that the different types of variables require. + +Table 16.1 Memory consumed by the different types of variables + +**Variable** | **Memory Needed (Bytes)** +---|--- +Boolean | 2 +Byte | 1 +Currency | 8 +Date | 8 +Variant/Decimal | 12 +Double | 8 +Integer | 2 +Long | 4 +Object | 4 +Single | 4 +String | Variable-length strings: 10 bytes plus the storage required for the string, which can be up to about two billion characters; fixed-length strings: the number of bytes required to store the string, which can be from 1 to about 64,000 characters +Variant | Variants that contain numbers: 16 bytes; variants that contain characters: 22 bytes plus the storage required for the characters + +How much memory you save by specifying data types, and how much difference choosing variable types makes to your procedures, depends on the type of work you're doing. For example, if you store a million characters in a Single variable, the 12 bytes you save by specifying that it's a String variable rather than a Variant variable make little difference. + +But if you use many variables on a computer with limited memory, specifying the appropriate data types for your variables may save enough memory to enable your procedure to run where it otherwise wouldn't have been able to, or at least enable it to run faster. Of course, hardware is continually improving—and memory is hardware. Now that RAM is becoming cheap and plentiful, conserving memory is not much of an issue for programmers. + +A second reason for declaring your variables explicitly rather than implicitly is to make your code easier to read and to debug. And a third reason is that you can implement some runtime range-checking. If you _know_ something will be less than 32,768, and you therefore declare it as being the Integer data type (rather than the Long type), you'll automatically get a helpful error if a Long-size value creeps into it somehow at runtime. + +* * * + +**Simplify Recorded Code** + +Recall that the Macro Recorder (available only in Word and Excel) offers an excellent way to get started writing code for a project. Just turn on the recorder and carry out the actions you want your code to accomplish. The recorder can write code for many tasks. It can't create conditional branches, loops, and a few other code features, but it nevertheless can do quite a bit. + +The Macro Recorder provides a great way to kick-start creating code by letting you identify quickly the built-in objects the procedure will need to work with and the methods and properties you'll need to use with them. + +But as you've seen, one drawback of the Macro Recorder is that it tends to record a lot of code that you don't actually need in your procedures. It records the _state_ of a context—the status of _all_ the options in the current context. And you're probably interested in only one or two options. + +It's like taking a photo. The camera records _everything_ that you point it at. But often you don't want to see everything, just a particular object. You took a picture of the school play. The photo contains all the kids on the stage, but you're only really interested in little Darla's lovely smile and her costume. So you use a graphics program to crop out (cut away) everything but Darla. + +Here's an example of "cropping" code: When you record a procedure that changes one setting in a dialog box (such as switching to italic in the Font dialog box in Word), the Macro Recorder nonetheless records _all_ the other settings on not only that page, but also on all the other Font dialog box's pages (Character Spacing and things like that). Just in case you wanted them. But you don't. You're only interested in the italic feature. + +Once you've finished recording the procedure, you'll often want to open it to make minor adjustments; to add loops, decisions, or UI items (message boxes, input boxes, or user forms); or even to lift parts of the code for use in other procedures. When you do this, first examine the code the Macro Recorder has recorded, and where possible, strip out the statements unrelated to your purpose. Leave only the recorded pieces of code that you need. Make the code focus on what you're actually doing—the task you're carrying out. Later, you'll thank yourself if you have to examine or reuse this code: You'll be able to easily see what the code is doing: not superscript, not boldface, or any of the other settings—just italic. + +Take this Word example. Compare the Applying_Arial_Font procedure that follows with the Stripped_Down_Procedure_Applying_Arial_Font procedure that comes after it: + + Sub Applying_Arial_Font() + ' + ' Applying_Arial_Font Macro + ' Applies the Arial font to the selected text + ' + With Selection.Font + . **Name = "Arial"** + .Size = 13 + .Bold = False + .Italic = False + .Underline = wdUnderlineNone + .UnderlineColor = wdColorAutomatic + .StrikeThrough = False + .DoubleStrikeThrough = False + .Outline = False + .Emboss = False + .Shadow = False + .Hidden = False + .SmallCaps = False + .AllCaps = False + .Color = wdColorAutomatic + .Engrave = False + .Superscript = False + .Subscript = False + .Spacing = 0 + .Scaling = 100 + .Position = 0 + .Kerning = 0 + .Animation = wdAnimationNone + End With + End Sub + + Sub Stripped_Down_Procedure_Applying_Arial_Font() + Selection.Font.Name = "Arial **"** End Sub + +As you can see, the Stripped_Down_Procedure_Applying_Arial_Font code has the same effect as the recorded procedure, but it contains 3 lines instead of the recorded procedure's 31. + +* * * + +### Using With Statements to Simplify Your Code + +When you're performing multiple actions with an object, you can often use With statements to avoid repeating the object reference for each action. This simplifies your code. It becomes easier to read. And it may make it run marginally faster. + +For example, the following statements contain multiple references to the first Paragraph object—Paragraphs(1)—in the ActiveDocument object in Word: + + ActiveDocument.Paragraphs(1).Range.Font.Bold = True + ActiveDocument.Paragraphs(1).Range.Font.Name = "Times New Roman" + ActiveDocument.Paragraphs(1).LineSpacingRule = wdLineSpaceSingle + ActiveDocument.Paragraphs(1).Borders(1).LineStyle = wdLineStyleDouble + ActiveDocument.Paragraphs(1).Borders(1).ColorIndex = wdBlue + +You can replace this redundancy by employing a With structure that references the Paragraphs(1) object in the ActiveDocument object to simplify the number of references involved: + + With ActiveDocument.Paragraphs(1) + .Range.Font.Bold = True + .Range.Font.Name = "Times New Roman" + .LineSpacingRule = wdLineSpaceSingle + .Borders(1).LineStyle = wdLineStyleDouble + .Borders(1).ColorIndex = wdBlue + End With + +When you need to work with multiple child objects contained within a single parent object, you can either use separate With statements or pick the lowest common denominator of the objects you want to work with and use an outer With statement along with nested With statements for the child objects. + +If you wish, you can further reduce the number of object references in the previous code example by using nested With statements for the Font object in the Range object and for the Borders(1) object, like this: + + With ActiveDocument.Paragraphs(1) + With .Range.Font + .Bold = True + .Name = "Times New Roman" + End With + .LineSpacingRule = wdLineSpaceSingle + With .Borders(1) + .LineStyle = wdLineStyleDouble + .ColorIndex = wdBlue + End With + End With + +### Don't Use With Statements Pointlessly + +With statements are great for reducing repetitive object references and making your code easier to read, but don't use them just because you can. If you have only one statement within a With statement, as in the following example (which again uses Word), you're probably wasting your time typing the extra code to set up the With structure: + + With ActiveDocument.Sections(1).Headers(wdHeaderFooterPrimary) _ + .Range.Words(1) + .Bold = True + End With + +Likewise, don't nest With statements unless you need to—it gets confusing, like this bizarre example: + + With ActiveDocument + With .Sections(1) + With .Headers(wdHeaderFooterPrimary) + With .Range + With .Words(1) + With .Font + .Italic = True + .Bold = False + .Color = wdColorBlack + End With + End With + End With + End With + End With + End With + +This code is better when written like this: + + With ActiveDocument.Sections(1).Headers(wdHeaderFooterPrimary).Range. _ + Words(1).Font + .Italic = True + .Bold = False + .Color = wdColorBlack + End With + +### Optimizing Your Select Case Statements + +When you use a Select Case statement, arrange the Case statements so that the most likely ones appear first. This saves VBA some work and time—VBA goes down through the list of Case statements until it finds a match, so the earlier in the list it scores a match, the quicker the execution of the statement. + +### Don't Check Things Senselessly + +If you need to implement a setting (especially a Boolean one) every time a particular procedure runs, there's no point in checking the current value. + +For example, suppose you wanted to make sure the EnableAutoRecover property (a Boolean property that sets or returns whether the AutoRecover feature is on for the current workbook) of the ActiveWorkbook object in Excel is set to True. You could check the current value of EnableAutoRecover and, if it is False, set it to True like this: + + If ActiveWorkbook.EnableAutoRecover = False Then _ + ActiveWorkbook.EnableAutoRecover = True + +But that wastes code. Instead, simply set the property to True: + + ActiveWorkbook.EnableAutoRecover = True + +### Removing Unused Elements from Your Code + +To improve the efficiency of your code, try to remove all unused elements from it. When creating a complex project with many interrelated procedures, it's easy to end up with some procedures that are almost or entirely useless. You were trying out various approaches and perhaps sketched in a couple of procedures that ended up never being used, for example. + +You'll find it easier to remove superfluous procedures if you've commented your code comprehensively while creating it so you can be sure that what you're removing is unused rather than used. If you're in doubt as to which procedure is calling which, display the Call Stack dialog box (see Figure 16.1); choose View ⇒ Call Stack or press Ctrl+L to see what's happening. Recall that the Call Stack dialog box is available in Break mode (while you're single-stepping through a procedure, or the editor has halted execution at a breakpoint, and so on). If one procedure has called another one during execution, they will both be listed. + +Figure 16.1 The Call Stack dialog box lets you see which procedure has called which. + +Figure 16.1 reveals that the procedure named Identify_Current_User called the procedure named ToggleItal and that ToggleItal then called GetClipboardText, which, in turn, called DocumentOpen. Execution is currently halted (is in Break mode) within the DocumentOpen procedure. + +Alternatively, try one of these techniques: + + * Set a breakpoint at the beginning of a suspect procedure so that you'll be alerted when it's called. + * Display message boxes at decisive junctures in your code so you can see what's happening: Is the procedure ever called? + * Use a Debug.Print statement at an appropriate point (again, perhaps the beginning of a procedure) to temporarily log information in the Immediate window. + +Before you remove an apparently dead procedure from your code, make sure not only that it's unused in the way the code is currently being run, but also that it's not used in ways in which the procedure _might_ be run were circumstances different. If you think that the procedure might still be used, try moving it to a project from which you can easily restore it rather than deleting it altogether. + +Once you've removed any unused procedures, examine the variables in the procedures. Even if you're using the Option Explicit declaration and declaring every variable explicitly, check that you haven't declared variables that end up not being used. For simple projects, you'll be able to catch the unused variables by using the Locals window to see which of them never get assigned a value. For more complex projects, you may want to try some of the available third-party tools that help you remove unneeded elements from your code. + +If in doubt, just use the Editor's Find feature (Ctrl+F) to see if the variable name appears only once: when the variable is declared. + +Removing unused procedures and variables isn't crucial. They do no real harm; they're just debris. But they do clutter up your code, potentially making it harder to understand and modify if you come back to it later for maintenance or reuse. + +* * * + +Back Up Your Modules, Forms, and Class Modules + +Before removing an entire module, use the File ⇒ Export File command to export a copy of the module to a .BAS file in a safe storage location in case the module contains anything you'll subsequently discover to be of value. Similarly, export your user forms to .FRM files and your classes to .CLS files. + +* * * + +## Making Visual Improvements to Your Code + +Another way to improve your code is to format it so it's as easy as possible to read, maintain, and modify. + +### Indenting the Different Levels of Code + +As you've seen in the examples so far in this book, you can make code much easier to follow by indenting some lines of code with tabs or spaces to show their logical relation to each other or to visually illustrate subordination and structures such as loops. + +You can click the Indent and Outdent buttons on the editor's Edit toolbar or press Tab and Shift+Tab to quickly indent or unindent a selected block of code, with the relative indentation of the lines within the block remaining the same. + +* * * + +Labels Can't Be Indented, But That's a Good Thing + +You can't indent a label—a word ending with a colon (:) and used as the target of a GoTo statement. If you try to indent a label, the VBA Editor won't let you. The editor removes all spaces to the left of the label as soon as you press Enter or otherwise move the insertion point off the line containing the label. A label is a target and _should_ be on the far left of its code line so you can easily see it. + +* * * + +### Using Line-Continuation Characters to Break Long Lines + +Use the line-continuation character (a space followed by an underscore) to break long lines of code into two or more shorter lines. Breaking lines makes long statements fit within the Code window on an average-size monitor at a readable point size and enables you to break the code into more logical segments. + +### Using the Concatenation Character to Break Long Strings + +You can't use the line-continuation character to break strings, however. If you want to break a long string, you must divide the string into smaller strings and then use the concatenation character (&) to attach the parts again. You _can_ separate the parts of the divided string (which are merely separated by the line-continuation character). For example, consider a long string such as this: + + strMessageText = "The macro has finished running. Please check your presentation + to ensure that all blank slides have been removed." + +Instead, you could divide the string into two, and then rejoin it like this: + + strMessageText = "The macro has finished running. " & **_** + "Please check your presentation to ensure that " & **_** + "all blank slides have been removed." + +### Using Blank Lines to Break Up Your Code + +* * * + +**For Legacy Reasons, You Can Employ the + Character for Concatenation** + +Alternatively, you can use the addition character (+) to concatenate one string with another, but not to concatenate a string and a numeric variable (do that, and VBA tries to _add_ them mathematically instead of concatenating them). However, your code is easier to read if you just stick with the concatenation character & when concatenating strings. Leave the + character for math. + +* * * + +To make your code more readable, use blank lines to separate statements into logical groups. For example, you might segregate all the variable declarations in a procedure as shown in the following example so that they stand out more clearly: + + Sub Create_Rejection_Letter + + Dim strApplicantFirst As String, strApplicantInitial As String, _ + strApplicantLast As String, strApplicantTitle As String + Dim strJobTitle As String + Dim dteDateApplied As Date, dteDateInterviewed As Date + Dim blnExperience As Boolean + + strApplicantFirst = "Shirley" + strApplicantInitial = "P" + strApplicantLast = "McKorley" + ] + +### Using Variables to Simplify Complex Syntax + +You can use variables to simplify and shorten complex syntax. For example, you could display a message box by using an awkwardly long statement such as this one: + + If MsgBox("The document contains no text." & vbCr & vbCr _ + & "Click the Yes button to continue formatting the document." & _ + " Click the No button to cancel the procedure.", _ + vbYesNo & vbQuestion, _ + "Error Selecting Document: Cancel Procedure?") Then + +Alternatively, you could use one String variable for building the message and another String variable for the title: + + Dim strMsg As String + Dim strTBar As String + strMsg = "The document contains no text." & vbCr & vbCr + strMsg = _ + strMsg & "Click the Yes button to continue formatting the document. " + strMsg = strMsg & "Click the No button to cancel the procedure." + strTBar = "Error Selecting Document: Cancel Procedure?" + If MsgBox(strMsg, vbYesNo & vbQuestion, strTBar) Then + +At first sight, this code looks more complex than the straightforward message-box statement, mostly because of the explicit variable declarations that increase the length of the code segment. But in the long run, this approach is much easier to read and modify. + +In the previous example, you could also replace the vbYesNo & vbQuestion part of the MsgBox statement with a variable (preferably a Long rather than a Variant). But doing so makes the code harder to read and is seldom worthwhile. + +### Passing Information from One Procedure to Another Using Arguments + +Often when you call another procedure, you'll need to pass information to it from the calling procedure. And you sometimes go the other way: when the called procedure has finished executing, it needs to pass back info to the caller. + +The best way to pass information from a caller procedure to a called procedure is by using arguments. You declare the arguments to pass in the declaration line of the procedure that passes them. The arguments appear in the parentheses after the procedure's name. You can pass either a single argument (as the first of the following statements does) or multiple arguments separated by commas (as the second does): + + Sub PassOneArgument(MyArg) + Sub PassTwoArguments(FirstArg, SecondArg) + +As with functions (discussed in Chapter 9), you can pass an argument either _by reference_ or _by value_. When a procedure passes an argument to another procedure by reference, the recipient procedure gets access to the memory location where the original variable is stored and can change the original variable. By contrast, when a procedure passes an argument to another procedure by value, the recipient procedure gets only a copy of the information in the variable and can't change the information in the original variable. + +Passing an argument by reference is useful when you want to manipulate the variable in the recipient procedure and then return the variable to the procedure from which it originated. Passing an argument by value is useful when you want to use the information stored in the variable in the recipient procedure and at the same time make sure the original information in the variable doesn't change. + +By reference is the default way to pass an argument, but you can also use the ByRef keyword to state explicitly that you want to pass an argument by reference. Both of the following statements pass the argument MyArg by reference: + + Sub PassByReference(MyArg) + Sub PassByReference(ByRef MyArg) + +To pass an argument by value, you must use the ByVal keyword. The following statement passes the ValArg argument by value: + + Sub PassByValue(ByVal ValArg) + +In practice, however, you'll rarely, if ever, need to employ ByVal. Arguments are nearly universally passed by reference, the default. + +If necessary, you can pass some arguments for a procedure by reference and others by value. The following statement passes the MyArg argument by reference and the ValArg argument by value: + + Sub PassBoth(ByRef MyArg, ByVal ValArg) + +You can explicitly declare the data type of arguments you pass in order to take up less memory and ensure that your procedures are passing the type of information you intend them to. But when passing an argument by reference, you need to make sure that the data type of the argument you're passing matches the data type expected by the called procedure. For example, if you declare a String in the caller procedure and try to pass it as an argument when the called procedure is expecting a Variant, VBA gives an error. + +To declare the data type of an argument, include a data-type declaration in the argument list. The following statement declares MyArg as a String and ValArg as a Variant: + + Sub PassBoth(MyArg As String, ValArg As Variant) + +You can specify an optional argument by using the Optional keyword. Place the Optional keyword before the ByRef or ByVal keyword if you need to use ByRef or ByVal: + + Sub PassBoth(ByRef MyArg As String, ByVal ValArg As Variant, _ + Optional ByVal MyOptArg As Variant) + +Listing 16.1 shows a segment of a procedure that uses arguments to pass information from one procedure to another. + +**Listing 16.1**: Passing arguments from one procedure to another + + 1. Sub GetCustomerInfo() + 2. Dim strCustName As String, strCustCity As String, _ + strCustPhone As String + 3. 'Get strCustName, strCustCity, strCustPhone from a database + 4. CreateCustomer strCustName, strCustCity, strCustPhone + 5. End Sub + 6. + 7. Sub CreateCustomer(ByRef strCName As String, _ + ByRef strCCity As String, ByVal strCPhone As String) + 8. Dim strCustomer As String + 9. strCustomer = strCName & vbTab & strCCity _ + & vbTab & strCPhone + 10. 'take action with strCustomer string here + 11. End Sub + +Listing 16.1 contains two minimalist procedures—GetCustomerInfo and CreateCustomer—that show how to use arguments to pass information between procedures: + + * The first procedure, GetCustomerInfo, explicitly declares three String variables in line 2: strCustName, strCustCity, and strCustPhone. + * Line 3 contains a comment indicating that you would write additional code here to obtain the data and assign information to the variables. + * Line 4 calls the CreateCustomer procedure and passes to it the variables strCustName, strCustCity, and strCustPhone as arguments. Because this statement doesn't use the Call keyword, the arguments aren't enclosed in parentheses. + * Execution then switches to line 7, which starts the CreateCustomer procedure by declaring the three String arguments it uses: strCName and strCCity are to be passed by reference, and strCPhone is to be passed by value. + * Line 8 declares the String variable strCustomer. Line 9 then assigns to strCustomer the information in strCName, a tab, the information in strCCity, another tab, and the information in strCPhone. + * Line 10 contains a comment indicating where the procedure would take action with the strCustomer string (for example, dumping it into some kind of primitive database), and line 11 ends the procedure. + +### Passing Information Back from a Called Procedure + +Just a reminder: Functions, not subs, are used to pass information _back_ to a caller. Both functions and subs are procedures, but functions are specifically designed to send information back to a caller. + +This code example calls a function that adds state tax to a purchase price, then passes back the resulting total cost: + + 1. Sub FindTotalCost() + 2. + 3. Dim OriginalCost, TotalCost ' declare two variant types + 4. OriginalCost = 155 'this sweater is expensive + 5. + 6. TotalCost = AddTax(OriginalCost) 'call the AddTax function + 7. MsgBox TotalCost 'show the final cost including 7% tax + 8. + 9. + 10. End Sub + 11. + 12. Function AddTax(SubTotal) + 13. + 14. AddTax = SubTotal * 1.07 'do the math and assign the result + 15. 'to the function name so it gets passed back + 16. + 17. End Function + +Data is passed from the caller to the called in line 6. Data is passed back from the called to the caller by assigning a value to the name of the function in line 14. + +### Passing Information from One Procedure to Another Using Private or Public Variables + +Another way to pass information from one procedure to another is to use either private variables or public variables. You can use private variables if the procedures that need to share information are located in the same module. If the procedures are located in different modules, you'll need to use public variables to pass the information. + +* * * + +Avoid Using Global Variables to Pass Data + +Using private or public variables to pass information from one procedure to another is widely considered poor programming practice. Doing so makes it harder to track the flow of information between procedures, especially when several procedures are involved. However, you may sometimes find this way of passing information helpful—or you may be required to work with someone else's code that uses this approach. + +* * * + +Listing 16.2 contains an example of passing information by using private variables. + +**Listing 16.2**: Passing data using a private variable + + 1. Private strPassMe As String + 2. + 3. Sub PassingInfo() + 4. strPassMe = "Hello." + 5. PassingInfoBack + 6. MsgBox strPassMe + 7. End Sub + 8. + 9. Sub PassingInfoBack() + 10. strPassMe = strPassMe & " How are you?" + 11. End Sub + +Listing 16.2 begins by declaring the private String variable strPassMe at the beginning of the code sheet for the module. strPassMe is then available to all the procedures in the module. + + * The PassingInfo procedure (lines 3 to 7) assigns the text Hello. (with the period) to strPassMe in line 4 and then calls the PassingInfoBack procedure in line 5. + * Execution then shifts to line 9, which starts the PassingInfoBack procedure. + * Line 10 adds How are you? with a leading space to the strPassMe String variable. + * Line 11 ends the PassingInfoBack procedure, at which point execution returns to the PassingInfo procedure at line 6, which displays a message box containing the strPassMe string (now _Hello_. _How are you_?). + * Line 7 ends the procedure. + +# Creating and Using Classes + +A _class_ is the formal definition of an object—typically, a custom object. By defining classes, you can build your own custom objects. A class is essentially a template for an object: Once you've defined the class in your code, VB will then create objects based on it when the code executes. + +The relationship between class and object is sometimes described as similar to a cookie cutter and a cookie or a blueprint and the houses based on that blueprint. The former is a description, the latter is the description brought to life. + +Another way to think of the distinction between class and object is to recall the distinction between design time and runtime. You create a _class_ during design time by writing code that describes the _object_ (or multiple objects). The class will come into being during runtime when the class code executes. + +The phrase _come into being_ is more formally expressed as follows: an object is _instantiated_. (An _instance_ —the object—of the class comes into existence during runtime.) Got it? + +## What Can You Do with Class Modules? + +Programming means telling the computer how to process some information. + +Information to be processed can be stored in various places. For example, you can store it in a database that your code accesses. Or you can type it into your code, such as storing the information _Donald_ in a string variable: + + MyString = "Donald" + +The second half of information-processing is executing code that manipulates the information (the data). You've been doing this throughout this book. Here we process some data by computing the length of a string: + + MsgBox Len(myString) + +One thing that is interesting about objects is that they not only can process information; they can also _contain_ it. They can hide their data (properties) or their processing (methods) from outside programming. This hiding is called _encapsulation_. + +You can use objects to store information, to process information, and to make information selectively accessible (hide it or not, as the programmer specifies) to the various other objects in an application. + +Consider what is to me the most successful application of object-oriented programming (or OOP): the controls you can put on a user form, such as the TextBoxes or Labels that we explored in Chapters 14 and 15—"Creating Simple Custom Dialog Boxes" and "Creating Complex Forms." + +A Label is an object that has a set of _properties_ , which can be visualized as both data and as processing capabilities. When you assign the value 33, say, to a Label's Left property, the Label automatically moves to that location on the form. + +_You did no programming to make this move happen_. You merely passed a desired position to the object and the object's internal capability to move itself took over and carried out the necessary tasks to make this happen. This is encapsulation: The programming that moves a Label is hidden from the outside world. The object has its own capabilities. And it contains its own internal data as well (the kind of line that frames a Label, its default width, the color of its background, and so on). + +When OOP is applied to more abstract concepts such as translating procedural programming (subs and functions) into OOP (objects), the results are mixed. Some programmers swear by OOP; others demur. OOP has become quite popular in many professional programming circles, but even after decades of implementation, OOP still causes controversy. For small jobs like simple macros, OOP is clearly overkill. For large projects, you might like the organizational and security features of OOP. And, if you intend to go into professional programming, you must understand how to use it. + +## A Brief Overview of Classes + +To create a class in VBA, you insert a _class module_ in a project (Insert ⇒ Class Module) and give the class the name by which you'll access it. You then use the Code window to create the code (constant and variable declarations, subroutines, and functions) that defines the properties and methods that the class will have. When you've finished, the class contains all the information that the custom object needs to perform its tasks and store data. + +A major distinction between a class module and a regular code module is that you don't directly execute code in a class module. Instead, in a regular code module you declare an object variable of the class's type. You then use this variable to access the class's members (its properties and methods) in your regular code. + +The concept of classes can be difficult to grasp, so the following sections present a simple example of a class that relates to something physical—the book you're holding. The example describes a class named Book that contains the salient information about a book. During runtime, after creating the Book object, the example's code adds this book's information to the Book object. + +Entire books endeavor to explain OOP and its uses. But I'll give you a taste of it here. The following example class works in any VBA host application. + +## Planning Your Class + +Before you start creating a class, decide the following: + + * A class describes an object, so...what does this object _do_? + * What information does the class need to contain for the object to do what it's supposed to do? You use variables and properties to store this information. You use _variables_ to store information used privately, internally inside the object, and _properties_ to make available pieces of that information that need to be accessed from outside the object. You can create both read-only and read/write properties. + * What capabilities should this object have? Things a class can do, its behaviors, are called its _methods_. You create subroutines and functions to implement the class's methods—subroutines for the methods that return no value after doing their job and functions for the methods that do return a value after executing. + +Objects based on our Book class will contain information about a book project. Note that I said _objects_ , plural. A single class can create as many objects during runtime as the programmer wishes, just as a single cookie cutter can stamp out multiple cookies. Or a single blueprint can be used to build many townhouses. + +If you're a librarian programmer, you might use the Book class to generate thousands of Book objects. + +The class we'll construct will need properties for storing information such as the title, author, and price and will need a method that displays all this book information. + +## Creating a Class Module + +The first step in creating your class is to insert a class module into your project. You create a class module in much the same way you create a regular module. + +In the Project Explorer, right-click the target project or one of the items it contains and choose Insert ⇒ Class Module from the context menu. Alternatively, choose Insert ⇒ Class Module from the editor's menu bar, or click the Insert button on the Standard toolbar and choose Class Module from the drop-down list. + +The VBA Editor creates a new class module named Class _n_ (where _n_ is the next-higher consecutive number not yet employed to name a class module) and opens a Code window for it. If the project doesn't already contain a Class Modules folder, VBA adds one, and it appears in the Project Explorer. + +If you have the Require Variable Declarations option selected (on the Editor page of the Tools ⇒ Options dialog box in the VBA Editor), the VBA Editor automatically places an Option Explicit statement in the declarations area at the top of the code sheet for the class, just as it does for an ordinary module. + +If you don't have the Require Variable Declarations option selected, it's still a good idea to type in the Option Explicit statement anyway to force yourself to declare variables explicitly. + +## Naming the Class + +Now change the name of the class to something more descriptive than Class _n_. Press F4 to display the Properties window (if it's not already displayed) and enter the new name in the (Name) text box. Make the name descriptive, because you'll be using it in your code and you'll want its purpose to be easily grasped. We can name our example class Book. Press Enter or click elsewhere in the Visual Basic Editor window to make the change take effect. + +## Setting the Instancing Property + +The Instancing property determines whether a class module is visible (can be instantiated—brought into existence) from an outside project. + +Recall that an outside project must first reference the project that the class module is in before any access to another project's objects is even possible. Referencing is accomplished by Tools ⇒ References in the Editor. + +The default setting, 1 – Private, prevents other projects from seeing the class module and from working with instances (objects) of that class. In other words, the object is encapsulated, hidden. + +The other setting is 2 – PublicNonCreatable, and it allows an outside project to see the class. The outside project, even with a reference, however, still can't create instances (create objects) from the class by itself. _The instantiation must take place in the project that hosts the class_. + +So, for one project to access an object in another, three conditions must be met: + + * The Instancing property in the project containing the object must be set to PublicNonCreatable. + * The project containing the object must have instantiated that object. + * The outside project must have established a reference to the project containing the object. + +To permit an outside project access to instances of a class (objects), set the Instancing property to 2 – PublicNonCreatable. Otherwise, leave the default setting of 1 – Private intact. With the default Private setting, only the project that has the class can access objects instantiated from that class. + +## Declaring Variables and Constants for the Class + +After setting the Instancing property, you should declare the variables and constants that the class will need for its internal operations. + +These declarations work just like the declarations you've seen so far in the book, except that you'll probably want to use a naming convention to indicate that the variables and constants belong to a class rather than to a procedure. We'll use the prefix book on the constants and variables to make it easy for the programmer to see that they're part of the Book class. + +The Book class uses the declarations shown in the following code snippet to declare one constant (bookName) and five variables (bookTitle, bookAuthor, bookPages, bookPrice, and bookPublicationDate) of assorted data types: + + Const BookName = "Book Project" + Dim BookTitle As String + Dim BookAuthor As String + Dim BookPages As Long + Dim BookPrice As Currency + Dim BookPublicationDate As Date + +## Adding Properties to the Class + +Now add the properties to the class. Table 16.2 lists the properties that the Book class uses. + +Table 16.2 Properties of the Book class + +**Property** | **Description** +---|--- +Title | A read/write String property that sets or returns the formal title of the book +Author | A read/write String property that sets or returns the author's name +Pages | A read/write Long property that sets or returns the page count of the book +Price | A read/write Currency property that sets or returns the price of the book +PublicationDate | A read/write Date property that sets or returns the publication date of the book + +You can create properties for a class in either of two ways. The first way is less formal than the second but provides you with less control over the properties. + +### Creating a Property by Using a Public Variable + +One way to create a property in your code is to declare a Public variable in the class module. Doing this creates a read/write property with the name of the variable. For example, the following statement (when typed into a class module) creates a read/write Boolean property named HardCover: + + Public HardCover As Boolean + +Using a Public variable like this is a quick way to create a property, but it's a bit limited: It must be read/write. You can't choose to make the property read-only (or write-only). What's more, you can't execute any other code when the program's code sets or returns the value of the property. + +After declaring a Public variable, your code can then set and return the property's value in the usual way. For example, say we've created the Boolean property HardCover in an instance named MastVBA of the Book class. The following statements set (store, write data in) the property and then display a message box returning (reading the value from) the property: + + MastVBA.HardCover = False + MsgBox MastVBA.HardCover + +Something special is illustrated here. The name of the _class_ is Book, but notice that the name of an object instantiated from this class is MastVBA. Objects—there can be many derived from a given class—will each have its individual object name. Instantiated objects should not have the same name as the class from which they spring. + +### Creating a Property by Using Property Procedures + +The second and more formal and flexible way to create a property is to use property procedures. There are three types of property procedures—Property Let, Property Get, and Property Set: + + * A Property Let procedure assigns a value to a property. It _writes_. + * A Property Get procedure returns the value from a property. It _reads_. + * A Property Set procedure creates a reference to an object. (This is similar to how you create an object variable in ordinary, non-object procedures.) + +You typically use these procedures in pairs, pairing a Property Get procedure with a Property Let procedure. That creates a read/write capability. Or you pair a Property Set procedure with a Property Let procedure. If you use a Property Get procedure on its own, that property will be read-only. + +#### _Assigning a Value to a Property with a_ Property Let _Procedure_ + +To permit outside code to assign a value to an object's property, you use a Property Let procedure. The syntax for a Property Let procedure is as follows: + + Property Let _name_ ([ _arglist_ ,] _value_ ) + [ _statements_ ] + End Property + +These are the components of the syntax: + + * The Property keyword starts the procedure, and the End Property keywords end the procedure. + * _name_ is a required argument specifying the name of the property procedure being created. If you also create a paired Property Get procedure as well for this property, use the same name as the Property Let procedure. + * _arglist_ is a required argument listing the arguments that are passed to the procedure. An argument list is required here because a Let procedure is designed to assign a value or values to this property. So the outside code must provide at least one value. If _arglist_ contains multiple arguments, you separate them with commas. + +For example, the following Property Let procedure creates the String property Title, assigning the argument NewTitle and passing its value to the variable bookTitle: + + Property Let Title(NewTitle As String) + bookTitle = NewTitle + End Property + +If you don't add a Property Get procedure for this Title data, the property named Title will be write-only. Write-only properties aren't widely useful, so the next step is to write code that reads the value in the property. Then it becomes a read/write property. + +#### _Returning a Value from a Property with a_ Property Get _Procedure_ + +To return a value from a property, you use a Property Get procedure. The syntax for a Property Get procedure is as follows: + + Property Get _name_ [( _arglist_ )] [As _type_ ] + [ _statements_ ] + End Property + +The components of the syntax are the same as for the Property Let procedure, except for two things: + + * First, Property Get adds the optional _type_ argument, which specifies the data type for the property. + * Second, for Property Get, the _arglist_ argument is optional. You _can_ have arguments for Property Get procedures, but you won't usually need to. If you do use arguments, their names and data types must match those in the corresponding Property Let procedure. + +For example, the following Property Get procedure creates the String property Title, assigning to it the contents of the bookTitle variable: + + Property Get Title() As String + Title = bookTitle + End Property + +If this Property Get procedure existed alone (without being paired with a corresponding Property Let procedure), it would be a read-only property. Use Property Get alone if you don't want to allow outside code to modify this property in any way. + +However, because we've paired it with the Property Let Title procedure shown in the previous section, you now have a read/write property. + +#### _Assigning an Object to a Property with a_ Property Set _Procedure_ + +Instead of assigning a value to a property, you can assign an object to it. To do so, you use a Property Set procedure rather than a Property Let procedure. The syntax for a Property Set procedure is as follows: + + Property Set _name_ ([ _arglist_ ,] _reference_ ) + [ _statements_ ] + End Property + +The components of the syntax are the same as for the Property Let procedure, except that Property Set uses the _reference_ argument rather than the value argument. _reference_ is a required argument specifying the object to reference. + +For example, the following Property Set procedure creates the object property Where that references a range: + + Property Set Where(rngR As Range) + bookRange = rngR + End Property + +* * * + +Both Set and Let Can Be Used with Object Variables + +For an object variable, you can use both a Property Set procedure and a Property Let procedure, but in most cases it makes more sense to use only a Property Set procedure. + +* * * + +### The Properties for the Book Class + +Listing 16.3 shows the full listing of properties for the Book class. + +**Listing 16.3**: All the properties of the Book class + + 1. Option Explicit + 2. + 3. Const BookName = "VBA Book Project" + 4. Dim BookTitle As String + 5. Dim BookAuthor As String + 6. Dim BookPages As Integer + 7. Dim BookPrice As Currency + 8. Dim BookPublicationDate As Date + 9. + 10. Public Property Let Title(strT As String) + 11. BookTitle = strT + 12. End Property + 13. + 14. Public Property Get Title() As String + 15. Title = BookTitle + 16. End Property + 17. + 18. Public Property Let Author(strA As String) + 19. BookAuthor = strA + 20. End Property + 21. + 22. Public Property Get Author() As String + 23. Author = BookAuthor + 24. End Property + 25. + 26. Public Property Let Pages(intPages As Integer) + 27. BookPages = intPages + 28. End Property + 29. + 30. Public Property Get Pages() As Integer + 31. Pages = BookPages + 32. End Property + 33. + 34. Public Property Let Price(curP As Currency) + 35. BookPrice = curP + 36. End Property + 37. + 38. Public Property Get Price() As Currency + 39. Price = BookPrice + 40. End Property + 41. + 42. Public Property Let PublicationDate(dtePD As Date) + 43. BookPublicationDate = dtePD + 44. End Property + 45. + 46. Public Property Get PublicationDate() As Date + 47. PublicationDate = BookPublicationDate + 48. End Property + +In Listing 16.3, each property for the Book class is declared as Public so that it is publicly accessible. + +The code illustrates how you should organize your paired procedures by putting each Property Let procedure next to the corresponding Property Get procedure: The Property Let Title procedure in lines 10 through 12 is matched by the Property Get Title procedure in lines 14 through 16, and so on for the Author, Pages, Price, and PublicationDate property procedures. + +Pairing the procedures makes it easy to read the code to make sure each procedure that should have a counterpart does have one, and to make sure the arguments match. + +## Adding Methods to a Class + +Now that we've created properties as places to store data in our object, it's time to add functionality that will process that data. It's time to add the class's methods by adding subroutines and functions as necessary. As you'll see at the end of this chapter, the VBA Editor will display a list of the members—properties and methods—of an object you create, or objects built into VBA + +Subroutines and functions you create within a class are like the subroutines and functions you use in ordinary, non-object code modules. + +Our example Book class uses only one method, ShowInfo, which displays a message box showing the properties of the book. Listing 16.4 displays the ShowInfo procedure. + +**Listing 16.4**: The ShowInfo method of the Book class + + 1. Sub ShowInfo() + 2. Dim strM As String + 3. strM = "Title:" & vbTab & BookTitle & vbCr + 4. strM = strM & "Author:" & vbTab & BookAuthor & vbCr + 5. strM = strM & "Pages:" & vbTab & BookPages & vbCr + 6. strM = strM & "Price:" & vbTab & "$" & BookPrice & vbCr + 7. strM = strM & "Date:" & vbTab & Me.PublicationDate & vbCr + 8. MsgBox strM, vbOKOnly + vbInformation, BookName _ + 9. & " Information" + 10. End Sub + +The ShowInfo procedure builds a string containing the information from the class and then displays the string in a message box. Here's what happens: + + * Line 2 declares the String variable strM, which the procedure uses to store the information for the prompt argument in the message box. + * Line 3 adds to strM the text Title:, a tab, the contents of the bookTitle variable (which contains the title of the book in the object), and a carriage return. + * Line 4 builds on strM, adding the author information. Likewise, line 5 adds the information on the page count, and line 6 adds the price information (including a dollar sign for completeness). + * Line 7 also builds on strM, adding the date information. However, instead of using the class's internal variable (bookPublicationDate) to return the date stored, it calls the PublicationDate property of the object (which is identified by the Me keyword). This is by way of an example—returning bookPublicationDate works fine too. But you'll see the difference when you retrieve information from the object: instead of supplying the variable, VBA runs the Property Get PublicationDate procedure to return the information. + * Line 9 displays an OK-style message box containing the string strM. The message-box title is set to bookName (the constant that contains the text Book Project) and Information, and the message box uses an Information icon. + +## Using Your Class + +Recall that you can't execute class code directly. You can't put your insertion point inside the ShowInfo procedure and press F5 to run the code or F8 to step through the code. + +A class is a description of an object not yet in existence. Again, think blueprint for a house. + +So, before you can execute or test a class, you must create an instance of the class. You can't test the plumbing in a house by just looking at the blueprints before the house has been built. In other words, you must create an object based on the class template, then test the object. + +To instantiate an object, you write code in an ordinary, non-object code module (like the modules we've been using throughout this book so far, such as the Module1 or NewMacros module). + +To use the class you created, you create a new instance of the object by using a New keyword. The New keyword can be employed in either a Dim statement or a Set statement. For example, the following statement creates a new object variable based on the Book class: + + Dim myBook As New Book + +The following statements declare an Object variable named bookAnotherBook and then assign to it a new instance of the Book object: + + Dim bookAnotherBook As Object + Set bookAnotherBook = New Book + +You can then access the properties and methods of the Book object as you would any other VBA object's properties and methods (note the syntax: objectVariableName.Property). For example, the following statement sets the Price property of the bookAnotherBook object: + + bookAnotherBook.Price = 54.99 + +Listing 16.5 contains a short procedure called Class_Test that shows the Book class in action. Type this procedure into an ordinary code module (not a class module). And be sure the module you type this into is in the _same project_ as the Book class module you created earlier. + +**Listing 16.5**: Testing the Book class + + 1. Sub Class_Test() + 2. + 3. Dim myBook As New Book + 4. + 5. myBook.Title = "Mastering VBA for Microsoft Office 2013" + 6. myBook.Price = 49.99 + 7. myBook.Author = "Richard Mansfield" + 8. myBook.Pages = 880 + 9. myBook.PublicationDate = #8/17/2013# + 10. + 11. myBook.ShowInfo + 12. + 13. End Sub + +The listing shows an example of how to use a class in your programming. Here's what happens: + + * Line 1 begins the Class_Test procedure, and line 13 ends it. + * Line 2 is a spacer. Line 3 declares a new object variable named myBook of the Book class. Line 4 is another spacer. + * Lines 5 through 9 set the five properties of the myBook object—Title, Price, Author, Pages, and PublicationDate—as you'd set the properties for any other object. Note that the object name (the object variable name) is separated by a period from the properties and methods of that object. + * Line 10 is a spacer. Line 11 invokes the ShowInfo method of the myBook object—again, as you'd invoke a method for any other object. + +You can now test your object, clicking inside this procedure to put the blinking insertion cursor there, then pressing F5 (run) or F8 (single-stepping). Try single-stepping to see how the instantiation takes place and how the inner workings of the object add info to the properties and carry out the ShowInfo method. + +Here's another quick experiment. Notice that the VBA editor's Auto List Members feature works with objects you create, as well as objects built into VBA itself, such as Excel's Workbooks object. Remember that if in Excel's VBA Editor Code window you type workbooks. (you must type the period), suddenly a list drops down showing you all the members—the properties and methods—of the workbooks object. To then add one of these members to your code, just click it or use the down-arrow key to select it, then press Enter. + +Similarly, when you are programming with the MyBook object, typing **MyBook** followed by a period drops that object's members list down, as shown in Figure 16.2: + +Figure 16.2 VBA's helpful Auto List Members feature shows the properties and methods of your objects. + +# The Bottom Line + +**Arrange your code in modules.** + +Rather than use a single lengthy, complex procedure that accomplishes many tasks at once, programmers usually subdivide their code into smaller, self-contained procedures—dedicated to a single, discrete task. + +Master It + +Shorter, self-contained, single-task procedures offer the programmer several advantages. Name three. + +**Call a procedure.** + +You execute a procedure by calling it from within your programming code. + +Master It + +How do you call a procedure? + +**Pass information from one procedure to another.** + +Sometimes a procedure requires that you pass it some information. For example, a procedure that searches text and makes some style changes to it will require that you pass the text you want modified. + +Sometimes a procedure passes back information to the procedure that called it. For example, it might pass back a message describing whether the actions taken in the procedure were (or were not) accomplished successfully. + +Master It + +What kind of procedure can pass back information to the caller? + +**Understand what classes are and what they're for.** + +Contemporary computer programs employ classes for various reasons—to help organize large programs, to make code more easily reusable, to provide certain kinds of security, or as a superior substitute for public variables. But beginners sometimes have a hard time wrapping their minds around the concept, particularly the relationship between classes and objects. + +Master It + +What is the difference between a class and an object? + +Choose the correct answer (only one answer is correct): + +**1.** A class is like a cookie and an object is like a cookie cutter. + +**2.** A class is like a programmer and an object is like a module. + +**3.** A class is like a blueprint and an object is like a house built from that blueprint. + +**Create a class.** + +The VBA Editor employs a special kind of module for containing classes. + +Master It + +How do you create a class module in the VBA Editor? +Chapter 17 + +Debugging Your Code and Handling Errors + +In this chapter, you'll learn some of the things that can go wrong in your VBA code and what you can do about them. You'll examine the types of errors that can occur, from simple typos to infinite loops to errors that occur only once in a while (intermittent bugs are usually the hardest to locate). + +The chapter starts by explaining the basics of debugging. Then you'll work with the tools that VBA offers for debugging VBA code and use them to get the bugs out of some examples. The end of the chapter discusses the various ways to have your program itself respond to errors that happen during runtime. + +In this chapter you will learn to do the following: + + * Understand the basic principles of debugging + * Recognize the four different types of errors you'll create + * Employ VBA's debugging tools + * Deal with runtime errors + +# Principles of Debugging + +A _bug_ is an error in hardware or software that causes a program to execute other than as intended. _Debugging_ means removing the bugs from hardware or software. + +* * * + +Where Did the Term _Bug_ Come From? + +There are various explanations of the etymology of the word _bug_ as used in computer programming, ranging from apocryphal stories of moths being found in the circuit boards of malfunctioning computers to musings that the word came from the mythological _bugbear_ , an unwelcome beast. But in fact, the term _bug_ has been used to mean something troublesome for centuries. For more information, see the "bug" entry in the _Free On_ - _line Dictionary of Computing_ at a site such as . + +* * * + +Your goal when debugging should be to remove all bugs from your code. Your order of business will probably go something like this: + +1. First, test your code to see whether it works as it should. Put it through its paces. Test it by running the procedure once or twice using suitable files or other appropriate data. Try all the options the macro makes available to the user. Even if it seems to work, continue testing for a reasonable period with various data from various sample documents before unleashing the procedure on the world (or your colleagues). + +2. If your code doesn't work as you expected it to, you'll need to debug it. That means following the techniques described in this chapter to locate the bugs and then remove them. Once you've removed all the bugs that you can find, retest the code as described in the first step. This is important, because sometimes the act of debugging itself introduces new bugs. + +3. When testing your code, try to anticipate unusual, perhaps exotic ways that users might employ your code. For example, you might write a sophisticated procedure for manipulating a Word document on the (perfectly reasonable) assumption that the document will be open when the user starts the procedure running. You can test it on sample documents until you're blue in the face and it'll work fine every time. But if a user tries to run the procedure without first opening a document, it crashes. + +And don't make fun of this user. It might seem sensible to users that the procedure _should_ be launched before a file is loaded. Users might expect the procedure to display an input box asking them which document they want to manipulate. And more important, users also expect that you will anticipate and handle unexpected errors without crashing your programming. There are ways to _trap_ unanticipated user behavior or other runtime errors and respond to them gracefully. What does your program do if the user attempts to save a file to a disk that's full, for example? Just crash and thereby lose all the information they've spent time typing in? + +4. When you're ready to distribute your procedure, you may want to write instructions for its use. In these instructions, you may also need to document any bugs that you can't squash or circumstances under which the procedure shouldn't be run. But it's better to build instructions, responses to unanticipated problems, and other kinds of _error trapping_ into the macro itself Try to make your code bulletproof. + +Debugging a procedure tends to be idiosyncratic. There's no magic wand that you can wave over your code to banish bugs (although the VBA Editor does its best to help you eliminate certain types of errors from your code as you create it). Moreover, such simple things as forgetting to initialize a variable can wreak havoc on your code. + +You'll probably develop your own approach to debugging, partly because your programming will inevitably be written in your own style. But when debugging, it helps to focus on understanding what the code is supposed to do. You then correlate this with your observations of what the code actually does. When you reconcile the two, you'll probably have worked out how to debug the procedure. + +Also, the longer and more complex your code, the higher the probability that it will contain bugs. Certain kinds of bugs occur because of interactions among the parts of a project. And obviously the larger the project, the more parts with potential side effects. So keep your code as simple as possible by breaking it into separate procedures and modules, as discussed in Chapter 16, "Building Modular Code and Using Classes." Small code sections, with distinct, small tasks to accomplish, are almost always easier to debug than large lumps of code that try to do several things all at once. Remember that most debugging is a matter of locating _where_ in your code the problem occurs. If you're testing a small module of code with a very easily specified objective, locating a bug is that much easier. + +# The Different Types of Errors + +You'll encounter four basic kinds of errors in your programming: + + * Language errors + * Compile errors + * Runtime errors + * Program logic errors + +The following sections look at these kinds of errors in turn and discuss how to prevent them. After that, you'll examine the tools VBA provides for debugging. + +## Language Errors + +The first type of error is a _language error_ (also known as a _syntax error_ ). When you mistype a word in the Code window, omit a vital piece of punctuation (and in programming, all punctuation is vital), scramble a statement, or leave off the end of a construction, that's a language error. If you've worked your way through the book to this point, you've probably already made dozens of language errors as part of the learning process and through simple typos. + +VBA helps you eliminate many language errors as you create them, as you'll see later in this chapter. Those language errors that the VBA Editor doesn't catch as you type them in usually show up as _compile errors_ during runtime testing, so the next section shows you examples of both language errors and compile errors. + +## Compile Errors + +_Compile errors_ occur when VBA can't compile a statement correctly—that is, when VBA can't turn a statement that you've entered into viable code. + +For example, if your programming tells VBA to use a certain property for an object that doesn't have that property, a compile error results. Compilation is the act of turning your source code (the programming you type into the Editor) into the lower-level commands understandable by the computer. For example, when you press F5 to execute your program, VBA starts off by compiling your programming. If it finds a problem during compilation, it displays an error message. + +The good news is that the VBA Editor detects many language errors and some compile errors as soon as you move the insertion point from the offending line. You don't even have to press F5 in many cases. For example, try typing the following statement in the Code window and pressing Enter to create a new line (or pressing ↑or ↓ to move to another line, or clicking the mouse in another line in the macro): + + If X > Y + +The VBA Editor displays the compile error "Expected: Then or GoTo" (see Figure 17.1) to tell you that the statement is missing a vital element: it should say If X > Y Then or If X > Y GoTo. (If you don't see the error message, there are two possibilities: Either you have turned off the Auto Syntax Check option [Tools ⇒ Options] or you didn't actually type it in by hand and press Enter.) + +Figure 17.1 The VBA Editor helps debug your code by identifying many compile errors as it checks the statements you enter. + +Every time you enter a line of code, the Editor examines that line for completeness and accuracy. In this example, VBA knows that when the code contains an If command, there must be a subsequent Then or Goto command. And so the Editor rejects the line and informs you what the problem is. + +This vigilance on the part of the VBA Editor prevents you from running into this type of error deep in the execution of your code. + +* * * + +**Decide for Yourself If You Like the Auto Syntax Check Feature** + +This chapter assumes that you're keeping VBA's Auto Syntax Check feature and other features switched on. If you have Auto Syntax Check turned off (Tools ⇒ Options ⇒ Editor tab), you won't see the error message displayed in Figure 17.1. Instead, the only warning you get about that incomplete line of code is that the VBA Editor turns the line red. Code turned red is the Editor's way of telling you that it's choking on your inadequate programming. You can either try to fix the error right then or keep on coding—putting off the debugging process until you've sketched in more code in the procedure. + +Some developers choose to turn off Auto Syntax Checking because they don't want to be nagged as they type in their code—it can interfere with their focus on the larger goals of the program they're writing. Working without automatic, immediate syntax checking can prove a cure worse than the disease for some programmers. But others find error message interruptions about typos annoying. + +Ultimately, whether you use the Auto Syntax Check feature is a matter of personal taste. For example, some people like to be told _right away_ if they make a spelling error in a Word document; others consider spelling errors rather tedious issues best left for later during an editing phase. They write, focusing on the main points they're trying to make, then at some later time they turn on the spell checker and fix any typos and punctuation blunders. You find a similar choice when you work at most any task. Consider woodworking: Should you hang each tool back on the wall in its appropriate place when you finish using it, or is it better to just let the saws and screwdrivers pile up around you, putting them away all at once after the coat rack is finished? + +* * * + +The VBA Editor notices blunders like the previous If X > Y problem easily enough, but you can also make language errors that the VBA Editor _cannot_ identify when you move the insertion point from the line in which the blunder resides. Instead, VBA identifies these errors as compile errors later when you press F5 and it compiles the code. For example, if you enter the following statement in the Code window when working with Word, the VBA Editor doesn't detect anything wrong. But when you run the procedure by pressing F5, VBA compiles the code, discovers the error, and objects to it (see Figure 17.2): + + ActiveDocument.SaveAs **FileMame** :="My File.docm" + +Figure 17.2 Other errors appear only when you try to run the code. + +This error is a straightforward typo—FileMame instead of FileName—but VBA won't see this particular kind of problem until it runs the code and fails to find any FileName property. + +The VBA Editor sometimes indirectly helps you to notice errors of this kind while you're writing code. Say you're trying to enter a Documents.Close statement in Word and mistype Documents as Docments. In this case, the VBA Editor doesn't display the Properties/Methods list (Auto List Members) as it normally does if you have this feature turned on. You haven't entered a valid object. VBA doesn't therefore have a members list to display. + +Not seeing the Properties/Methods list should alert you that something is wrong. If you continue anyway and enter the Docments.Close statement, the VBA Editor doesn't spot the mistake—it shows up as a "Run-time error 424: Object required" message (if you don't have Option Explicit on) when you try to run the procedure. (If you do have Option Explicit on, you get a "Variable not defined" compile error instead.) + +The Editor gives you yet another clue that Docments.Close is an error. When you press Enter to leave this line of code, you see this: + + docments.Close + +Does anything here look odd to you? VBA will automatically capitalize valid objects names. But docments is not capitalized. + +Another kind of problem is caused if you specify a property or method for an object to which that property or method doesn't apply. In this situation, VBA displays a compile error. For example, say you forget that the proper method here is Add and you enter Documents.Create instead. VBA highlights the offending word and gives the compile error "Method or data member not found" (see Figure 17.3), which tells you there's no Create method for the Documents collection. This message is displayed only during runtime, not design time (design time means when you're typing in code lines). + +Figure 17.3 The "Method or data member not found" error tells you that you've used a method or property that isn't available for the object in question. + +## Runtime Errors + +The third type of error is the _runtime error_ , which occurs while code is executing. You will cause a runtime error if you write code that forces VBA to try to perform an impossible operation, such as opening a document that doesn't exist, closing a file when no file is open, or performing something mathematically impossible, such as dividing by zero. + +The diction, punctuation, and syntax of your code is error-free, but you're asking VBA to do something that can't be done. An unhandled runtime error results in a crash that manifests itself as a Microsoft Visual Basic dialog box displaying a runtime error number, such as the one shown in Figure 17.4. + +Figure 17.4 An unhandled runtime error causes VBA to display a message box such as this one. + +As an example of an impossible operation, consider the archetypal division by zero. The following statements give a "Run-time error '11': Division by zero" message: + + Dim x As Integer + x = 1 / 0 + +You're unlikely to enter anything as obviously wrong as this in your code (you're not _nuts_ ). A line of code like this will inevitably produce a division-by-zero error because the divisor is zero. But it's easy to enter a valid equation, such as MonthlyPay = Salary/Months, and forget to assign any value to Months (if a numeric variable is empty, it counts as a zero value) or to produce a zero value for Months by addition or some other math. Or the user can type zero into a dialog box, then your code later tries to use that as a divisor. And so on. + +One way to check for runtime errors is to track the values of your variables by using the Watch window (discussed later in this chapter). To avoid possible user-input errors, have your code check their input after they close a dialog box. You can, for example, display a message explaining that zero isn't an acceptable input for their age, then display the dialog box again, expecting valid input this time around. + +## Program Logic Errors + +The fourth type of error is the _program logic error_ , which is characterized by valid code that nonetheless produces incorrect results. With program logic errors, the code is technically fine. VBA is able to compile and run it without noticing any errors—but you get a different result than you intended. + +Program logic errors range in scope from the relatively obvious (such as performing manipulations on the wrong workbook in Excel because your code doesn't check which window is active) to the subtle (such as extending a range to the wrong character or cell). In the first example, the procedure is likely to run perfectly, but the resulting workbook will bear little resemblance to what you were trying to accomplish. In the second example, you might get a result that is almost correct—or the error might cause you to get perfect results sometimes and slightly wrong results at other times. + +Program logic errors tend to be the hardest errors to fix. To nail them down, you need to trace the execution of your code and pinpoint where things start to go wrong. To do that, you almost always need to employ the debugging tools discussed in the next section. + +A friend of mine wrote a very nice program to format and print forms. But while he was testing it he noticed that after working fine about five times, it suddenly sent only one-third of the form to the printer during a trial run. He couldn't get it to repeat this behavior. So he surrounded the code with a loop and let it run continuously (dumping the sample form repeatedly into a log file rather than wasting paper printing it over and over). He discovered that the error only occurred once every 256 times the program ran. He never did locate the bug, but when he gave the program to other people, he just told them that it worked "almost always." + +* * * + +When Errors Aren't Your Fault + +There are two other types of errors that you may run into—even though perhaps you shouldn't. The first type is where Microsoft has documented a VBA item differently than it actually works. This shouldn't happen, but because of the complexity of VBA, it does. If you find that your code absolutely won't work even though it follows the Microsoft documentation to the letter, consider the possibility that the documentation may be incorrect. Search the Web using the VBA keywords involved to find if others have encountered this problem and learn how they've worked around it. The second type of error, a distant relation of the first type, is where one version of VBA behaves differently than another version. For example, you might create a procedure that works perfectly in Word 2010, but you have to change it to make it work with Word 2013. In an ideal world, this shouldn't happen—but as you know, this world is far from ideal. These two errors are blessedly quite rare. For one thing, VBA has been extensively used for decades, so it's a very mature language with few surprises. + +* * * + +# VBA's Debugging Tools + +VBA provides a solid assortment of debugging tools to help you remove the bugs from your procedures. The main windows you'll employ for debugging are the Immediate window, the Locals window, and the Watch window. You can access these tools in various ways, one of which is by using the Debug toolbar (shown in Figure 17.5). Four of the buttons—Design Mode, Run Sub/UserForm, Break, and Reset—are shared with the Standard toolbar. You'll learn about most of the others later in this chapter. + +Figure 17.5 The Debug toolbar provides 13 commands for debugging your procedures. + +* * * + +Heisenbugs, Bohr Bugs, and Other Uncatchable Critters + +The more complex and lengthy your code, the more likely you are to create bugs that are exceptionally difficult to catch. Usually, with determination and ingenuity, you can track down even the tougher bugs located in a single procedure. But bugs that depend on several unforeseen and improbable circumstances occurring simultaneously can be tough to isolate. + +For example, an error that occurs in a procedure when the user makes a certain choice in a dialog box is relatively easy to catch. But if the error occurs only when the user has made two particular choices in the dialog box, it's much harder to locate. And if the error is contingent on a particular combination of three choices the user has made in the dialog box, or if it depends on an element in the particular file from which the procedure is getting its data, you'll likely have a much harder time pinpointing it. + +Programmer folklore defines various kinds of rare bugs by assigning them names derived from such disciplines as philosophy and quantum physics. For instance, a _heisenbug_ is defined as "a bug that disappears or alters its behavior when one attempts to probe or isolate it." Heisenbugs are frustrating, as are Bohr bugs and mandelbugs (search online for details if you're curious). But the worst kind of bug is the _schroedinbug_ , which is a design or implementation bug that remains quiescent until someone reads the code and notices that it shouldn't work, whereupon it stops working until the code is made logically consistent. + +These bugs are, of course, ridiculous—until you start to discover bit rot at work on your code and have to explain the problem to your superiors. + +* * * + +## Break Mode + +Break mode is a vital tool for debugging your procedures because it lets you watch your code execute step by step—line by line—in the Code window (by repeatedly pressing F8). This technique is called _single-stepping_. + +For example, if an If...Then...ElseIf...Else statement appears to be executing incorrectly, you can step through it in Break mode and watch exactly which statements are executing, and which are being skipped, to produce the bad result. + +These are the easiest ways to enter Break mode: + + * Click to place the blinking insertion cursor in the procedure you want to run in the Code window and press the F8 key (or click the Step Into button on the Debug toolbar, or choose Debug ⇒ Step Into) to start stepping through it. Repeatedly press F8 to step down through the code. + * Set one or more breakpoints in the procedure to cause VBA to halt execution and enter Break mode when it reaches one of the marked lines. A breakpoint allows you to stop execution at a particular point in your code. The easiest way to set a breakpoint is to click beside the line where you want to stop. You click in the gray margin-indicator bar to the left of the Code window. (You could also right-click in the line of code and choose Toggle ⇒ Breakpoint from the context menu.) You can set any number of breakpoints. They're especially useful when you need to track down a bug that you suspect is located in a particular procedure because a breakpoint allows you to run the parts of a procedure that have no problems at full speed and then stop the procedure where you think there might be problems. From there, you can step through the suspicious statements and watch closely how they execute. + +You can also enter Break mode in a couple of other ways: + + * Interrupt your code by pressing Ctrl+Break and then click the Debug button in the resulting dialog box (see Figure 17.6). Normally, the only reason to enter Break mode this way is if your code gets stuck in an endless loop (which you'll typically recognize when the code appears to be doing nothing for a long time or repeating itself when you think it shouldn't be). VBA highlights the statement that was executing when you pressed Ctrl+Break, but (depending on your timing) it's unlikely to be the statement that's causing the problem in your code—it'll just be one of the statements in the offending loop. You'll then need to step through the loop to identify the aberrant statement. + +Figure 17.6 You can enter Break mode by pressing Ctrl+Break and then clicking the Debug button in this dialog box. + + * Click the Debug button in a runtime-error dialog box such as the one shown in Figure 17.7. In the Code window, VBA highlights the statement that caused the error. (You can also click the Help button in the runtime-error dialog box to get an explanation of the error before clicking the Debug button.) + +Figure 17.7 Entering Break mode from a runtime error dialog box like this one takes you straight to the offending statement in your code. The problem code will be highlighted in yellow. + +* * * + +Access's _SingleStep_ Method + +In addition to hosting a full version of VBA, Access includes a unique, legacy macro-design tool called the Macro Builder. This book doesn't spend much time with the Builder feature because Access's VBA offers much more capability and flexibility than its Builder. However, if you want to experiment with the Macro Builder, in Access click on the Ribbon's Create tab, then click the Macro icon on the far right. One interesting command (added to the Builder in Access 2007) is the SingleStep method of the DoCmd object. This operates somewhat like a breakpoint, dropping you into Break mode during execution and displaying Access's specialized Macro Single Step dialog box. You can insert DoCmd.SingleStep into a VBA macro as well. VBA recognizes it as a legitimate line of code. However, VBA just ignores this statement during runtime. Only macros created in the Access Builder will respond to this SingleStep method. + +* * * + +## The Step Over and Step Out Commands + +In Chapter 3, "Editing Recorded Macros," you learned how to step through a procedure by repeatedly pressing the F8 key to issue the Step Into command, going down the lines one at a time. (You can also issue this command by clicking the Step Into button on the Debug toolbar or choosing Debug ⇒ Step Into, but F8 is ever so much more efficient.) + +Stepping into lets you see exactly what each statement in your code does, but you'll often find that you need to get past sections of code that you're sure are working fine so that you can step through a section that seems suspicious. This situation is particularly true of loop structures, which can have you going round and round—a real time-waster if you know the bug you're tracking down isn't within the loop. + +Break mode offers three features to speed up stepping through your code: the Step Over command, the Step Out command, and the Run To Cursor command. The Step Over and Step Out commands aren't available until you enter Break mode (for example, by using the Step Into command). + +The Step Over command (which you can trigger by pressing Shift+F8, clicking the Step Over button on the Debug toolbar, or choosing Debug ⇒ Step Over) executes the whole Sub or function called from the current procedure instead of stepping through the called procedure statement by statement as the Step Into command would do. (It "steps over" that procedure or function.) Use the Step Over command when you're debugging a procedure that calls another procedure or function that you know to be error-free and that you don't need to test step by step. + +The Step Out command (which you can issue by Ctrl+Shift+F8, clicking the Step Out button on the Debug toolbar, or choosing Debug ⇒ Step Out) runs the rest of the current procedure at full speed. Use the Step Out command to quickly execute the rest of a procedure once you've gotten through the part that you needed to watch step by step. + +The Run To Cursor command (which you can issue by pressing Ctrl+F8 or choosing Debug ⇒ Run To Cursor) runs the code at full speed until it reaches the statement where the blinking cursor currently is in the Code window, whereupon it enters Break mode. Click to position the cursor in the appropriate statement before invoking this command. + +## The Locals Window + +The Locals window provides a quick readout of the values and types of all variables or expressions in the currently active procedure. It displays a collapsible tree view (see Figure 17.8). + +Figure 17.8 Use the Locals window to see at a glance all the expressions in the active procedure. + +An expression is a combination of keywords, operators, variables, and/or constants. Variables are one kind of expression; but more complex expressions involve more than a single variable: x > y, for example, is an expression stating that x is greater than y. This expression might be True or False, depending on what's happening during runtime. + +The Expression column displays the name of each expression, listed under the name of the procedure in which it appears. The Value column displays the current value of the expression (including Empty if the expression is empty, or Null or Nothing as appropriate). And the Type column displays the data type of the expression, with Variants listed as "Variant" along with their assigned data type (for example, "Variant/String" for a Variant assigned the String data type). + +To display the Locals window, click the Locals Window button on the Debug toolbar or choose View ⇒ Locals Window. To hide the Locals window, click its close button. + +From the Locals window, you can also click the button marked with an ellipsis (...) to display the Call Stack dialog box, discussed later in this chapter. This button is available only in Break mode. + +* * * + +How to Float and Dock Windows + +Remember that you can make panes (interior windows such as the Locals window) float by either dragging them or double-clicking their title bar. Restore them to their default docking location by double-clicking their title bar a second time. + +* * * + +## The Watch Window + +The Watch window (identified as Watches in Figure 17.9) is a separate window that you use to track the values of variables and expressions as your code executes. To display the Watch window, click the Watch Window button on the Debug toolbar or choose View ⇒ Watch Window in the VBA Editor. To hide the Watch window again, click its close button (clicking the Watch Window button or choosing View ⇒ Watch Window again doesn't hide it). + +Figure 17.9 Use the Watch window to track the values of variables and expressions in your code. + +The Watch window displays _watch expressions_ —expressions in your code that you specify ahead of time. You want to view a dynamic display of the values in these variables or expressions. + +Watch-expression information can help you to pinpoint where an unexpected value for a variable or an expression occurs as your code executes. The Watch window lists the names of the watched expressions or variables in the Expression column, their values in the Value column, their type (Integer, Byte, String, Long, and so on) in the Type column, and their context (the module and procedure in which they're operating) in the Context column. So to track the value of a given variable, you need only look at the Watch window at any given point while in Break mode. + +If a variable or expression listed in the Watch window hasn't been initialized, the Watch window displays "< Out of Context >" in the Value column and "Empty" (for a variable other than a Variant) or "Variant/Empty" (for a Variant) in the Type column. + +The VBA Editor updates all watch expressions in the Watch window whenever you enter Break mode and whenever you execute a statement in the Immediate window. So if you step through a procedure in the Code window by pressing the F8 key (which keeps you in Break mode), you can watch the value in a variable, or of an expression, as each statement executes. This is a great way to pinpoint where an error or an unexpected value occurs—and is much easier than moving the mouse over each variable or expression in question to check its value by using the Auto Data Tips feature. + +Here's a typical debugging scenario. Let's say your code is producing a preposterous result, such as asserting that your annual salary is $2,200,000. As usual with most debugging, you're trying to figure out _where_ in your code this sudden and massive gain in income is being calculated. Observe the Watch window while single-stepping through your code to see in which line of code the variable MySalary goes from 50,000 to 2,200,000. Now you're right there close to where the bug is and you can examine the preceding lines of code very carefully to see what's impacting the MySalary variable. + +Because watch expressions slow down execution of your code, the VBA Editor doesn't save them with the code—you need to redo them for each editing session. However, the Editor _does_ store watch expressions during the current editing session, so you can move from procedure to procedure without losing your watch expressions. + +### Setting Watch Expressions + +Sometimes referred to as _conditional breakpoints,_ watch expressions give you considerable flexibility when debugging. You can ask the VBA Editor to halt execution on most any kind of situation you can think up, such as break on any line that causes a variable to exceed a certain value, go below zero, change to a shorter string length, and so on. In other words, you specify a condition, an expression such as MySalary > 50000, and the VBA Editor automatically halts execution and displays the line where your salary increases beyond the expected 50,000. As you can imagine, the conditional breakpoint is one of the best tools a debugger has. + +To set a watch expression, add it to the list in the Watch window by following these steps: + +1. Select the variable or expression in your code, right-click it, and choose Add Watch from the context menu to display the Add Watch dialog box (see Figure 17.10). The variable or expression in which you right-clicked appears in the Expression text box. + +Figure 17.10 In the Add Watch dialog box, specify the watch expression you want to add. + +You can also select the variable or expression you're interested in and choose Debug ⇒ Add Watch to display the Add Watch dialog box. If you choose Debug ⇒ Add Watch _without_ selecting the variable or expression, you must type it in the Expression text box, which is a waste of time. + +**2.** If necessary, adjust the settings in the Context group box. The Procedure drop-down list is set to the current procedure, and the Module drop-down list is set to the current module. + +**3.** In the Watch Type group box, adjust the option-button setting if necessary: + + * The default setting—Watch Expression—adds the variable or expression in the Expression text box to the list in the Watch window. However, conditional breakpoints are more useful if you do more than merely observe the status of variables or expressions. The following two list items describe the true benefit of these breakpoints. + * Break When Value Is True causes VBA to enter Break mode whenever the value of the variable or expression changes to True. + * Break When Value Changes causes VBA to enter Break mode whenever the value of the watch expression changes. Use this setting when dealing either with a watch expression whose value you don't expect to change but that appears to be changing (such as MySalary in the previous example) or with a watch expression whose every change you need to observe. + +**4.** Click the OK button to add the watch expression to the Watch window. + +* * * + +Use These Two Important Conditional Break Techniques + +The Break When Value Is True option button allows you to run your code without stepping through each statement that doesn't change the value of the watch expression to True. This allows you to specify that Break mode should be entered, for example, when your variable exceeds a certain value (such as X > 10000) or equals another variable (such as x = y). Employing this kind of conditional break can be extremely helpful when tracking down elusive bugs. + +The Break When Value Changes option button allows you to run your code and stop at each location where the value changes in the code. + +* * * + +You can also drag a variable or an expression from the Code window to the Watch window; doing so sets a default watch expression in the current context. To set Break When Value Is True or Break When Value Changes, edit the watch expression after dragging it to the Watch window. + +### Editing Watch Expressions + +To edit a watch expression, right-click it in the Watch window and choose Edit Watch from the context menu, or select it in the Watch window and choose Debug ⇒ Edit Watch. Either action will display the Edit Watch dialog box with the watch expression selected in the Expression box, as shown in Figure 17.11. Change the context or watch type for the watch expression by using the settings in the Context group box and the Watch Type group box, and then click the OK button to apply your changes. + +Figure 17.11 You can edit your watch expressions in the Edit Watch dialog box. + +### Deleting Watch Expressions + +To delete a watch expression, right-click it in the Watch window and choose Delete Watch from the context menu. You can also delete the current watch expression by clicking the Delete button in the Edit Watch dialog box. + +### Using the Quick Watch Feature + +For those times when you don't need to create a watch expression for an expression or a variable, when you merely want to observe the value, you can use the Quick Watch feature, which displays the Quick Watch dialog box (see Figure 17.12) containing the context and value of the selected expression. + +Figure 17.12 Use the Quick Watch dialog box to get quick information on a variable or expression for which you don't want to set a watch expression in the Watch window. + +To use Quick Watch, while in Break mode select the expression or variable in the Code window and then click the Quick Watch button on the Debug toolbar, choose Debug ⇒ Quick Watch, or press Shift+F9. (If you're already working in the Quick Watch dialog box, you can click the Add button to add the expression to the Watch window.) + +## The Immediate Window + +One use for the Immediate window is as a virtual scratchpad. In the Immediate window you enter lines of code that you want to test quickly, without having to enter them in a procedure and then test the entire procedure. A second major use of the Immediate window is to display information to help you check the values of variables while a procedure is executing. + +In the first case, you type code into the Immediate window, then press Enter to see the results immediately (get it?). In the second case, you insert in your code Debug.Print statements that display information in the Immediate window, where you can easily view it. We'll explore both of these techniques in the following sections. + +To display the Immediate window, click the Immediate Window button on the Debug toolbar, choose View ⇒ Immediate Window, or press Ctrl+G. To hide the Immediate window again, click its close button. (Clicking the Immediate Window button, choosing View ⇒ Immediate Window, or pressing Ctrl+G when the Immediate window is displayed does not hide the Immediate window.) + +You can execute code in the Immediate window in both Break mode and Design mode. + +### What You Can't Do in the Immediate Window + +There are a number of restrictions on the code you can use in the Immediate window: + + * You can't use declarative statements (such as Dim, Private, Public, Option Explicit, Static, or Type) or control-flow statements (such as GoTo, Sub, or Function). These statements cause VBA to return an "Invalid in Immediate Pane" error. + * You can't use multiline statements (such as block If statements or block For... Next statements) because there's no logical connection between statements on different lines in the Immediate window: Each line is treated in isolation. + * You can't place breakpoints in the Immediate window. + +### Entering Code in the Immediate Window + +The Immediate window supports a number of standard Windows editing keystrokes and key combinations, such as Ctrl+X (Cut), Ctrl+C (Copy), Ctrl+V (Paste), Ctrl+Home (move the insertion point to the start of the window), Ctrl+End (move the insertion point to the end of the window), Delete (delete the current selection), and Shift+F10 (display the context menu). + +The Immediate window also supports the following VBA Editor keystrokes and key combinations: + + * F5 continues running a procedure. + * Alt+F5 runs the error-handler code for the current procedure. + * F8 single-steps through code (executing one statement at a time). + * Shift+F8 procedure-steps through code (executing one procedure at a time). + * Alt+F8 steps into the error handler for the current procedure. + * F2 displays the Object Browser. + +Finally, the Immediate window has a couple of commands of its own: + + * Pressing Enter runs the current line of code. + * Pressing Ctrl+Enter inserts a carriage return. + +### Printing Information to the Immediate Window + +As well as entering statements in the Immediate window for quick testing, you can use this window for a different debugging technique. To include in your procedures statements that print information to the Immediate window, use the Print method of the Debug object. Printing like this allows you to create a log during execution, a log you can later examine for errors or strange behavior. You don't single-step or display message boxes containing the value of a variable. Instead you print data for later study. + +The syntax for the Print method is as follows: + + Debug.Print [ _outputlist_ ] + +_outputlist_ is an optional argument specifying the expression or expressions to print. You'll almost always want to include _outputlist_ —if you don't, the Print method prints a blank line, which is of little use. Construct your _outputlist_ using the following syntax: + + [Spc( _n_ ) | Tab( _n_ )] _expression_ + +Here, Spc( _n_ ) inserts space characters and Tab( _n_ ) inserts tab characters, with _n_ being the number of spaces or tabs to insert. Both are optional arguments, and for simple output, you'll seldom need to use them. + +_expression_ is an optional argument specifying the numeric expression or String expression to print: + + * To specify multiple expressions, separate them with either a space or a semicolon. + * A Boolean value prints as either True or False (as appropriate). + * If _outputlist_ is Empty, Print doesn't print anything. If _outputlist_ is Null, Print prints Null. + * If _outputlist_ is an error, Print prints it as Error _errorcode_ , where _errorcode_ is the code specifying the error. + +As an example, you could log the contents of the String variables (expressions) CustName, Address1, Address2, City, State, and Zip to the Immediate window in an address format by using the following statements: + + Debug.Print CustName + Debug.Print Address1 & "," & Address2 + Debug.Print City & "," & State & " " & Zip + +As another example, the following procedure prints the names and paths of all open workbooks in Excel to the Immediate window: + + Sub See_All_Workbook_Names() + Dim oBook As Workbook + For Each oBook In Workbooks + Debug.Print oBook.FullName + Next + End Sub + +In practice Debug.print is used by many programmers as a sometimes-quick, efficient alternative to debugging with the Watch windows, message boxes, or breakpoints. You need to see if something is going wrong with a variable (its value is wrong, but where does it go wrong?). So you insert some Debug.Print statements to display the variable's value while executing a procedure. Then you can see if the value is wrong in that location or somewhere else in the code. + +If your program contains multiple procedures, you might also want to debug.print the name of the procedure. This example identifies both the procedure and variable name within the Debug.Print statement: + + Debug.Print "In the Sub Add_Tax the variable intLocal is: " & intLocal + +This results in the following line in the Immediate window: + + In Sub Add_Tax the variable intLocal is: 7 + +## The Call Stack Dialog Box + +When working in Break mode, you can summon the Call Stack dialog box (see Figure 16.1 in Chapter 16) to display a list of the active _procedure calls_ —the outside procedures being triggered by the current procedure. It shows the history of your code's execution path. + +When you begin running a procedure, that procedure is added to the call-stack list in the Call Stack dialog box. If that procedure then calls another procedure, the name of the second procedure is added to the call-stack list, but only while the procedure is executing; it's then removed from the list. By using the Call Stack dialog box in Break mode, you can find out what procedures are being called by another procedure; this can help you establish which parts of your code you need to check for errors. + +To display the Call Stack dialog box, click the Call Stack button on the Debug toolbar, press Ctrl+L, or select View ⇒ Call Stack. To display one of the procedures listed in the Call Stack dialog box, select it in the Project.Module.Function list box and click the Show button. + +# Dealing with Infinite Loops + +You'll probably find it easy to tell when a procedure gets stuck in an infinite loop: You'll notice that the procedure simply doesn't stop executing. If you open Windows's Task Manager, it will report that your application has "stopped responding." To interrupt an infinite loop, press Ctrl+Break. The VBA Editor then displays a Code Execution Has Been Interrupted dialog box. Infinite loops are also known as _endless loops_. + +There are several ways to get stuck in infinite loops, such as using GoTo statements without If conditions or Do loops without While or Until constraints. These are easy enough to avoid, but even if you do, it's still possible for infinite loops to occur in your code because of conditions you haven't been able to anticipate. + +The best way to approach detecting and eliminating an infinite loop is to use breakpoints or a watch expression to pinpoint where the procedure enters the infinite loop. Once you've reached it, use the Step Into command to step into the procedure. Then use the Watch window or the Locals window to observe the variable and expressions in the loop, which should indicate when something is going wrong and causing the loop to be endless. + +If your code contains a loop that should execute only a set number of times but you suspect it's running endlessly, you can insert a counter variable in the loop in an If... Then structure that triggers either an Exit For statement or an Exit Do statement to exit the loop if it runs more than a certain number of times. + +# Dealing with Runtime Errors + +Despite the help that VBA provides by checking for language errors and compile errors, runtime errors remain an unpleasant fact of life. Sooner or later, you will get runtime errors in your code, but you don't have to take them lying down. Just add _error handlers_ , pieces of code that trap errors, analyze them, and take action if they match given error codes. + +An error handler is a preventative measure, allowing your code to manage problems gracefully rather than crashing in front of a user's alarmed or bemused face. + +## When Should You Write an Error Handler? + +Consider writing an error handler in the following circumstances: + + * When a runtime error can cause your code to fail disastrously. For a procedure that tweaks a couple of objects on a slide in PowerPoint, you're unlikely to need an error handler. By contrast, for a procedure that creates, deletes, or moves files, you'll probably want an error handler. + * When your program accesses peripherals or objects outside the application itself—the status of which is unpredictable during design time. In this situation, you can identify particular errors that are likely to occur and that can be trapped. For example, when the user tries to open a file, certain well-known errors can occur—perhaps the file doesn't exist, or is currently in use by another computer, or is on a network drive, floppy drive, CD-ROM drive, or removable drive that isn't available at the time. You'll also run into errors if the user tries to use a printer or other remote device (say, a scanner or a digital camera) that's not present, not connected, turned off, or not configured correctly. Similarly, any procedure that deals with a particular object in a document (for example, a chart in Excel) will run into trouble if that object is not available. + +* * * + +Consider Trapping Errors Rather than Anticipating Them + +In some instances, you may find it simpler to trap a resulting error from a procedure than to anticipate and try to forestall the many and various conditions that might lead to the generation of the error. For example, instead of checking to make sure a file exists before you try to open or manipulate the file, just trap any kind of error that results if the file isn't detected. + +* * * + +## Trapping an Error + +_Trapping_ an error means catching it in your code during runtime so that you can write programming that handles the error. + +VBA's On Error statement triggers when there is a runtime error, allowing you to write code that responds to the error. + +Usually, you'll want to prevent an error from stopping your VBA code, but you can also anticipate particular errors and use them to determine a suitable course of action to follow from the point at which they occur. + +To trap an error, you use the On Error statement. The usual syntax for On Error is as follows: + + On Error GoTo _line_ + +Here, _line_ is a label specifying the line to which execution is to branch when a runtime error occurs. For example, to branch to the label named ErrorHandler, you could use a structure like this: + + Sub ErrorDemo() + **On Error GoTo ErrorHandler** + 'ordinary code statements here + + Exit Sub + **ErrorHandler:** + 'error-handling statements here + End Sub + +The label you use to identify the error handler can be named with any valid label name—you don't have to call it ErrorHandler or anything similar. Some people find that a descriptive label (perhaps one that identifies the type or types of error expected, such as HandleErrorNoFileOpen) is clearer in the long run than a generic name; others prefer to go with a generic name such as HandleErr. + +Usually, you'll want to place the error trap early, near the top of a procedure so that it's active and ready to trap errors for all the lines of code below it throughout the whole procedure. If necessary, you can place several different error traps in a procedure by entering multiple On Error statements where they're needed—but only one can be enabled at a time. ( _Enabled_ means that an error trap has been switched on by an On Error statement. When an error occurs and execution branches to the error handler, that error handler is _active_.) + +Inserting multiple error handlers in a procedure can be useful when you're dealing with statements that can cause different types of errors that may need to be trapped. In the following example, the first On Error statement directs execution to ErrorHandler1, and the second On Error statement directs execution to ErrorHandler2: + + Sub ErrorDemo2() + On Error GoTo ErrorHandler1 + 'statements here + On Error GoTo ErrorHandler2 + 'statements here + Exit Sub + ErrorHandler1: + 'statements for first error handler here + ErrorHandler2: + 'statements for second error handler here + End Sub + +Each error handler is limited to the procedure in which it appears, so you can create different error handlers for different procedures and have each enabled in turn as the procedures run. + +Because the error handler appears as code in the procedure, you need to make sure that it doesn't run when no error has occurred. You can do this by using either an Exit Sub statement in the line just above the error-handler statement (this ends execution of the procedure) or a GoTo statement that directs execution to a label beyond the error-handling code. The Exit Sub statement is better if you choose to place your error handler at the end of its procedure, which is standard practice and usually makes sense. The GoTo statement may prove easier to use if you choose to place your error handler elsewhere in the procedure. + +For a function, use an Exit Function statement rather than an Exit Sub statement. For a property in a class module, use an Exit Property statement. + +The following example uses an Exit Sub statement to cause execution to end before the error handler if no error occurs: + + Sub ErrorDemo3() + On Error GoTo ErrorHandler + 'statements that might cause an error + **Exit Sub** + ErrorHandler: + 'statements that handle the error + End Sub + +This next example uses a GoTo statement to skip the error handler—which is placed within the code of the procedure—unless an error occurs. When execution reaches the GoTo SkipErrorHandler statement, it branches to the SkipErrorHandler label, thus bypassing the code in the error handler: + + Sub ErrorDemo4() + On Error GoTo ErrorHandler + 'statements that might cause an error + **GoTo SkipErrorHandler** + ErrorHandler: + 'statements that handle the error + SkipErrorHandler: + 'statements + End Sub + +You read earlier in this book that some people don't like GoTo statements for uses such as the second example here. Given that this GoTo statement makes the flow of the procedure a little harder to follow, you may be inclined to agree with them in this case. (The use of GoTo in the On Error statement itself is, however, unavoidable.) + +## Disabling an Error Trap + +Recall that an error trap works only for the procedure in which it appears, and VBA disables it when the code in the procedure has finished executing. You can also disable an error trap before the end of a procedure in which it appears if you wish by using the following statement: + + On Error GoTo 0 + +Why would you do this? You might want to disable an error trap while testing a procedure to enable yourself to pinpoint errors that occur after a certain point while at the same time retaining error trapping for the first part of the procedure. + +## Resuming after an Error + +You use the Resume statement to resume execution of a procedure after trapping an error or handling an error with an error-handling routine. The Resume statement takes three forms: Resume, Resume Next, and Resume _line_. + +### Using a _Resume_ Statement + +The Resume statement causes execution to resume with the line that caused the error. Use Resume with an error-handling routine that detects and fixes the problem that caused the offending statement to fail. For example, look at the error handler in Listing 17.1, which runs when VBA is unable to apply a specified style in Word. + +**Listing 17.1**: Trapping a style error + + 1. Sub StyleError() + 2. + 3. On Error GoTo Handler + 4. + 5. Selection.Style = "Executive Summary" + 6. + 7. 'the rest of the procedure happens here + 8. + 9. 'exit the procedure once execution gets this far + 10. Exit Sub + 11. + 12. Handler: + 13. + 14. If Err = 5834 Then + 15. ActiveDocument.Styles.Add _ + Name:="Executive Summary", Type:=wdStyleTypeParagraph + 16. **Resume** + 17. End If + 18. + 19. End Sub + +Here's how the StyleError procedure in Listing 17.1 works: + + * Line 1 starts the procedure, and line 19 ends it. Lines 2, 4, 6, 8, 11, 13, and 18 are spacers. + * Line 3 uses an On Error statement to enable the imaginatively named error handler, which is identified by the Handler label in line 12. + * Line 5 applies the style named Executive Summary to the current selection. If this operation succeeds, execution will continue at line 7, which in this example contains only a comment indicating that this is where the rest of the procedure would take place. + * Line 9 is a comment introducing line 10, which holds the Exit Sub statement to end execution of the procedure before the error handler. + * If the Selection.Style statement in line 5 causes an error, execution branches to the Handler label in line 12, and the error handler is activated. Line 14 compares the error value to 5834, the error that occurs if the specified style doesn't exist. If it matches, line 15 then adds the missing style to the document, and the Resume statement in line 16 causes execution to resume where the error occurred, on line 5. Because the specified style is now available, the Selection.Style statement runs without an error. + +* * * + +How to Find VBA Error Numbers and Their Explanations + +To find error numbers, here are three approaches: + + * Go to this Web page: + +. + + * Search the VBA Help system for _trappable errors_. + * Deliberately cause the error yourself and note the number and description in the resulting error-message dialog box that VBA displays. + +* * * + +### Using a _Resume Next_ Statement + +Resume Next causes execution to resume with the next statement after the statement that caused the error. You can use Resume Next in either of the following circumstances: + + * With an error-handling routine that ignores the error and allows execution to continue without executing the offending statement + * As a straightforward On Error Resume Next statement that causes execution to continue at the next statement after the statement that caused an error, without using an error handler to fix the error + +As an example of the first circumstance, if the style specified in the previous example isn't available, you can use a Resume Next statement to skip applying it: + + Sub StyleError2() + On Error GoTo Handler + + Selection.Style = "Executive Summary" + + 'the rest of the procedure happens here + + 'exit the procedure once execution gets this far + Exit Sub + + Handler: + **Resume Next** + + End Sub + +The descriptions of Resume and Resume Next apply if the error occurred in the procedure that contains the error handler. But if the error occurred in a different procedure from the procedure that contains the error handler, Resume causes execution to resume with the last statement that transferred execution (called) out of the procedure where the handler is located; Resume Next causes execution to resume with the statement _after_ the last statement to call out of the procedure that contains the error handler. + +### Using a _Resume Line_ Statement + +Resume _line_ causes execution to resume at the specified line. Use a label to indicate the line, which must be in the same procedure as the error handler. + +For example, if a procedure tried to open a particular file, you could create a simple error handler that uses a Resume _line_ statement, as shown in Listing 17.2. This procedure works with Word. To make it work with other applications, substitute the appropriate error numbers in line 15. + +**Listing 17.2**: Resuming execution at a specified line + + 1. Sub Handle_Error_Opening_File() + 2. + 3. Dim strFName As String + 4. + 5. StartHere: + 6. + 7. On Error GoTo ErrorHandler + 8. strFName = InputBox("Enter the name of the file to open.", _ + "Open File") + 9. If strFName = "" Then End + 10. Documents.Open strFName + 11. Exit Sub + 12. + 13. ErrorHandler: + 14. + 15. If Err = 5174 Or Err = 5273 Then MsgBox _ + "The file " & strFName & " does not exist." & vbCr & _ + "Please enter the name again.", _ + vbOKOnly + vbCritical, "File Error" + 16. **Resume StartHere** + 17. + 18. End Sub + +Here's how Listing 17.2 works: + + * Line 1 starts the procedure, and line 18 ends it. + * Line 2 is a spacer. Line 3 declares the String variable strFName. Line 4 is another spacer. + * Line 5 contains the StartHere label, to which execution will return from the Resume statement in line 16. Line 6 is a spacer. + * Line 7 uses an On Error statement to enable the error handler ErrorHandler. + * Line 8 displays an input box prompting users for the name of the file they want to open, and stores the name in the variable strFName, which line 9 then tries to open. Line 10 checks strFName against an empty string and ends execution if it matches. + * If the file exists and can be opened, execution passes to line 11, where an Exit Sub statement exits the procedure, ending its execution. Otherwise, an error is generated, and execution branches to the ErrorHandler label in line 13, where the error handler becomes active. + * Line 14 is a spacer. Line 15 then compares the value of the error to 5174 (the error that occurs if VBA can't find the file) and to 5273 (the error that occurs if the document name or path isn't valid in Word). If either of these comparisons matches, line 15 displays a message box advising users of the error and prompting them to enter the correct filename. + * The Resume statement in line 16 then returns execution to the StartHere label in line 5. Line 17 is a spacer. + +* * * + +Try Inserting a Counter Variable to Deal with Repetitious User Errors + +For some procedures, you may want to build in a counter mechanism to prevent users from repeating the same error endlessly because they don't grasp what's wrong. By incrementing a counter variable each time the error handler is invoked and checking the resulting number, you can choose to take a different action after a number of unsuccessful attempts to execute a particular action. + +* * * + +You can't use a Resume statement anywhere other than in an error-handling routine (or an On Error Resume Next statement). If you do, VBA reports an error. + +## Getting a Description of an Error + +To see the description of the current error, return the Description property of the Err object: + + MsgBox Err.Description + +In general, operating-system and programming-language error messages tend to be terse, cryptic, and of less help to the end user than to the people who built the OS or language. Think twice before displaying one of these error messages to an end user. The error message shown in Figure 17.7 says "Run-time error '5941': The requested member of the collection does not exist." As you can imagine, most users would be baffled by this message; some would panic. + +Usually, it's more effective, not to mention kinder, to write and display a more verbose error message of your own devising. It should explain in ordinary English what the problem is—and, preferably, what (if anything) the user can do to solve it. + +## Raising Your Own Errors + +As part of your testing, you may want to deliberately simulate errors so that you can see how well your error handler handles them. (Programming lingo sometimes substitutes the word _raise_ for _cause_ or _trigger._ Nobody knows why.) + +To cause an error to be triggered, use the Raise method of the Err object, specifying only the _number_ argument. _number_ is a Long argument giving the number of the error that you want to cause. For example, the following statement "raises" error 5121: + + Err.Raise 5121 + +# Suppressing Alerts + +Many of the procedures you build will use message boxes or dialog boxes to allow the user to choose options for the procedure. In some applications—such as Word, Excel, PowerPoint, and Access—you can use the DisplayAlerts property of the Application object to suppress the display of message boxes and errors while a procedure is running: + + * In Word, DisplayAlerts can be set to wdAlertsNone (0) to suppress alerts and message boxes, wdAlertsMessageBox (-2) to suppress alerts but display message boxes, or wdAlertsAll (-1, the default) to display all alerts and message boxes. DisplayAlerts is a sticky setting. You need to set DisplayAlerts explicitly back to one of four things: to True or to wdAlertsAll when you want to see alerts again after setting it to False, to wdAlertsNone, or to wdAlertsMessageBox. VBA resets the default value when you restart Word. + * In Excel, DisplayAlerts is a read/write Boolean property that can be set to True to display alerts and False to suppress them. The setting sticks until you change it or restart Excel, at which point VBA resets it to True. + * In PowerPoint, DisplayAlerts is a read/write property that can be set to ppAlertsAll to display all alerts and ppAlertsNone to suppress all alerts. The setting sticks until you change it or until you restart PowerPoint, at which point VBA resets it to ppAlertsNone. + * In Access, you use the pervasive DoCmd object's SetWarnings method, like this: + + DoCmd.SetWarnings False + +# Handling User Interrupts in Word, Excel, and Project + +Errors may seem quite enough of a problem, but you also need to decide what will happen if a user tries to interrupt your code by pressing Ctrl+Break during execution. Some VBA hosts, including Word and Excel, offer you three options: + + * You can allow a user interrupt to stop your code. This is the easy way to proceed (and, as the default condition, needs no effort on your part), but in complex procedures, it may cause problems. For example, the user may have spent five minutes typing in data, only to lose it because the data wasn't saved due to the early termination of the program. + * You can prevent user interrupts by disabling user input while the procedure is running. This is simple to do, but you run the risk of creating unstoppable code if a procedure enters an endless loop. The user would have to power down the machine or, at least, invoke Task Manager and kill your task. Any unsaved work in the procedure or even the host application will be lost. The user might have been typing for _hours_ without saving their work. Losing this much...it can send some people _right over the edge_. + * As a compromise between the first two options, you can allow user interrupts during certain parts of a procedure and prevent user interrupts during more critical parts of a procedure. + +## Disabling User Input While a Procedure Is Running + +To disable user input while a procedure is executing, disable the Ctrl+Break key combination by setting the EnableCancelKey property of the Application object to wdCancelDisabled (in Word) or xlDisabled (in Excel): + + Application.EnableCancelKey = wdCancelDisabled 'Word + Application.EnableCancelKey = xlDisabled 'Excel + +VBA automatically enables user input again when the procedure stops executing. You can also reenable user input during a procedure by setting the EnableCancelKey property to wdCancelInterrupt (in Word) or xlInterrupt (in Excel): + + Application.EnableCancelKey = wdCancelInterrupt 'Word + Application.EnableCancelKey = xlInterrupt 'Excel + +Excel offers a third setting, xlErrorHandler, that traps the Ctrl+Break keystroke as error 18. You can deal with this error as you would any other error. Here's a quick example: + + Sub CancelKey_Example() + Dim i As Long + On Error GoTo EH + Application.EnableCancelKey = xlErrorHandler + For i = 1 To 100000000 ' time-consuming loop + Application.StatusBar = i + Next i + EH: + If Err.Number = 18 Then + If MsgBox("Do you want to stop the procedure?" _ + & vbCr & vbCr & "If not, stop pressing Ctrl+Break!", _ + vbYesNo + vbCritical, "User Interrupt Detected") = vbYes Then End + End If + End Sub + +## Disabling User Input While Part of a Procedure Is Running + +You may want to temporarily disable user input while a procedure is executing a sensitive task that must not be interrupted. Then when the task is complete, you can reenable user input because it's safe for the user to stop the procedure again. + +For example, say you have a procedure in which a section of code moves a number of files from one folder to another. You don't want the user to prevent the code that executes the move operations from being interrupted. That could cause problems because if the user stopped the procedure in mid-task, it might leave some files still in the source folder and some in the destination folder. + +Here's an example using Word: + + 'interruptible actions up to this point + Application.EnableCancelKey = **wdCancelDisabled** + For i = 1 to LastFile + SourceFile = Source & "\Section" & i + DestFile = Destination & "\Section" & i + Name SourceFile As DestFile + Next i + Application.EnableCancelKey = **wdCancelInterrupt** + 'interruptible actions after this point + +# Documenting Your Code + +Some musicians can read a symphonic score and more or less "hear" the music. Likewise, some programmers can read raw code and visualize what it does. But most programmers need comments to help them understand what code is doing, particularly if they wrote the code months before or if it was written by another programmer. + +Many programmers also find it easier to debug their procedures by documenting their code. The best way to document your code is to add comments to it, either as you create the code or after you've finished creating it: This procedure does this. It expects this data as input and provides this as its output. This line does this. And so on. + +Some experts advise that you document your code as you create it in any procedure in which you're exploring your way and trying different methods to reach your goal. Add comments to explain what action each group of statements is trying to achieve. Once you've gotten the procedure to work, go through the code and delete the statements you didn't use, using the comments to identify which sections are now useless and which are still worthwhile and leaving only the comments that are relevant to how the remaining code functions. + +Also consider adding comments when you're modifying an existing procedure so that you don't lose track of your changes. Once you have the procedure working to your liking, remove any unnecessary comments and reword any verbose or unclear comments. + +Other experts suggest documenting your code when you've finished writing it. This allows you to enter only the comment lines that you want to be there permanently. This is the way to go when you're fairly sure of the direction of your code when you start writing the procedure and the procedure needs only a few pointers to make its code clear once it's complete. + +To document your code, use comments prefaced by either the single quote (') or the Rem keyword (short for _remark_ ). + +* * * + +Use Block-Commenting as a Debugging Tool + +Remember that commenting can also be employed as a debugging technique–when you want to see how code runs with some lines inactivated. In other words, does the bug disappear when the commented-out lines are not executed? If so, the bug is probably located somewhere in those lines of code. You can "comment out" a group of lines, a whole line, or part of a line: anything to the right of an apostrophe or the Rem keyword is commented out. See the section in Chapter 3 titled "Commenting Out Lines" for details on this tactic. + +* * * + +Few programmers use Rem anymore. When you're trying to comment out only a part of a line, the apostrophe is usually the better choice anyway. If you do choose to use the Rem keyword, you'll need to add a colon before it to make it work consistently (some statements accept a Rem without a colon at their end; others generate a compile error): + + Rem This is a comment line. + Documents.Add: Rem create a document based on Normal.dotm + +Generally, apostrophe-commented remarks are separated by a few spaces or tabs from any statement the line contains (as in the second line here). This makes the code and comments easier to read than comments using Rem: + + 'This is a comment line + Documents.Add 'create a document based on Normal.dotm + +It's tempting to think that you don't need to document your code because you'll be able to recall what it does. But once you've written a lot of code, you probably won't be able to remember. Coming back to a procedure six months after writing it, you'll find it as unfamiliar as if someone else had written it. And if you've become a VBA whiz, you may even find it hard to visualize the clumsy techniques you were using at that time. + +Most programmers have a distinct aversion to documenting their code; for some, the dislike of documenting is almost pathological. You can see why: When you're writing the code, documenting what each line does slows you down and distracts you from your larger purpose. And documenting after the code is finished and tested is tedious work. Besides, anyone that's competent should be able to read the code and see what it does...shouldn't they? + +Maybe so, but consider this: It's likely that you won't always be the person working with your code—at times, others may work with it too, and they'll appreciate all the help they can get in understanding its purposes and behaviors. Likewise, the code on which you work won't always be your own—you may at times have to debug code that others have written, and in this case, _you'll_ be the one grateful for comments. + +# The Bottom Line + +**Understand the basic principles of debugging.** + +A major aspect of programming is testing your code. Debugging can be enjoyable if you think of it as a puzzle that you can solve. But whether or not you enjoy it, debugging is essential if you want to preserve a reputation as a professional. + +Master It + +When testing your code, try to imagine ways that the code could fail. Describe a situation that can produce unanticipated results. + +**Recognize the four different types of errors you'll create.** + +Experts have concluded that there are four primary categories of error in programs. + +Master It + +Name two of the four basic types of programming errors. + +**Employ VBA's debugging tools.** + +The VBA Editor and VBA include a generous assortment of debugging tools to help you track down and remove bugs from your procedures. The main windows you'll employ for debugging are the Immediate window, the Locals window, and the Watch window. + +Master It + +The Watch window is especially useful because you can set watch expressions (also known as conditional breakpoints). Describe this debugging tactic. + +**Deal with runtime errors** + +You can trap some runtime errors (errors that show up while a procedure is executing) while debugging your code. But others show up only while your user is interacting with your program—and you're probably not there to help them. There is a way, though, to soften the blow and, in some cases, even fix a problem by adding error handlers to your programs. + +Master It + +Error handlers are special statements and sections of code that detect and then manage runtime errors. What VBA statement detects a runtime error? +Chapter 18 + +Building Well-Behaved Code + +This chapter concentrates on the principles of good behavior. Once you've built a procedure that's useful and that works consistently as intended, you'll probably want to distribute it to as many of your coworkers as might use it or even to a wider audience on the Internet. Before you distribute it, though, you should make sure that the procedure is as civilized as possible in its interaction with users and with the settings they may have chosen on their computers. It's all too easy to distribute an apparently solid, useful procedure that runs roughshod over the user's preferences or one that fails unexpectedly under certain circumstances. In this chapter, you'll look at how to avoid such problems and how to construct your procedures so that the user will have no problem interacting with them. + +The specifics of good program behavior vary from application to application, and you will need to apply the principles of the application with which you're working. This chapter gives some examples. + +In this chapter you will learn to do the following: + + * Understand the characteristics of well-behaved procedures + * Retain and restore the user environment + * Let the user know what's happening + * Check that the procedure is running under suitable conditions + * Clean up after a procedure + +# What Is a Well-Behaved Procedure? + +A well-behaved procedure leaves no trace of its actions beyond those that the user expected it to perform. This means the following: + + * Making no detectable changes to the user environment or, if the procedure does need to make changes (for example, in order to do its job), restoring the previous settings + * Presenting the user with relevant choices for the procedure and relevant information once the procedure has finished running + * Showing or telling the user what is happening while the procedure is running + * Making sure (if possible) that conditions are appropriate for the procedure to run successfully—before the procedure takes any actions + * Anticipating or trapping errors wherever possible so that the procedure doesn't crash or, if it does crash under exceptional circumstances, so that it does so as gracefully as possible, minimizing damage to the user's work + * Leaving users in the optimal position to continue their work after the procedure finishes executing + * Deleting any scratch documents, folders, or other detritus that the procedure created in order to perform its duties but that are no longer needed + +You can probably think of a couple of examples in which applications you use don't exactly do these things. For example, do you use Word? Then you're probably familiar with the less-than-inspiring behavior of the Page Up and Page Down feature. While working in a document, press the Page Down key three times, then press the Page Up key three times. Your blinking insertion point should be back in the exact location where it was before you paged down, then back up, right? Unfortunately, the insertion point doesn't always (let's be honest, it will _rarely_ ) return to the exact point in the document as it should. + +So if you page through your document to look at some paragraph but then try to return to where you were last, you always need to check that the insertion point is in the right place before you start typing—otherwise, the characters are very likely to land in the wrong place. Word was first released in October 1983, _so Microsoft has had time to fix this_. It would be simple for Word to note the insertion point before the paging, but why that's never done remains a mystery. I'll show you how to do this in your macros in the section titled "Leaving the User in the Best Position to Continue Working" later in this chapter. + +Such weaknesses in commercial applications' interfaces provoke two main reactions among developers. First, if users are accustomed to such niggles as having to reposition the selection or change the view when they shouldn't need to, they're unlikely to get too annoyed with having to perform similar actions after running one of our procedures. This is particularly true if your macro saves them plenty of time and effort, for which they should be grateful rather than picky. Besides, they mostly likely didn't pay for your macro, did they? + +The second reaction is an impressive (and sometimes overzealous) determination on the part of macro programmers to restore the user environment absolutely perfectly even if major software corporations seem incapable of producing software that does so. + +The first approach tends to be more economical in its code and the second more inventive. To get your work done and retain your sanity, you'll probably want to steer a course between the two extremes. + +# Retaining or Restoring the User Environment + +In many cases, your procedures will run without even needing to change the user environment—but if not, restore it as closely as possible to its previous state. What this means depends on the host application, but here are some examples of environment changes in Word, Excel, and PowerPoint: + + * In Word: Changing the revision-marking (Track Changes) setting so that you can change the text without the changes being marked as revisions. + * In Word or PowerPoint: Changing the view to a different view so that you can perform certain operations that cannot be performed in the original view. + * In Excel: Creating a temporary worksheet on which you can manipulate data secure in the knowledge that you don't need to check whether any ranges are already occupied by user data. + * In any application that lets you manipulate its Find and Replace feature: Using the Find and Replace feature to identify and/or modify parts of a document, then restoring users' last search (and replace, if necessary) so that they can perform it again seamlessly. The problem here is that most applications have "sticky" Find and Replace settings to allow the user to perform the same search or replacement operation again quickly without reentering the parameters. If you've replaced users' search and replacement parameters, they'll get a rude shock the next time they try to search or replace. This is particularly true if you've turned on some esoteric feature like Match Case. The next time the user tries to search for _florida_ , they will find no matches, even if the document is about Miami and is jam-packed with the word _Florida_. Why? Because your macro left the Match Case filter turned on, and the user didn't capitalize _Florida_ when initiating the search. Fail. + +You'll want to save information about the user's environment so that you can restore it at the end of the procedure. If your procedure will mess around with the Match Case property of the Word's Find and Replace feature, at the start of this procedure you save the user's current value in this property in a private variable, public variable, or custom object as appropriate. + +Then at the end of your macro, fetch the saved value and restore it to the property you temporarily modified. Here's an example: + + Dim CaseStatus As Boolean 'match case is either on or off + + CaseStatus = Selection.Find.MatchCase 'save the user's setting + + Selection.Find.MatchCase = True 'our macro needs to be case-sensitive + + ' execute statements in the macro + + Selection.Find.MatchCase = CaseStatus 'restore the user's preference + +# Leaving the User in the Best Position to Continue Working + +After your procedure finishes running, users need to be in the best possible position to continue their work. What exactly this best possible position entails depends on the situation, but here are three simple suggestions: + + * Usually, you'll want to leave users viewing the same document they were working on when they started running your macro. There are some obvious exceptions to this, such as when the procedure creates a new file for the user and the user is expecting to work in that file, but the general principle applies in most situations. + * If a file is essentially untouched (at least from the user's point of view) by your macro, the blinking insertion cursor (selection) should probably be placed back where it was when the user started running the procedure. To restore the selection, you may want to define a range at the start of your procedure and then move the selection back to it at the end of the procedure. In some applications, you could also use a bookmark or a named range—but if you do, be sure to remove it afterward. Remember, leave no debris behind. + * Listing 18.1 is an example macro that you can try out. It saves a Word document's current blinking insertion-cursor location in a bookmark. Next it moves the cursor down a few lines and shows you a message box so you can see the new location of the cursor. Finally, it restores the cursor to its original location: + +**Listing 18.1**: Restoring the cursor + + 1. Sub SaveAndRestoreCursor() + 2. + 3. 'save the current cursor location in a bookmark + 4. ActiveDocument.Bookmarks.Add Name:="OriginalInsertionPoint", _ + Range:=Selection.Range + 5. + 6. 'move down eight lines + 7. Selection.MoveDown Unit:=wdLine, Count:=8 + 8. + 9. MsgBox "moved to here (look for insertion line; it's moved down 8 lines from where it was.)" + 10. + 11. 'fetch the saved bookmark and go to it + 12. Selection.GoTo what:=wdGoToBookmark, Name:="OriginalInsertionPoint" + 13. + 14. MsgBox "Now the insertion line has been restored to where it was when this macro started.)" + 15. + 16. 'remove the bookmark to leave no debris behind + 17. + 18. ActiveDocument.Bookmarks("OriginalInsertionPoint").Delete + 19. End Sub + + * Notice in line 18 that we delete our bookmark when we've finished using it. Don't leave rubbish behind. + * If the procedure has created a new object in the file, and the user will be expecting to work with it, you may want to have that object selected at the end of the procedure. + +# Keeping the User Informed during the Procedure + +A key component of a well-behaved procedure is keeping the user adequately informed throughout the process. In a macro that performs a basic if tedious task, adequate information may require only a clear description in the macro's Description field, to assure users that they're choosing the right procedure from the Macros dialog box. + +With a more complex procedure, adequate information will probably have to be more extensive: You may need to display a starting message box or dialog box, show information on the status bar during the procedure, display an ending message box, or create a log file of information so that the user has a record of what took place during execution of the procedure. + +You must first decide whether to disable user input during the procedure. In Word and Excel, you can disable user input to protect sensitive sections of your procedures by setting the EnableCancelKey property of the Application object (as discussed in "Disabling User Input While a Procedure Is Running" in Chapter 17, "Debugging Your Code and Handling Errors"). When you do so, it's a good idea to indicate to the user at the beginning of the procedure that input will be disabled and explain why. Otherwise, a user may react to a procedure that seems not to be executing in the same way they would respond to an application that had hung—by trying to close the application forcibly via Task Manager. To keep the user informed about other aspects of the procedure, you have several options, which are discussed in the following sections. But first, the sidebar "Disabling Screen Updating" examines how you can _hide_ information from the user (and the reasons for doing so) by disabling screen updating in Word and Excel. + +* * * + +**Disabling Screen Updating** + +Access, Word, and Excel let you disable screen updating—that is, stop the redrawing of the information in the document area. The other parts of the application window—the title bar, command bars, status bar, scroll bars, and so on—continue to update, but these items are usually relatively static compared to the document area and so don't take much updating. Still, if the user resizes the application window or the document window, they will see these other parts of the application window change, even with screen updating disabled. + +There are two advantages to disabling screen updating while your procedure is running: + + * You can speed up the execution of your procedures somewhat. This improvement was quite noticeable in the early days of personal computing, and it is still perceptible with underpowered computers that have slow graphics cards. Most computers built since 2000 or so have relatively capable graphics cards, so turning off screen updating makes little visible difference. Any speed improvement from disabling screen updating applies especially to procedures that cause a lot of changes to the onscreen display. For example, suppose a procedure in Word strips a certain type of information out of the current document, pastes it into a new document, creates a table out of it, and applies assorted formatting to the table. The computer will expend a fair amount of effort updating what's appearing on the monitor. This is wasted effort if the user isn't hanging on every operation, so you might as well turn off screen updating. + * You can hide from users any parts of the procedure that you don't want them to see. This sounds totalitarian, but it's usually more like a cross between benevolent dictatorship and public television: People shouldn't see certain things that might really upset them, and there's a lot that most people don't _really_ need to know about. It's the same when you write programs: If users don't know about the operations that a procedure will routinely perform to achieve certain effects, they may be surprised or dismayed by what they see onscreen. For example, in a procedure that moves an open file, you might want to hide from the user the fact that the procedure closes the open file, moves it, and then reopens the file from its new location. By disabling screen updating, you can achieve this. + +The major disadvantage to disabling screen updating is that doing so prevents users from seeing information that might be useful to them. In the worst case, users may assume from the lack of activity onscreen that either the procedure has entered an endless loop or the computer has hung, and so they may try to stop the procedure by pressing Ctrl+Break or Ctrl+Alt+Delete to use Task Manager to close the application. (Task Manager typically lists the host application as "Not responding" for much of the time VBA code is running, which doesn't help.) + +To forestall users from disrupting a procedure, warn them in advance that a procedure will disable screen updating. For instance, you might mention the fact in a message box at the beginning of the procedure, or you might display a dialog box that allows the user to choose whether to disable screen updating and have the procedure run faster or to leave screen updating on and have the procedure run at its normal speed and provide a performance possibly worth watching. + +If you don't display a message box or dialog box at the beginning of a procedure, you may want to display information on the status bar to tell the user what's going on during the procedure. Word and Excel update the status bar and the title bar of the application even if screen updating is turned off—provided the status bar and the title bar are visible. To display information on the status bar, assign a suitable string to the StatusBar property of the Application object: + + Application.StatusBar = _ + "Word is creating 308 new documents for you to edit. Please wait..." + +Alternatively, you can disable screen updating for parts of a procedure and turn it back on, or refresh it, for other parts. Consider a procedure that creates and formats a number of documents from an existing document. If you turn off screen updating at the beginning of the procedure and then refresh it once each document has been created and formatted, the user will see each document in turn (which conveys the progress the procedure is making) without seeing the details of the formatting. What's more, the procedure will run faster than if the screen were showing all of the formatting taking place. + +To turn off screen updating, set the ScreenUpdating property of the Application object to False: + + Application.ScreenUpdating = False + +To turn screen updating back on, set ScreenUpdating to True again: + + Application.ScreenUpdating = True + +In Access, use the Echo method of the DoCmd object to turn screen updating on or off, respectively: + + DoCmd.Echo True 'turns updating on + DoCmd.Echo False 'turns updating off + +In Word, to refresh the screen with the current contents of the video memory buffer, use the ScreenRefresh method of the Application object: + + Application.ScreenRefresh + +* * * + +## Manipulating the Cursor + +Word and Excel permit you to manipulate the cursor (the mouse pointer). You may need to do this because VBA automatically displays the busy cursor (an hourglass in Windows XP, a rotating ring in Windows versions since then) while a VBA procedure is running and then restores the normal cursor when it has finished. Sometimes, however, you may need or want to specify the cursor's appearance in your code. + +* * * + +Stick with the Familiar Cursor Cues + +After using computers for even a few months, users tend to develop almost Pavlovian reactions to the cursor, with the busy cursor signifying (in ascending order) a momentary breather (or a slow computer), a chance to grab a cup of coffee or chat with a colleague, or the onset of panic that the computer has hung before they've saved the last three hours of work. You usually won't want to mess with these reactions. So it's a mistake to display an I-beam insertion cursor or "normal" arrow cursor when the system is in fact busy—or to display the busy cursor after the procedure has in fact finished running. + +* * * + +### Manipulating the Cursor in Word + +Word implements the cursor via the System object. To manipulate the cursor, you set the Cursor property. This is a read/write Long property that can be set to the following values: wdCursorIBeam (1) for an I-beam cursor, wdCursorNormal (2) for a normal cursor, wdCursorNorthWestArrow (3) for a left-angled resizing arrow (pointing up), and wdCursorWait (0) for the busy cursor. The exact appearance of the cursor depends on the cursor scheme the user has selected. + +For example, the following statement displays a busy cursor: + + System.Cursor = wdCursorWait + +Note that a user can customize the cursors by clicking the Mouse icon in Control Panel to open the Mouse Properties dialog box, then selecting the Pointers tab. + +### Manipulating the Cursor in Excel + +Excel lets you manipulate the cursor through the Cursor property of the Application object. Cursor is a read/write Long property that can be set to the following values: xlIBeam (3) for an I-beam cursor, xlDefault (-4143) for a default cursor, xlNorthwestArrow (1) for the arrow pointing up and to the left, and xlWait (2) for the busy cursor. + +For example, the following statement displays the busy cursor: + + Application.Cursor = xlWait + +When you explicitly set the Cursor property of the Application object in Excel, remember to reset it to something appropriate before your code stops executing. Otherwise, the cursor stays as you left it. + +## Displaying Information at the Beginning of a Procedure + +At the beginning of many procedures, you'll probably want to display a message box or a dialog box. For this purpose, you'll typically use a Yes/No or OK/Cancel message-box style. The message box tells users what the procedure will do and gives them the chance to cancel the procedure without running it any further. + +Alternatively, a dialog box can present options for the procedure (for example, mutually exclusive options via option buttons or nonexclusive options via check boxes), allowing users to enter information (via text boxes, list boxes, or combo boxes) and of course letting them cancel the procedure if they've cued it by accident. If you have time to create a Help file to accompany the procedures and user forms you create, you might add a Help button to each message box or dialog box, linking it to the relevant topic in the Help file. + +You can also use a message box or dialog box to warn the user that the procedure is going to disable user interrupts for part or all of its duration. + +## Communicating with the User via a Message Box or Dialog Box at the End of a Procedure + +With some procedures, you'll find it useful to collect information on what the procedure is doing so that you can display that information to the user in a message box or dialog box after the procedure has finished its work. As you saw in Chapter 13, "Getting User Input with Message Boxes and Input Boxes," message boxes are easier to use but are severely limited in their capabilities for laying out text—you're limited to the effects you can achieve with spaces, tabs, carriage returns, and bullets. With dialog boxes, however, you can lay out text however you need to (by using labels or text boxes) and even include images if necessary. + +The easiest way to collect information while running a procedure is to build one or more strings containing the information you want to display. For an example of this, look back to the sidebar titled "Control a For...Next Loop with User Input via a Dialog Box" in Chapter 12, "Using Loops to Repeat Actions," in which a cmdOK_Click procedure collects information while creating a series of folders and then at the end displays a message box telling the user what the procedure has accomplished. + +## Creating a Log File + +If you need to collect a lot of information during the course of running a procedure and either present it to the user once the procedure has finished or just make it available for reference if needed, consider using a log file rather than a message box or dialog box. Log files are useful for lengthy procedures that manipulate critical data: by writing information periodically to a log file (and by saving it frequently), you create a record of what the procedure achieves in case it crashes. + +* * * + +Make a Log File Useful for Both Average and Sophisticated Users + +If you want a log file to be useful for ordinary users as well as to the technically inclined, make its entries readable and helpful while including any technical information required for advanced troubleshooting. For example, a message such as _The data files for the_ " _Madrid_ " _office_ ( _madrid060430_. _xlsm_ ) _and the_ " _Taos_ " _office_ ( _taos060430_. _xlsm_ ) _were not found in the expected location_ , "\\\ _server2_ \ _data_ \ _dayfiles_ \", _so the information could not be included_ is usually more widely helpful than a cryptic _Error code 44E: Required Data Missing._ + +* * * + +Say you wrote a procedure for Word that collects information from a variety of sources each day and writes it into a report. You might want to keep a log file that tracks whether information from each source was successfully transferred and at what time. Listing 18.2 provides an example of such a procedure. At the end of the procedure, you could leave the log file open so that the user could check whether the procedure was successful in creating the report or leave the summary file open so that the user could read the report itself. + +**Listing 18.2**: Creating a log file + + 1. Sub Create_Log_File() + 2. + 3. Dim strDate As String + 4. Dim strPath As String + 5. Dim strCity(10) As String + 6. Dim strLogText As String + 7. Dim strLogName As String + 8. Dim strSummary As String + 9. Dim strFile As String + 10. Dim i As Integer + 11. + 12. On Error GoTo Crash + 13. + 14. strCity(1) = "Chicago" + 15. strCity(2) = "Toronto" + 16. strCity(3) = "New York" + 17. strCity(4) = "London" + 18. strCity(5) = "Lyons" + 19. strCity(6) = "Antwerp" + 20. strCity(7) = "Copenhagen" + 21. strCity(8) = "Krakow" + 22. strCity(9) = "Pinsk" + 23. strCity(10) = "Belgrade" + 24. + 25. strDate = Month(Date) & "-" & Day(Date) & "-" _ + & Year(Date) + 26. strPath = "f:\Daily Data\" + 27. strLogName = strPath & "Reports\Log for " _ + & strDate & ".docm" + 28. strSummary = strPath & "Reports\Summary for " _ + & strDate & ".docm" + 29. Documents.Add + 30. ActiveDocument.SaveAs strSummary + 31. + 32. For i = 1 To 10 + 33. strFile = strPath & strCity(i) & " " & strDate & ".docm" + 34. If Dir(strFile) <> "" Then + 35. Documents.Open strFile + 36. Documents(strFile).Paragraphs(1).Range.Copy + 37. Documents(strFile).Close _ + 38. SaveChanges:=wdDoNotSaveChanges + 39. With Documents(strSummary) + 40. Selection.EndKey Unit:=wdStory + 41. Selection.Paste + 42. .Save + 43. End With + 44. strLogText = strLogText & strCity(i) _ + & vbTab & "OK" & vbCr + 45. Else + 46. strLogText = strLogText & strCity(i) _ + & vbTab & "No file" & vbCr + 47. End If + 48. Next i + 49. + 50. Crash: + 51. + 52. Documents.Add + 53. Selection.TypeText strLogText + 54. ActiveDocument.SaveAs strLogName + 55. Documents(strLogName).Close + 56. Documents(strSummary).Close + 57. + 58. End Sub + +The procedure in Listing 18.2 creates a new document that contains a summary, opens a number of files in turn, copies the first paragraph out of each and pastes it into the summary document, and then closes the file. As it does this, the procedure maintains a string of log information from which it creates a log file at the end of the procedure or, if an error occurs, during the procedure. Here's what happens in the code: + + * Lines 3 through 9 declare six String variables—strDate, strPath, strLogText, strLogName, strSummary, and strFile—and one String array, strCity, containing 10 items. (The procedure uses an Option Base 1 statement that doesn't appear in the listing, so strCity(10) produces 10 items in the array rather than 11.) Line 10 declares the Integer variable i, which the procedure will use as a counter. + * Line 11 is a spacer. Line 12 uses an On Error GoTo statement to start error handling and direct execution to the label Crash: in the event of an error. Line 13 is a spacer. + * Lines 14 through 23 assign the names of the company's 10 offices to the strCity array. Line 24 is a spacer. + * Line 25 assigns to strDate a string created by concatenating the month, the day, and the year for the current date (with a hyphen between each part) by using the Month, Day, and Year functions, respectively. For example, January 21, 2007, will produce a date string of 1-21-2007. (The reason for creating a string like this is that Windows can't handle slashes in filenames—slashes are reserved for indicating folders.) + * Line 26 sets strPath to the f:\Daily Data\ folder. Line 27 then builds a filename for the log file in the \Reports\ subfolder, and line 28 creates a filename for the summary file, also in the \Reports\ subfolder. + * Line 29 creates a new document based on Normal.dotm, and line 30 saves this document under the name stored in the strSummary variable. Line 31 is a spacer. + * Line 32 begins a For... Next loop that runs from i = 1 to i = 10. Line 33 assigns to the String variable strFile the filename for the first of the cities stored in the strCity array: strPath & strCity(i) & " " & strDate & ".docm". + * Line 34 then begins an If statement that checks whether Dir(strFile) returns an empty string. If not, line 35 opens the document specified by strFile, line 36 copies its first paragraph, and line 37 closes it without saving changes. The procedure doesn't make any changes to the document, but if the document contains any dynamic "hot fields" (such as date fields or links that automatically update themselves when the document is opened), it may have become dirty (modified). Including the SaveChanges argument ensures that users don't get an unexpected message box prompting them to save a document they know they haven't changed. (An alternative would be to set the Saved property of the document to True and then close it without using the SaveChanges argument.) + * Lines 39 through 43 contain a With statement that works with the Document object specified by strSummary. Line 40 uses the EndKey method with the Unit argument wdStory to move the selection to the end of the document. Line 41 pastes in the material copied from the document just opened, and line 42 saves the document. Line 43 ends the With statement. + * Line 44 adds to strLogText the contents of strCity(i), a tab, the text OK, and a carriage return, which will produce a simple tabbed list of the cities and the status of their reports. + * If the condition posed in line 34 isn't met, execution branches to the Else statement in line 45, and line 46 adds to strLogText the contents of strCity(i), a tab, No file, and a carriage return. Line 47 ends the If statement, and line 48 ends the For... Next loop, returning execution to line 32. + * Line 49 is a spacer. Line 50 contains the Crash: label and marks the start of the error handler. Unlike in many procedures, you don't want to stop execution before entering the error handler—as it happens, you want to execute these statements (to create the log file) even if an error occurs. Line 51 is a spacer. + * Line 52 creates a new document based on the default template; line 53 types the contents of strLogText into the new document; and line 54 saves it under the name strLogName. Line 55 closes this new document (alternatively, you could leave the document open so that the user could view it). Line 56 closes the summary document (which has remained open since it was created; again, you might want to leave this open so that the user might view it or offer the user the option of keeping it open). Line 57 is a spacer, and line 58 ends the procedure. + +# Making Sure a Procedure Is Running under Suitable Conditions + +Another important consideration when creating a well-behaved procedure is to check that it's running under suitable conditions. This ideal is nearly impossible to achieve under all circumstances, but you should take some basic steps, such as the following: + + * Make sure a file is open in a procedure that needs a file to be open—otherwise, you'll get an error every time. For example, in Excel, you might check the Count property of the Workbooks collection to make sure at least one workbook is open: + + If Workbooks.Count = 0 Then _ + MsgBox "This procedure will not run without a " _ + & "workbook open. Open one, then run the procedure again.", _ + vbOKOnly + vbExclamation, _ + "No Workbook Is Open" + + * Check that the procedure is referencing an appropriate item, if the procedure has definable requirements. For example, in an Excel procedure that applies intricate formatting to a chart the user has selected, make sure the user has, in fact, actually selected a chart. Trying to manipulate another object with chart-related commands is likely to cause an error or at least unwanted side effects. + * Make sure a file contains the element required by the procedure. (If it doesn't, an error will likely result.) Alternatively, trap the error that will result from the element's absence. + +# Cleaning Up after a Procedure + +Like your children or housemates, your procedures should learn to clean up after themselves. Cleaning up involves the following: + + * Undoing any changes that the procedure had to make + * Closing any files that no longer need to be open + * Removing any scratch files or folders that the procedure has created to achieve its effects + +## Undoing Changes the Procedure Has Made + +In some cases, you'll need to make changes to a document in order to run a procedure successfully. Here are a couple of examples: + + * In Word, you might need to apply some formatting to half of a table but not to the rest of it. In this case, it may be easier to split the table into two tables so that you can select columns in the relevant part and format or change them without affecting the columns in the other half of the original table. If you do this, you'll want to join the tables together again afterward by removing the break you've inserted between the original table's two halves. The easiest way to do this is to bookmark the break that you insert. You can then go back to the bookmark and delete it and the break at the same time. Alternatively, you could use a Set statement to define a range for the break and then return to the range and remove the break. + * In Excel, you may need to define named ranges in a workbook so that you can easily reference them from the code. (Usually, you'll do better to use ranges via VBA, which won't leave unwanted named ranges in the workbook.) Delete these named ranges when you've finished with them. + +## Removing Scratch Files and Folders + +During a complex procedure, you may need to create scratch files in which to temporarily store or manipulate data, or scratch folders in which to store temporary files. + +For example, if you need to perform complex formatting on a few paragraphs of a long document in Word, you may find it easier to copy and paste those paragraphs into a new blank document and manipulate them there than to continue working in the original document and risk unintentionally affecting other paragraphs as well. Likewise, in PowerPoint, you might need to create a new presentation that you could use for temporary or backup storage of intricate objects. + +Creating scratch files, while often necessary for the safe and successful operation of a procedure, can be intrusive. You're cluttering up the user's hard drive with information that's probably of no use to that user. Creating scratch folders in which to save the scratch files is even worse. Always go the extra distance to clean up any temporary items that you've stored on the user's hard drive. If you're thinking that commercial applications don't always do this, not even Microsoft's applications, you're right. But that doesn't mean you should follow their example. + +If your procedure is going to remove any scratch files it creates, you may be tempted to conceal from the user their creation and subsequent deletion. This usually isn't a good idea—in most cases, the best thing is to warn the user that the procedure will create scratch files. You might even let the user specify or create a suitable folder for the scratch files or present the user with a list that logs the files created and whether they were successfully deleted. Doing so will allow users to easily delete any scratch files left on their computer if your procedure goes wrong or is interrupted during execution. + +Another approach is to use the API (application programming interface) commands GetTempDir and GetTempFileName to find out the location of the computer's temporary folder and a temporary filename that you can use. (How to make an API call is illustrated in Chapter 30, "Accessing One Application from Another Application," in the sidebar titled "Using the Sleep Function to Avoid Problems with Shell's Asynchrony.") But even if you use the default temporary folder, you should delete any files that you create in it when your procedure is finished. Again, a disappointing number of commercial software developers fail to do this. + +### Using Your Own Scratch Folder + +You can use the MkDir command to create a folder. For example, the following statement creates a folder named Scratch Folder on the C: drive: + + **MkDir** "c:\Scratch Folder" + +Before creating a folder, use the Dir command to check to see that the name isn't already in use. (If a folder with that name already exists, an error results.) Here's how: + + Dim s As String + s = "c:\TempDir" + + If Len( **Dir** (s, vbDirectory)) = 0 Then + MkDir s + End If + +For temporary storage, you may want to use a folder name based on the date and time to lessen the chance that a folder with that name already exists. You could also use VBA's Rnd function to generate a random number to use as part of the folder name. + +### Deleting a Scratch Folder + +You can use the RmDir statement to remove an empty folder. (Make sure that you've deleted all files in the folder first—otherwise RmDir will fail.) For example, the following statement removes the scratch folder named Scratch Folder on the C: drive: + + RmDir "c:\Scratch Folder" + +# The Bottom Line + +**Understand the characteristics of well-behaved procedures.** + +Well-behaved procedures don't annoy or alarm the user either during or after their execution. + +Master It + +Name two ways programmers can write procedures that don't annoy users. + +**Retain and restore the user environment.** + +Users quite rightly don't appreciate it if your procedure leaves the state of their application's or operating system's environment modified. Find ways to restore the user environment before your procedure finishes execution. + +Master It + +Assume that you are writing a procedure that employs Word's Search and Replace feature. This feature retains its settings between uses so the user can repeatedly trigger the same search or replace actions. How can you temporarily store the status of the user's last search or replace so that you can restore this data after your procedure is finished executing? + +**Let the user know what's happening.** + +Particularly when a procedure is doing a lengthy "batch job" such as updating dozens of files, it's important to let the user know that the computer hasn't frozen. People need to be told that execution is continuing as expected even though nothing appears to be happening. + +Master It + +Describe a way to let the user know that a procedure isn't frozen—that activity is taking place during execution. + +**Check that the procedure is running under suitable conditions.** + +Another important element of creating a well-behaved procedure is to check that it's running under suitable conditions. This ideal is nearly impossible to achieve under all circumstances, but you should take some basic steps. + +Master It + +If a procedure accesses data from a file, name an error that could occur and thus should be trapped. + +**Clean up after a procedure.** + +A well-behaved procedure avoids leaving unneeded files or other temporary items behind. In other words, a procedure should clean up after itself. + +Master It + +Cleaning up involves three major tasks. Name one. +Chapter 19 + +Securing Your Code with VBA's Security Features + +This chapter discusses how to use the security tools that VBA provides for distributing and implementing macros and VBA code. VBA security falls into three categories: securing your applications against rogue VBA code; establishing that your VBA code isn't itself rogue so that it can be run; and securing your code against theft, alteration, or snooping. + +In this chapter you will learn to do the following: + + * Understand how VBA implements security + * Sign a macro project with a digital signature + * Get a digital certificate + * Choose the appropriate security level + * Lock your code + +# Understanding How VBA Implements Security + +Macros, dialog boxes, and user forms that you write are computer programs, albeit usually rather small ones. But because macros, like any other computer program, can access the user's hard drive and exploit other features of a computer, macros can do damage. + +Office and the operating systems Vista, Windows 7, and Windows 8 include a variety of security features designed to protect the user from malicious code—macro, virus, Trojan horse, bot, or whatever. But some security features are specific to Office documents and the macros, dialog boxes, and user forms they can contain. + +Scary but true: An evil macro can do its damage _automatically_. It's not necessary for the user to deliberately launch a macro from the Macros dialog box or from within the VBA Editor. Some procedures (with certain special names such as Open) automatically launch themselves. For example, if you name one of your procedures Document_Open Sub, all the code within that sub executes spontaneously when the user opens its host document: + + Private Sub **Document** _ **Open** () + +This can be handy, of course. Perhaps you'll want to write some code in this procedure that automatically sets up your preferred zoom level or completes some other housekeeping task that you always perform when opening any document. But the fact that the user doesn't need to specifically choose to run this macro means that a virus can be put into this procedure. And whammo—your computer is infected. + +Malicious code can enter a user's Office applications via three primary vehicles: macros, ActiveX controls, and add-ins. Microsoft provides users with various approaches to VBA security, including the following: + + * Certain Office document file types that simply cannot contain any embedded macros at all. That's the difference between, for example, saving a file using the Word .docx option, which cannot contain macros, and the .docm file type, which can. + * Documents that are loaded from a trusted area on the hard drive. + * Trust Center settings the user can specify, such as completely preventing the execution of any ActiveX controls, macros, or add-ins without even notifying or querying the user. Alternatively, the user can be prompted for permission before potentially dangerous code is allowed execute. + * A list of user-modifiable "trusted publishers"—companies whose documents are considered safe. + * The ability to digitally sign your own documents or templates, thereby making you a "trusted publisher." + +Office 2007 introduced the concept of two types of documents. For the first time, the user could save documents that simply cannot contain any macros or other potentially malicious code. By default, any new Word document is of the .docx type, not the .docm (macro-enabled) type. In other words, a document must be deliberately created as a macro-enabled document. And because it also must have a .docm filename extension, everybody else (including Word when opening the document) knows that it contains possibly dangerous code. Administrators can use Group Policy to enforce rules concerning which file types are permitted. But the default .docx file type is free of potentially risky executables (files or procedures that can execute). + +Other Office applications also have pairs of macro-disabled, macro-enabled file types. Excel has .xlsx and .xlsm files, and PowerPoint has .pptx and .pptm files. + +Office includes various security tools and features that ordinary users, administrators, and IT professionals can employ to further safeguard Office applications from attack: + + * An Office ActiveX kill bit that allows administrators to forbid certain ActiveX controls from executing. + * File-type blocking that can be implemented via Group Policy settings or individually via the Office Trust Center. The types of files that an application can access can be specifically controlled. + * A Trusted Documents feature that allows users to specify individual documents as reliable, thereby obviating whatever macro settings the user has enforced in the Trust Center. + * A scanning feature that searches for format exploits before a file can be opened by an Office application. + * A sandbox named Protected View. A sandbox isolates an executing program so it can't damage other programs, introduce viruses into the operating system, or store nasty surprises on the hard drive. Figure 19.1 shows the warning you get if you're about to open a document from a potentially dangerous source, and the Protected View options. This is similar to starting Windows in Safe Mode. In Protected View, executables are disabled. The protected document is in effect quarantined, so it theoretically can't do any harm to your computer or its contents. I say _theoretically_ because as we all know, no security is perfect. Note that in its description of Protected View in Figure 19.1, Microsoft carefully states that this mode will "help minimize harm"—no claim of invulnerability. All the Protected View options are turned on by default, so files you get from the Internet and Outlook attachments, for example, are automatically tossed into the sandbox when opened. + +Figure 19.1 Suspect sources trigger this security warning when opened in Office applications. + +Also, the user can deliberately choose to open a file in sandbox mode by selecting the file's name in an Office application's Open dialog box, clicking the Open drop-down box in the lower-right corner of the Open dialog, then choosing Open In Protected View. + +Various under-the-hood features, including password security and encryption to protect the privacy of user information. + +Doubtless there are additional hardening tactics that Microsoft is not mentioning. After all, why tell the bad people everything that's being done to prevent their incursions? + +* * * + +**Real Security in an Insecure World** + +All the virus-detection software, firewalls, digital signatures, and other security efforts in the world won't protect you or your colleagues if somebody on your network opens email attachments, downloads dodgy executables, or otherwise invites trouble into your environment. + +Even if everybody is aware of the dangers and follows the best security practices, viruses and other troubles can _still_ get in. After all, antivirus applications are always playing catch-up. A new virus is released, and then the antivirus forces identify it and send out a new update. + +On the plus side, currently it's pretty rare to find macros employed as a vehicle for spreading viruses. And, of course, if you're writing the VBA code yourself—as a reader of this book—you can certainly trust the source of _your_ macros. It's you! + +Because threats are constant, and because it's ultimately impossible to guarantee that you will never get a virus (in spite of taking great pains to prevent them), you should ensure that you are taking additional precautions to at least mitigate damage. + +Malicious software falls into two broad categories: + + * Code that attempts to do damage to you by, for example, erasing files or slowing your computer down so much that it becomes painful to use. The goal here is to create a mess you have to clean up. + * Code that attempts to find out your secrets to do damage to you by, for example, stealing your identity to ruin your credit or to drain your bank account. The issue here is violation of your privacy, a different kind of mess you have to clean up. + +If you're concerned about privacy, you should encrypt any documents containing sensitive information. Fortunately, with Office 2007 the formerly weak Office encryption scheme was replaced with a highly secure one. And Microsoft continues to toughen built-in encryption schemes and has added integrity-checking technologies for encrypted files. PowerPoint, Word, and Excel all permit you to encrypt files and then decrypt them by providing a password. Click File on the Ribbon, then in the Info page click Protect Document and then Encrypt With Password. + +If you're worried about a virus attack, be sure to back up your documents (you should do this anyway, in case of a drive crash, fire, theft, or other havoc). These days, with three-terabyte external drives selling for around $100, it's practical to store your entire computer system (a "system image")—documents, programs, inbox email, everything—as an image on an external drive. That way, you wouldn't even have to reinstall applications in the event of a serious problem. You can use third-party backup systems. Or if you use Windows 7, you can use Windows's built-in backup system by choosing Start ⇒ Control Panel, then clicking Backup And Restore. + +If you use Windows 8 and just want to back up your data files, press Windows key+W and type **Save Backup** to use the new File History utility. If you want to use the traditional Windows backup, it's possible even in Windows 8. To invoke this utility, press Windows key+W and type **Windows 7 File Recovery**. From there you can create an image, a repair disk, or a traditional Windows-style backup. + +* * * + +To secure an application against rogue VBA code, you can use the Office Trust Center to choose the level of security that you want the application to use when running VBA code. Click the File tab, then choose Options. Click the Trust Center button in the left pane, and click the Trust Center Settings button. + +You can also specify which sources to trust and how much to trust them. A trusted source might be someone who works for the same company as you, or someone who has a digital certificate from a third party you trust, such as the VeriSign certification authority. Because you (in this example) trust VeriSign, you therefore trust the third party to whom VeriSign has issued a digital certificate. Office also has a trusted time-stamping feature with the digital signature technology. + +To establish that your own code is fine for the Office applications to trust, you can sign a document or template project that contains customizations or macro project items (code modules, class modules, or user forms) with a digital signature generated by a digital certificate that uniquely identifies you or your company. We'll look at this technique first because it sets the stage for specifying the level of security to use. + +You can also lock a macro project with a password so that nobody can open the code. This both prevents anyone from tinkering with your code and either stopping it from working or rendering it harmful, and protects your intellectual property: If nobody can see your code, they can't steal your ideas. The section "Locking Your Code" shows you how to do this. + +# Signing Your Macro Projects with Digital Signatures + +VBA provides a security mechanism for securing macro projects with digital signatures. The digital signatures provide a means of establishing the provenance of the projects, which should help you decide whether to trust the code. If you trust the source of the code to produce benevolent programming, you can open the project and run the code. If you suspect the source or the information of being malignant, you can either avoid opening the project or open the project with the code disabled. + +The same goes for other people: If others are concerned about your macros, you may need to sign your projects so that other people know where they come from and who created them. Once you've signed the projects, the code is available to any application that has specified you as a trusted source for macro projects. (This assumes users have chosen one of the Disable options in the Macro Settings dialog box. You'll see how to set the security level later, in the section "Specifying a Suitable Security Setting.") + +The following sections discuss what digital certificates are, what they mean in practical terms, how you obtain them, and how you use them to create digital signatures. + +* * * + +Trusting a Publisher Is Global for VBA-Enabled Applications + +VBA's security mechanism, and the list of certificates, is shared across the range of VBA-enabled applications on your computer. So if you designate a trusted publisher in one application, all the other applications that support VBA security will trust that source as well. For example, if you open a document that contains code in Word and choose to trust the source of the code, Excel and Outlook also gain that trust and open projects from that source without having to prompt you. + +* * * + +## What Is a Digital Certificate? + +A _digital certificate_ is an encrypted datum that uniquely identifies its holder. Rather like a driver's license, it provides a level of trust that you are who you say you are and that your code can be trusted. + +You use your digital certificate to create a digital signature for a project. This project can be a document project, a template project, or an add-in. The project doesn't have to contain macros, procedures, user forms, classes, or VBA code for you to sign it, although these contents are the usual reason for signing a project. + +A digital signature applies to a whole macro project, typically a document project or a template project. You can't apply a digital signature to just part of a project—say, just to one module of code or to one user form. Each macro project item in that macro project—each module, user form, class, and reference—is covered by the digital certificate. + +But digital signatures, while usually reliable, have sometimes been compromised. + +## Getting a Digital Certificate + +There are three types of digital certificates: those you create yourself ("self-signed"), those you get from your company or organization, and those you get from a commercial certification authority, or certificate authority (CA). + +A digital certificate you create yourself is the weakest form of identification and is of little use to people beyond you and those who use your machine, whereas a certificate from a commercial certification authority should be good enough for general use in the world. Self-signed code will generate a security warning if someone opens a file containing this code. Office applications will not allow this code to run on any but the machine on which the certificate was created. + +A certificate issued by your company falls in the middle range of trustworthiness: In many cases, the company will have obtained the certificate from a commercial certification authority, which means the commercial certification authority has established to its satisfaction that the company is trustworthy. Whom the company chooses to trust with the certificate is another matter and introduces another complicating link into the chain of trust. However, server software such as Windows Server includes independent certification-authority services that do not require a certificate from a commercial certification authority, so you should be careful which certificates you trust. See the section "Whose Certificate Is It, and What Does It Mean?" later in this chapter for a discussion of how to discern a certificate's provenance and meaning. + +### Creating a Digital Certificate of Your Own + +The quickest and easiest way of getting a digital certificate is to create one yourself. It's easy, but its usefulness is very limited. Remember that this kind of certification only works on the computer on which the certificate was created. + +To understand how digital certificates work, you'll probably want to create several of your own and practice with them on sample files. By designating some of your files as originating from trusted publishers and leaving others untrusted, you can get a clear idea of how digital certificates work without having to actually mess around with suspect code on your system. + +To open the Create Digital Certificate dialog box (see Figure 19.2), from the Desktop in Windows 8, press the Windows Key and type **digital certificate**. Press Enter when you see Digital Certificate for VBA projects. You'll see the form you can "sign," as shown in Figure 19.2. + +Figure 19.2 You can self-sign a certificate, but Office only permits such certification to be trusted within the computer where the certificate was created. + +If you're using Windows 7, choose Start ⇒ All Programs ⇒ Microsoft Office ⇒ Microsoft Office 2013 Tools ⇒ Digital Certificate For VBA Projects. + +Type the name for the certificate in the text box, and then click the OK button. The SelfCert application creates the certificate and installs it automatically. + +### Getting a Digital Certificate from Your Company + +Your second option is to get a digital certificate from a digital certificate server that your company has. The details of this procedure vary from company to company. The certificates the company provides via its digital certificate server are generated in the same fashion as the digital certificates distributed by the commercial certification authorities discussed in the next section. However, a company distributes the certificates from a pool that it has allocated, without needing to apply to the certification authority for each certificate as it's needed, or creates the certificates of its own accord without getting them from a certification authority. Clearly this isn't all that safe. A rogue employee can _pose_ as trustworthy, obtain a company certificate, and then run totally wild. Totally. + +### Getting a Digital Certificate from a Commercial Certification Authority + +Your third choice is to get a digital certificate from a commercial certification authority such as these: + + * VeriSign (www.verisign.com). This, the most famous code-signing company, is now owned by Symantec. + * Go Daddy (www.godaddy.com) is the new kid on the block. Offers bargain code certification and other security products. + * Thawte, Inc. (www.thawte.com, a VeriSign company). + * GeoTrust (www.geotrust.com, another VeriSign company). + * DigiCert (www.digicert.com). + +VeriSign's computers handle four trillion lookups per day, but the company plans to spend $300 million over the next several years to increase that capacity to four quadrillion. + +Several types of certificate are available, depending on what you want to do. If you're creating and distributing software, you'll probably want to consider one of the certificates targeted at developers. + +The procedure for proving your identity varies depending on the CA and the type of certificate you want. Generally speaking, the greater the degree of trust that the certificate is intended to inspire, the more proof you'll need to supply. For example, you can get a basic certificate on the strength of nothing more than a verifiable email address, but this type of certificate is unlikely to make people trust you. Other certificate types require you to appear in person before a registration authority with full documentation (such as a passport, driver's license, or other identity documents). Such certificates obviously inspire more trust. + +### Installing a Digital Certificate + +Once you have a digital certificate, you need to install it so that Windows and the applications that will use it know where it's located. + +To install a digital certificate, follow these steps (you must be logged on as Administrator to view the Certificates dialog box): + +1. In Windows 8, from the Desktop, press the Windows Key and type **certmgr.msc**. + +* * * + +Self-Certifications Are Automatically Registered + +The Office SelfCert program automatically registers the certificates it creates on the computer on which it creates them. If you created a digital certificate for yourself, you shouldn't need to install it on the same computer. If you want to practice installing it, you'll need to use a different computer. + +* * * + +In Windows 7, click the Start button. A Search Programs And Files field opens just above the Start button. In the Search Programs And Files field, type **certmgr.msc**. + +**2.** When certmgr.msc appears in the Programs list, click it. You'll possibly be asked if you want to give yourself permission to take this step. Unless you are not you, go ahead and grant the permission by clicking the Continue button. (From this point on, Windows 7 will take a different path and display different dialogs than those shown here.) + +You now see the Certificates dialog box shown in Figure 19.3. + +Figure 19.3 Windows provides the Certificates dialog box to manage digital certificates. + +As you can see in Figure 19.3, I, identifying myself as an entity named _TotallyTrustworthy_ , granted code-signing certification to myself, also TotallyTrustworthy, as described earlier in this chapter in the section "Creating a Digital Certificate of Your Own." + +**3.** Click the Trusted Publishers folder in the left pane of the Certificates dialog box. + +**4.** Choose Action ⇒ All Tasks ⇒ Import from the Certificates dialog box's menu. The Certificate Import Wizard opens, as shown in Figure 19.4. + +Figure 19.4 Windows includes the Certificate Import Wizard to manage digital certificates. + +**5.** Click the Next button in the wizard to locate the file you want to import. You can search your hard drive for filenames ending in .cer or .crt. + +**6.** Click Next to display the Certificate Store page of the wizard, shown in Figure 19.5. + +Figure 19.5 On the Certificate Store page of the Certificate Import Wizard, choose the certificate store in which to store the certificate you're importing. + +7. Choose how to store the certificate: + + * To have Windows store each certificate automatically in the default certificate store for the certificate's type, select the Automatically Select The Certificate Store Based On The Type Of Certificate option button. + * To control where Windows stores the certificates, select the Place All Certificates In The Following Store option button. To specify the store, click the Browse button to display the Select Certificate Store dialog box, shown in Figure 19.6. Choose the certificate store (for example, Personal) and click the OK button. To specify a particular location within a certificate store, select the Show Physical Stores check box, and then click the plus (+) sign next to the store in question to display its subfolders. Select the folder you want, and then click the OK button. + +Figure 19.6 Use the Select Certificate Store dialog box to specify the certificate store in which you want to keep the certificate. The screen on the left shows the categories of stores; the screen on the right shows the physical stores. + +8. Click the Next button to finish setting up the import procedure. The Completing The Certificate Import Wizard dialog box is displayed to confirm the choices you've made. + +9. Review your choices, and then click the Finish button. The Certificate Import Wizard imports the certificate and then confirms that the operation was successful. + +Now that you've imported the certificate, it appears in the Certificates dialog box on the appropriate page. + +### Exporting a Digital Certificate + +You may need to export a certificate for backup so that you can keep it safely on removable media away from your computer or so that you can install it on another computer. For security, you should not store the digital certificate on your hard drive after you install it, because storing it there is an unnecessary security risk. + +To export a certificate, right-click it in the Certificates dialog box, then choose All Tasks ⇒ Export. Windows starts the Certificate Export Wizard, which walks you through the process of exporting the certificate. If you choose to export the private key with the certificate, be sure to protect it with a password. + +### Removing a Digital Certificate + +To remove a digital certificate from Windows's digital certificate store, follow these steps: + +1. Display the Certificates dialog box (follow steps 1 and 2 in the section earlier in this chapter on installing a certificate). + +2. Click the folder in the left pane that contains the digital certificate in question, and then select the certificate you want to remove. + +3. Click the red X icon, or choose Action ⇒ Delete. Windows displays a dialog box warning you of the consequences of deleting the digital certificate and asking you to confirm the deletion. Figure 19.7 shows the warning you get when removing a certification authority certificate (top) or a personal certificate (bottom). Click the Yes button to delete the certificate. + +Figure 19.7 Two of the warnings the Certificate Manager displays when you're about to remove a digital certificate + +### Signing a Macro Project with a Digital Signature + +Once you've completed a macro project and have it ready for distribution, you sign it with a digital signature so that applications that use a high level of security can use it. + +To sign a macro project digitally, follow these steps: + +1. In the VBA Editor, navigate to the document or template project that contains the macro project you want to sign. + +2. Select the project in the Project Explorer. + +3. Choose Tools ⇒ Digital Signature to display the Digital Signature dialog box (see Figure 19.8). + +If the Digital Signature dialog box lists the certificate you want in the Sign As area, simply click the OK button to use that certificate. + +Figure 19.8 Use the Digital Signature dialog box to specify the digital signature for a macro project. + +4. Click the Choose button. If you have more than one certificate, you'll see a Select Certificate dialog box. (If you have only one certificate, you'll see the Windows Security dialog box where you can confirm your choice, as shown in Figure 19.9. You should then skip to step 7.) + +Figure 19.9 Use this Windows Security dialog box to confirm your choice of certificate with which to sign the macro project. + +5. Click the certificate you want to use for the macro project. + +6. Click the OK button to apply the selected certificate and close the Select Certificate dialog box. + +7. Click the OK button to close the Digital Signature dialog box. + +8. Click the Save button on the Standard toolbar, press Ctrl+S, or choose File ⇒ Save to save the document or template project with the digital signature applied to it. + +### Removing a Digital Signature from a Macro Project + +To remove a digital signature from a macro project, follow these steps: + +1. In the VBA Editor, navigate to the document or template project that contains the macro project. + +2. Select the project in the Project Explorer. + +3. Choose Tools ⇒ Digital Signatures to display the Digital Signature dialog box. + +4. Click the Remove button. Both the Certificate Name readout in the area labeled The VBA Project Is Currently Signed As and the Certificate Name in the Sign As area of the Digital Signature dialog box will display [No Certificate] to indicate that the project no longer has a digital certificate assigned to it. + +5. Click the OK button to close the Digital Signature dialog box. + +You can always reapply the digital signature to the project whenever you wish, as described earlier in this chapter. + +### Whose Certificate Is It, and What Does It Mean? + +When you receive a digitally signed project, you'll probably want to find out just who has signed it and just what type of digital certificate they used. To view the details of a digital certificate, follow these steps: + +1. In the VBA Editor, navigate to the document or template project that contains the macro project. + +2. Select the project in the Project Explorer. + +3. Choose Tools ⇒ Digital Signature to display the Digital Signature dialog box. + +4. For an official (VeriSign or other) certification, click the Details button to see information about the source. + +If you want to view the details of one of your own, dodgy, _self_ - _signed_ certificates, click the Choose button in the Digital Signature dialog box, and click the Click Here To View Certificate Properties link to display the Certificate Details dialog box shown in Figure 19.10. + +Figure 19.10 Use the Certificate Details dialog box to examine the properties of a certificate. + +By examining Figure 19.10 close up, you'll see the Official Certificate icon with the Gold Seal and the Blue Ribbon (they inspire trust), but there is, alas, also a _Red X_ symbol! Chilling. This X means that the project in question _cannot be trusted whatsoever_. + +The Certificate Details dialog box has three pages: + + * The General page displays basic information about the certificate: for what purpose the certificate is intended, to whom it's issued, by whom it's issued, and the period for which it's valid. + * The Details page of the Certificate Details dialog box, shown in Figure 19.11, contains specifics about the certificate. Click one of the fields in the list box to display its value in the text box below. + * The Certification Path page of the Certificate Details dialog box shows the path by which the certificate has been issued from the issuing authority to the current holder. To check one of the links in the chain, select it in the Certification Path list box and click the View Certificate button (if it's available). You'll see the Certificate Details dialog box for the certificate in question. You can then follow the certification path for that certificate if you choose or click the OK button to dismiss the second (or subsequent) Certificate Details dialog box and return to the previous one. + +Figure 19.11 The Details page of the Certificate Details dialog box contains a host of details about the certificate. + +# Choosing a Suitable Level of Security + +To use VBA macros safely, you or a user of your code must open the Office Trust Center and choose a suitable level of security—high enough to avoid the threats posed by malicious or incompetent code but low enough that it doesn't prevent you from running useful, safe code. + +## Understanding the Security Threats Posed by VBA + +The VBA macro language is formidable. It can accomplish sophisticated and valuable tasks. But its capabilities also pose a threat when misused. Using relatively simple VBA commands, you can create files, delete files, manipulate existing data, and even control other applications. + +Also, code developed with the best of intentions can damage a computer when run under unsuitable circumstances. For example, a procedure might delete valuable data or delete critical files, making the computer crash. Such unintentional damage happens frequently enough, but what tends to make the headlines is damage caused intentionally by malicious code in macro viruses and other malicious software (or _malware_ ). + +A _macro virus_ is simply a computer virus written in a macro language such as VBA. + +## Protecting against Macro Viruses + +Protecting your computer (and computers connected to it in a network) against macro viruses requires three main steps: + +1. Install and run antivirus software, such as Malwarebytes (www.malwarebytes.org/) on your computer. And use the Windows Defender that's built into Windows. Update the antivirus software frequently and regularly with the latest virus definitions. (Most antivirus software offers automatic updating.) + +2. Configure suitable security settings in the applications you use, especially in those applications that host VBA or other programming languages or scripting languages. For example, configure VBA security settings as described in the next section. + +3. Be careful when opening any file that might contain code or an email attachment. Most modern applications warn you when there might be a problem with a file. Many macro viruses attempt to outwit such warnings by _social engineering_ —conning the user—rather than by sophisticated programming. + +4. For example, a macro virus may transmit itself as an email attachment to all the addresses in a friend's email application. The message and attachment suggest that the contents of the attachment are interesting or amusing—for example, jokes or compromising pictures. Because the file comes from a friend, someone known and trusted, and because the contents seem compelling, many users will open the file and ignore any security warnings. The action of simply opening the file can cause the code within the file to execute. Similarly, simply opening a Word .docm file can execute a macro. And by then it could be too late. Creepy code robots could be multiplying exponentially throughout your system. + +## Specifying a Suitable Security Setting + +First, set a suitable level of security for your purposes. To open the Options dialog box in Access, Word, Excel, or PowerPoint, click the File tab, then choose Options. Click the Trust Center button in the left pane. Then click the Trust Center Settings button, and click Macro Settings (see Figure 19.12). + +Figure 19.12 On the Macro Settings page of the Trust Center dialog box, choose the level of security you want to use when running macros. + +The various macro security settings are self-explanatory. However, if you are working in documents that you've created yourself and saved as the .docm type, having written your own macros you can temporarily choose the Enable All Macros option. At least while you're practicing with the examples in this book, you can trust your own documents. However, if you are opening macro-enabled document files (.docm or the other files from PowerPoint or Excel with an m appended to the filename extension), you should specify a less risky setting in your Trust Center macro settings. + +There's an easier way to deal with this problem, though. You can alternatively (and more safely) employ one of the disable options shown in Figure 19.12, but while doing development work with VBA (such as experimenting with the code in this book), just ensure that you save your .docm documents in one of the trusted locations. You can see the list of trusted locations by clicking the Trusted Locations button shown in the left pane in Figure 19.12. + +If you choose the Disable All Macros Except Digitally Signed Macros option, any unsigned macros in your documents won't work. They are blocked from executing. However, you can get them to work again by simply moving the document files to a trusted location. + +## Additional Trust Center Settings + +Microsoft is currently encouraging (by the pricing structure if nothing else) that its Office customers move from one-purchase, disk-based Office installation to a downloaded, pay-yearly subscription model called Office 365. + +What's more, there are seven versions of Office 365, each with its own variations on security features, such as whether or not it supports Group Policy settings. To see the variations, visit this page: + + + +Notice also a security feature listed in the left pane in Figure 19.12 that is new in Office 2013: Trusted App Catalogs. + +Open the Trusted Application Catalogs page in the Trust Center dialog box and you'll see the options illustrated in Figure 19.13. + +Figure 19.13 On this page of the Trust Center dialog box, choose whether you want to trust app catalogs. + +* * * + +The New Office Apps + +What is an Office app? Microsoft describes the new apps like this: "An app for Office is a region inside an Office application that contains a web page that can interact with the document to augment content and provide new interactive content types and functionality. apps [sic] for Office can be obtained by users from the new Office marketplace or from a private catalog in the form of stand-alone apps or subcomponents of a document template solution, or a SharePoint application." In other words, an online image-search tool or grammar checker could be embedded in Word as a command-bar pane, like Word's own built-in Navigation or Thesaurus command bars. + +At the time of this writing, the kinks have not yet been ironed out of all apps for Office, but if you're interested, you can try some free apps that are available in the Office Store. Click the Insert tab on the Ribbon, then click Apps For Office. Click See All. Click the _Find more apps at the Office Store_ link. Your browser opens showing various apps you can add to whatever Office application you happen to be working in currently. + +The apps for Office technology is, to be polite, still being refined at this time. For a sad example, try adding the Merriam-Webster dictionary to Word. Word 2013 has no built-in dictionary, presumably to encourage you to use an app instead. But this dictionary may not be the best. It doesn't have an entry for _normally_ , for example. Worse, if you look up _normal_ , the first definition given is _perpendicular_. A superior dictionary app for Office is the Bing dictionary, found here: + +http://office.microsoft.com/en-us/store/results.aspx?vtags=Reference&av=zwd150 + +Alas, the person who wrote the interface for the Bing dictionary seems to think that the adjective _lookup_ is interchangeable with the verb _look up_. But don't blame Bing. On the plus side, the Bing dictionary does have a good definition of the word _naturally_. + +A key feature of the new Office apps is that they cannot be written in VBA. You must use "web technologies like HTML5, XML, CSS3, JavaScript, and REST APIs" instead. + +* * * + +When you trust a catalog of Office apps, you're telling Office that it can stop notifying you or otherwise blocking executable content (such as macros or ActiveX controls) from this source. Thus you can override on a case-by-case basis the macro and other security settings that have been specified (see Figure 19.12 and Figure 19.13). + +### File Block Settings + +The File Block Settings page, shown in Figure 19.14, gives you the ability to block individual file types from opening or to open them in Protected View. Here you can also specify which types of files can be saved. Notice at the bottom of this page that you specify what choosing the Open option means: + + * Do Not Open Selected File Types means documents are totally blocked. + * Open Selected File Types In Protected View means you can open documents in the sandbox for reading only. + * Open Selected File Types In Protected View And Allow Editing means you can open documents in the sandbox for editing. + +Figure 19.14 File Block Settings specify what types of documents you want blocked or sandboxed. + +If you want to delve more deeply into Office 2013 security features, take a look at these web pages and the links therein: + + + + + +* * * + +Can Even a Simple .txt File Harbor a Virus? + +You might wonder why the _Plain Text Files_ option is included in the File Block Settings page shown in Figure 19.14. It would seem that a simple Notepad .txt file couldn't contain any dangerous executable code (any more than a stop sign could fire a bullet at you). After all, text is just words, right? + +Nope. Even opening simple .txt files can install a virus. How? The bad guys use trick filename extensions. Even though it says .txt, it might only be masquerading as a text file. Executable files (programs or viruses) usually have a .exe filename extension, but by default Windows _hides_ filename extensions. So you see Word, not Word.exe in Windows Explorer. Also, Windows files can be named with multiple extensions. So, you can have a dangerous file named OpenMe.txt.exe, but thanks to Windows's default extension-hiding, the filename that you actually see in this case is OpenMe.txt. You go ahead and double-click it thinking it will open in Notepad like most .txt files. Your hard drive explodes. Well, maybe not a detonation, but all your files could be wiped or there could be some other nasty virus surprise. OpenMe.txt was merely _posing_ as a .txt file, and inside was a monster. + +* * * + +# Locking Your Code + +To prevent anyone from viewing the contents of a macro project, you can lock it with a password. You'll usually want to do this before distributing a project to your colleagues. If your workplace is particularly volatile, you might even want to lock projects while they are merely under development on your own desktop. The argument against locking a project on which you're still actively working is that the lock adds a step to accessing the modules and forms in the project—but if you need the security, it's well worth the small amount of effort involved. + +Follow these steps to lock a document or template project: + +1. Press Alt+F11 to display the VBA Editor. + +2. In the Project Explorer, right-click the project that you want to lock, and choose Project Properties from the context menu to display the Project Properties dialog box. Alternatively, select the project in the Project Explorer and choose Tools ⇒ Project Properties. + +3. Click the Protection tab to display the Protection page (see Figure 19.15). + +Figure 19.15 Use the Protection page of the Project Properties dialog box to lock the project. + +4. Select the Lock Project For Viewing check box in the Lock Project group box. + +5. In the Password To View Project Properties group box, type a password in the Password text box and the same password in the Confirm Password text box. Setting a password is compulsory: You can't lock a project without specifying a password. Without a password, how could you unlock it? + +6. Click the OK button to apply the locking to the project. The VBA Editor closes the Project Properties dialog box but leaves the contents of the project open for you to view and work with. + +7. Switch back to the application, save your work, and close the application. + +Once you've done that, the project is locked and can't be viewed or edited without the password. When you choose to edit a procedure in the project from the application or try to expand the project in the Project Explorer in the VBA Editor, the Project Password dialog box appears, as shown in Figure 19.16 (unless you have macros disabled in the Trust Center settings). + +Figure 19.16 When you open a locked project, you need to enter the password for the project in this Project Password dialog box. + +Type the password in the Password text box and click the OK button to display the contents of the project. (If you enter the wrong password, the application or the VBA Editor displays a Project Locked message box followed by the Project Password dialog box for you to try again.) + +To unlock a project, open it in the VBA Editor (supplying the password), display the VBA Project Properties dialog box (by right-clicking the project's name in the Project Explorer, then choosing the Project Properties option from the context menu), clear the Lock Project For Viewing check box on the Protection page, and click the OK button. Save the file that contains the project. + +# The Bottom Line + +**Understand how VBA implements security.** + +Microsoft takes a multipronged approach to protecting users from malicious VBA code embedded in documents and capable of launching itself when the user simply opens the document. + +Master It + +Name two ways that users are protected from malicious VBA code. + +**Sign a macro project with a digital signature.** + +You can add a digital signature to your projects by creating your own certification, getting it from your company, or getting it from certification authorities such as VeriSign. + +Master It + +Describe the limitations of certifying a VBA macro project for yourself—without obtaining a certificate from your company or a commercial certification authority. + +**Get a digital certificate.** + +Commercial certification authorities provide the greatest level of security, but their certification is also more difficult to attain than self-certification or certification from your company. + +Master It + +Name some of the ways you may be required to prove your identity when obtaining a digital signature from a commercial certification authority. + +**Choose the appropriate security level.** + +When choosing the right security level to use VBA macros safely, you or a user of your code must achieve a balance. The security level must be set high enough to avoid malicious or incompetent code but low enough that it doesn't prevent you from running useful, safe code. + +Master It + +To set a suitable level of security for your purposes, open the Trust Center in Access, Word, Excel, or PowerPoint. You'll see four settings. Which one of the following five settings is _not_ available: + + * Disable All Macros Without Notification + * Disable All Macros With Notification + * Disable All Macros Except Digitally Signed Macros + * Enable All Macros With Notification + * Enable All Macros + +**Lock your code.** + +You can protect your source code in the VBA Editor from others. You add a password to a project (projects are in boldface in the Project Explorer), and others can't open your VBA procedures for reading or modifying. + +Master It + +What is the one drawback to locking your code? +Part 6 + +Programming the Office Applications + + * **Chapter 20: Understanding the Word Object Model and Key Objects** + * **Chapter 21: Working with Widely Used Objects in Word** + * **Chapter 22: Understanding the Excel Object Model and Key Objects** + * **Chapter 23: Working with Widely Used Objects in Excel** + * **Chapter 24: Understanding the PowerPoint Object Model and Key Objects** + * **Chapter 25: Working with Shapes and Running Slide Shows** + * **Chapter 26: Understanding the Outlook Object Model and Key Objects** + * **Chapter 27: Working with Events in Outlook** + * **Chapter 28: Understanding the Access Object Model and Key Objects** + * **Chapter 29: Manipulating the Data in an Access Database via VBA** + * **Chapter 30: Accessing One Application from Another Application** + * **Chapter 31: Programming the Office 2013 Ribbon** + +Chapter 20 + +Understanding the Word Object Model and Key Objects + +In this chapter you'll become familiar with the Word object model and the architecture underlying Word. You'll see how to perform common tasks with the most frequently useful Word objects. These objects include the Documents collection and the Document object, the Selection object, Range objects, and the Options object. + +In this chapter you will learn to do the following: + + * Understand the Word object model + * Understand Word's creatable objects + * Work with the Documents collection and the Document object + * Work with the Selection object + * Create and use ranges + * Manipulate options + +# Examining the Word Object Model + +You don't need to understand how the entire Word object model fits together in order to work with VBA in Word, but most people find having a general idea of the components and structure of the object model helpful. Some VBA programming involves managing objects, and for this the Help system's code examples are often invaluable. To see Word's object model reference, follow these steps: + +1. Launch or activate Word, and then press Alt+F11 to launch or activate the VBA Editor. + +2. Choose Help ⇒ Microsoft Visual Basic For Applications Help. (Pressing F1 is not an alternative, alas; it currently takes you to an entirely different page.) You should now see a web page similar to the one shown in Figure 20.1 (this figure shows a part of the web page). If you don't see this web page, type this URL into your browser's address field: + + + +Figure 20.1 A Word Help website (partial view) + +3. Click the _Welcome to the Word 2013 developer reference_ link on the left side of the web page (see the pointing hand icon in Figure 20.1). + +You now see the page shown in Figure 20.2. + +Figure 20.2 Drilling down in the Word Help site (partial view) + +4. Now click the link named _Object model reference_ ( _Word 2013 developer reference_ ), as shown in Figure 20.2 with the pointing hand cursor. + +You now see the Object Model Reference, shown partially in Figure 20.3. + +Figure 20.3 The entries in the Word Object Model Reference will help you write your own VBA code. + +* * * + +Help When Migrating Legacy Code from Earlier Office Projects + +If you've inherited VBA code written in earlier versions of Office, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though modifications to object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility, the Office Code Compatibility Inspector, that will flag objects and their members that have changed. It does a text comparison of the Office 2013 object model against VBA code written in earlier versions of Office. You can download the Compatibility Inspector from this web page: + +www.microsoft.com/en-us/download/details.aspx?id=15001 + +* * * + +* * * + +A Shortcut: Understanding Creatable Objects + +Like most VBA-enabled applications, Word has a number of _creatable objects_. This merely means that you don't have to employ the full qualification. In other words, you don't need to mention the Application object in your code. For example, the Documents collection object is creatable, + +so you can omit its parent, the Application object, when using the collection in code, like this: + + Dim x As Integer + x = Documents.Count + MsgBox x + +The Application object is simply understood, for the same reason that you don't have to add Planet Earth when addressing an envelope. The post office assumes that Mother Earth is the parent—the context—of all addresses. + +However, you can, if you wish, use the longer ("fully qualified") version: + + x = Application.Documents.Count + +Both versions have the same effect. + +The following are typically the most useful of these creatable objects: + + * The ActiveDocument object returns a Document object that represents the active document. + * The ActiveWindow object returns a Window object that represents the active window. + * The Documents collection contains the Document objects, each of which represents an open document. + * The Options object represents Word options and document options, including most of the options that appear in the Options dialog box. + * The Selection object represents the selection in the active document. Selection represents the selection (containing text or other objects) or collapsed selection (containing nothing—merely the blinking insertion point) in the document. + * The Windows collection contains the Window objects that represent all open windows. + +* * * + +The following sections show you how to work with some of the most useful Word objects, starting with the Documents collection and the Document object. You'll see how to use the ActiveWindow object and the Windows collection in the next chapter. + +# Working with the Documents Collection and the Document Object + +In many of your Word procedures, you'll likely work with documents: creating new documents, saving documents, opening existing documents, closing documents, and printing documents. To do so, you work with the Documents collection, which contains a Document object for each open document in Word. + +## Creating a Document + +To create a new file, use the Add method of the Documents collection. The syntax is as follows: + + _expression_.Add Template, NewTemplate, DocumentType, Visible + +Here, _expression_ is a required expression that returns a Documents collection. Typically, you'll want to use the Documents collection itself ( **Documents**.Add). + +Template is an optional Variant argument that specifies the template on which to base the new document. If you omit Template, Word uses the Normal template (this process is the same as if you'd clicked the File tab on the Ribbon, then clicked the New button to open a blank document). So you need specify a Template argument only when you want to base the new document on a template other than the default Normal.dotm. + +NewTemplate is an optional Variant argument that you can set to True to create a template file (.dotx) rather than a document. NewTemplate is set to False by default, so you can safely omit this argument unless you're creating a template. + +DocumentType is an optional Variant argument that you can use to specify the type of document to create: wdNewBlankDocument (the default), wdNewEmailMessage, wdNewFrameset (for a frameset), or wdNewWebPage. + +Visible is an optional Variant argument that you can set to False to have the document created in a window that isn't visible. The default setting is True, making the document window visible. + +There are two ways to create a document: + +**Creating a document based on Normal.dotm** + +The following statement creates a new document based on the **Normal.dotm** global template: + + Documents.Add + +**Creating a document based on a template** + +The following statement creates a new document based on the template named **Company Report.dotm** stored in the network folder designated **\\\server\public\templates** : + + Documents.Add Template:= "\\server\public\templates\Company Report.dotm" + +## Creating a Template + +The following statements declare a new object variable of the Document class named myTemplate, create a new template based on the template named Fancy.dotx, and assign it to myTemplate: + + Dim myTemplate As Document + + Set myTemplate = Documents.Add(Template:="c:\program files (x86)\Microsoft Office\Office14\1033\Quickstyles\fancy.dotx", _ + NewTemplate:=True, Visible:=True) + +In this example, the file path (c:\program files\ _and so on_ ) to the template is specified because this template is not in one of the default template folders. The result is a new .dotx file, based on the Fancy.dotx template. + +* * * + +Changing the Default File Locations + +Word has two templates folders: the user templates folder and the workgroup templates folder. You can change the locations of these folders by clicking the File tab on the Ribbon, then clicking the Options button to open the Word Options dialog box. Then click the Advanced button in the left pane. Scroll all the way down in the General Options section, and click the File Locations button you see at the bottom. Then click to select the default folder you want to change and click the Modify button. + +* * * + +### Saving a Document + +Just as when a user is saving a newly created document via the keyboard and mouse, when executing VBA code you must specify a filename and path the first time you save a new document. After that, you can save it under the same name or specify a different name or format. This is the difference between the Save and Save As options. + +### Saving a File for the First Time or as a Different File + +To save a file for the first time, or to save a file under a different name or in a different format, use the SaveAs2 method. The syntax is as follows: + + _expression_.SaveAs2(FileName, FileFormat, LockComments, Password, AddToRecentFiles, WritePassword, ReadOnlyRecommended, EmbedTrueTypeFonts, SaveNativePictureFormat, SaveFormsData, SaveAsAOCELetter, Encoding, InsertLineBreaks, AllowSubstitutions, LineEnding, AddBiDiMarks, CompatibilityMode + +With Word 2010, the traditional SaveAs command was replaced by the SaveAs2 command, which is identical except for the addition of a CompatibilityMode argument. Documents can be saved five different ways with respect to their compatibility with earlier versions of Word. Based on how you set the CompatibilityMode argument, Word saves your document like this: + + * 0 is the default if you don't specify any of the other CompatibilityMode options in this list. The document will be saved using whatever compatibility mode is currently used by this document. + * wdCurrent is a compatibility mode equivalent to the latest version of Microsoft Word. + * wdWord2003 is a mode that's compatible with Word 2003. Any features new in Word 2013 are disabled. + * wdWord2007 is essentially the same as 2003 mode, but features compatible with the 2007 version of Word are enabled. + * wdWord2010 is the mode where the Word 2010 features are enabled. + +The traditional SaveAs command will still work, but the Editor has a tendency to automatically replace it with SaveAs2. Spooky, true, but no real harm done. + +In the syntax , _expression_ is an expression that returns a Document object. For example, you might use the ActiveDocument object or an object in the Documents collection. + +FileName is an optional Variant argument that specifies the name for the document. If you omit FileName, VBA uses the current folder and the default filename of Doc _n_.docx (or .docm) or a document and Dot _n_.dotx (or .dotm) for a template, where _n_ is the next available number (for example, Doc5.docx for a macro-free document or Dot2.dotm for a macro-enabled template). + +* * * + +Avoid Accidentally Overwriting a File + +When writing code that saves a document, you should first check whether a document with this name and location already exists. If you don't check, VBA overwrites an existing file without warning, potentially causing data loss. + +* * * + +FileFormat is an optional Variant argument that specifies the format in which to save the document. Table 20.1 lists the wdSaveFormat constants for specifying commonly used formats. + +Table 20.1 WdSaveFormat constants + +**Constant** | **Saves Document As** +---|--- +wdFormatDocument | A Word document +wdFormatDocument97 | The Word version 97 document format +wdFormatDocumentDefault | The Word document default (the docx file type) +wdFormatDOSText | A DOS text file (the pre-Windows OS) +wdFormatDOSTextLineBreaks | A DOS text file with carriage returns +wdFormatEncodedText | A text file with encoding +wdFormatFilteredHTML | A filtered HTML file (Word 2003 and XP only) +wdFormatFlatXML | An unindexed XML document +wdFormatFlatXMLMacroEnabled | An unindexed XML document with macro capability +wdFormatFlatXMLTemplate | An unindexed XML template +wdFormatFlatXMLTemplateMacroEnabled | An unindexed XML template with macro capability +wdFormatHTML | An HTML file +wdFormatOpenDocumentText | An XML file format developed by Sun Microsystems +wdFormatPDF | Adobe's Portable Document Format +wdFormatRTF | A Rich Text format file +wdFormatStrictOpenXMLDocument | An XML document standard promoted for several years by Microsoft +wdFormatTemplate | A Word template +wdFormatTemplate97 | The Word version 97 template format +wdFormatText | A text file (plain ASCII) +wdFormatTextLineBreaks | A text file with carriage returns +wdFormatUnicodeText | A text file with Unicode characters +wdFormatWebArchive | A web archive file +wdFormatXML | An XML file (Word 2003 only) +wdFormatXMLDocument | XML document format +wdFormatXMLDocumentMacroEnabled | XML document format with macros enabled +wdFormatXMLTemplate | XML template format +wdFormatXMLTemplateMacroEnabled | XML template format with macros enabled +wdFormatXPS | XPS format + +* * * + +A Quick Way to See Objects and Their Constants + +If you're writing code and you want to quickly see a list of constants, such as the WdSaveFormat constants shown in Table 20.1, just press F2 to open the Object Browser in the Editor. Then type **wdsaveformat** in the Object Browser's search field and press Enter. You'll see the complete list of constants as shown in the illustration. + +* * * + +As an example of how to use one of these constants, the following statement saves the active document as a filtered HTML file under the name Example.html in the current folder: + + ActiveDocument.SaveAs2 FileName:="Example.html", _ + FileFormat:=wdFormatFilteredHTML + +After you run this example code, use Windows Explorer to locate this new Example.html file and click on it. It will open in Internet Explorer as if it were a web page, because it's stored using the HTML format (if Internet Explorer is the default application in which your machine opens.html files). Or take a look at it in Notepad if you want to see the full horror of HTML markup. + +* * * + +Save Documents Using File Converters + +In addition to the wdSaveFormat constants described in Table 20.1, you can save documents in other formats for which you have file converters installed by specifying the appropriate value for the SaveFormat property of the FileConverter object. For example: + + ActiveDocument.SaveAs2 FileFormat:=FileConverters(15).SaveFormat. + +See the FileConverters property entry in the VBA Help file for more information. + +* * * + +AddToRecentFiles is an optional Variant argument that you can set to True to have Word add the document to the list of recently used files displayed when you click the File tab on the Ribbon and then click Recent. (Often, when experimenting with documents in procedures, you'll want to avoid listing them on the Most Recently Used list, leaving the user's previous list of recent files undisturbed.) + +To protect the document as you save it, you can use four different protection features: + + * LockComments is an optional Variant argument that you can set to True to lock the document so that reviewers can enter comments but can't change the text of the document. + * Password is an optional Variant argument that you can use to set a password required before opening the document. + * WritePassword is an optional Variant argument that you can use to set a password required before saving changes to the document. + * ReadOnlyRecommended is an optional Variant argument that you can set to True to have Word recommend that the user open the document as read-only. + +Finally, there are the following optional arguments you'll use infrequently, if ever: + + * EmbedTrueTypeFonts is an optional Variant argument that you can set to True to save TrueType fonts with the document. (This is a good idea only if you're distributing the document to someone you know doesn't have the TrueType fonts installed to view the document correctly.) + * SaveNativePictureFormat is an optional Variant argument that you can set to True to have graphics imported from another platform saved as Windows graphics. + * SaveFormsData is an optional Variant argument that you can set to True to save the data entered in a form as a data record (as opposed to saving the whole form, including its static text). + * SaveAsAOCELetter is an optional Variant argument that you can set to True to save the document as an AOCE (Apple Open Collaboration Environment) letter (a mailing format for routing documents). + * Encoding is an optional Variant argument for using a different code page than the system code page. For example, you might need to save a document using a Cyrillic code page. + * InsertLineBreaks is an optional Variant argument that you can set to True when saving a document as a text file to make Word insert a line break at the end of each line of text. + * AllowSubstitutions is an optional Variant argument that you can set to True when saving a document as a text file to make Word substitute some symbol characters with similar text. For example, Word substitutes (TM) for a trademark symbol (™). + * LineEnding is an optional Variant argument that you can use when saving a document as a text file to control how Word marks line breaks and paragraph breaks. + * AddBiDiMarks is an optional Variant argument that you can set to True to make Word add control characters to the file to maintain bidirectional layout. + +Usually, when saving a file for the first time, you'll need to specify only its name and path; if you want to save it in a format other than a Word document, specify that too. The following statement saves the active document under the name Beehives.docx in the folder \\\server\Products\Field\: + + ActiveDocument.SaveAs2 _ + "\\server\Products\Field\Beehives.docx" + +### Saving a Document That Has Already Been Saved + +After a document has been first saved, you can save it in the future under the same name by using the Save method. For a Document object, the Save method takes no arguments (all the document's current formats are saved unchanged). For example, the following statement saves the document named Guns01.docx: + + Documents("Guns01.docx").Save + +### Saving All Open Documents + +To save all open documents, use the Save method with the Documents collection. The syntax is as follows: + + _expression_.Save(NoPrompt, OriginalFormat) + +Here, _expression_ is an expression that returns a Documents collection. Often, you'll use the Documents collection itself. + +NoPrompt is an optional Variant argument that you can set to True to make Word save all open documents containing unsaved changes and any attached templates containing unsaved changes without prompting the user. The default setting is False, which causes Word to prompt the user whether to save each document and template. Even if you set NoPrompt to True, Word will prompt the user to save changes to Normal.dotm if the Prompt Before Saving Normal Template check box is selected in the Save section of the Advanced tab of the Options dialog box. + +OriginalFormat is an optional Variant argument that you can set to wdOriginalDocumentFormat to save the documents in their original formats, wdWordDocument to force each document to be saved as a Word document, or wdPromptUserX to prompt the user about which format to use. + +For example, the following statement saves all open documents and templates without prompting the user: + + Documents.Save NoPrompt:=True + +### Checking Whether a Document Contains Unsaved Changes + +To find out whether a document contains unsaved changes, check its Saved property. Saved is a read/write Boolean property that returns False if the document contains unsaved changes and True if it does not. A new document contains no unsaved changes, even though it has never been saved. + +* * * + +The Dangers of Cloud Storage and How to Send Files up into the Cloud + +With mobility now the main trend in personal computing, people increasingly expect their files to be available anywhere, not just on their hard drive at home or in the office. They also want them accessible to various devices: the Surface tablet/Ultrabook, the phone, the laptop, whatever. + +So, to make files within reach everywhere and on whatever kind of computer, data is being moved to the cloud. Never mind that if you read their EULAs you discover that cloud storage providers nearly universally refuse to guarantee either the safety or security of your data. It could be lost in a fire; it could be captured by snoops. To protect yourself, it is a wise precaution to keep your own backup copies in your own house or office and also encrypt sensitive information. The cloud is useful, but dicey. Who _are_ these people storing your data? And where, exactly, are their servers located? + +Nonetheless, if you want to know how to save Word files to the cloud on Microsoft's SkyDrive, it's pretty straightforward. Just save a document to your SkyDrive folder. This example saves the current document to SkyDrive (change my name, _Richard_ , to your name in the file path in this example code): + + ActiveDocument.SaveAs ("C:\Users\ _Richard_ \SkyDrive\CloudTest") + +Similarly, to save to Dropbox: + + ActiveDocument.SaveAs2 ("C:\Users\ _Richard_ \Dropbox\CloudTest") + +* * * + +## Opening a Document + +To open a document, use the Open method with the appropriate Document object. The syntax for the Open method is as follows: + + _expression_.Open FileName, ConfirmConversions, ReadOnly, + AddToRecentFiles, PasswordDocument, PasswordTemplate, + Revert, WritePasswordDocument, WritePasswordTemplate, + Format, Encoding, Visible, + OpenAndRepair, DocumentDirection, NoEncodingDialog, XMLTransform + +The arguments are as follows: + + * _expression_ is a required expression that returns a Documents collection. Usually, you'll want to use the Documents collection itself. + * FileName is a required Variant argument specifying the name (and path, if necessary) of the document to open. + * ConfirmConversions is an optional Variant argument that you can set to True to have Word display the Convert File dialog box if the file is in a format other than Word. + * ReadOnly is an optional Variant argument that you can set to True to open the document as read-only. + * AddToRecentFiles is an optional Variant argument that you can set to True to have Word add the filename to the list of recently used files at the foot of the File menu. + * PasswordDocument is an optional Variant argument that you can use to set a password for opening the document. + * PasswordTemplate is an optional Variant argument that you can use to set a password for opening the template. + * Revert is an optional Variant argument that specifies what Word should do if the FileName supplied matches a file that's already open. By default (that is, if you don't include the Revert argument), Revert is set to False, which means that Word activates the open instance of the document and doesn't open the saved instance. You can set Revert to True to have Word open the saved instance of the document and discard any changes to the open instance. + * WritePasswordDocument is an optional Variant argument that indicates the password for saving changes to the document. + * WritePasswordTemplate is an optional Variant argument that specifies the password for saving changes to the template. + * Format is an optional Variant argument that you can use to specify the file converter with which to open the document. Table 20.2 lists the WdOpenFormat constants you can use specify the file converter. + * Encoding is an optional Variant argument specifying the document encoding (the code page or the character set) for Word to use when opening the document. + * Visible is an optional Variant argument that you can set to False to have Word open the document in a window that isn't visible. (The default setting is True, specifying a visible window.) + * OpenAndRepair is an optional Variant that, when True, repairs the document to prevent corruption. + * DocumentDirection is an optional WdDocument Direction variable type, indicating the horizontal flow of text in the document. The default is wdLeftToRight. + * NoEncodingDialog is an optional Variant that defaults to False. But if it's set to True, the Encoding dialog box is not displayed when Word cannot recognize text encoding. + * XMLTransform is mysterious. The only explanation I could find is in MSDN and it merely says, "Specifies a transform to use." So your guess is as good as mine about what this option accomplishes. + +Table 20.2 WdOpenFormat constants for opening a document + +**Constant** | **Effect** +---|--- +wdOpenFormatAllWord | Word opens the document in any recognized Word format as a Word document. +wdOpenFormatAllWordTemplates | Word opens the document in any recognized Word format as a Word template. +wdOpenFormatAuto | Word chooses a converter automatically. This is the default setting. +wdOpenFormatDocument | Word opens the document as a Word document. +wdOpenFormatDocument97 | Microsoft Word 97 document format. +wdOpenFormatEncodedText | Word opens the document as a text file with encoding. +wdOpenFormatOpenDocumentText | Word opens the document in an XML file format developed by Sun Microsystems. +wdOpenFormatRTF | Word opens the document as a Rich Text format file. +wdOpenFormatTemplate | Word opens the document as a template. +wdOpenFormatTemplate97 | Word 97 template format. +wdOpenFormatText | Word opens the document as a text file. +wdOpenFormatUnicodeText | Word opens the document as a Unicode text file. +wdOpenFormatWebPages | Word opens the document as a web page. +wdOpenFormatXML | Word opens the document in XML format. +wdOpenFormatXMLDocument | XML document format. +wdOpenFormatXMLDocumentMacroEnabled | XML document format with macros enabled. +wdOpenFormatXMLDocumentMacro-EnabledSerialized | Word opens an XML document with macros enabled by reconstructing the original document from a one-dimensional stream of bits. +wdOpenFormatXMLDocumentSerialized | Word opens an XML document by reconstructing the original document structure from a one-dimensional stream of bits. +wdOpenFormatXMLTemplate | XML template format. +wdOpenFormatXMLTemplateMacroEnabled | XML template format with macros enabled. +wdOpenFormatXMLTemplateMacro-EnabledSerialized | Word opens an XML template with macros enabled by reconstructing the original document from a one-dimensional stream of bits. +wdOpenFormatXMLTemplateSerialized | Word opens an XML template by reconstructing the original document from a one-dimensional stream of bits. + +The following statement opens the document Times.docx found in the C:\My Documents\ folder: + + Documents.Open "C:\My Documents\Times.docx" + +The following statement opens the file notes.docm in the folder C:\temp as read-only and adds it to the list of most recently used files (the list you see when you click the File tab on the Ribbon, then click Recent): + + Documents.Open "C:\temp\notes.docm", ReadOnly:=True, _ + AddToRecentFiles:=True + +* * * + +How to Look Up Office 2013 Members in MSDN + +Recall that Microsoft's MSDN online help system can sometimes be difficult to search because it is so huge; it's perhaps _too_ complete. Among other issues, MSDN includes enumerations (lists of properties and methods, or constants, for example) for older versions of Office applications, such as 2007 or 2010, as well as those for the current version 2013. + +Although these lists usually don't change much between versions, they _can_ change. To preserve compatibility—so you don't have to rewrite your macros every time a new version of Office comes out—few enumerations ever _lose_ members. But new capabilities are added. Word 2013, for example, adds wdFormatStrictOpenXMLDocument to the enumeration list for wdSaveFormat shown in Table 20.1. + +To search MSDN for the latest enumeration, type something like this in the MSDN online Search Office With Bing field: **wdDefaultFilePath office 2013**. Then in the list of hits displayed by Bing, choose the enumeration. Note that _wd_ specifies Word.Closing a Document + +* * * + +To close a document, use the Close method with the application Document object. The syntax is as follows: + + _expression_.Close(SaveChanges, OriginalFormat, RouteDocument) + +Here, _expression_ is a required expression that returns a Document object or a Documents collection. Typically you use the ActiveDocument object or, to close all documents, the Documents collection object. + +SaveChanges is an optional Variant argument you can use to specify how to handle unsaved changes. Use wdDoNotSaveChanges to discard changes, wdPromptToSaveChanges to have Word prompt the user to save changes, or wdSaveChanges to save changes without prompting. + +OriginalFormat is an optional Variant argument you can use to specify the save format for the document. Use wdOriginalDocumentFormat to have Word use the original document format, wdPromptUser to have Word prompt the user to choose a format, or wdWordDocument to use the Word document format. + +RouteDocument is an optional Variant argument that you can set to True to route a document that has a routing slip attached. + +For example, the following statement closes the active document without saving changes: + + ActiveDocument. **Close** SaveChanges:= **wdDoNotSaveChanges** + +The following statement closes all open documents (but not the Word application itself) and saves changes automatically: + + **Documents.** Close SaveChanges:=wdSaveChanges + +## Changing a Document's Template + +To change the template attached to a document, set the AttachedTemplate property of the Document object you want to affect to the path and name of the appropriate template. For example, the following statement attaches the template named SalesMarket02.dotm to the active document. In this example, the template is assumed to be stored in one of the Word templates folders, so the path need not be specified: + + ActiveDocument.AttachedTemplate = "SalesMarket02.dotm" + +## Printing a Document + +To print a document, use the PrintOut method for the appropriate Document object. The syntax for the PrintOut method is as follows: + + _expression_.PrintOut(Background, Append, Range, OutputFileName, From, To, Item, Copies, Pages, PageType, PrintToFile, Collate, ActivePrinterMacGX, ManualDuplexPrint, PrintZoomColumn, PrintZoomRow, PrintZoomPaperWidth, PrintZoomPaperHeight) + +These are the components of the PrintOut method: + + * _expression_ is a required expression specifying an Application, Document, or Window object. Usually, you'll print a Document object such as ActiveDocument. + * Background is an optional Variant argument that you can set to True to have Word print the document in the background, allowing the procedure to continue running. + * Append is an optional Variant argument that you can set to True to append the document being printed to file to the print file specified. + * Range is an optional Variant argument specifying the selection or range of pages to print: wdPrintAllDocument (0, the default), wdPrintCurrentPage (2), wdPrintFromTo (3; use the From and To arguments to specify the pages), wdPrintRangeOfPages (4), or wdPrintSelection (1). + * OutputFileName is an optional Variant argument used to specify the name for the output file when printing to file. + * From is an optional Variant argument used to specify the starting page number when printing a range of pages. + * To is an optional Variant argument used to specify the ending page number when printing a range of pages. + * Item is an optional Variant argument used to specify the item to print: wdPrintAutoTextEntries (4), wdPrintComments (2), wdPrintDocumentContent (0, the default), wdPrintKeyAssignments (5, shortcut key assignments for the document or its template), wdPrintProperties (1), or wdPrintStyles (3). + * Copies is an optional Variant argument used to specify the number of copies to print. (If you omit Copies, Word prints one copy.) + * Pages is an optional Variant argument used to specify the pages to print—for example, 1, 11-21, 31. + * PageType is an optional Variant argument used to specify whether to print all pages (wdPrintAllPages, 0, the default), odd pages (wdPrintOddPagesOnly, 1), or even pages (wdPrintEvenPagesOnly, 2). + * PrintToFile is an optional Variant argument that you can set to True to direct the output of the print operation to a file. + * Collate is an optional Variant argument used when printing multiple copies of a document to specify whether to collate the pages (True) or not (False). + * ActivePrinterMacGX is an optional Variant argument used on the Macintosh to specify the printer if QuickDraw GX is installed. + * ManualDuplexPrint is an optional Variant argument that you set to True for two-sided printing on a printer that doesn't have duplex capabilities. When ManualDuplexPrint is True, you can use the PrintOddPagesInAscendingOrder property or the PrintEvenPagesInAscendingOrder property of the Options object to print odd or even pages in ascending order to create a manual duplex effect (reloading the odd-page-printed paper into the printer the other way up to print the even pages). The ManualDuplexPrint argument is available only in some languages. + * PrintZoomColumn and PrintZoomRow are optional Variant arguments that you use to specify the number of pages to print on a page horizontally (PrintZoomColumn) and vertically (PrintZoomRow). Each property can be 1, 2, or 4. + * PrintZoomPaperWidth is an optional Variant argument that you can use to specify the width (measured in twips) to which to scale printed pages. + * PrintZoomPaperHeight is an optional Variant argument that you can use to specify the height (measured in twips) to which to scale printed pages. + +For example, the following statement prints three collated copies of the active document in the background: + + ActiveDocument.PrintOut Background:=True, Copies:=3, Collate:=True + +The following statement prints pages 2 through 5 of the active document: + + ActiveDocument.PrintOut Range:=wdPrintFromTo, From:=2, To:=5 + +The following statement prints the active document at two virtual pages per sheet of paper: + + ActiveDocument.PrintOut PrintZoomColumn:=2, PrintZoomRow:=1 + +## Working with the ActiveDocument Object + +The ActiveDocument object returns a Document object that represents the current document you're working with—in other words, whichever document has the focus in the Word window. The ActiveDocument object behaves like a Document object, but watch out for two possible problems when working with it. + +First, you may have problems locating information about the ActiveDocument object in the Help system. It's actually a _property_ of the Application object, so its status as an actual object is somewhat iffy. Object taxonomy is an evolving clerical system and, as you see, remains incomplete. + +To find the ActiveDocument object in the Help system, MSDN system, or VBA Editor Object Browser, you need to first locate the Application object, then look at its properties (or members). Just remember, ActiveDocument is found only _under_ the Application object. It's a clerical error. It's as if you were looking for _California_ in a geography book's index, but the index is wacky because you find _most_ states listed under their own names (Hawaii is under _H_ , for example), but for some reason, California is not listed under C. You're puzzled. It's a big, important state. Then you stumble upon the solution: In this bizarre index, _California_ is only found under the entry for _United States_. + +The second oddity about the ActiveDocument "property" is that it can be evanescent. The first problem is that if there's no document open in Word, there's no ActiveDocument object, and any code that tries to work with the ActiveDocument object returns an error. When writing code that invokes the ActiveDocument object, remember to check the Count property of the Documents collection to make sure there's a document open (Count will be at least 1) before attempting to use ActiveDocument in your code. Here's an example that tests to see if there is an open document: + + If **Documents.Count = 0** Then + If MsgBox("No document is open." & vbCr & vbCr & _ + "Do you want to create a new blank document?", _ + vbYesNo + vbExclamation, "No Document Is Open") = vbYes Then + Documents.Add + Else + End + End If + End If + +A second problem relating to this evanescence is that a different document may be active than your code assumes is active. This problem tends to occur when a procedure starts with the active document and then creates a new document to work in; this new document becomes the active document, and from this point on, confusion may result. + +If you know the _name_ of the document that should be active, you can check to see if the name of the active document matches it, to verify that you'll be working with the right document. + +If there's any doubt about which document you're working with, declare a Document object variable and employ that object variable in your code rather than the ActiveDocument object. + +For example, the following statements declare a Document object and assign the ActiveDocument object to it so that subsequent code can work with the Document object: + + **Dim myDocument As Document** + **Set myDocument = ActiveDocument** + With myDocument + 'actions here + End With + +Or if you know the name of the document you want to work with: + + Dim myDocument As Document + Set myDocument = ActiveDocument + **If myDocument.Name = "CorrectFile.docx** " Then + 'actions here + End If + +# Working with the Selection Object + +Up to now in this chapter we've worked with programming that affects an entire document. To write code that works with only part of a document (a word, paragraph, or whatever), you can access these zones in three ways: + + * By using the Selection object + * By directly accessing the object that you want to affect + * By defining a range that encompasses the object + +Using the Selection object is analogous to working interactively with Word and is effective with procedures that require the user to select an object or position the insertion point to denote what content in the document the procedure should access. + +Using the Selection object is also effective when you're learning to use VBA with Word, because many actions that you record using the Macro Recorder use the Selection object. + +The Selection object represents the current selection in the active document in Word. The selection can be very small (collapsed to the blinking cursor insertion point), in which case nothing is selected. Or a Selection object can contain one or more objects—one or more characters, one or more words, one or more paragraphs, a graphic, a table, the entire document. Or the selection can be a combination of these objects. Whatever's selected. + +Even if the selection is collapsed to an insertion point, you can use it to refer to objects outside the selection. For example, Selection.Paragraphs(1).Range.Words(10).Text returns the 10th word in the paragraph in which the insertion point is positioned (or, if a paragraph or multiple paragraphs are selected, the 10th word in the first paragraph). + +## Checking the Type of Selection + +Word recognizes nine different kinds of selections. When you're working in the active document, you'll often need to check what kind of selection is active so that you know whether you're dealing with no selection (just the insertion point), a block of ordinary text, or a special type of text like a table or a graphic. + +Depending on the current selection, you may not be able to take certain actions in your procedure, and you may not _want_ to take other actions. You can't, for example, insert a table row into an ordinary text paragraph. + +Table 20.3 lists the types of selections that Word differentiates. + +Table 20.3 Selection types in Word + +**constant** | **Value** | **Meaning** +---|---|--- +wdNoSelection | 0 | There's no selection. (This state seems impossible to achieve. You'd think it'd be when no document is open, but then Selection statements return runtime error 91. Stay tuned...) +wdSelectionIP | 1 | The selection is collapsed to a plain insertion point—nothing is selected. But the insertion cursor is blinking as usual. +wdSelectionNormal | 2 | A "normal" selection, such as a selected word or sentence. +wdSelectionFrame | 3 | A frame is selected. +wdSelectionColumn | 4 | A column or part of a column (two or more cells in a column or one cell in each of two or more columns) is selected. +wdSelectionRow | 5 | A full row in a table is selected. +wdSelectionBlock | 6 | A block is selected (a vertical part of one or more paragraphs, selected by holding down the Alt key and dragging with the mouse or by using column-extend mode). +wdSelectionInlineShape | 7 | An inline shape or graphic (a shape or graphic that's in the text layer rather than floating over it) is selected. +wdSelectionShape | 8 | A Shape object is selected. (A text box counts as a Shape object.) + +To find out what type of selection you currently have, look at the Type property of the Selection object. The following statements check that the current selection is merely an insertion point before inserting a text literal. The text will not be inserted if the user has dragged to select, for example, some characters, a word, or a paragraph: + + If Selection. **Type = wdSelectionIP** Then + Selection.TypeText "This is inserted." + End If + +## Checking the Story Type of the Selection + +Beyond the type of selection, you'll sometimes need to find out which "story" the selection is in—the main text story, the comments story, the primary header story, and so on. Microsoft uses the word _story_ instead of _zone, type_ , or other terms to mean a distinct type of content. + +Checking the story can help you avoid problems, such as trying to perform in a header or footer actions that Word supports only in a main text story. + +The story is the zone of the document within which the current selection is located. So, most of the time the story is the _main text story_ (wdMainTextStory). That's the document and the items within it. But alternative "stories" are things like footnotes, frames, headers, and footers—as you can see in Table 20.4, which lists the wdStoryType constants and the stories to which they correspond. + +You may notice another whimsical, enigmatic feature of Table 20.4. It starts the enumeration value with 1. Compare that to Table 20.3 which starts with 0. Inconsistencies like this make programming more challenging. + +Table 20.4 Word story types + +**Constant** | **Value** | **Meaning** +---|---|--- +wdMainTextStory | 1 | Main (body) text of the document +wdCommentsStory | 4 | Comments section +wdEndnotesStory | 3 | Endnotes section +wdFootnotesStory | 2 | Footnotes section +wdTextFrameStory | 5 | Text in frames +wdPrimaryFooterStory | 9 | Main footer +wdEvenPagesFooterStory | 8 | Even-page footer +wdFirstPageFooterStory | 11 | First-page footer +wdPrimaryHeaderStory | 7 | Main header +wdEvenPagesHeaderStory | 6 | Even-page header +wdFirstPageHeaderStory | 10 | First-page header +wdFootnoteSeparatorStory | 12 | Footnote separator +wdFootnoteContinuationSeparatorStory | 13 | Footnote continuation separator +wdFootnoteContinuationNoticeStory | 14 | Footnote continuation notice +wdEndnoteSeparatorStory | 15 | Endnote separator +wdEndnoteContinuationSeparatorStory | 16 | Endnote continuation separator +wdEndnoteContinuationNoticeStory | 17 | Endnote continuation notice + +Here's a code example that displays a message box if the selection isn't in the main text of a document: + + If Selection. **StoryType <> wdMainTextStory** Then + MsgBox "This range is not in the main text." + End If + +## Getting Other Information about the Current Selection + +To work effectively with a selection, you'll often need to know what it contains and where it's positioned. To find out, use the Information property to learn the details you need. Table 20.5 lists examples of useful information available in the Information property. + +Here's an example showing how to use the Information property: + + If Selection. **Information(wdCapsLock) = True** Then + MsgBox "The caps lock is ON." + End If + +Sharp-eyed readers will notice a capricious inconsistency in this code. In the other code examples in this section, no parentheses were used around a constant, and the operator (= or <> or whatever) is placed between the property and the constant, as shown in this example: + + Selection.Type = wdSelectionIP + +But with the Information property, you _do_ use parentheses, and you move the operator to the right of the constant: + + **Selection.Information(wdCapsLock) =** + +This syntax and punctuation irregularity is yet _another_ of those exceptions to the rule. You should therefore remember that if the usual syntax produces an error message from the Editor, try the other (parenthetical) version. + +To see the complete list of all members, open the object browser (F2) and scroll down in the Classes list until you see wdInformation. Double-click it and its members will be listed in the Members of "WdInformation" list on the right. + +Table 20.5 Information available in the Information property + +**Constant** | **Returns This Information** +---|--- +**Environment Information** | +wdCapsLock | True if Caps Lock is on. +wdNumLock | True if Num Lock is on. +wdOverType | True if Overtype mode is on. (You can turn Overtype mode on and off by changing the Overtype property.) +wdRevisionMarking | True if Track Changes is on. +wdSelectionMode | A value that specifies the current selection mode: 0 indicates a normal selection, 1 indicates an extended selection (Extend mode is on), and 2 indicates a column selection. +wdZoomPercentage | The current zoom percentage. +**Selection and Insertion Point Information** | +wdActiveEndAdjustedPageNumber | The number of the page containing the active end of the selection or range. This number reflects any change you make to the starting page number; wdActiveEndPageNumber, the alternative, doesn't. +wdActiveEndPageNumber | The number of the page containing the active end of the selection or range. +wdActiveEndSectionNumber | The number of the section containing the active end of the selection or range. +wdFirstCharacterColumnNumber | The character position of the first character in the selection or range. If the selection or range is collapsed to an insertion point, this constant returns the character number immediately to the right of the insertion point. (Note that this "column" is relative to the currently active left margin and doesn't have to be inside a table.) +wdFirstCharacterLineNumber | In Print Layout view and Print Preview, this constant returns the line number of the first character in the selection. In nonlayout views (e.g., Normal view), it returns -1. +wdFrameIsSelected | True if the selection or range is a whole frame or text box. +wdHeaderFooterType | A value that specifies the type of header or footer containing the selection or range: -1 indicates that the selection or range isn't in a header or footer; 0 indicates an even page header; 1 indicates an odd page header in a document that has odd and even headers and the only header in a document that doesn't have odd and even headers; 2 indicates an even page footer; 3 indicates an odd page footer in a document that has odd and even footers and the only footer in a document that doesn't have odd and even headers; 4 indicates a first-page header; and 5 indicates a first-page footer. +wdHorizontalPositionRelativeToPage | The horizontal position of the selection or range—the distance from the left edge of the selection or range to the left edge of the page, measured in twips. +wdHorizontalPositionRelativeToTextBoundary | The horizontal position of the selection or range—the distance from the left edge of the selection or range to the text boundary enclosing it, measured in twips. +wdInCommentPane | True if the selection or range is in a comment pane. +wdInEndnote | True if the selection or range is an endnote (defined as appearing in the endnote pane in Normal view or in the endnote area in Print Layout view). +wdInFootnote | True if the selection or range is in a footnote (defined as appearing in the footnote pane in Normal view or in the footnote area in Print Layout view). +wdInFootnoteEndnotePane | True if the selection or range is in a footnote or endnote. +wdInHeaderFooter | True if the selection or range is in a header or footer (defined as appearing in the header or footer pane in Normal view or in the header or footer area in Print Layout view). +wdInMasterDocument | True if the selection or range is in a master document (a document containing at least one subdocument). +wdInWordMail | A value that specifies the WordMail location of the selection or range: 0 indicates that the selection or range isn't in a WordMail message; 1 indicates that it's in a WordMail message you're sending; 2 indicates that it's in a WordMail you've received. +wdNumberOfPagesInDocument | The number of pages in the document in which the selection or range appears. +wdReferenceOfType | A value that specifies where the selection is in relation to a footnote reference, endnote reference, or comment reference. -1 indicates the selection or range includes a reference. 0 indicates the selection or range isn't before a reference. 1 indicates the selection or range is before a footnote reference, 2 that it's before an endnote reference, and 3 that it's before a comment reference. +wdVerticalPositionRelativeToPage | The vertical position of the selection or range—the distance from the top edge of the selection to the top edge of the page, measured in twips. +wdVerticalPositionRelativeToTextBoundary | The vertical position of the selection or range—the distance from the top edge of the selection to the text boundary enclosing it, measured in twips. +**Table Information** | +wdWithInTable | True if the selection is in a table. +wdStartOfRangeColumnNumber | The number of the table column containing the beginning of the selection or range. +wdEndOfRangeColumnNumber | The number of the table column containing the end of the selection or range. +wdStartOfRangeRowNumber | The number of the table row containing the beginning of the selection or range. +wdEndOfRangeRowNumber | The number of the table row containing the end of the selection or range. +wdAtEndOfRowMarker | True if the selection or range is at the end-of-row marker in a table (not the end-of-cell marker). +wdMaximumNumberOfColumns | The largest number of table columns in any row in the selection or range. +wdMaximumNumberOfRows | The largest number of table rows in the table in the selection or range. +**Macintosh** | +wdInClipboard | Used with Microsoft Office Macintosh Edition + +## Inserting Text at, after, or before the Selection + +You can insert text at the selection by using the TypeText method of the Selection object, insert text before the selection by using the InsertBefore method, or insert text after the selection by using the InsertAfter method. + +The TypeText method merely inserts a text string into the document if the selection is collapsed (merely the blinking insertion cursor with nothing actually selected). But if something _is_ selected, such as a word or phrase, that selection is _replaced_ by the string when you execute the TypeText method. However, the InsertBefore and InsertAfter methods do not replace a selection. They merely insert the new string. + +The syntax is as follows: + + Selection.TypeText _string_ + Selection.InsertAfter _string_ + Selection.InsertBefore _string_ + +Here, _string_ is a required String expression containing the text you want to insert in double quotation marks, as in this example: + + Selection.TypeText "Please come to the meeting next Friday at 9:00 A.M." + Selection.InsertBefore "Dr. " + Selection.InsertAfter vbCr & Address + +When you use the InsertAfter or the InsertBefore method, VBA extends the selection to include the text you inserted. (You can see selected text, cells, or other items in a document because Word changes the background from the default white to the document frame color.) When you use the TypeText method, the result is a collapsed selection—whether you are replacing a selection or a collapsed selection. (Recall that a collapsed selection means nothing is selected—merely the blinking insertion point.) + +* * * + +A Selected Paragraph Includes the Ending Paragraph Mark + +When you have a whole paragraph selected, the selection includes the paragraph mark at the end of the paragraph. So any text you add to the end of the selection appears at the beginning of the next paragraph rather than at the end of the selected paragraph. + +* * * + +## Inserting a Paragraph in a Selection + +You can insert paragraphs: + + * To insert a paragraph at the current selection, use the InsertParagraph method. + * To insert a paragraph before the current selection, use the InsertParagraphBefore method. + * To insert a paragraph after the current selection, use the InsertParagraphAfter method. + +You can also have VBA type a paragraph by using the Selection.TypeParagraph command. + +## Applying a Style + +To apply a style to a paragraph, set the Style property of the Paragraph object: + + Selection.Style = "Heading 3" + +View the styles in the current document by pressing Ctrl+S, or click the Home tab on the Ribbon. + +Similarly, you can apply a character style to the current selection or (as in the following example) to a specific range of words or characters. This example changes the fifth word in the second paragraph of the current document to boldface: + + ActiveDocument.Paragraphs(2).Range.Words(5).Style = "Bold" + +Note that a character style must always be applied to a range rather than directly to a paragraph. + +## Extending a Selection + +To extend a selection programmatically (through programming rather than by the user), use the EndOf method for a Range or Selection object. The syntax for the EndOf method is as follows: + + _expression_.EndOf(Unit, Extend) + +Here, _expression_ is a required expression that returns a Range or Selection object, such as an object in the Characters, Words, Sentences, or Paragraphs collection. Unit is an optional Variant specifying the unit of movement (see Table 20.6). + +Table 20.6 Units of movement for the EndOf method + +**Unit** | **Meaning** +---|--- +wdCharacter | A character. +wdWord | A word. (This is the default setting if you omit the argument.) +wdSentence | A sentence. +wdLine | A line. (This unit can be used only with Selection objects, not with ranges.) +wdParagraph | A paragraph. +wdSection | A section of a document. +wdStory | The current story—for example, the document story or the header and footer story. +wdCell | A cell in a table. +wdColumn | A column in a table. +wdRow | A row in a table. +wdTable | A whole table. + +Extend is an optional Variant specifying whether to move or extend the selection or range. wdMove moves the selection or range and is the default setting; wdExtend extends the selection or range. + +For example, the following statement extends the current selection to the end of the paragraph: + + Selection.EndOf Unit:=wdParagraph, Extend: **=wdExtend** + +The following statement moves the selection to the end of the paragraph: + + Selection.EndOf Unit:=wdParagraph, Extend: **=wdMove** + +The following statement selects from the current selection to the end of the current Word story: + + Selection.EndOf Unit:=wdStory, Extend:=wdExtend + +To select the whole active document, use ActiveDocument.Content.Select. This command has the same effect as pressing Ctrl+A when working interactively. + +## Collapsing a Selection + +When you've finished working with a selection larger than a blinking cursor insertion point, you often want to deselect it. In other words, you may want to force the selection into a collapsed state (just the blinking cursor) when your procedure ends. (If you don't do this and the user just starts typing, whatever is selected will be _replaced_ by the user's typing.) + +The easiest way to do so is to use the Collapse method of the Selection object to collapse the selection to its start or its end: + + Selection.Collapse Direction:=wdCollapseStart + Selection.Collapse Direction:=wdCollapseEnd + +Alternatively, you can reduce the selection to just one point by setting the selection's end selection equal to its start (collapsing the selection to its start) or by setting the selection's start equal to its end (collapsing the selection to its end): + + Selection.End = Selection.Start + Selection.Start = Selection.End + +# Creating and Using Ranges + +In Word, a _range_ is a contiguous area of a document with a defined starting point and ending point. For example, if you define a range that consists of the first two paragraphs in a specified document, the range's starting point is at the beginning of the first paragraph, and its ending point is at the end of the second paragraph (after the paragraph mark). + +Although similar to a selection, a range is more flexible. And, it's important to note that a range is _named_ in your code, so you can refer to it by name at any time. There can be multiple ranges, but there can only be one selection at a time, and it has no name. + +The typical use of ranges in Word VBA is similar to how you use bookmarks when working interactively with Word: to mark a location in a document that you want to be able to access quickly or manipulate easily. + +Like a bookmark, a range can contain any amount of text in a document, from a single character to the entire contents of the document. A range can even have the same starting point and ending point, which gives it no contents and makes it, in effect, an invisible mark in the document that you can use to insert text. (This is similar to a collapsed selection.) + +Once you've created a range, you can refer to it, access its contents or insert new contents in it, or format it—all by using the methods and properties of the range object. + +* * * + +How a Range Differs from a Bookmark + +The main difference between a range and a bookmark is that the lifetime of a range is limited to the VBA procedure that defines it. Once the procedure finishes executing, the range vanishes. By contrast, a bookmark persists. It is saved with the document or template that contains it and can be accessed at any time (whether or not a procedure is running). + +* * * + +## Defining a Named Range + +To create a Range object, you use a Set statement and either the Range method on a Document object or the Range property for an object—for example, the Selection object, the Paragraphs collection, or a Paragraph object. The syntax for using the Range method is as follows: + + Set RangeName = Document.Range(Start, End) + +Here, RangeName is the name you are assigning to the range, and Start and End are optional arguments specifying the starting and ending points of the range. + +The syntax for using the Range property on an object is as follows: + + Set RangeName = _object_.Range + +For example, the following statement uses the Range property of the Paragraphs collection to define a range named FirstPara that consists of the first paragraph of the active document. This statement doesn't use Start and End arguments because the starting point and ending point of the paragraph are clearly understood: + + Set FirstPara = ActiveDocument.Paragraphs(1).Range + +The following statements change to uppercase the first three words at the start of a document: + + Dim InitialCaps As Range + Set InitialCaps = ActiveDocument.Range _ + (Start:=ActiveDocument.Words(1).Start, _ + End:=ActiveDocument.Words(3).End) + InitialCaps.Case = wdUpperCase + +The first statement defines a Range object named InitialCaps. The second statement assigns InitialCaps to a range in the active document, from the beginning of the first word to the end of the third word. The third statement changes the case of the InitialCaps Range object to uppercase. + +Because InitialCaps is now defined as a Range object for the duration of the procedure that declares it, you can return to InitialCaps and manipulate it later in the procedure if you want to. + +## Redefining a Range + +To redefine a range to make it refer to another part of a document, use the SetRange method. The syntax is as follows: + + _expression_.SetRange(Start, End) + +Here, _expression_ is a required expression that returns a Range or Selection object, and Start and End are optional arguments specifying the starting and ending points of the range. + +For example, the following statement redefines the range named InitialCaps so it now refers to the first two characters of the document: + + InitialCaps.SetRange Start:=0, End:=2 + +You can also redefine a range by reusing the Set method, creating the range again from scratch. + +## Using the Duplicate Property to Store or Copy Formatting + +You can use the Duplicate property to store or copy a range so that you can apply it to another range. For example, the following statements declare two ranges, Range1 and Range2; store the duplicate of the current selection's range in Range1; assign to Range2 the Range of the first bookmark in the active document; and then apply to Range2 the contents of Range1: + + Dim Range1 As Range, Range2 As Range + Set Range1 = Selection.Range. **Duplicate** + Set Range2 = ActiveDocument.Bookmarks(1).Range + +# Manipulating Options + +In your procedures, you'll often need to check the status of options in the Word application or in a particular document. In VBA, many of the options are controlled by the Options object, which has dozens of properties but no methods. + +Let's look now at four brief examples that show how to set options. Three of them use the Options object and one uses a property of the Document object. To see the full list of properties available for the Options object, look in the Help system. + +## Making Sure Hyperlinks Require Ctrl+Clicking + +Hyperlinks in Word documents have proved a mixed blessing—especially since Microsoft's changes to the way Word handles hyperlinks have left users unsure whether to just click or to Ctrl+click the hyperlink to follow it. You can set the CtrlClickHyperlinkToOpen property of the Options object to True to ensure that hyperlinks require Ctrl+clicking: + + Options.CtrlClickHyperlinkToOpen = True + +Setting this option to False means you can trigger links by merely clicking them—no Ctrl key required. + +## Turning Off Overtype + +To make sure your procedures behave as expected, you may need to check that Word is using Insert mode rather than Overtype mode. (In Insert mode, Word inserts the characters you type at the insertion point, moving right any existing text to make room. In Overtype mode, each character you type replaces the character to the right of the insertion point.) + +Overtype mode is controlled by the Overtype property of the Options object. When OverType is True, Overtype mode is on; when Overtype is False, Insert mode is on. The following statements store the user's current Overtype setting in a Boolean variable named blnOvertypeOn, set Overtype to False, perform its actions, and then restore the user's Overtype setting: + + Dim blnOvertypeOn As Boolean + blnOvertypeOn = Options.Overtype + Options.Overtype = False 'write more code here to perform actions + Options.Overtype = blnOvertypeOn + +## Setting a Default File Path + +When configuring Word on a computer, you may need to make sure that its default file paths are set to the correct folders. You can do so by working with the DefaultFilePath property of the Options object. The syntax is as follows: + + _expression_.DefaultFilePath(Path) + +Here, _expression_ is a required expression that returns an Options object. Often, it's easiest to use the Options object itself. Path is one of the self-explanatory enumerated constants shown in the following list: + +wdAutoRecoverPath | wdStyleGalleryPath +---|--- +wdBorderArtPath | wdTempFilePath +wdCurrentFolderPath | wdTextConvertersPath +wdDocumentsPath | wdToolsPath +wdGraphicsFiltersPath | wdTutorialPath +wdPicturesPath | wdUserOptionsPath +wdProgramPath | wdUserTemplatesPath +wdProofingToolsPath | wdWorkgroupTemplatesPath +wdStartupPath | + +For example, the following statements set the user templates path and the workgroup templates path: + + Options.DefaultFilePath( **wdUserTemplatesPath** ) = _ + "c:\users\richard\appdata\roaming\microsoft\templates" + Options.DefaultFilePath( **wdWorkgroupTemplatesPath** ) = _ + "\\server\users\templates" + +## Turning Off Track Changes + +Before running a procedure that adds, deletes, or formats text, you may need to turn off the Track Changes feature so that the changes the procedure makes are not marked up in the text. If the user had Track Changes on, you should turn it back on at the end of the procedure so that changes the user makes are tracked again. Remember that it's usually a good practice when changing options to first store the user's current setting in a variable, carry out your procedure's task, and then restore the user's original setting. + +The following example saves the user's setting for the TrackRevisions option in the ActiveDocument object in a Boolean variable named blnTrackChangesOn, sets TrackRevisions to False, performs its actions, and then restores the user's TrackRevisions setting: + + Dim blnTrackChangesOn As Boolean + blnTrackChangesOn = ActiveDocument.TrackRevisions + ActiveDocument.TrackRevisions = False + ' write more code here to perform actions + ActiveDocument.TrackRevisions = blnTrackChangesOn + +## Accessing OneNote + +Earlier in this chapter you saw how to access the cloud: SkyDrive, Dropbox, and so on. It's uncomplicated. Dealing with OneNote is another matter because its contents are stored in the tricky XML format. _Tricky_ because reading and manipulating XML data is somewhat surreal. XML tries to be all things to all people, and the usual consequences of that effort ensue. XML comes in many, many versions; it mixes a "self-describing" metalanguage into its data; you'll find no standards for parsing (breaking the data down for reading and managing); and so on. + +When you write code to manage an ordinary text document, it's pretty simple because of VBA's string-manipulation features. Even managing a Word document is easy enough because VBA has so many built-in functions involving the Range, Selection, and other objects. Word effectively hides its internal formatting and other complexities, allowing you the option of handling the text simply as text. + +Not so with XML. As you'll see, just getting the metadata (information such as the name of a notebook) is heavy-going. + +Why bother to explore this topic then? Although VBA is not built into OneNote, VBA code in other Office 2013 applications can directly manipulate OneNote. And, because Microsoft is currently promoting OneNote, attempting to make it popular after all these years, I'm including this example demonstrating how to access it from the other Office applications. + +OneNote _is_ useful; it's actually quite rich in features and well integrated into the Windows and Office platforms. And now that versions of OneNote are available on everything from iOS to Android devices, perhaps Microsoft's dream will come true. So, if you should ever need to know how to contact OneNote from Word or some other Office application, read on. + +The following example fetches metadata from the user's OneNote. Before you try this code, choose Tools ⇒ References in the Editor and ensure that both Microsoft OneNote 15.0 Object Library and Microsoft XML, v6.0 are both checked in the References dialog box. + + 1. Sub GetMetaData() + 2. + 3. 'If it's not currently running, OneNote will be launched + 4. Dim ONote As oneNote.Application + 5. Set ONote = New oneNote.Application + 6. + 7. Dim strXML As String + 8. + 9. ONote.GetHierarchy "", hsNotebooks, strXML, xs2010 'this fails if you use xs2013 + 10. + 11. MsgBox strXML + 12. End Sub + +Lines 4 and 5 create an instance of OneNote and assign it to the ONote object variable. Next we create a string variable in line 7 to hold the metatdata. Line 9 uses the GetHierarchy method to fill strXML with the metadata. hsNotebooks represents the collection of notebooks in OneNote. The messagebox displays the results, as illustrated in Figure 20.4. + +Figure 20.4 Metadata fetched from OneNote + +# The Bottom Line + +**Understand the Word object model.** + +Some people find viewing a schematic of the Word object model useful as a way of visualizing the interrelationships of the various objects and collections. + +Master It + +When you look at the Word Object Model Map, what is the highest object in the hierarchy—the object that contains all other objects? + +**Understand Word's creatable objects.** + +Word contains a set of creatable objects that VBA programmers will frequently employ in their code. + +Master It + +What is a creatable object? + +**Work with the **Documents** collection and the **Document** object.** + +The Documents collection represents all the currently open documents. Using VBA, you can manipulate this collection in a variety of ways. + +Master It + +Here is the syntax for creating a new document in the Documents collection: + + Documents.Add Template, NewTemplate, DocumentType, Visible + +If you merely want to add a new, empty document (based on the default Normal.dotm template) to the documents currently open in Word, the code is quite simple. What is the code that you would write in VBA to accomplish this? + +**Work with the **Selection** object.** + +The Selection object represents the current selection in the active document in Word. A zone can be selected by the user by dragging the mouse or by using various key combinations (such as pressing Shift and an arrow key). A selection can include one or more objects—one or more characters, one or more words, one or more paragraphs, a graphic, a table, and so on. Or a combination of these objects. + +Master It + +One kind of selection is described as a _collapsed selection_. What is that? + +**Create and use ranges.** + +In Word, a _range_ is a named contiguous area of a document with a defined starting and ending point. The typical use of ranges in Word VBA is similar to how you use bookmarks when working interactively with Word: to mark a location in a document that you want to be able to access quickly or manipulate easily. + +Master It + +Although a range is similar to a bookmark, what is the significant difference between them? + +**Manipulate options** + +Word contains many options that can be manipulated from within VBA. + +Master It + +In Word, one object controls many of the options. This object has dozens of properties but no methods. Name this object. +Chapter 21 + +Working with Widely Used Objects in Word + +In the previous chapter, you learned how to work with some of the main objects in the Word object model, such as Document objects, the Selection object, Range objects, and the Options object. This chapter shows you how to go further with VBA in Word by working with Find and Replace; with headers, footers, and page numbers; with sections, page setup, windows, and views; and with tables. + +In this chapter you will learn to do the following: + + * Use Find and Replace via VBA + * Work with headers, footers, and page numbers + * Manage sections, page setup, windows, and views + * Manipulate tables + +# Using Find and Replace via VBA + +Word's Find and Replace tool can be very useful in your procedures. You can, for example, quickly adjust multiple styles throughout an entire document. Or you could automate the process of finalizing documents (spell-checking, revising corporate information, looking for out-of-date references, or whatever routinely needs to be done before publication). + +To access Word's Find and Replace features via VBA, you use the Find and Replacement objects. This section illustrates how to work with the Find object's Execute method, usually the best method to employ when working with Find. You usually specify the parameters for the Find operation as arguments in the Execute statement, but you can also specify them beforehand using properties if you prefer that approach. + +Table 21.1 describes the Find properties that are most useful for common search operations. + +Table 21.1 Properties of the Find object + +**Property** | **Meaning** +---|--- +Font | Font formatting you're searching for (on either specified text or an empty string). +Forward | A Boolean variable-type argument specifying whether to search forward (True) or backward (False) through the document. +Found | A Boolean property that's True if the search finds a match and False if it doesn't. +Highlight | A Long variable-type argument controlling whether highlighting is included in the formatting for the replacement text (True) or not (False). +MatchAllWordForms | A Boolean property—True or False—corresponding to the Find All Word Forms check box. +MatchCase | A Boolean property corresponding to the Match Case check box. If the user has this option deselected, be sure your code deselects it after you're finished with any case-sensitive searching in your procedure. See the sidebar "Practical Searching: Remember to Clear Formatting" later in this chapter. +MatchSoundsLike | A Boolean property corresponding to the Sounds Like check box. +MatchWholeWord | A Boolean property corresponding to the Find Whole Words Only check box. +MatchWildcards | A Boolean property corresponding to the Use Wildcards check box. +ParagraphFormat | Paragraph formatting you're searching for (on either specified text or an empty string). +Replacement | Returns a Replacement object containing the criteria for a replace operation. +Style | The style for the search text. Usually, you'll want to use the name of a style in the current template, but you can also use one of the built-in Word constant style names, such as wdStyleHeading1 (Heading 1 style). +Text | The text you're searching for (what you'd enter in the Find What box in the Find And Replace dialog box). Use an empty string ("") to search only for formatting. +Wrap | A Long property that governs whether a search that starts anywhere other than the beginning of a document (for a forward search) or the end of a document (for a backward search), or a search that takes place in a range, _wraps_ (continues) when it reaches the end or beginning of the document or the end or beginning of the selection. + +You use the Replacement object to specify the replace criteria in a replacement operation. The Replacement object has the following properties, which correspond to the properties of the Find object (but pertain to the replacement operation instead): Font, Highlight, ParagraphFormat, Style, and Text. + +## Understanding the Syntax for the _Execute_ Method + +The syntax for the Execute method is as follows: + + expression.Execute(FindText, MatchCase, MatchWholeWord, MatchWildcards, + MatchSoundsLike, MatchAllWordForms, Forward, Wrap, Format, + ReplaceWith, Replace, MatchKashida, MatchDiacritics, MatchAlefHamza, + MatchControl, MatchPrefix, MatchSuffix, MatchPhrase, IgnoreSpace, + IgnorePunct) + +The final five arguments, starting with MatchPrefix, are not displayed in the Auto List Members tool in the Editor, but they can be used in code, as in, for example, IgnoreSpace:=True. + +The most commonly used arguments for this method are explained here: + + * _expression_ is a required expression that returns a Find object. Usually, it's easiest to use the Find object itself. + * FindText is an optional Variant specifying the text for which to search. Although this argument is optional, you'll almost always want to specify it, even if you specify only an empty string ("") to allow you to search for formatting rather than text. (If you don't specify "" for FindText, you will inadvertently search for the previous text searched for, and the style you want to locate will never be found unless that text is also present.) + * You can search for special characters by using special characters you use when working interactively (for example, ˆp for a paragraph mark or ˆt for a tab) and for wildcards by using the traditional Windows wildcards. For wildcards to work, you need to set MatchWildcards to True. You can search for a symbol by entering a caret and a zero, followed by its character code. For example, to search for a smart double closing quote, you'd specify **ˆ0148** because its character code is 148. + * MatchCase is an optional Variant that you can set to True to make the search case sensitive. + * MatchWholeWord is an optional Variant that you can set to True to restrict the search to finding whole words rather than words contained in other words. + * MatchWildcards is an optional Variant that you can set to True to use wildcards in the search. + * MatchSoundsLike is an optional Variant that you can set to True to have Word find words that it thinks sound similar to the Find item specified. + * MatchAllWordForms is an optional Variant that you can set to True to have Word find all forms of the Find item specified (for example, different forms of the same verb or noun). + * Forward is an optional Variant that you can set to True to have Word search forward (from the beginning of the document toward the end) or False to have Word search backward. + * Wrap is an optional Variant that governs whether a search that begins anywhere other than the beginning of a document (for a forward search) or the end of a document (for a backward search), or that takes place in a range, _wraps_ (continues) when it reaches the end or beginning of the document. Word offers various options for Wrap, as detailed in Table 21.2. + +Table 21.2 Options for Wrap offered by Word + +**Constant** | **Value** | **Meaning** +---|---|--- +wdFindAsk | 2 | Word searches the selection or range—or from the insertion point to the end or beginning of the document—and then displays a message box prompting the user to decide whether to search the rest of the document. +wdFindContinue | 1 | Word continues to search after reaching the end or beginning of the search range or the end or beginning of the document. +wdFindStop | 0 | Word stops the Find operation upon reaching the end or beginning of the search range or the end or beginning of the document. + + * Format is an optional Variant that you can set to True to have the search operation find formatting as well as (or instead of) any Find text you've specified. + * ReplaceWith is an optional Variant specifying the replacement text. You can use an empty string for ReplaceWith to simply remove the FindText text; you can also use special characters for ReplaceWith as you can for the FindText argument. To use a graphic object, copy it to the Clipboard and then specify **ˆc** (which stands for the contents of the Clipboard). + +* * * + +How to Use Graphic Objects with + +To use a graphic object as described in the bulleted item that explains the ReplaceWith argument, the graphic needs to be in the text layer (not floating over text). If the graphic is floating over text, ˆc pastes in the previous text contents of the Clipboard. + +* * * + + * Replace is an optional Variant that controls how many replacements the Find operation makes: one (wdReplaceOne), all (wdReplaceAll), or none (wdReplaceNone). + * MatchPrefix is an optional Variant that allows you to search for a string of characters at the start of words, but not if any other character(s) precede the string. Here's how it works: If you leave MatchPrefix and MatchWholeWord set to False, then search for **real** , you'll get results with any word that contains that string, such as _real_ , sur _real, real_ time, bo _real_ , and so on. Any word with _real_ in it will be a hit. But set MatchWholeWord to True, and only the string itself, **real** , will result in a hit. Leave MatchWholeWord set to False and set MatchPrefix to True, and only words that begin with _real_ will hit, such as _real_ and _real_ time. Words like sur _real_ fail to qualify because they don't begin with the target string. + * MatchSuffix is an optional Variant that works the same way as MatchPrefix, except MatchSuffix allows you to search for a string of characters at the end of a word but not if any other characters follow the string. Using the example in the previous bullet, with MatchSuffix set to True, you would get hits on sur _real_ and bo _real_ but not _real_ time. + * MatchPhrase is an optional Variant that when set to True ignores any control characters (such as paragraph or tab characters) or white space (one or more space characters) between words. + * **This** + * **phrase** + * becomes equivalent to + * **this phrase.** + * IgnoreSpace is an optional Variant that ignores any white space between words but that does not ignore control characters. + * IgnorePunct is an optional Variant that ignores all punctuation characters between words in a search phrase. + +* * * + +**Practical Searching: Remember to Clear Formatting** + +One behavior in Word that can puzzle even experienced users and developers results from the fact that Find settings _persist_. For example, say that you search for a style such as _Heading1_. All goes well, you find the headings, and you close the Find And Replace dialog box (or if you're searching using VBA code, your macro finishes execution). + +Then somewhat later you run another macro that searches or replaces—or you use the Find And Replace dialog box in Word to look for a word such as _program_. You know that the word _program_ appears many times in your document, but the Find utility displays a message stating that "The search item was not found." What's wrong? + +The problem is that your original search for the headline style persists during your session with Word. Even switching to a different document during the current session will not clear the search criteria—including any style, font, or other special search criteria, such as MatchCase, that may have been employed + +In other words, you're now searching for the word _program_ but _also_ for the style _Heading 1_. So all the instances of _program_ in regular body text do not trigger hits. They are not in the specified style. + +If you search for a style but fail to click the No Formatting button in the Find And Replace dialog box when you've finished, that style search remains active. + +Likewise, when you use the Find object and the Replacement object in a procedure, you'll often need to use the ClearFormatting method, which clears any formatting specified under the Find What box or the Replace With box. Using the ClearFormatting method has the same effect as clicking the No Formatting button with the focus on the Find What box or the Replace With box. The following statements (here used within a With structure) clear formatting from the Find and Replacement objects, respectively: + + With ActiveDocument.Content.Find + .ClearFormatting + .Replacement.ClearFormatting + End With + +It's a good idea to get into the habit of using the **ClearFormatting** method at the end of any macro that searches for styles or other special formatting. And it doesn't hurt to use the **ClearFormatting** method at the **_start_** of any macro that searches for anything, as well. Sometimes unnecessary? Potentially redundant? Sure. But it's good insurance against this common and puzzling bug. + +A similar situation occurs when you employ the Execute method, as described earlier in this chapter. Remember that when using Execute, you should almost always specify the FindText argument—even if you specify only an empty string ("") to allow you to search for formatting. If you don't specify FindText, you run the risk of searching inadvertently for the string searched for previously. + +* * * + +## Putting Find and Replace to Work + +The simplest way to use Find and Replace is to specify only as many parameters as you need in an Execute statement, leaving out any optional parameters that are irrelevant to your search. With long argument lists, it's always better to use the named-argument approach, like this: + + FindText:="National Velvet" + +This example replaces all pairs of paragraph marks (removing empty lines) in the active document with single paragraph marks; you could search for **ˆpˆp** and replace it with **ˆp** with the following statement: + + ActiveDocument.Content.Find. **Execute** FindText:="ˆpˆp", + ReplaceWith:="ˆp", _ + Replace:=wdReplaceAll + +By running this statement in a loop, you could replace all extra paragraph marks in the document. You would have to employ a loop here because the wdReplaceAll constant specifies that the find-and-replace activity should go through the entire document once. + +It's necessary to loop because you might have multiple paragraph marks in clusters, such as four in a row: ˆpˆpˆpˆp. The first pass through the document would replace those four with two (ˆpˆp), so you'd need to go through again to reduce these to the desired single ˆp. In other words, in this case you must search and replace more than once. + +You can also use a With statement to specify the properties for a Find and Replace operation. Listing 21.1 shows an example of this. The code changes all bold formatting in the open document named Example.docm to italic formatting. + +**Listing 21.1**: Using With to specify properties + + 1. **With** Documents("Example.docm").Content.Find + 2. .ClearFormatting + 3. .Font.Bold = True + 4. **With** .Replacement + 5. .ClearFormatting + 6. .Font.Bold = False + 7. .Font.Italic = True + 8. **End With** + 9. .Execute FindText:= "", ReplaceWith:= "", _ + Format:=True, Replace:=wdReplaceAll + 10. **End With** + + * Here, line 1 identifies the Document object (Example.docm in the Documents collection) with which to work and begins a With statement with its Find object. + * Line 2 uses the ClearFormatting method to clear any formatting from the Find object, and + * Line 3 then sets the Bold property of its Font object to True. + * Lines 4 through 8 contain a nested With statement for the Replacement object. + * Line 5 uses the ClearFormatting method to clear formatting from the Replacement object, + * Line 6 sets its Bold property to False, and + * Line 7 sets its Italic property to True. + * Line 9 then uses the Execute method to execute the replacement operation. Both FindText and ReplaceWith here are specified as empty strings to cause Word to work with formatting only; Format is set to True to activate the formatting set in the Find and Replacement objects, and Replace is set to wdReplaceAll to replace all instances of the bold formatting with the italic formatting. + * Line 10 ends the outer With statement. + +# Working with Headers, Footers, and Page Numbers + +The following sections show you how to work with headers and footers in Word documents. You'll also learn how to use VBA to manipulate page numbers, which are often included in headers and footers. + +## Understanding How VBA Implements Headers and Footers + +You can create several types of headers and footers in a Word document: the primary header and footer, unique first-page-only headers and footers, special headers and footers that appear only on the even pages—even different sets of headers and footers for each of the sections in a document if need be. + +Every document automatically gets a primary header and a primary footer, even if you don't put anything in them. You can then create different first-page and even-page headers by changing the Page Setup options for the section. (Click the Page Layout tab on the Ribbon, then click the small arrow in the lower-right corner of the Page Setup zone. This opens the Page Setup dialog box; click the Layout tab. Note, however, that the primary header and footer features are accessed from the Insert tab on the Ribbon.) + +VBA uses the following objects for headers and footers: + + * Both headers and footers are contained in HeaderFooter objects. You access headers through the Headers property and footers through the Footers property. + * The HeadersFooters collection contains all the HeaderFooter objects in a given section of a document. Because each section of a document can have different headers and footers than the other sections have, you reach any given header or footer by going through the section. + * To return the HeadersFooters collection, you use the Headers property or the Footers property of the appropriate Section object in the appropriate Document object. Alternatively, you can use the HeaderFooter property of the Selection object to return a single HeaderFooter object, but this approach tends to be more limited in its use. + * The HeaderFooter object gives access to the Range object, the Shapes collection, and the PageNumbers collection. + +## Getting to a Header or Footer + +You access a header or footer through the appropriate _section_ within the document. For example, the following statement displays a message box containing the text in the first-page footer that's in the second section of the open document Transfer.docm: + + MsgBox Documents("Transfer.docm").Sections(2). _ + Footers(wdHeaderFooterFirstPage).Range.Text + +The following statements declare the HeaderFooter object variable myHeader and assign to it the primary header in the first section in the active document: + + Dim myHeader As HeaderFooter + Set myHeader = ActiveDocument.Sections(1).Headers _ + (wdHeaderFooterPrimary) + +## Checking to See If a Header or Footer Exists + +Recall that Word automatically creates a primary header and primary footer for each document, so these objects always exist. To find out whether other types of headers or footers exist, check the Exists property of the application HeaderFooter object. The following statements check to see if the even-pages footer exists in each section in turn in the active document and create a generic header (containing the section number and the full name of the document) formatted with the style named Footer (which exists by default in most Word documents): + + Dim cSection As Section + With ActiveDocument + For Each cSection In .Sections + cHeader = cSection.Headers(wdHeaderFooterEvenPages) + If Not cSection.Headers(wdHeaderFooterEvenPages).Exists Then + cSection.PageSetup.OddAndEvenPagesHeaderFooter = True + cSection.Headers(wdHeaderFooterEvenPages).Range.Text _ + = "Section " & cSection.Index & " of " & .FullName + cSection.Headers(wdHeaderFooterEvenPages).Range. _ + Style = "Even Footer" + End If + Next cSection + End With + +## Linking to the Header or Footer in the Previous Section + +By default, Word links the header and footer in each section after the first to the header and footer in the previous section. To break the link, set the LinkToPrevious property of the header or footer to False; to create the link, set this property to True. The following statement unlinks the primary footer in the third section of the active document from the corresponding footer in the second section: + + ActiveDocument.Sections(3).Footers _ + (wdHeaderFooterPrimary).LinkToPrevious = False + +## Creating a Different First-Page Header + +To create a different header on the first page of a section, set the DifferentFirstPageHeaderFooter property of the PageSetup object for the section to True. The following statements check to see if the 10th section of the active document contains a first-page header and create one if it doesn't: + + With ActiveDocument.Sections(10) + If .Headers(wdHeaderFooterFirstPage).Exists = False Then _ + .PageSetup.DifferentFirstPageHeaderFooter = True + End With + +## Creating Different Odd- and Even-Page Headers + +To produce different headers for odd and even pages of your document (other than the first page), create an even-page header. The primary header by default appears on both odd and even pages until you create an even-page header, at which point the primary header becomes the odd-page header. + +As with the first-page header, you work through the PageSetup object to create a different even-page header, setting the OddAndEvenPagesHeaderFooter property to True, as in the following statement: + + ActiveDocument.Sections(1). **PageSetup.OddAndEvenPagesHeaderFooter** = True + +* * * + +Use Nested Loops to Modify Headers and Footers + +If you write procedures to format documents, you may need to check or change all the headers and footers in a document. The easiest way to do so is to use two For Each... Next loops, the outer loop working through each Section object in the Sections collection and the inner loop working through each HeaderFooter object in the HeaderFooters collection within that section. + +* * * + +## Adding Page Numbers to Your Headers and Footers + +A header or footer of a document often contains a page number: either a simple number in a straightforward format (1, 2, 3, and so on) or a more complex number denoting the chapter and page within it, separated by a separator character. + +VBA implements page numbers through a PageNumbers collection that you return by using the PageNumbers property of the appropriate HeaderFooter object within the appropriate section of the document. + +### Adding Page Numbers to One or More Sections of a Document + +To add page numbers to a document, use the Add method with the PageNumbers collection for the appropriate section of the document. + +The syntax for the Add method is as follows: + + _expression_.Add PageNumberAligment, FirstPage + +Here, _expression_ is a required expression that returns a PageNumbers collection. Usually, you'll use the PageNumbers collection itself. + +PageNumberAlignment is an optional Variant argument specifying the alignment for the page numbers being added. Table 21.3 lists the constants and values you can use. + +Table 21.3 PageNumberAlignment constants and values + +**Constant** | **Value** | **Resulting Alignment** +---|---|--- +wdAlignPageNumberLeft | 0 | Left +wdAlignPageNumberCenter | 1 | Centered +wdAlignPageNumberRight | 2 | Right (default) +wdAlignPageNumberInside | 3 | Inside margin (right on left-hand pages, left on right-hand pages) +wdAlignPageNumberOutside | 4 | Outside margin (left on left-hand pages, right on right-hand pages) + +FirstPage is an optional Variant argument that you can set to False to make the header and footer on the first page suppress the page number. If you omit the FirstPage argument, the DifferentFirstPageHeaderFooter property of the PageSetup object controls whether the header and footer on the first page are the same as or different than they are on the other pages in the section. + +Both the PageNumberAlignment argument and the FirstPage argument are optional, but you'll usually want to specify at least the PageNumberAlignment argument. + +The following subprocedure adds page numbers to all the headers in each section of a document by using two For Each... Next loops: + + Sub AddPageNumbersToAllHeadersAndSections() + Dim cHeader As HeaderFooter, cSection As Section + With Documents("Headers and Footers.docm") + For Each cSection In .Sections + For Each cHeader In cSection.Headers + cSection.Headers(wdHeaderFooterPrimary).PageNumbers.Add _ + PageNumberAlignment:=wdAlignPageNumberRight, FirstPage:=True + Next cHeader + Next cSection + End With + End Sub + +### Removing Page Numbers from One or More Sections of a Document + +To remove a page number from a page, specify the PageNumber object and use the Delete method. The following subprocedure removes each PageNumber object from the current section of the active document: + + Sub RemovePageNumbersFromCurrentSection() + Dim ThisHeader As HeaderFooter + Dim ThisPageNumber As **PageNumber** + With Selection.Sections(1) + For Each ThisHeader In .Headers + For Each ThisPageNumber In ThisHeader.PageNumbers + ThisPageNumber. **Delete** + Next ThisPageNumber + Next ThisHeader + End With + End Sub + +### Finding Out If a Section of a Document Has Page Numbers + +The easiest way to find out if any given page number exists is to check the Count property for the PageNumbers collection for the appropriate section. For example, the following statement adds centered page numbers to the even-pages header in the current section if the header doesn't already have them: + + If Selection.Sections(1).Headers(wdHeaderFooterEvenPages) _ + .PageNumbers.Count = 0 Then Selection.Sections(1) _ + .Headers(wdHeaderFooterEvenPages).PageNumbers.Add _ + PageNumberAlignment:=wdAlignPageNumberCenter + +### Changing the Page Numbering for a Section + +To change the page numbering for a section, you work with the StartingNumber property, using the RestartNumberingAtSection property, the IncludeChapterNumber property, and the ChapterPageSeparator property as necessary. + +The StartingNumber property is a Long property that contains the starting page number for the section when the RestartNumberingAtSection property is set to True. When the RestartNumberingAtSection property is set to False, StartingNumber returns 0 (zero). The following statements set the page numbering for the primary header in the fourth section of the active document to start at 55 if it doesn't currently have a starting number assigned: + + With ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) + If .PageNumbers.StartingNumber = 0 Then + .PageNumbers.RestartNumberingAtSection = True + .PageNumbers.StartingNumber = 55 + End If + End With + +To add the chapter number to the page numbers, use heading numbering in your document. Set the IncludeChapterNumber property to True, and specify the separator to use (for example, wdSeparatorEnDash for an en dash): + + With ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _ + .PageNumbers + .IncludeChapterNumber = True + .ChapterPageSeparator = wdSeparatorEnDash + End With + +### Suppressing the Page Number for the First Page + +To suppress the page number for the first page in a section, set the ShowFirstPageNumber property for the appropriate HeaderFooter object in the appropriate section to False: + + ActiveDocument.Sections(1).Footers(wdHeaderFooterPrimary).PageNumbers_ + .ShowFirstPageNumber = False + +### Formatting Page Numbers + +You can format page numbers in two ways: by setting the format in which they're displayed (for instance, as regular Arabic numbers or as lowercase Roman numerals) and by formatting the font in which that format is displayed. + +To choose the format in which the page numbers are displayed, set the NumberStyle property of the PageNumbers collection in question. For example, the following statement formats the page numbers in the primary header in the fourth section of the active document as lowercase letters: + + ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _ + .PageNumbers.NumberStyle = wdPageNumberStyleLowercaseLetter + +Once the page numbers are in the header or footer, you can format them in any of several ways. One easy way to set the font in which a given page number is formatted is to use the Select method to select the PageNumber object and then apply formatting to it as you would any other selection, as in the following statements: + + ActiveDocument.Sections(4).Headers(wdHeaderFooterPrimary) _ + . **PageNumbers(1).Select** + With Selection.Font + .Name = "Impact" + .Size = 22 + .Bold = True + End With + +### Creating "Page X of Y"–Type Page Numbers + +You can also implement page numbering by using Word's field codes in the header or footer. This technique is especially useful when you want to number the pages with an "X of Y" numbering scheme—"Page 168 of 192" and so on. The following statements select the primary header for the final section of the active document, apply center alignment, and enter the text and fields to produce this type of numbering: + + ActiveDocument.Sections(ActiveDocument.Sections.Count) _ + .Headers(wdHeaderFooterPrimary).Range.Select + With Selection + .Paragraphs(1).Alignment = wdAlignParagraphCenter + .TypeText Text:="Page " + .Fields.Add Range:=Selection.Range, Type:=wdFieldEmpty, Text:= _ + "PAGE ", PreserveFormatting:=True + .TypeText Text:=" of " + .Fields.Add Range:=Selection.Range, Type:=wdFieldEmpty, Text:= _ + "NUMPAGES ", PreserveFormatting:=True + End With + +If you insert a page number by using a field in this way, you can still access the page number by using the appropriate PageNumber object. (In this case, the PageNumber object consists of the PAGE field, not of the NUMPAGES field.) + +# Working with Sections, Page Setup, Windows, and Views + +Each Word document contains at least one _section_ by default and can contain multiple sections as needed for its contents and layout. The section of the document controls the page layout so that different sections of a document can use different page layouts if necessary. + +## Adding a Section to a Document + +You can add a section to a document either by using the Add method with the Sections collection or by using the InsertBreak method with a Range or Selection object. + +The Add method has the following syntax: + + _expression_.Add Range, Start + +Here, _expression_ is a required expression that returns a Sections collection. Range is an optional Variant argument specifying the range at the beginning of which to insert the break. (If you omit Range, VBA inserts the break at the end of the document.) Start is an optional Variant argument used to specify the type of section break to insert: + + * wdSectionContinuous (0) for a continuous break + * wdSectionEvenPage (3) for an even-page break + * wdSectionOddPage (4) for an odd-page break + * wdSectionNewColumn (1) for a new-column break + * wdSectionNewPage (2, the default) for a new-page break + +The following statement adds a new-page section to the active document, placing it before the second paragraph: + + ActiveDocument.Sections. **Add** _ + Range:=.Range(Start:=.Paragraphs(2).Range.Start, _ + End:=.Paragraphs(2).Range.Start), Start:=wdSectionNewPage + +The InsertBreak method takes the following syntax: + + _expression_.InsertBreak Type + +Here, _expression_ is a required expression that returns a Selection or Range object. Type is an optional Variant argument specifying the type of section break to be inserted: + + * wdSectionBreakNextPage (2) for a new-page break + * wdSectionBreakContinuous (3) for a continuous break + * wdSectionBreakEvenPage (4) for an even-page break + * wdSectionBreakOddPage (5) for an odd-page break + * wdColumnBreak (8) for a new-column break + +The following statement inserts a continuous section break before the second paragraph in the active document: + + ActiveDocument.Paragraphs(2).Range.InsertBreak _ + Type:=wdSectionBreakContinuous + +## Changing the Page Setup + +To change the page setup of a document or a section, you work with the PageSetup object of the application Document object or Section object. For example, the following statements work with the PageSetup object of the document named Planning.docm, setting letter-size paper, portrait orientation, mirror margins, and margin measurements (in points): + + With Documents("Planning.docm").PageSetup + .PaperSize = wdPaperLetter + .Orientation = wdOrientPortrait + .TopMargin = 1 + .BottomMargin = 1 + .LeftMargin = 1 + .RightMargin = 1.5 + .MirrorMargins = True + End With + +## Opening a New Window Containing an Open Document + +To open a new window containing an open document, use the Add method. Its syntax is straightforward: + + _expression_.Add _window_ + +Here, _expression_ is an expression that returns a Windows collection, and _window_ is an optional Variant argument specifying the window containing the document for which you want to open a new window. If you omit _window_ , VBA opens a new window for the active document. + +* * * + +Understanding the Two Windows Collections + +There are two Windows collections: one for the application and one for the windows displaying the document with which you're working. The Windows collection for the Document object can be useful if you have multiple windows open for the same document (as you can do by clicking the Ribbon's View tab, then clicking the New Window button in the Window section of the Ribbon), but usually you'll want to use the Windows collection for the Application object. Windows is a creatable object, so you don't need to specify the Application object. + +* * * + +For example, the following statements open a new window for the first window open for the active document, assigning the window to the variable myWindow: + + Dim myWindow As Window + Set myWindow = Windows.Add(Window:=ActiveDocument.Windows(1)) + +## Closing All Windows Except the First for a Document + +Occasionally, it's useful to open one or more new windows for a document. If you do so, sooner or later you'll need to close all the secondary windows to give yourself more room to maneuver. The following statements close all windows except the first for the active document: + + Dim myWin As Window, myDoc As String + myDoc = ActiveDocument.Name + For Each myWin In Windows + If myWin.Document = myDoc Then _ + If myWin. **WindowNumbe** r <> 1 Then myWin.Close + Next myWin + +## Splitting a Window + +To split a window in two parts horizontally, set its Split property to True. To specify the split percentage (which controls how far down the window, measuring vertically, the split is placed), set the SplitVertical property. The following statements split the active window 70 percent of the way down the window: + + With ActiveWindow + . **Split** = True + .SplitVertical = 70 + End With + +To remove the split from the window, set the Split property to False: + + ActiveWindow.Split = False + +* * * + +Try Snapping Windows + +Windows 7 and 8 have a nice feature that you might want to use instead of Word's internal split window. Drag a window by its title bar to the left side of the screen. Drag another window to the right side. They snap and automatically take up half the screen each. + +* * * + +## Displaying the Document Map for a Window + +To display the Document Map for a window at the Document Map's previous width percentage (of the entire window), set the DocumentMap property to True: + + ActiveWindow.DocumentMap = True + +To display the Document Map at a different width, or to change the width of the Document Map, set the DocumentMapPercentWidth property to a suitable percentage of the window's width: + + ActiveWindow.DocumentMapPercentWidth = 25 + +To hide the Document Map again, set the DocumentMap property to False or set the DocumentMapPercentWidth property to 0. + +## Scrolling a Window + +To scroll a window up, down, left, or right, use either the LargeScroll method or the SmallScroll method. + +The LargeScroll method is analogous to clicking within the scroll bar (not on a thumb—the arrows at the top and bottom of the scroll bar); this scrolls the contents of the window by one entire "screen." The SmallScroll method is analogous to clicking a thumb, this scrolls the contents of the window up or down by one line. If you're working with a horizontal scroll bar, the contents move left or right by a small scroll increment. + +The syntax for the LargeScroll method is as follows: + + _expression_.LargeScroll( _Down, Up, ToRight, ToLeft_ ) + +The syntax for the SmallScroll method is almost identical: + + _expression_.SmallScroll( _Down, Up, ToRight, ToLeft_ ) + +Here, _expression_ is a required expression that returns a Window object. Down, Up, ToRight, and ToLeft are optional Variant arguments that specify the number of screens (for LargeScroll) or lines or horizontal movement units (for SmallScroll) to scroll the contents of the window in the directions their names indicate. + +The following statement scrolls the active window up two screens: + + ActiveWindow. **LargeScroll** Up:=2 + +## Arranging Windows + +To arrange a number of windows, use the Arrange method. The syntax for the Arrange method is as follows: + + _expression_.Arrange ArrangeStyle + +Here, _expression_ is an expression that returns a Windows collection, and ArrangeStyle is an optional Variant argument that specifies how to arrange the windows: as icons (wdIcons, 1) or tiled (wdTiled, 0). The default is wdTiled. + +For example, the following statement tiles the open windows: + + Windows. **Arrange** ArrangeStyle:=wdTiled + +## Positioning and Sizing a Window + +To position a window on the monitor, set its Left and Top properties, as in this example: + + ActiveWindow.Left = 100 + ActiveWindow.Top = 200 + +To size a window, set its Height and Width properties: + + With ActiveWindow + .Height = 300 + .Width = 400 + End With + +To maximize, minimize, or "restore" a window, set its WindowState property to wdWindowStateMaximize, wdWindowStateMinimize, or wdWindowStateNormal, respectively. The following statements maximize the window containing the document named Example.docm if the window is minimized: + + With Documents("Example.docm").Windows(1) + If .WindowState = wdWindowStateMinimize Then _ + .WindowState = wdWindowStateMaximize + End With + +## Making Sure an Item Is Displayed in the Window + +After opening or arranging windows, you'll often need to make sure an item you want the user to see—a range, some text, a graphic or other shape, or a field—is displayed in the window. The easiest way to do so is to use the ScrollIntoView method of the Window object. This method moves the view but not the selection, so if you need the selection to move as well, you'll need to write additional code to move it there. + +The ScrollIntoView method takes the following syntax: + + _expression_.ScrollIntoView(Obj, Start) + +Here, _expression_ is a required expression that returns a Window object. Obj is a required argument specifying a Range or Shape object. Start is an optional Boolean argument that you can set to True (the default) to have the upper-left corner of the range or shape displayed, or False to have the lower-right corner displayed. Specify False for Start when you need to make sure the end of a range or shape that may be larger than the window is displayed. + +The following statements position the selection at the end of the last paragraph in the first list in the active document, ready to add a new paragraph to the list: + + Dim rngFirstList As Range + Set rngFirstList = ActiveDocument.Lists(1).Range + ActiveDocument.Windows(1). **ScrollIntoView** Obj:=rngFirstList, + Start:=False + rngFirstList.Select + Selection.Collapse Direction:=wdCollapseEnd + Selection.MoveLeft Unit:=wdCharacter, Count:=1, Extend:=wdMove + +## Changing a Document's View + +To change a document's view, set the Type property of the View object for the appropriate window to wdConflictView, wdMasterView, wdNormalView, wdOutlineView, wdPrintPreview, wdPrintView, wdReadingView, or wdWebView. For example, the following statement changes the view for Sample.docm to Print Layout view: + + Documents("Sample.docm").Windows(1).View.Type = wdPrintView + +## Switching to Read Mode + +Read mode hides the Ribbon, any markup, and nearly everything else except the text itself. Panes, however, such as Navigation and Thesaurus, do remain visible. The text itself is usually displayed as two pages or three (depending on your zoom level) side by side as in a book. You cannot edit in this view. Here's how to switch to read mode: + + ActiveDocument.ActiveWindow.View.Type = wdReadingView + +Read mode is thoughtfully designed to make the content as easy to read and remember as possible. For example, the zoom feature (lower right) adjusts the font size but repaginates (reflows) so you never have the struggle with moving a horizontal scroll bar to show hidden text. There _is_ a scroll bar, but it's never needed to display text that's out of view because of the zoom level. The zoom bar is strictly for global document navigation and as an indicator of your current position. + +Read mode also gives you some control over column width. Most people find it easier to read shorter lines of text, so you can adjust line length in the View menu. The Esc key exits read mode. + +## Zooming the View to Display Multiple Pages + +To zoom Print Layout view or Print Preview to display multiple pages, set the PageColumns and PageRows properties of the appropriate View object. (Change the view first if necessary.) The following statement displays Sample.docm in Print Layout view with six pages displayed (three across by two deep): + + With Documents("Sample.docm").Windows(1).View + .Type = wdPrintView + With .Zoom + .PageColumns = 3 + .PageRows = 2 + End With + End With + +# Working with Tables + +Many people need to work with tables in their Word documents, either creating them from scratch or manipulating existing tables. + +VBA uses a Table object to represent each individual table. If there is more than one Table object, they are gathered together into the Tables collection. To work with tables, you use the Tables property to return the Tables collection for the Document, Range, or Selection object in question. + +Here is a sample of the collections and objects that are members of the Tables collection and the Table object: + + * The Rows collection contains the rows in the table. Each row is represented by a Row object. + * The Columns collection contains the columns in the table. Each column is represented by a Column object. + * The Cell object provides access to a specified cell directly from the Table object. You can also reach the cells in the table by going through the row or column in which they reside. + * The Range object provides access to ranges within the table. + * The Borders collection contains all the borders for the table. + * The Shading object contains all the shading for the table. + +For a complete list of the members of the Table object in Word 2013, see this Web page: + + + +The members of the Tables collection can be found here: + + + +## Creating a Table + +To create a new table from scratch (rather than converting existing text to a table), use the Add method with the Tables collection. The Add method takes the following syntax for the Tables collection: + + _expression_.Add(Range, NumRows, NumColumns, DefaultTableBehavior, AutoFitBehavior) + +The arguments are as follows: + + * _expression_ is a required expression that returns a Tables collection. Typically, you'll want to use the Tables collection for the appropriate document. + * Range is a required argument supplying the range where you want to insert the table. If the range is a selection (rather than being a collapsed selection, or insertion point), the table replaces the range. + * NumRows is a required Long argument specifying the number of rows the table is to have. + * NumColumns is a required Long argument specifying the number of columns the table is to have. + * DefaultTableBehavior is an optional Variant argument specifying whether the table autofits its columns to their contents or to the window when you change the contents or the window width. Use wdWord9TableBehavior to have the table autofit its columns or wdWord8TableBehavior (the default) to have the columns retain their width. + * AutoFitBehavior is an optional Variant argument specifying the autofit behavior for the table. This argument applies only when DefaultTableBehavior is wdWord9TableBehavior. Use wdAutoFitContent to resize the columns to their contents, wddAutoFitWindow to resize the columns to the window width, or wdAutoFitFixed to use a fixed column width. + +For example, the following statement inserts a new, blank, non-autofitting table containing 10 rows and 5 columns at the current position of the insertion point in the active document: + + ActiveDocument.Tables.Add Range:=Selection.Range, NumRows:=10, _ + NumColumns:=5, DefaultTableBehavior:=wdWord8TableBehavior + +## Selecting a Table + +To select a table, specify the Document, Range, or Selection object involved, and then identify the Table object and use the Select method. This method takes no arguments. + +The following statement selects the first table in the active document: + + ActiveDocument.Tables(1).Select + +The following statements declare the variable tempTable and then select the first table in the document named Log.docm and assign its Range object to tempTable: + + Dim tempTable + Documents("Log.docm").Tables(1).Select + Set tempTable = Selection.Tables(1).Range + +The following statement selects the second table in the range named tempRange: + + tempRange.Tables(2).Select + +This statement selects the first table in the current selection: + + Selection.Tables(1).Select + +## Converting Text to a Table + +To convert ordinary text to a table (as opposed to inserting a new table from scratch), use the ConvertToTable method with an appropriate Range or Selection object. The ConvertToTable method takes the following syntax: + + _expression_.ConvertToTable(Separator, NumRows, NumColumns, + InitialColumnWidth, Format, ApplyBorders, ApplyShading, ApplyFont, + ApplyColor, ApplyHeadingRows, ApplyLastRow, ApplyFirstColumn, + ApplyLastColumn, AutoFit, AutoFitBehavior, DefaultTableBehavior) + +The arguments are as follows: + + * _expression_ is a required argument specifying an expression that returns a Range object or a Selection object. + * Separator is an optional Variant argument specifying the separator character (also known as the _delimiter_ character) to use to mark where the column divisions were. You can use these values for Separator: + * wdSeparateByCommas separates column information at commas. + * wdSeparateByDefaultListSeparator separates column information at the currently specified Other list separator character (the character shown in the text box alongside the Other option button in the Convert Table To Text dialog box). + * wdSeparateByParagraphs separates column information at the paragraph marks. + * wdSeparateByTabs (the default separator if you don't specify one) separates column information at tabs. + * Alternatively, you can specify a single separator character of your choice as a string or between double quotation marks. For example, enter **Separator:="|"** to use a vertical bar [|] as the separator. + * NumRows is an optional Variant argument specifying the number of rows the table should have. If you omit the NumRows argument, Word decides the number of rows in the table based on the number of columns specified and/or the number of the chosen separator characters it finds. + * NumColumns is an optional Variant argument specifying the number of columns the table should have. As with NumRows, if you omit the NumColumns argument, Word decides the number of columns in the table based on the number of rows specified and/or the number of the chosen separator characters it finds. + * InitialColumnWidth is an optional Variant argument that you can use to specify the initial width (in points) of each column in the table. If you omit the InitialColumnWidth argument, Word uses the full width of the page—from margin to margin—and allocates an equal width to each column, regardless of the relative widths of the contents of the columns. The InitialColumnWidth argument is useful primarily for restraining tables from using the full width of the page automatically. In many cases, autofitting the columns provides a better solution. + * Format is an optional Variant argument that you can use to specify one of Word's built-in autoformat styles for tables. To use the Format argument, specify the appropriate WdTableFormat constant (such as wdTableFormatElegant to specify the Elegant autoformat style). If you choose to apply a format, you can specify which properties of the autoformat style to apply to the table by using the following optional Variant arguments: + * Set ApplyBorders to True to apply the border formatting, or to False not to apply it. + * Set ApplyShading to True to apply the shading, or to False not to apply it. + * Set ApplyFont to True to apply the font formatting, or to False not to apply it. + * Set ApplyColor to True to apply the color formatting, or to False not to apply it. + * Set ApplyHeadingRows to True to apply any heading-row formatting, or to False not to apply it. + * Set ApplyLastRow to True to apply any last-row formatting, or to False not to apply it. + * Set ApplyFirstColumn to True to apply any first-column formatting, or to False not to apply it. + * Set ApplyLastColumn to True to apply any last-column formatting, or to False not to apply it. + * AutoFit is an optional Variant argument you can set to True to have Word adjust the column width to best fit whatever contents are in the cells. When autofitting, Word doesn't increase the overall width of the table—it either reduces or retains the table's width. + * AutoFitBehavior and DefaultTableBehavior are as described in the section "Creating a Table," earlier in the chapter. + +The following statement converts the current selection to a five-column table, separating the information at commas. It applies autofitting to the table based on cell content and sets the cells to resize automatically: + + Set myTable = Selection.ConvertToTable(wdSeparateByCommas, _ + Selection.Paragraphs.Count, 5, , , , , , , , , , , True, _ + wdAutoFitContent, wdWord9TableBehavior) + +## Ensuring That a Selection Is within a Table + +Before running any procedure that is intended to manipulate a table, it's a good idea to make sure that the current selection actually is within a table. Use the wdWithInTable argument of the Information property for the selection. wdWithInTable is Boolean, returning True if the selection is in a table and False if it isn't. Here's an example: + + If Selection.Information(wdWithInTable) = True Then + 'take actions here + End If + +## Finding Out Where a Selection Is within a Table + +In addition to establishing whether the selection is in a table, you can use the Information property to find out other information that can be useful when working with tables via a Range object or Selection object. + +Once you've established that the selection is within a table (probably by using the wdWithinTable argument), check whether the selection is at an end-of-row marker rather than being in a cell. If the selection is at an end-of-row marker, certain actions fail. For example, attempting to select the current cell or column fails because the selection is outside any cell or column, but attempting to select the current row succeeds. + +To check whether the selection is at the end-of-row marker, use the AtEndOfRowMarker argument for the Information property. The following statement moves the selection left one character (into the last cell in the same row) if the selection is at the end-of-row marker: + + If Selection.Information(wdAtEndOfRowMarker) = True Then _ + Selection.MoveLeft Unit:=wdCharacter, Count:=1 + +If the selection contains the end-of-row marker rather than being a collapsed selection (an insertion point) before the marker, the wdAtEndOfRowMarker argument returns False. To avoid a selected end-of-row marker causing problems in your procedures, collapse the selection if it isn't collapsed before checking whether it's at the end-of-row marker. The following statements do this, using a variable named curSel to restore the selection it collapses unless collapsing the selection leaves the selection at an end-of-row marker: + + Dim curSel + With Documents("Communications.docm") + If Selection.Type <> wdSelectionIP Then + Set curSel = Selection.Range + Selection.Collapse Direction:=wdCollapseStart + End If + If Selection.Information(wdAtEndOfRowMarker) = True Then + Selection.MoveLeft Unit:=wdCharacter, Count:=1, Extend:=wdMove + Else + If curSel <> "" Then curSel.Select + Set curSel = Nothing + End If + End With + +After establishing that the selection is safely in a table, you can retrieve six useful pieces of information about the table: + + * wdStartOfRangeColumnNumber returns the number of the column in which the beginning of the selection or range falls. The following statement selects the column in which the current selection begins: + + Selection.Tables(1).Columns(Selection.Information _ + (wdStartOfRangeColumnNumber)).Select + + * wdEndOfRangeColumnNumber returns the number of the column in which the end of the selection or range falls. The following statements delete the column in which the range testRange ends if the range is more than one column wide: + + With testRange + If .Information(wdStartOfRangeColumnNumber) <> _ + .Information(wdEndOfRangeColumnNumber) Then _ + .Tables(1).Columns(.Information _ + (wdEndOfRangeColumnNumber)).Delete + End With + + * wdStartOfRangeRowNumber returns the number of the row in which the beginning of the selection or range falls. + * wdEndOfRangeRowNumber returns the number of the row in which the end of the selection or range falls. + * wdMaximumNumberOfColumns returns the highest number of columns in any row in the selection or range. + * wdMaximumNumberOfRows returns the highest number of rows in the specified selection or range in the table. + +## Sorting a Table + +To sort a table, identify the table and use the Sort method. Sort takes the following syntax with the Table object: + + _expression_.Sort(ExcludeHeader, FieldNumber, SortFieldType, SortOrder, + FieldNumber2, SortFieldType2, SortOrder2, FieldNumber3, + SortFieldType3, SortOrder3, CaseSensitive, BidiSort, IgnoreThe, + IgnoreKashida, IgnoreDiacritics, IgnoreHe, LanguageID) + +The arguments are as follows: + + * _expression_ is an expression that returns a Table object. + * ExcludeHeader is an optional Variant argument that you can set to True to exclude the first row in the table (which is often the table header row) from the sort, or to False to include the first row in the table. + * FieldNumber, FieldNumber2, and FieldNumber3 are optional Variant arguments specifying the first, second, and third fields by which to sort (respectively). Usually you'll want to specify at least FieldNumber; if you don't, Word performs an alphanumeric sort on the table. + * SortFieldType, SortFieldType2, and SortFieldType3 are optional Variant arguments specifying the type of sorting you want to use for FieldNumber, FieldNumber2, and FieldNumber3, respectively. For U.S. English, the options are alphanumeric sorting (wdSortFieldAlphanumeric, the default), numeric sorting (wdSortFieldNumeric), and date sorting (wdSortFieldDate). + * SortOrder, SortOrder2, and SortOrder3 are optional Variant arguments specifying the sorting order for FieldNumber, FieldNumber2, and FieldNumber3. Use wdSortOrderAscending to specify an ascending sort (the default) or wdSortOrderDescending to specify a descending sort. + * CaseSensitive is an optional Variant argument that you can set to True to specify case-sensitive sorting. The default setting is False. + * The next five arguments (BidiSort, IgnoreThe, IgnoreKashida, IgnoreDiacritics, and IgnoreHe) are for specialized sorting (such as right-to-left languages, Arabic, and Hebrew). + * LanguageID is an optional Variant argument that you can use to specify the language in which to sort. For example, to sort in Lithuanian, you could specify wdLithuanian for LanguageID. For sorting in your default language, you can omit this argument. + +## Adding a Column to a Table + +To add a column to a table, use the Add method with the Columns collection for the appropriate Table object. The Add method takes the following syntax for the Columns collection: + + _expression_.Add [BeforeColumn] + +Here, _expression_ is a required expression that returns a Columns collection, and BeforeColumn is an optional Variant argument specifying the column to the left of which you want to insert the new column. + +The following example uses the Count property to check the number of columns in the first table in the active document. If this table contains fewer than five columns, one or more columns are added to bring the number of columns up to five. Each new column is added before (to the left of) the existing last column in the table: + + With ActiveDocument.Tables(1) + .Select + If .Columns.Count < 5 Then + Do Until .Columns.Count = 5 + .Columns.Add BeforeColumn:=.Columns(.Columns.Count) + Loop + End If + End With + +## Deleting a Column from a Table + +To delete a column, identify it and use the Delete method. Delete takes no arguments. The following statement deletes the first column in the table referenced by the object variable myTable: + + myTable.Columns(1).Delete + +## Setting the Width of a Column + +You can set the width of a column by using the AutoFit method, by using the SetWidth method, or by specifying the Width property for the column. + +The AutoFit method resizes each column automatically to a width suitable to its contents. AutoFit takes no arguments. The following statement uses the AutoFit method to resize each column in the first table in the active document: + + ActiveDocument.Tables(1).Columns.AutoFit + +The SetWidth method allows you to set the width of one or more columns and specify how the other columns in the table should change as a result. The syntax for the SetWidth method is as follows: + + _expression_.SetWidth ColumnWidth, RulerStyle + +Here, _expression_ is an expression that returns the Columns collection or Column object whose width you want to set. ColumnWidth is a required Single argument specifying the width of the column or columns, measured in points. RulerStyle is a required Long argument that specifies how Word should adjust the width of the columns: + + * The default value, wdAdjustNone, sets all the specified columns to the specified width, moving other columns to the left or right as necessary. This argument is analogous to Shift+dragging a column border when working interactively. + * wdAdjustFirstColumn applies the specified width to the first specified column, adjusting only as many columns to the right of this column as necessary. For example, widening the first column in a table slightly causes Word to narrow the second column but leave the third and subsequent columns unchanged. Widening the first column significantly causes Word to narrow the second and third columns, leaving the fourth and subsequent columns unchanged. This argument is analogous to dragging a column border when working interactively. + * wdAdjustProportional applies the specified width to the first specified column, keeping the right edge of the table in its previous position and adjusting all nonspecified columns proportionally to accommodate the change. + * wdAdjustSameWidth applies the specified width to the first specified column, keeping the right edge of the table in its previous position and adjusting all the other columns to an identical width to accommodate the change. This argument is analogous to Ctrl+dragging a column border when working interactively. + +The following statement sets the width of the second column in the first table in the active document to 50 points, adjusting the columns to the right of the second column proportionally: + + ActiveDocument.Tables(1).Columns(2).SetWidth ColumnWidth:=50, _ + RulerStyle:=wdAdjustProportional + +The Width property lets you change the width of a column without worrying about the effect on the other columns. Specify the width you want in points, as in this example: + + ActiveDocument.Tables(11).Columns(44).Width = 100 + +## Selecting a Column + +To select a column, use the Select method with the appropriate Column object. Select takes no arguments. The following statement selects the second column in the third table in the document named Originals.docm: + + Documents("Originals.docm").Tables(3).Columns(2).Select + +## Adding a Row to a Table + +To add a row, use the Add method with the Rows collection for the table. The Add method takes the following syntax for the Rows collection: + + _expression_.Add [BeforeRow] + +Here, _expression_ is a required expression that returns a Rows object, and BeforeRow is an optional Variant argument specifying the row before which you want to add the new row. If you omit BeforeRow, VBA adds the new row after the last existing row in the table. + +The following statement adds a new first row to the table referenced by the object variable myTable: + + myTable.Rows.Add BeforeRow:=1 + +You can also insert a row into a table at the current selection, using the InsertRowsBelow or InsertRowsAbove method. You specify how many rows. In this example, one row is inserted below the current selection: + + Selection.InsertRowsBelow 1 + +## Deleting a Row from a Table + +To delete a row, use the Delete method with the appropriate Row object. The Delete method takes no arguments. The following statement deletes the first row in the table referenced by the object variable myTable: + + myTable.Rows(1).Delete + +## Setting the Height of One or More Rows + +You can set the height of rows by letting Word set the row height automatically, by using the SetHeight method to specify an exact height or a minimum height, or by setting the Height property of the row or rows directly. + +To have Word set the height of a row automatically, set the row's HeightRule property to wdRowHeightAuto. Word then adjusts the height of the row to accommodate the cell with the tallest contents. The following statement sets the HeightRule property for the second row in the fourth table in the active document to wdRowHeightAuto: + + ActiveDocument.Tables(4).Rows(2).HeightRule = wdRowHeightAuto + +To specify an exact height or a minimum height for one or more rows, use the SetHeight method with the row or rows. The syntax for the SetHeight property is as follows: + + _expression_.SetHeight RowHeight, [HeightRule] + +Here, _expression_ is an expression that returns a Row object or a Rows collection. HeightRule is a required Variant argument specifying the rule for setting the row height: use wdRowHeightAtLeast to specify a minimum height or wdRowHeightExactly to specify an exact height. (The third setting for HeightRule is wdRowHeightAuto, which specifies automatic row height and which you won't want to use in this case.) + +Instead of using the SetHeight method, you can set the Height property of the row or rows in question by specifying the height in points: + + Documents("Tables.docm").Tables(3).Rows(3).Height = 33 + +## Selecting a Row + +To select a row, use the Select method for the appropriate Row object. The Select method takes no arguments. The following statement selects the last row in the last table in the document named Tables.docm: + + Documents("Tables.docm").Tables(.Tables.Count).Rows.Last.Select + +## Inserting a Cell + +To insert a cell, use the Add method with the Cells collection. The Add method takes the following syntax for the Cells collection: + + _expression_.Add [BeforeCell] + +Here, _expression_ is an expression that returns a Cells collection, and BeforeCell is an optional Variant argument that specifies the cell to the left of which the new cell should be inserted. (If you omit the BeforeCell argument, VBA adds a new row of cells to the end of the table if you're using the Cells collection of the Columns collection, or it adds a new cell to the first row in the table if you're using the Cells collection of the Rows collection.) + +The following statement inserts a cell before the second cell in the first row of the first table in the document named Tables.docm: + + Documents("Tables.docm").Tables(1).Rows(1).Cells.Add _ + BeforeCell:=Documents("Tables.docm").Tables(1).Rows(1).Cells(2) + +## Returning the Text in a Cell + +To return the contents of a cell, use the Text property of the Range object for the cell. The following statement returns the text in the first cell in the second row of the third table in the active document and assigns it to the variable strCellText: + + strCellText = ActiveDocument.Tables(3).Rows(2).Cells(1).Range.Text + +Because the Text property includes the end-of-cell marker (which takes up two characters), you'll usually want to strip off the last two characters when assigning the Text property to a string, like this: + + strCellText = ActiveDocument.Tables(3).Rows(2).Cells(1).Range.Text + strCellText = Left(strCellText, Len(strCellText) - 2) + +When using the Range object, you can work with any of the objects and collections it contains. For example, to work with the paragraphs in a cell, use the Paragraphs collection. + +## Entering Text in a Cell + +To enter text in a cell, assign the text to the Text property of the Range object for the cell. The following statements enter text in the first three cells in the first row of the current selection: + + With Selection.Tables(1).Rows(1) + .Cells(1).Range.Text = "Sample text in first cell." + .Cells(2).Range.Text = "Sample text in second cell." + .Cells(3).Range.Text = "Sample text in third cell." + End With + +## Deleting Cells + +To delete cells, use the Delete method with the appropriate Cell object or Cells collection. When you delete one or more cells, you must specify what happens to the rest of the table—whether the cells to the right of those you deleted move to the left or whether the cells below those you deleted move up. + +The syntax for the Delete method for the Cells collection and the Cell object is as follows: + + _expression_.Delete [ShiftCells] + +Here, _expression_ is an expression that returns a Cells collection or a Cell object. ShiftCells is an optional Variant argument that specifies how the cells below or to the right of the deleted cell or cells should move. Use these values: + + * wdDeleteCellsEntireColumn deletes the whole column in which the specified cell (or cells) is located. + * wdDeleteCellsEntireRow deletes the whole row. + * wdDeleteCellsShiftLeft moves cells across to the left to fill the gap. + * wdDeleteCellsShiftUp moves cells up to fill the gap. + +The following statement deletes the first cell in the first row of the first table in the active document and shifts the other cells in the first row to the left to fill the gap: + + ActiveDocument.Tables(1).Rows(1).Cells(1).Delete _ + ShiftCells:=wdDeleteCellsShiftLeft + +For procedures that rely on the user to make a selection within a table, you may want to determine how many rows or columns are in the selection before deciding how to shift the cells. The following example checks the number of rows and columns in a selection. If the selection is only one cell, or if the selection is all in one column, the code deletes the cell or cells and moves the other cells in the row to the left. If the selection is multiple cells in one column, the code deletes the cells and moves the other cells in the column up. If the selection spans columns and rows, the code displays a message box asking the user to make a selection in only one row or only one column: + + With Selection + If .Columns.Count > 1 And .Rows.Count > 1 Then + MsgBox "Please select cells in only one row " _ + & "or only one column." + End + Else + If .Cells.Count > 1 Then + If .Columns.Count > 1 Then + .Cells.Delete ShiftCells:=wdDeleteCellsShiftUp + Else + .Cells.Delete ShiftCells:=wdDeleteCellsShiftLeft + End If + Else + .Cells.Delete ShiftCells:=wdDeleteCellsShiftLeft + End If + End If + End With + +## Selecting a Range of Cells + +To select a range of cells within a table, declare a Range variable, assign to it the cells you want to select, and then select the range. The following example declares the Range variable myCells, assigns to it the first four cells in the first table in the active document, and then selects the range: + + Dim myCells As Range + With ActiveDocument + Set myCells = .Range(Start:=.Tables(1).Cell(1, 1).Range.Start, _ + End:=.Tables(1).Cell(1, 4).Range.End) + myCells.Select + End With + +## Converting a Table or Rows to Text + +To convert an entire table or a row or number of rows to text, specify the table, row, or rows and use the ConvertToText method. This is frequently useful if you're copying and pasting from Internet pages; they often contain tables and you just want the contents, the text, not the table structure itself. Due to limitations of the HTML language used to describe web page layout, HTML tables are sometimes even used for spacing and other reasons unrelated to displaying actual tabular data. These faux "tables" can look bizarre when pasted as text into Word or other body text. To see how to get rid of these annoying artifacts, see the example macro at the end of this section. It's a useful macro to add to your Normal project in Word's VBA Editor. + +The ConvertToText method takes the following syntax: + + _expression_.ConvertTotext(Separator, Nested Tables) + +Here, _expression_ is a required expression that returns a Table object, a Row object, or a Rows collection. Separator is an optional Variant argument specifying the separator character (also known as the _delimiter_ character) to use to mark where the column divisions were. The possible values are as follows: + + * wdSeparateByCommas separates column information by commas. + * wdSeparateByDefaultListSeparator separates column information by the currently specified Other list-separator character (the character shown in the text box alongside the Other option button in the Convert Table To Text dialog box). + * wdSeparateByParagraphs separates column information with paragraph marks. + * wdSeparateByTabs (the default separator if you don't specify one) separates column information by tabs. + * Alternatively, you can specify a separator character of your choice as a string or between double quotation marks. For example, enter **Separator:="|"** to use a vertical bar [|] as the separator. (Although you can supply more than one separator character here, Word uses only the first character.) + +The following statement converts the first table in the current selection to text using an asterisk (*) as the separator character: + + Selection.Tables(1).ConvertToText Separator:="*" + +You can use the ConvertToText method with a Table object, a Row object, or a Rows collection. The following statement converts only the first row of the selected table to tab-delimited text: + + Selection.Tables(1).Rows(1).ConvertToText Separator:=wdSeparateByTabs + +If you need to continue working with the contents of the table once you've converted it, assign a range to the table as you convert it. You can then work with the Range object afterward to manipulate the information. For example, the following statements convert the first table in the document named Cleveland Report.docm to text separated by paragraphs and assign the range exTable to the converted information and then copy the range, create a new document, and paste in the information: + + Dim exTable As Range + Set exTable = Documents("Cleveland Report.docm").Tables(1). _ + ConvertToText(Separator:=wdSeparateByParagraphs) + exTable.Copy + Documents.Add + Selection.Paste + +Often when you copy and paste information from a web page, it's in a tabular format. If you paste such tables into Word, it usually doesn't look right, is too bulky, and can be difficult to edit or format. In other words, you want to remove the web-page table definitions but leave the data in a usable format within Word. + +The following macro does just that: + + Sub Untable() + + On Error Resume Next + + Selection.Rows.ConvertToText Separator:=wdSeparateByCommas, NestedTables:= _ + True + Selection.MoveDown Unit:=wdLine, Count:=1 + + If Err Then MsgBox "No table was detected, dude." + + End Sub + +To use this macro, click somewhere within the text you've pasted from the Internet to put the insertion cursor in a table (or a suspected table; they often don't look like tables, merely like an area of bizarre formatting), then execute the macro. You may need to execute this macro more than once to completely eliminate all the tabular formatting debris left over from the original HTML. The macro tells you when all table structures have been destroyed, and not only that—it calls you "dude." + +# The Bottom Line + +**Use Find and Replace via VBA.** + +Word's Find and Replace utilities are frequently valuable to the VBA programmer. You'll want to master them and also some subtleties associated with their use. + +Master It + +Sometimes when replacing, you need to go through a document more than once—using a loop structure. Why would you ever need to repeatedly search and replace the same document? Doesn't the Replace All setting in fact _replace all_? + +**Work with headers, footers, and page numbers.** + +All Word documents contain headers and footers, even if they are empty. In addition, you can insert various types of headers and footers. + +Master It + +Name two types of headers you can use in a Word document. + +**Manage sections, page setup, windows, and views.** + +Among the various ways you can view a document, you sometimes want to have the document automatically scroll to a particular table, graphic, or other target. + +Master It + +What method of the Window object can be used to easily accomplish this task? + +**Manipulate tables.** + +When you need to manage tables in Word documents, you can employ VBA to work with the Table object to represent a single table. If there is more than one table, they are referenced by a collection of Table objects. + +Master It + +Name two important and useful objects within the Tables collection or the Table object. +Chapter 22 + +Understanding the Excel Object Model and Key Objects + +This chapter shows you how to start working with the Excel object model, the architecture underlying Excel. It also shows you how to perform common actions with the most immediately useful Excel objects. These objects include the Workbooks collection and the Workbook object, the ActiveCell object, and Range objects. You'll also see how to set options in Excel. + +In this chapter you will learn to do the following: + + * Work with workbooks + * Work with worksheets + * Work with the active cell or selection + * Work with ranges + * Set options + +# Getting an Overview of the Excel Object Model + +As with the other Office applications, it's not necessary (or even possible for most people) to understand how the entire Excel object model fits together in order to work with VBA in Excel, but most people find that knowing the main objects in the object model is helpful. And often the code examples in the Help system's object-model reference are invaluable—showing you how and where to employ objects in your own programming. + +To see the Excel object-model reference, follow these steps: + +1. Launch or activate Excel, and then press Alt+F11 to launch or activate the VBA Editor. + +2. Move your cursor to a blank space in the code window (to avoid context-sensitive help). + +3. Press F1 in the Editor to launch the web page for the VBA language reference for Office 2013. + +4. In the Bing search field, type **excel 2013 object model** and press Enter. + +5. Click the link _Object Model Reference_ ( _Excel 2013 Developer Reference_ ). You'll now have access to the whole collection of syntax specifications, useful descriptions, and code examples, as shown in Figure 22.1. + +Figure 22.1 The entries in the Excel object model reference will help you write your own VBA code. + +* * * + +Help When Migrating Legacy Code from Earlier Office Projects + +If you've inherited VBA code written in earlier versions of Office, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though modifications to object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility, the Office Code Compatibility Inspector, that will flag objects and their members that have changed. It does a text comparison of the Office 2013 object model against VBA code written in earlier versions of Office. You can download the Compatibility Inspector from this web page: + +www.microsoft.com/en-us/download/details.aspx?id=15001 + +* * * + +# Understanding Excel's Creatable Objects + +Excel _exposes_ (makes available for your use in code) various _creatable_ objects, meaning that you can employ most of the important objects in its object model without explicitly going through (mentioning) the Application object. For most programming purposes, these creatable objects are the most commonly used objects. Here's a list: + + * The Workbooks collection contains the Workbook objects that represent all the open workbooks. Within a workbook, the Sheets collection contains the Worksheet objects that represent the worksheets and the Chart objects that represent chart sheets. On a sheet, the Range object gives you access to ranges, which can be anything from an individual cell to a complete worksheet. Remember that, because the workbooks object is creatable, you need not write Application.Workbooks in your code. You can leave off the Application and merely write Workbooks. + * The ActiveWorkbook object represents the currently active workbook. + * The ActiveSheet object represents the active worksheet. + * The Windows collection contains the Window objects that represent all the open windows. + * The ActiveWindow object represents the active window. When using this object, be sure to check that the window it represents is the type of window you want to manipulate, because the object returns whatever window currently has the focus. + * The ActiveCell object represents, you guessed it, the active cell. This object is especially valuable for simple procedures (for example, those that compute values or correct formatting) that work on a cell selected by the user. + +# Managing Workbooks + +In many of your Excel procedures, you'll need to manipulate workbooks: creating new workbooks, saving them in various locations and formats, opening existing workbooks, closing and printing workbooks. To accomplish these tasks, you employ the Workbooks collection, which contains a Workbook object for each open workbook in Excel. + +## Creating a Workbook + +To create a new workbook, use the Add method with the Workbooks collection. The syntax is as follows: + + Workbooks.Add(Template) + +Here, Template is an optional Variant argument that specifies how to create the workbook. The following subsections discuss the available options. + +### Creating a New Blank Workbook + +To create a blank workbook (as if you'd clicked the File tab on the Ribbon, then clicked the New button), omit the Template argument: + + Workbooks.Add + +The new workbook receives the number of sheets specified in the Excel Options dialog box (click the File tab on the Ribbon, then choose Options to display the When Creating New Workbooks section of the dialog box—you'll see a field where you can adjust the Include This Many Sheets option. + +You can get or set this value in VBA by using the SheetsInNewWorkbook property of the Application object. For example, the following macro declares an Integer variable named mySiNW, stores the current SheetsInNewWorkbook property in it, sets the SheetsInNewWorkbook property to 12, creates a new workbook (with those 12 worksheets), and then restores the SheetsInNewWorkbook setting to its previous value: + + Sub MVBA_New_Workbook_with_12_Sheets() + Dim mySiNW As Integer + mySiNW = Application.SheetsInNewWorkbook + Application.SheetsInNewWorkbook = 12 + Workbooks.Add + Application.SheetsInNewWorkbook = mySiNW + End Sub + +### Creating a New Workbook Based on a Template + +To create a workbook based on a template, specify the full path and name of the template file. For example, the following statement creates a new workbook based on the template Balance Sheet.xlt in a network folder \\\server\template\excel: + + Workbooks.Add Template:= "\\server\template\excel\Balance Sheet.xlt" + +### Creating a New Workbook Based on an Existing Workbook + +To create a workbook based on an existing workbook, specify the full name and path of the workbook file. For example, the following statement creates a new workbook based on the existing workbook named Personnel.xlsx in the C:\Business folder: + + Workbooks.Add Template:= "C:\Business\Personnel.xlsx" + +### Creating a Chart Workbook, a Macro Sheet, or a Worksheet + +You can also create a workbook that contains a single chart, macro sheet, or worksheet by using the constants shown in Table 22.1 with the Template argument. + +Table 22.1 Constants for creating a chart workbook, macro sheet, or worksheet + +**Constant** | **Creates a Workbook Containing** +---|--- +xlWBATChart | A chart sheet +xlWBATExcel4IntlMacroSheet | An international macro sheet +xlWBATExcel4MacroSheet | A macro sheet +xlWBATWorksheet | A worksheet + +For example, the following statement creates a workbook containing a single chart sheet: + + Workbooks.Add Template:=xlWBATChart + +## Saving a Workbook + +The first time you save a workbook, you must specify the path and filename to use (this is the SaveAs option). After that, you can save the workbook under the same name or specify a different path, name, format, or all three (this is the Save option). + +### Saving a Workbook for the First Time or as a Different File + +To save a workbook for the first time, or to save a workbook using a different path, name, or format, use the SaveAs method. The syntax is as follows: + + _expression_. **SaveAs** (FileName, FileFormat, Password, WriteResPassword, + ReadOnlyRecommended, CreateBackup, AccessMode, ConflictResolution, + AddToMru, TextCodePage, TextVisualLayout, Local) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Workbook object. + * FileName is an optional Variant argument that specifies the name for the workbook. If you omit FileName, VBA uses the current folder and the default filename of Book _n_.xlsx for a workbook, where _n_ is the next available number (for example, Book5.xlsx). + +VBA uses the default file format, which is specified in the Options dialog box's Save page. (Click the File tab on the Ribbon, then click Options to display the Options dialog box, then click the Save button on the left. You'll see a Save Files In This Format drop-down list.) + +You can get and set the default save format by using the DefaultSaveFormat property of the Application object. For example, the following statement sets the default save format to xlNormal, the "Excel Workbook" format: + + Application.DefaultSaveFormat = xlNormal + + * FileFormat is an optional Variant argument that specifies the format in which to save the workbook. Table 22.2 lists the XlFileFormat constants for specifying commonly used formats. + +* * * + +Be Careful Not to Accidentally Overwrite a File + +When saving a workbook to a folder, you should check whether a workbook with the same name already exists in the folder. If it does, and unless you prevent it, VBA overwrites it without warning, causing data loss. See "Using the Dir Function to Check Whether a File Exists" in Chapter 9, "Using Built-in Functions," for instructions on how to check whether a file with a particular filename already exists. + +* * * + + * Password is an optional Variant argument that you can use to supply the password that is to be required to open the workbook (the "password to open"). Password is case sensitive. If the user can't provide the password, Excel won't open the workbook. + * WriteResPassword is an optional Variant argument that you can use to supply the password that is required to open the workbook in a writable form (the "password to modify"). WriteResPassword is case sensitive. If the user can't provide the password, Excel will open the workbook as read-only. + * ReadOnlyRecommended is an optional Variant argument that you can set to True to have Excel recommend that the user open the document as read-only. Such recommendations typically carry little force, and you'll do better to protect the workbook with a "password to modify." + * CreateBackup is an optional Variant argument that you can set to True to make Excel automatically create a backup of the workbook. The default setting is False. + * AccessMode is an optional argument that you can use to specify whether the workbook is shared or is in Exclusive mode. Specify xlExclusive for Exclusive mode, xlShared for Shared mode, and xlNoChange to leave the access mode unchanged (this is the default setting). + * ConflictResolution is an optional argument that you can use to specify how to resolve any conflicting changes to the workbook. Use xlLocalSessionChanges to accept the changes in the current Excel session, xlOtherSessionChanges to accept the other user's or users' changes, and xlUserResolution to display the Resolve Conflicts dialog box so that the user can choose how to resolve the conflicts. + * AddToMru is an optional Variant argument that you can set to True to add the workbook to the list of recently used files at the bottom of the File menu. The default setting is False. + * TextCodePage and TextVisualLayout are optional Variant arguments used in international versions of Excel (not in U.S. English Excel). + * Local is an optional Variant that controls whether the language used is that of Excel (True) or of VBA (False). (You'll seldom need to use Local.) + +Table 22.2 XlFileFormat constants for widely used formats + +**Constant** | **Saves Document As** +---|--- +xlNormal | A normal workbook +xlXMLSpreadsheet | An XML spreadsheet +xlWebArchive | A single-file web page +xlHtml | A web page +xlTemplate | A template +xlExcel9795 | An Excel workbook for Excel versions 95 and later + +For example, the following statement saves the active workbook in the current folder under the name Salaries.xlsx and using the default save format: + + ActiveWorkbook.SaveAs FileName:="Salaries.xlsx" + +The following statement saves the open workbook named Schedule.xlsx under the name Building Schedule.xlsx in the folder named \\\server2\Public using the Microsoft Excel 97–2003 & 5.0/95 format (from Excel 2003): + + ActiveWorkbook.SaveAs Filename:="\\server2\Public\Building Schedule.xlsx", _ + FileFormat:=xlExcel9795 + +To see a complete list of all the Excel 2013 file formats, visit this web page: + + + +### Saving a Workbook That Has Already Been Saved + +Once a workbook has been saved, you can just save it again with the same name by using the Save method. For a Workbook object, the Save method takes no arguments. For example, the following statement saves the workbook named Data Book.xlsx: + + Workbooks("Data Book.xlsx").Save + +### Saving All Open Workbooks + +The Workbooks collection doesn't have a Save method, but you can save all open workbooks by using a loop such as that shown in the following subroutine: + + Sub Save_All_Workbooks() + Dim myWorkbook As Workbook + For Each myWorkbook In Workbooks + myWorkbook.Save + Next myWorkbook + End Sub + +Note that if any of the currently opened workbooks have not been previously saved, and if they include any macros, a security message will be displayed when this procedure executes. Users are told that they must agree to save the potentially dangerous executable content in a macro-enabled file format (.xlsm). However, if the file has already been saved with the .xlsm filename extension, no message is displayed. If you want to suppress such messages, you can insert the following code at the start of this procedure: + + Application.DisplayAlerts = False + +However, be sure to set the DisplayAlerts property back to True as soon as you can in the code. This particular warning message is quite useful as a reminder to the user—so you likely won't want to suppress it. + +## Accessing Cloud Storage + +Having VBA access SkyDrive, Dropbox, or one of the other cloud storage systems—systems, is fairly easy. Just open or save a file from the SkyDrive or Dropbox folder. + +The only thing to figure out is the file path, and it will look something like this: "C:\Users\ _Richard_ \SkyDrive\ExcelToCloudTest", with _Richard_ replaced by your name. + +This example saves the current document to SkyDrive. Because this is a source of so many errors, I repeat: Change my name, _Richard_ , to your name in the file path in this example code: + + ActiveWorkbook.SaveAs ("C:\Users\ _Richard_ \SkyDrive\ExcelCloudTest") + +To save to Dropbox, it's pretty much the same: + + ActiveWorkbook.SaveAs ("C:\Users\ _Richard_ \DropBox\ExcelCloudTest") + +## Opening a Workbook + +To open a workbook, use the Open method with the Workbooks collection. The syntax is as follows: + + _expression_.Open(FileName, UpdateLinks, ReadOnly, Format, Password, + WriteResPassword, IgnoreReadOnlyRecommended, Origin, Delimiter, + Editable, Notify, Converter, AddToMru, Local, CorruptLoad) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Workbooks collection. Often, you'll want to use the Workbooks collection itself. + * FileName is a required String argument that supplies the path and name of the workbook to open. + * UpdateLinks is an optional Variant that controls how Excel updates any links in the workbook. If you leave out this argument, the user is prompted to specify how to update the links. Table 22.3 shows the values and their effects. If Microsoft Excel is opening a file in the WKS, WK1, or WK3 format and the UpdateLinks argument is 2, Microsoft Excel generates charts from the graphs attached to the file. If the argument is 0, no charts are created. + * ReadOnly is an optional Variant that you can set to True to open the workbook as read-only. The default is False. + * Format is an optional Variant that you can use to specify the delimiter character when opening a text file. Use 1 for tabs, 2 for commas, 3 for spaces, 4 for semicolons, 5 for no delimiter character, and 6 for a delimiter you specify using the Delimiter argument. + * Password is an optional Variant argument that you can use to provide the password required to open the workbook (the "password to open"). Password is case sensitive. If you omit Password and a password is required, Excel prompts the user for it. + +* * * + +Don't Include Passwords in Your Procedures + +If possible, avoid placing passwords in your code, because it may be possible for other people to read them. + +* * * + + * WriteResPassword is an optional Variant argument that you can use to provide the password required to open the workbook in a writable form (the "password to modify"). WriteResPassword is case sensitive. If you omit WriteResPassword and a password is required, Excel prompts the user for it. + * IgnoreReadOnlyRecommended is an optional Variant argument that you can set to True to have Excel ignore a read-only recommendation on the workbook. + * Origin is an optional Variant argument that you can use when opening a text file to specify the operating system used to encode it and thus how to treat carriage-return/line-feed characters and character encoding. Use xlWindows to indicate Windows, xlMacintosh to indicate Mac OS, or xlMSDOS to indicate DOS. + * Delimiter is an optional Variant argument you can use with a Format value of 6 to specify one delimiter character to use when opening a text file. + * Editable is an optional Variant argument that you can set to True when FileName specifies a template to open the template itself rather than start a workbook based on the template (False). Editable also applies to Excel 4.0 add-ins: True opens the add-in in a visible window, while False opens the add-in hidden. However, you can't employ this option with add-ins created in Excel 5.0 or later. + * Notify is an optional Variant argument that you can set to True to have Excel add the workbook to the notification list when someone else has the workbook open for editing and VBA requests the workbook. Excel then notifies the user when the workbook becomes available. If you specify Notify:=False, opening the workbook fails if someone else has the workbook open. + * Converter is an optional Variant argument that you can use to specify the first file converter to use when opening a file. + * AddToMru is an optional Variant argument that you can set to True to add the workbook to the list of recently used files at the bottom of the File menu. The default setting is False. + * Local is an optional Variant that controls whether the language used is that of Excel (True) or of VBA (False). (You'll seldom need to use Local.) + * CorruptLoad is an optional Variant that you can use to control how Excel handles corruption it encounters when opening the workbook. Use xlNormalLoad to use normal behavior—first, opening the workbook as usual; second, repairing the file if there's a problem; and third, recovering the data from the workbook. Use xrRepairFile to go straight to the repair stage or xlExtractData to go straight to the recovery stage. + +Table 22.3 Values for the UpdateLinks argument + +**Value** | **Effect** +---|--- +(If you omit this argument) | Excel prompts the user to decide how to update links. +| +1 | User specifies how links are to be updated. +2 | Links are never updated for this workbook when it's opened. +3 | Excel always updates links for this workbook when opening it. + +For example, the following statement opens the workbook named Expenses.xlsx stored in the C:\Business folder without updating links: + + Workbooks.Open Filename:= "C:\Business\Expenses.xlsx", UpdateLinks:=0 + +The following statement opens the workbook named Plan.xlsx stored in the D:\Planning folder, providing the password for opening the workbook: + + Workbooks.Open Filename:="D:\Planning\Plan.xlsx", Password:="s@cur1ng!" + +The following statement opens the text file named Data13.txt in the folder z:\transfer using an exclamation point (!) as the delimiter character: + + Workbooks.Open _ + Filename:="z:\transfer\Data13.txt", Format:=6, Delimiter:="!" + +## Closing a Workbook + +To close a workbook, use the Close method with the appropriate Workbook object. The syntax is as follows: + + _expression_.Close(SaveChanges, Filename, RouteWorkbook) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Workbook object or the Workbooks collection. + * SaveChanges is an optional Variant argument that lets you specify whether to save any unsaved changes in the workbook (True) or not (False). If you omit the SaveChanges argument, Excel prompts the user to save any workbook that contains unsaved changes. + * Filename is an optional Variant that you can use to specify the filename under which to save the workbook if it contains changes. In most cases, it's best to use the SaveAs method to save the workbook under a different name before you use the Close method to close it. + * RouteWorkbook is an optional Variant argument that you can set to True to route the workbook to the next recipient on its routing slip, or False to refrain from routing the workbook. If the workbook has no routing slip attached, RouteWorkbook has no effect. + +For example, the following statement closes the active workbook without saving changes: + + ActiveWorkbook.Close SaveChanges:=False + +### Closing All Open Workbooks + +To close all open workbooks, use the Close method with the Workbooks collection: + + Workbooks.Close + +The Close method takes no arguments. Excel prompts you to save any workbook that contains unsaved changes. If such prompts will be inconvenient in a procedure, use a loop (for example, a For Each... Next loop with the Workbooks collection) to close each open workbook individually, using the SaveChanges argument to control whether Excel saves or discards any unsaved changes. + +## Sharing a Workbook + +To determine whether a workbook is shared, check its MultiUserEditing property. This is a read-only Boolean property. + +To share a workbook, use the SaveAs method (discussed in "Saving a Workbook for the First Time or as a Different File," earlier in this chapter) to save the file using the xlShared value for the AccessMode argument. + +For example, the following statements share the workbook named Brainstorming.xlsx if it is not already shared: + + With Workbooks("Brainstorming.xlsx") + If MultiUserEditing = False Then + .SaveAs Filename:=.FullName, AccessMode:=xlShared + End If + End With + +## Protecting a Workbook + +To protect a workbook, use the Protect method with the appropriate Workbook object. The syntax is as follows: + + _expression_.Protect(Password, Structure, Windows) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Workbook object. + * Password is an optional Variant argument that specifies the password for unprotecting the workbook. Password is case sensitive. You'll almost always want to supply Password—if you don't, anybody who can open your workbook can unprotect it. + * Structure is an optional Variant argument that you can set to True to protect the workbook's structure (how the worksheets are positioned relative to each other) or leave at its default setting, False. + * Windows is an optional Variant argument that you can set to True to protect the workbook windows or omit to leave the windows unprotected. + +For example, the following statement protects the structure and windows of the active workbook with the password 0llsecurd: + + ActiveWorkbook.Protect Password:="0llsecurd", Structure:=True, Windows:=True + +* * * + +You Can Protect Workbooks against Both Writing (Editing) and Reading + +In addition to protecting a workbook against modifications, you can protect it against being opened and viewed. See the sidebar "Setting Passwords and Read-Only Recommendations for a Workbook" later in this chapter for details. + +* * * + +## Working with the ActiveWorkbook Object + +The ActiveWorkbook object returns a Workbook object that represents the active workbook (whichever workbook currently has the focus in the Excel window). The ActiveWorkbook object behaves like a Workbook object and is very useful in procedures that users execute (put another way, macros that users run) after opening the workbook that they want to manipulate. + +If no workbook is open, there is no ActiveWorkbook object, so any code that tries to use the ActiveWorkbook object returns an error. Users can run macros when no workbook is open in Excel, so it's a good idea to verify that at least one workbook is open before trying to execute code that assumes there is an active workbook. One option is to check that the ActiveWorkbook object is not Nothing before running the code, as in the following example: + + If ActiveWorkbook **Is Nothing** Then + MsgBox "Please open a workbook and click in it before running this macro." _ + & vbCr & vbCr & "This macro will now end.", _ + vbOKOnly + vbExclamation, "No Workbook Is Open" + End + End If + +It's also a good idea to check that the workbook your code assumes is the active workbook actually _is_ the active workbook. This problem can easily occur when a procedure starts with the active workbook and then creates a new workbook to work in; the new workbook becomes the active workbook, and from this point on, the code may start accessing the wrong workbook. + +If there's any doubt about which workbook you're working with, declare a Workbook object variable and use that object variable in your code rather than the ActiveWorkbook object. For example, the following statements declare a Workbook object variable and assign the ActiveWorkbook object to it, so that subsequent code can work with the object variable: + + Dim myWorkbook As Workbooks + Set myWorkbook = ActiveWorkbook + With myWorkbook + 'actions here + End With + +# Working with Worksheets + +Most workbooks you need to manipulate via VBA will contain one or more worksheets. As a result, many procedures will need to work with worksheets—inserting them, deleting them, copying or moving them, or simply printing the appropriate range from them. + +Each worksheet is represented by a Sheet object. The Sheet objects are contained within the Sheets collection. + +## Inserting a Worksheet + +To insert a worksheet into a workbook, use the Add method with the Sheets collection. The syntax is as follows: + + _expression_.Add(Before, After, Count, Type) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Sheets collection. Often, you'll want to use the Sheets collection itself. + * Before is an optional Variant argument that specifies the sheet before which to add the new sheet. After is an optional Variant argument that specifies the sheet after which to add the new sheet. Typically, you'll want to specify either Before or After, but not both. You can also omit both arguments to make Excel insert the new sheet before the active worksheet. + * Count is an optional Variant argument that specifies how many sheets to add. If you omit Count, VBA uses the default value, 1. + * Type is an optional Variant that specifies the type of sheet to insert. The default is xlWorksheet, a standard worksheet. You can also insert a chart sheet (xlChart), an Excel 4 macro sheet (xlExcel4MacroSheet), or an Excel 4 international macro sheet (xlExcel4IntlMacroSheet). + +For example, the following statements declare a Worksheet object variable named mySheet, insert a worksheet before the first sheet in the first open workbook and assign the new sheet to mySheet, and then set the Name property of mySheet to Summary (the Name property controls the text that appears on the worksheet's tab): + + Dim mySheet As Worksheet + Set mySheet = Workbooks(1).Sheets.Add(before:=Sheets(1)) + mySheet.Name = "Summary" + +The following statements insert two chart sheets after the last worksheet in the active workbook. The chart sheets receive default names, such as Chart1 and Chart2: + + ActiveWorkbook.Sheets.Add _ + After:=Sheets(Sheets.Count), Count:=2, Type:=xlChart + +## Deleting a Worksheet + +To delete a worksheet, use the Delete method of the appropriate Sheet object. The Delete method takes no arguments. For example, the following statement deletes the worksheet named Summary from the workbook referenced by the myWorkbook object variable: + + myWorkbook.Sheets("Summary").Delete + +If you delete a worksheet, you lose any data stored on that worksheet, so Excel asks the user to confirm the deletion by default (see Figure 22.2). If you need to avoid this user interaction—for example, in a procedure that adds a worksheet without the user's knowledge, uses it to manipulate data, and then deletes it—you can turn off alerts in Excel by setting the DisplayAlerts property of the Application object to False before deleting the worksheet and then turning alerts back on: + + Application.DisplayAlerts = False + myWorkbook.Sheets("Summary").Delete + Application.DisplayAlerts = True + +Figure 22.2 When deleting a worksheet, you must either suppress alerts in Excel or have the user confirm the deletion in this dialog box. + +## Copying or Moving a Worksheet + +To copy a worksheet, use the Copy method of the appropriate Sheet object. To move a worksheet, use the Move method. The syntax is as follows: + + _expression_.Copy(Before, After) + _expression_.Move(Before, After) + +Here, _expression_ is a required expression that returns a Worksheet object. Before is an optional Variant argument that specifies the sheet before which to place the copy or the moved sheet. After is an optional Variant argument that specifies the sheet after which to place it: + + * Typically, you'll want to specify either Before or After, but not both. + * You can specify another workbook by name to copy or move the worksheet to another workbook. + * You can also omit both arguments to make Excel create a new workbook containing the copied or moved sheet. The new workbook becomes the active workbook, so you can use the ActiveWorkbook object to start working with it or to assign it to an object variable. + +For example, the following statement copies the worksheet named Costs – Materials in the workbook named Building Schedule.xlsx, placing the copy after the last of the current worksheets in the workbook: + + Workbooks("Building Schedule.xlsx").Sheets("Costs - Materials").Copy, _ + After:=Sheets(Sheets.Count) + +The following line of code moves the worksheet named Homes from the workbook named Planning.xlsx to the workbook named Building Schedule.xlsx, inserting the worksheet before the first existing worksheet in the workbook: + + Workbooks("Planning.xlsx").Sheets("Homes"). **Move** , _ + Before:=Workbooks("Building Schedule.xlsx").Sheets(1) + +## Printing a Worksheet + +To print a worksheet, use the PrintOut method with the appropriate Worksheet object. + +* * * + +The PrintOut Method Can Be Used with Several Objects + +Various objects in addition to an individual worksheet have a PrintOut method, including the Worksheets collection, the Chart object and the Charts collection, the Workbook object, the Window object, and the Range object. + +* * * + +The syntax for the PrintOut method is as follows: + + _expression_.PrintOut(From, To, Copies, Preview, ActivePrinter, + PrintToFile, Collate, PrToFileName, IgnorePrintAreas) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns the appropriate Worksheet object or other object to which the PrintOut method applies. + * From is an optional Variant argument that specifies the number of the page at which to start printing. Omit From to start printing at the beginning of the object. Note that From and To refer to the pages in the printout, not to the overall number of pages that the object would take up. + * To is an optional Variant argument that specifies the number of the page at which to stop printing. Omit the To argument to print to the end of the object. + * Copies is an optional Variant argument that specifies the number of copies to print. If you omit Copies, Excel prints one copy. + * Preview is an optional Variant argument that you can set to True to display the object in Print Preview before printing it. Set Preview to False, or simply omit this argument, to print the object without previewing it. Use the PrintPreview method to display an object in Print Preview without printing it. + * ActivePrinter is an optional Variant argument that you can use to specify the printer on which to print. + * PrintToFile is an optional Variant argument that you can set to True to make Excel print to a print file rather than a printer. When printing to a file, you can use the PrToFileName property to specify the filename, or omit it and have Excel prompt the user for the filename. + * Collate is an optional Variant argument that you can set to True to have Excel print multiple copies for collation rather than printing all the copies of one page, all the copies of the next, and so on. + * PrToFileName is an optional Variant argument that you can use with PrintToFile:=True to specify the filename of the print file. + * IgnorePrintAreas is an optional Variant argument. Set to False, this argument prints the entire specified print area; when it's True, the entire object is printed and any print area is ignored. A _print area_ can be defined in Excel and is useful as a way of printing only a specified range of cells. Once specified, the print area is retained by Excel until you either clear it or specify a new print area. You define a print area by selecting the cells you want to print, then clicking the Ribbon's Page Layout tab. Click the Print Area option in the Page Setup area of the Ribbon. + +The following statement prints two copies of each page of the first worksheet in the active workbook, collating the pages: + + ActiveWorkbook.Sheets(1).Printout Copies:=2, Collate:=True + +The following statement prints the first two pages of the worksheet named Summary in the workbook named Planning.xlsx to a file named Planning Summary.prn in the network folder \\\server\to_print: + + Workbooks("Planning.xlsx").Sheets("Summary").PrintOut From:=1, To:=2, _ + PrintToFile:=True, _ + PrToFileName:="\\server\to_print\Planning Summary.prn" + +## Protecting a Worksheet + +To protect a worksheet, use the Protect method with the appropriate Worksheet object. The syntax is as follows: + + _expression_.Protect(Password, DrawingObjects, Contents, Scenarios, + UserInterfaceOnly, AllowFormattingCells, AllowFormattingColumns, + AllowFormattingRows, AllowInsertingColumns, AllowInsertingRows, + AllowInsertingHyperlinks, AllowDeletingColumns, AllowDeletingRows, + AllowSorting, AllowFiltering, AllowUsingPivotTables) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Worksheet object. + * Password is an optional Variant argument that specifies the password for unprotecting the worksheet. Password is case sensitive. You'll almost always want to supply Password to prevent unauthorized people from unprotecting the workbook. + * DrawingObjects is an optional Variant argument that you can set to True to protect shapes in the worksheet. The default setting is False. + * Contents is an optional Variant argument that protects the locked cells when set to True, its default value. Set Contents to False to leave the locked cells unprotected. + * Scenarios is an optional Variant argument that protects scenarios when set to True, its default value. + * UserInterfaceOnly is an optional Variant argument that you can set to True to leave macros unprotected while protecting the user interface. The default value is False. + * AllowFormattingCells, AllowFormattingColumns, and AllowFormattingRows are optional Variant arguments that you can set to True to allow the formatting of cells, columns, and rows, respectively. The default value for each argument is False. + * AllowInsertingColumns, AllowInsertingRows, and AllowInsertingHyperlinks are optional Variant arguments that you can set to True to allow the user to insert columns, rows, and hyperlinks, respectively. The default value for each argument is False. + * AllowDeletingColumns and AllowDeletingRows are optional Variant arguments that you can set to True to allow the user to delete columns or rows, respectively, where every cell in the column or row is unlocked. The default setting is False. + * AllowSorting is an optional Variant argument that you can set to True to allow the user to sort unlocked cells on the protected worksheet. The default setting is False. + * AllowFiltering is an optional Variant argument that you can set to True to allow the user to set filters or change filter criteria (but not enable or disable an autofilter) on a protected worksheet. The default setting is False. + * AllowUsingPivotTables is an optional Variant argument that you can set to True to allow the user to work with pivot tables on the protected worksheet. The default value is False. + +For example, the following statement protects the worksheet referenced by the object variable myWorksheet using the password no1gets1n: + + myWorksheet.Protect Password:="no1gets1n" + +The following statement protects the myWorksheet worksheet with the same password but allows the formatting of cells and allows the sorting of unlocked cells: + + myWorksheet.Protect Password:="no1gets1n", AllowFormattingCells:=True, _ + AllowSorting:=True + +## Working with the _ActiveSheet_ Object + +The ActiveSheet object returns the active worksheet. If you specify a workbook, then the active worksheet in _that_ specified workbook is returned. + +If no sheet is active, ActiveSheet returns Nothing. Before executing code that depends on there being an active sheet, it's a good idea to check, as in this example: + + If ActiveSheet Is Nothing Then End + +# Working with the Active Cell or Selection + +In a procedure that manipulates a selection that the user has made, you'll typically work with either the active cell or the selection. The active cell is always a single cell, but the selection can encompass multiple cells or other objects. + +## Working with the Active Cell + +The ActiveCell property of the Application object or the Window object returns a Range object that represents the active cell in the Excel application or in the specified window. If you use ActiveCell without specifying the window, VBA returns the active cell in the active window. + +For example, the following statement returns the address of the active cell in the active workbook: + + ActiveCell.Address + +The following statement returns the text in the active cell in the first window open on the workbook named Planning.xlsx: + + MsgBox Workbooks("Planning.xlsx").Windows(1).ActiveCell.Text + +If no worksheet is active, or if a chart sheet is active, there is no active cell. If you try to access ActiveCell, VBA returns an error. So before using code that assumes there is an active cell, check that ActiveCell is not Nothing: + + If ActiveCell Is Nothing Then End + +### Getting and Setting the Value of the Active Cell + +To return the value of the active cell, use the Value property. For example, the following statement sets the value of the active cell to 25: + + ActiveCell.Value = 25 + +And the following statement retrieves the value of the active cell: + + MsgBox ActiveCell.Value + +### Moving the Active Cell to Another Address + +The ActiveCell object is often convenient to work with in your code, so sometimes you'll want to make a different cell the active cell in order to work with it via the ActiveCell object. To make a cell the active cell, use the Activate method with the appropriate Range object. For example, the following statement makes cell L7 the active cell in the worksheet identified by the object variable myWorksheet: + + myWorksheet.Range("B5").Activate + +Often, you'll need to move the active cell to a different range a specified number of rows or columns away (in other words, to an address _relative_ to the location of the active cell—as opposed to an _absolute_ address, such as C12). To do so, use the Offset property of the active cell object, specifying the number of rows with the RowOffset argument and the number of columns with the ColumnOffset argument. Use a positive offset to move the active cell right or down and a negative offset to move the active cell left or up. For example, the following statement moves the active cell up two rows (RowOffset:=-2) and four columns to the right (ColumnOffset:=4): + + ActiveCell.Offset(RowOffset:=-2, ColumnOffset:=4).Activate + +In procedures that the user triggers (macros), it's often a good idea to return the active cell to where it was when the user started the procedure. To do so, you can store the location of the active cell and then return it to the stored location after your procedure is finished with its tasks. Here's an example: + + Set myActiveCell = ActiveCell + Set myActiveWorksheet = ActiveSheet + Set myActiveWorkbook = ActiveWorkbook + + 'take actions here + + myActiveWorkbook.Activate + myActiveWorksheet.Activate + myActiveCell.Activate + +* * * + +Be Careful with Equations That Use Relative Cell Addresses + +Always test your procedures carefully with various types of data. Errors can sometimes occur when you move cells that contain equations that use relative cell addresses. + +* * * + +### Working with the Region around the Active Cell + +You can work with the range of cells around the active cell by using the CurrentRegion property to return the CurrentRegion object. The current region extends from the active cell to the first _blank_ row above and below and to the first blank column to the left and right. In other words, if there are no blank rows or columns in the entire worksheet, then the region is all the cells in the worksheet. + +For example, the following statements use the Font property of the CurrentRegion object to set the font of the current region to 12-point Times New Roman with no bold or italic: + + With ActiveCell.CurrentRegion.Font + .Name = "Times New Roman" + .Size = 12 + .Bold = False + .Italic = False + End With + +## Working with the User's Selection + +In macros designed to be run by a user, you will often need to work with cells that the user has selected. For example, a user might select a range of cells and then run a macro to manipulate the contents of the range. + +To work with the range the user has selected, use the RangeSelection property of the appropriate Window object. For example, you might assign the RangeSelection property to a range so that you could work with it in a macro and then select it again at the end of the macro, leaving the user ready to work with their selection again. Here's an example: + + Dim myMacroRange As Range + **Set myMacroRange = ActiveWindow.RangeSelection** + With myMacroRange + 'take actions on the range here + End With + **myMacroRange.Activate** + +# Working with Ranges + +Within a worksheet, you'll often need to manipulate ranges of cells. You can work with _absolute_ ranges (ranges for which you specify the absolute addresses of the cells you want to affect, such as C12) or ranges relative to the active cell, where you merely describe an offset. + +You can either specify a range by using the Range property or create a named range by using the Names collection. Excel also provides the UsedRange property for working with the used range on a worksheet, and the SpecialCells method of the Range object for working with cells that meet specific criteria. + +## Working with a Range of Cells + +To work with a range of cells, use the Range property of the appropriate Worksheet object to specify the cells. For example, the following statement sets the value of cell C12 on the active worksheet to 44: + + ActiveSheet.Range("C12").Value = "44" + +## Creating a Named Range + +To create a named range, use the Add method with the Names collection. The syntax is as follows: + + _expression_.Add(Name, RefersTo, Visible, MacroType, ShortcutKey, + Category, NameLocal, RefersToLocal, CategoryLocal, RefersToR1C1, + RefersToR1C1Local) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a Names object. + * Name is an optional Variant argument that specifies the name to assign to the named range. Name is required if you don't specify the NameLocal argument (later in this list). The name cannot be a cell reference, nor can it contain spaces. + * RefersTo is an optional Variant argument that specifies the range for the named range. You need to specify RefersTo unless you use the RefersToLocal argument, the RefersToR1C1 argument, or the RefersToR1C1Local argument. + * Visible is an optional Variant argument that you can omit, set to True to have Excel make the name visible in the user interface (in the Go To dialog box, the Paste Name dialog box, and other locations), or set to False to make the name hidden. + * MacroType is an optional Variant argument that you can use to assign a macro type to the range: 1 for a user-defined Function procedure, 2 for a Sub procedure, and 3 or omitted for no macro. + * ShortcutKey is an optional Variant argument that specifies the shortcut key for a command macro assigned to the named range. + * Category is an optional Variant argument that specifies the category of the macro or function specified by MacroType. You can specify one of the categories used by the Function Wizard, or specify another name to have Excel create a new category with that name. + * NameLocal is an optional Variant argument that specifies the name for the range in the local language. Use NameLocal when you omit Name. + * RefersToLocal is an optional Variant argument that specifies the range for the named range. Use RefersToLocal when you omit RefersTo, RefersToR1C1, and RefersToR1C1Local. + * CategoryLocal is an optional Variant argument that you use to specify the category of the macro or function specified by MacroType. Use CategoryLocal when you omit Category. + * RefersToR1C1 is an optional Variant argument that specifies the range for the named range using R1C1 notation (R1C1 would mean row 1 column 1). Use RefersToR1C1 when you omit RefersTo, RefersToLocal, and RefersToR1C1Local. + * RefersToR1C1Local is an optional Variant argument that specifies the range for the named range using R1C1 notation in the local language. Use RefersToR1C1Local when you omit RefersTo, RefersToLocal, and RefersToR1C1. + +For example, the following statement defines a range named myRange that refers to the range A1:G22 on the worksheet named Materials in the workbook named Building Schedule.xlsx: + + Workbooks("Building Schedule.xlsx"). **Names.Add** Name:= "myRange", _ + RefersTo:="=Materials!$A$1:$G$22" + +## Deleting a Named Range + +To delete a named range, use the Delete method with the appropriate Name object. For example, the following statement deletes the range named myRange in the workbook named Building Schedule.xlsx: + + Workbooks("Building Schedule.xlsx").Names("myRange").Delete + +## Working with a Named Range + +To work with a named range, specify the name with the Range object. For example, the following statements set the row height of the rows in the named range myRange to 20 points and applies 16-point Arial font to the cells: + + With Range("myRange") + .RowHeight = 20 + .Font.Name = "Arial" + .Font.Size = "16" + End With + +## Working with the Used Range + +If you need to work with all the cells on a worksheet, but not with any unoccupied areas of the worksheet, use the UsedRange property. For example, the following statement autofits all the columns in the used range in the active worksheet: + + ActiveSheet.UsedRange.Columns.AutoFit + +## Working with the Special Cells + +If you need to work with only some types of cells on a worksheet or in a range, use the SpecialCells method of the Range object to return the cells you need. The syntax is as follows: + + _expression_.SpecialCells(Type, Value) + +These are the components of the syntax: + + * _expression_ is a required expression that returns a Range object. + * Type is a required argument that specifies which cells you want. Table 22.4 lists the constants you can use. + * Value is an optional Variant argument that you can use when Type is xlCellTypeConstants or xlCellTypeFormulas to control which cells Excel includes. Table 22.5 shows the constants and what they return. + +Table 22.4 Constants for the Type argument for the SpecialCells method + +**Constant** | **Returns This Kind of Cell** +---|--- +xlCellTypeAllFormatConditions | All formats +xlCellTypeAllValidation | Cells that use validation +xlCellTypeBlanks | Empty +xlCellTypeComments | Containing notes +xlCellTypeConstants | Containing constants +xlCellTypeFormulas | Containing formulas +xlCellTypeLastCell | The last cell in the used range +xlCellTypeSameFormatConditions | Having the same format +xlCellTypeSameValidation | Containing the same validation criteria +xlCellTypeVisible | All visible + +Table 22.5 Constants for the Value argument for the SpecialCells method + +**Constant** | **Returns Cells Containing** +---|--- +xlErrors | Errors +xlLogical | Logical values +xlNumbers | Numbers +xlTextValues | Text formulas + +For example, the following statement activates the last cell in the worksheet referenced by the object variable myWorksheet: + + myWorksheet.Cell.SpecialCells(Type:=xlCellTypeLastCell).Activate + +The following statement identifies all the cells that contain formulas resulting in errors in the active worksheet: + + ActiveSheet.Cells.SpecialCells(Type:=xlCellTypeFormulas, _ + Value:=xlErrors).Activate + +## Entering a Formula in a Cell + +To enter a formula in a cell, set the Formula property of the appropriate Cell object. For example, the following statement enters the formula =SUM($G$12:$G$22) in the active cell: + + ActiveCell.Formula = "=SUM($G$12:$G$22)" + +# Setting Options + +Unlike with Word, in which most of the options that you find in the Word Options dialog box (click the File tab, then click Options) are available through the Options object, most of Excel's options are located in the Application object. Workbook-specific properties that appear in the Excel Options dialog box, however, are accessed through the appropriate Workbook object. + +## Setting Options in the Application Object + +The following sections show three examples of setting widely useful options in the Application object. + +### Controlling Excel's Calculation + +In complex worksheets that perform many calculations, you may need to turn off automatic calculation so that a procedure can enter data quickly without the calculations taking place. + +To do so, set the Calculation property of the Application object to xlCalculationManual, enter the data, and then set the Calculation property back to its previous value: + + Dim varAutoCalculation As Variant + varAutoCalculation = Application.Calculation + Application.Calculation = xlCalculationManual + 'enter the data here + Application.Calculation = xlCalculationAutomatic + +### Clearing the Recently Used Files List + +Sometimes you may find it useful to clear all the entries from recently displayed documents (shown when you click the File tab on the Ribbon, then click Recent). Perhaps, for example, your macro creates some temporary files that you want to delete. + +You can do this by setting the Maximum property of the RecentFiles object to 0. After doing so, you likely want to restore the user's previous setting, as the following example illustrates: + + Dim myMax As Long + With Application.RecentFiles + myMax = .Maximum 'store the user's preference, currently in effect + .Maximum = 0 + .Maximum = myMax + End With + +After you execute this code and then click the File tab on the Ribbon and click Recent, no files will be displayed in the Recent Documents list. + +### Setting a Default File Location + +To set the default location for saving and opening files, use the DefaultFilePath property of the Application object, as in this example: + + Application.DefaultFilePath = "\\server3\users\mjones\files" + +## Setting Options in a Workbook + +Workbook-specific options include the following: + + * Security options (such as those shown in the following section and the sidebar "Setting Passwords and Read-Only Recommendations for a Workbook") + * Whether to update remote references in the workbook (the Boolean UpdateRemoteReferences property) and whether to save external link values (the Boolean SaveLinkValues property) + * Whether to use AutoRecover (the Boolean EnableAutoRecover property) + * Whether to accept labels in formulas (the Boolean AcceptLabelsInFormulas property) and whether to use the 1904 date system (the Boolean Date1904 property) + +### Forcing Excel to Remove Personal Information from the File Properties When You Save + +To make Excel remove personal information from a workbook's properties when you save it, set the RemovePersonalInformation property of the workbook to True: + + ActiveWorkbook.RemovePersonalInformation = True + +* * * + +**Setting Passwords and Read-Only Recommendations for a Workbook** + +Office's protection works well in a typical workplace. To protect a workbook against an unauthorized user opening it or modifying it, you can set a "password to open" (for reading only) or a "password to modify" on the workbook. You can also specify that when anyone opens a workbook, Excel will recommend that they open it as read-only rather than read/write. + +To set a "password to open," set the Password property of the Workbook object. For example, the following statement sets the active workbook to use the "password to open" 1mpass4: + + ActiveWorkbook.Password = "1mpass4" + +To set a "password to modify," set the WritePassword property of the Workbook object. For example, the following statement sets the active workbook to use the "password to modify" n0mods: + + ActiveWorkbook.WritePassword = "n0mods" + +To apply a read-only recommendation to a workbook, set its ReadOnlyRecommended property to True: + + Workbooks("Strategy.xlsx").ReadOnlyRecommended = True + +* * * + +## Accessing OneNote + +Earlier in this chapter you saw how to access SkyDrive and Dropbox. Simple enough. Dealing with OneNote is another matter because its contents are stored in the tricky XML format. When you write code to deal with XML, the words _efficient, straightforward_ , and _sensible_ do not come to mind. + +VBA isn't built into OneNote, but you can access OneNote from VBA in other Office applications. + +The following example gets the metadata (data about data) from your OneNote notebooks. + +Before you try this code, choose Tools ⇒ References in Excel's VBA Editor and ensure that both Microsoft OneNote 15.0 Object Library and Microsoft XML v6.0 are selected (checked) in the References dialog box. + + 1. Sub GetMetaData() + 2. + 3. 'If it's not currently running, OneNote will be launched + 4. Dim ONote As oneNote.Application + 5. Set ONote = New oneNote.Application + 6. + 7. Dim strXML As String + 8. + 9. ONote.GetHierarchy "", hsNotebooks, strXML, xs2010 'don't use xs2013 + 10. + 11. MsgBox strXML + 12. End Sub + +Lines 4 and 5 create an instance of OneNote and assign it to the ONote object variable. Next we create a string variable in line 7 to hold the metadata. Line 9 uses the GetHierarchy method to fill strXML with the metadata. hsNotebooks represents the collection of notebooks in OneNote. The message box displays the results. + +# The Bottom Line + +**Work with workbooks.** + +You often need to create a new, blank workbook in a macro (mimicking a user clicking the File tab on the Ribbon, then clicking the New button). And writing code that accomplishes this is not difficult. It requires only two words. + +**Master It** + +What code would you write to create a new, blank notebook? + +**Work with worksheets.** + +Most workbooks you access via VBA will contain one or more worksheets, so most procedures will need to work with worksheets—inserting, deleting, copying, or moving them, or simply printing the appropriate range from them. + +**Master It** + +Name the object you use in VBA code to represent a worksheet. + +**Work with the active cell or selection.** + +In a procedure that manipulates a selection that the user has made, you'll typically work with either the active cell or the current selection. + +**Master It** + +What is the difference between the active cell and a selection? + +**Work with ranges.** + +Within a worksheet, you'll often need to manipulate ranges of cells. Excel includes a special kind of range—represented by the UsedRange property. + +**Master It** + +What is unique about UsedRange? + +**Set options.** + +Word employs an Options object to contain most of the options that you find in the Word Options dialog box (click the File tab on the Ribbon, then click Options). Excel uses a different object to contain its options. + +**Master It** + +From which object do you access most of Excel's options? +Chapter 23 + +Working with Widely Used Objects in Excel + +In the previous chapter, you learned to work with some of the main objects in the Excel object model, such as Workbook objects, the ActiveCell object, Range objects, and the Options object. This chapter shows you how to expand your programming facility with VBA in Excel by working with charts, windows, and Find and Replace. + +In this chapter you will learn to do the following: + + * Work with charts + * Work with windows + * Work with Find and Replace + +# Working with Charts + +The following sections show you how to use VBA to create and format charts, either as entire chart sheets in a workbook or as objects on an existing worksheet. + +## Creating a Chart + +VBA uses the Chart object to represent a chart on a chart sheet and a ChartObject object to represent an embedded chart on a worksheet. The ChartObject object contains a Chart object, which you can manipulate by accessing it through the ChartObject object. Confused? Object classification schemes can be a bit bewildering. + +When writing a macro, you create a chart or chart object in a different order than when working interactively and doing things by hand within Excel. Here are the steps you take when creating charts _programmatically_ (via code rather than interactively via a mouse and keyboard): + +1. Create a Chart object variable. + +2. Instantiate (bring into existence) the Chart object using the Set command. + +3. Specify the source range for its data using the SetSourceData method. + +4. Specify the chart type using the ChartType property. + +5. Specify any other items you need to. + +### Creating a Chart on a New Chart Sheet + +To create a chart on a new chart sheet, use the Add method with the Charts collection. The syntax is as follows: + + _expression_.Add(Before, After, Count, Type) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Charts collection. + * Before is an optional Variant argument that you can use to specify the sheet before which to add the new chart sheet. After is an optional Variant argument that you can use to specify the sheet after which to add the new sheet. Typically, you'll use either Before or After. If you omit both arguments, VBA adds the new chart sheet before the active sheet. + * Count is an optional Variant argument that you can use to specify how many chart sheets to add. The default is one. + * Type is an optional Variant argument that you can use to specify which kind of chart you want displayed. The choices are xlWorksheet, xlChart, xlExcel4MacroSheet, and xlExcel4IntlMacroSheet. The default value is xlWorksheet, so you have to specify xlChart in the following code example because it adds a chart, not an ordinary worksheet. + +The following code declares an object variable named myChartSheet as being of the Chart type (a chart worksheet) and then assigns to myChartSheet a new chart sheet added after the last existing sheet in the active workbook: + + Dim myChartSheet As Chart + Set myChartSheet = ActiveWorkbook.Sheets.Add _ + (After:=ActiveWorkbook.Sheets(ActiveWorkbook.Sheets.Count), _ + Type:=xlChart) + +### Creating a Chart on an Existing Worksheet + +To create a chart on an existing worksheet, use the Add method with the ChartObjects collection. The syntax is as follows: + + _expression_.Add(Left, Top, Width, Height) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a ChartObjects collection. + * Left is a required Double (variable type) argument that specifies the position of the upper-left corner of the chart in points from the left edge of cell A1. + * Top is a required Double argument that specifies the position of the upper-left corner of the chart in points from the top edge of cell A1. + * Width is a required Double argument that specifies the width of the chart in points. + * Height is a required Double argument that specifies the height of the chart in points. + +For example, the following statements declare a new ChartObject object named myChartObject and assign to it a new chart object (chart area) 400 points wide by 300 points deep, positioned 200 points from the left edge and 200 points from the top of the worksheet: + + Dim myChartObject As ChartObject + Set myChartObject = ActiveSheet.ChartObjects.Add(Left:=200, Top:=200, _ + Width:=400, Height:=300) + +To work with the chart inside the ChartObject, return the Chart property of the ChartObject object. + +## Specifying the Source Data for the Chart + +So far, the chart (on the chart sheet or in the Chart object) is blank. To give it contents, specify the chart's source data by using the SetSourceData method of the Chart object. For example, the following statement specifies the range A1:E5 on the worksheet named Chart Data in the active workbook as the source data of the Chart object in the ChartObject object named myChartObject: + + myChartObject.Chart. **SetSourceData** Source:= _ + ActiveWorkbook.Sheets("Chart Data").Range("A1:E5") + +## Specifying the Chart Type + +To specify the chart type, set the ChartType property of the Chart object. Excel offers too great a variety of charts to list here (73 different types), but you can easily identify the chart types from their enumeration-constant names. For example, the constant xl3DArea represents the 3-D Area chart type, xlColumnStacked represents the Stacked Column chart type, and xlDoughnutExploded represents the Exploded Doughnut chart type. + +The following statement sets the type of the chart represented by the object variable myChart to the Stacked Column type: + + myChart.ChartType = xlColumnStacked + +## Working with Series in the Chart + +To work with series in a chart, you use the SeriesCollection collection, which contains all the series in the specified chart. + +### Creating a New Series + +To create a new series, use the NewSeries method with the SeriesCollection collection. For example, the following statement adds a new series to the chart represented by the object variable myChart: + + myChart.SeriesCollection.NewSeries + +### Adding a New Series + +To add a new series to a SeriesCollection collection, use the Add method with the appropriate SeriesCollection object. The syntax is as follows: + + _expression_.Add(Source, Rowcol, SeriesLabels, CategoryLabels, Replace) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a SeriesCollection collection. + * Source is a required Variant argument that specifies the source of the data for the new series. You can supply the data either as a range or as an array of data points. + * Rowcol is an optional argument that you can set to xlRows to specify that the new values are in rows in the specified range, or use the default setting, xlColumns, to specify that the new values are in columns. If you omit this argument, Excel uses xlColumns. + * SeriesLabels is an optional Variant argument that you can set to True to specify that the first row or column in the source area contains the series labels, or False to specify that the first row or column in the source area contains the first data point for the series. If you omit this argument, Excel tries to work out whether the first row or column contains a series label. It's best to specify this argument to avoid confusion. However, if Source is an array, VBA ignores this argument. + * CategoryLabels is an optional Variant argument that you can set to True to specify that the first row or column contains the name for the category labels, or set to False to specify that it does not contain them. If you omit this argument, Excel tries to work out whether the first row or column contains a category label. It's best to specify this argument to avoid confusion. Again, if Source is an array, VBA ignores this argument. + * Replace is an optional Variant argument that you can set to True when CategoryLabels is True to make the categories replace the existing categories for the series, or set to False (the default value) to prevent the existing categories from being replaced. + +The following procedure brings together several elements used in the previous code examples in this chapter. It illustrates how to create a complete chart and add a new series to the chart identified by the object variable myChart. The procedure draws the data from the range A4:K4 on the active worksheet in the active workbook, using rows: + + Sub test() + + Dim myChartObject As ChartObject + Dim MyChart As Chart + + Set myChartObject = ActiveSheet.ChartObjects.Add(Left:=100, Top:=100, _ + Width:=400, Height:=300) + + Set MyChart = myChartObject.Chart + MyChart.ChartType = xlConeBarStacked + + MyChart.SeriesCollection.Add _ + Source:=ActiveSheet.Range("A4:K4"), Rowcol:=xlRows + + End Sub + +If you execute this example, you'll see results similar to those shown in Figure 23.1. A chart will be generated based on whatever data lies within the specified range. + +Figure 23.1 This chart was generated in a procedure, using the Add method of the SeriesCollection object. + +### Extending an Existing Series + +To extend an existing series, use the Extend method with the appropriate SeriesCollection object. The syntax is as follows: + + _expression_.Extend(Source, Rowcol, CategoryLabels) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a SeriesCollection object. + * Source is a required Variant argument that specifies the source of the data for the new series. You can supply the data either as a range or as an array of data points. + * Rowcol is an optional argument that you can set to xlRows to specify that the new values are in rows in the specified range, or use the default setting, xlColumns, to specify that the new values are in columns. If you omit this argument, Excel uses xlColumns. + * CategoryLabels is an optional Variant argument that you can set to True to specify that the first row or column contains the name for the category labels, or set to False to specify that it does not contain them. If you omit this argument, Excel tries to work out whether the first row or column contains a category label. It's best to specify this argument to avoid confusion. If Source is an array, VBA ignores this argument. + +For example, the following statement extends the series in the chart identified by the object variable myChart using the data in the cells P3:P8 on the worksheet named Chart Data: + + myChart.SeriesCollection.Extend _ + Source:=Worksheets("Chart Data").Range("P3:P8") + +## Adding a Legend to the Chart + +To add a legend to the chart, set its HasLegend property to True. To manipulate the legend, work with the properties of the Legend object. Key properties include these: + + * The Position property controls where the legend appears: xlLegendPositionBottom, xlLegendPositionCorner, xlLegendPositionLeft, xlLegendPositionRight, or xlLegendPositionTop. + * The Height property and the Width property control the height and width of the legend, respectively, in points. + * The Font property returns the Font object, whose properties you can set to specify the font size, name, and effects. + +For example, the following statements add the legend to the chart represented by the object variable myChart and apply 16-point Arial font to it: + + With myChart.Legend + .HasLegend = True + .Font.Size = 16 + .Font.Name = "Arial" + End With + +## Adding a Chart Title + +To add a title to the chart, set its HasTitle property to True, as in this example: + + myChart.HasTitle = True + +Excel adds the title with the default text Chart Title. To change the text, set the Text property of the ChartTitle object, which represents the chart title. Here's an example: + + myChart.ChartTitle.Text = "Industrial Mixups in North Dakota" + +To position the title, set its Top property (specifying the number of points from the top edge of the worksheet) and its Left property (specifying the number of points from the left edge of the worksheet), as in this example: + + With myChart.ChartTitle + .Top = 100 + .Left = 150 + End With + +To format the text of the title, work with its Font object, as follows: + + myChart.ChartTitle.Font.Name = "Arial" + +## Working with a Chart Axis + +To work with an axis of a chart, use the Axes method to access the appropriate axis. The syntax is as follows: + + _expression_.Axes(Type, Group) + +Here, _expression_ is a required expression that returns a Chart object. Type is an optional Variant argument that specifies the axis to return. Use xlValue to return the value axis, xlCategory to return the category axis, or xlSeriesAxis to return the series axis (on 3D charts only). Group is an optional argument that you can set to xlSecondary to specify the second axis group instead of xlPrimary (the default setting), which specifies the first axis group. + +For example, the following statements work with the category axis in the primary group of the chart, applying its title, adding text, setting the font and font size, and turning major gridlines on and minor gridlines off. Note that this With structure should be placed within a second, outer With structure representing the chart itself: + + With MyChart + With .Axes(Type:=xlCategory, AxisGroup:=xlPrimary) + .HasTitle = True + .AxisTitle.Text = "Years" + .AxisTitle.Font.Name = "Times New Roman" + .AxisTitle.Font.Size = 12 + .HasMajorGridlines = True + .HasMinorGridlines = False + End With + + End With + +## Formatting Headers and Footers + +You can manipulate headers and footers easily via VBA by using a built-in set of format and content constants. These include format specifications such as &U for underlining and &C for centering. Content constants include &D, which inserts the current date, &P for the page number, and &F for the document's name. The complete list of Excel 2013 header and footer constants can be found here: + + + +This code turns on italics and underlining, and on the right side of the header prints _Dr. Dancy Page_ followed by the current page and the total number of pages: _Dr. Dancy Page 2 of 7_. If there is no header, one is created. + + ActiveSheet.PageSetup.RightHeader = "&U&I Doctor Dancy Page &P of &N" + +# Working with Windows + +The Windows collection contains a Window object for every open window in the Excel application. Normally, when you open a workbook, Excel opens a window so that you can see it. You can also open further windows as necessary—for example, by clicking the Ribbon's View tab, then clicking the New Window button in the Window area. + +In most cases, using Window objects isn't a very useful way to access data via VBA because you can access it more easily using objects such as the ActiveSheet object or the ActiveCell object. However, you may want to open, close, activate, or arrange windows programmatically (via a procedure rather than having the user do it by hand interactively) to display data to the user in a particular way. + +## Opening a New Window on a Workbook + +To open a new window on a workbook, use the NewWindow method of the appropriate Window object. This method takes no arguments. For example, the following statement opens a new window showing the contents of the first window open on the workbook identified by the object variable myWorkbook: + + myWorkbook.Windows(1).NewWindow + +## Closing a Window + +To close a window, use the Close method with the appropriate Window object. The syntax is as follows: + + _expression_.Close(SaveChanges, Filename, RouteWorkbook) + +Here, _expression_ is a required expression that returns a Window object. This syntax is the same as for closing a workbook (see "Closing a Workbook" in the previous chapter). The difference is that if two or more windows are open on the same workbook, closing the second or subsequent window does not close the workbook, so the arguments are not relevant. (If the window you're closing is the workbook's last window, however, you do need to specify the windows—otherwise, Excel prompts the user to save any unsaved changes.) For example, the following statement closes all windows open on the workbook referenced by the object variable myWorkbook except for one window: + + Do While myWorkbook.Windows.Count > 1 + myWorkbook.Windows(myWorkbook.Windows.Count).Close + Loop + +## Activating a Window + +To activate a window, use the Activate method of the appropriate Window object. For example, the following statement activates the first window open on the workbook Planning.xlsx: + + Workbooks("Planning.xlsx").Windows(1).Activate + +Similarly, you can activate the previous window by using the ActivatePrevious method or the next window by using the ActivateNext method. + +## Arranging and Resizing Windows + +To arrange windows, use the Arrange method with the appropriate Windows collection. The syntax is as follows: + + _expression_.Arrange(ArrangeStyle, ActiveWorkbook, SyncHorizontal, SyncVertical) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Windows collection. + * ArrangeStyle is an optional argument that you can set to xlArrangeStyleTiled to tile the windows (the default setting), xlArrangeStyleHorizontal to arrange the windows horizontally, xlArrangeStyleVertical to arrange the windows vertically, or xlArrangeStyleCascade to cascade the windows in an overlapping arrangement that lets you see the title bar of each window but the contents of only the front window. + * ActiveWorkbook is an optional Variant argument that you can set to True to make VBA arrange only the windows of the active workbook. The default value is False, which arranges all open windows. + * SyncHorizontal and SyncVertical are optional Variant arguments that you can set to True when you use ActiveWorkbook:=True to make the windows of the active workbook scroll horizontally or vertically in sync (when you scroll one window, the other windows scroll by the same amount in the same direction). The default is False. + +For example, the following statement arranges the windows in the workbook Budget.xlsx vertically and sets synchronized scrolling on them: + + Workbooks("Budget.xlsx").Windows.Arrange _ + ArrangeStyle:=xlArrangeStyleVertical, _ + ActiveWorkbook:=True, SyncVertical:=True + +You can maximize, minimize, or restore the application window by setting the WindowState property of the Application object to xlMaximized, xlMinimized, or xlNormal. Similarly, within the application window, you can maximize, minimize, or restore a document by setting its WindowState property. + +When a window is in a "normal" state (xlNormal; not maximized or minimized), you can position it by using the Top and Left properties to specify the position of the upper-left corner of the window and size it by setting its Height and Width properties. Check the UsableWidth property and the UsableHeight property of the Application object to find the amount of space available in the Application window. (Similarly, you can check the UsableWidth property and the UsableHeight of the Window object to see how much space is available in the window—for example, so that you can size or position an object correctly.) + +The following example declares two Window object variables, myWindow1 and myWindow2, and assigns myWindow1 to the active window and myWindow2 to a new window showing the same worksheet as myWindow1. The example then sizes and positions the two windows so that each is the full height available in the application window, with myWindow1 taking one-quarter of the available width and myWindow2 taking the remaining three-quarters of the available width: + + Dim myWindow1 As Window, myWindow2 As Window + Set myWindow1 = ActiveWindow + Set myWindow2 = myWindow1.NewWindow + With myWindow1 + .WindowState = xlNormal + .Top = 0 + .Left = 0 + .Height = Application.UsableHeight + .Width = Application.UsableWidth * 0.25 + End With + With myWindow2 + .WindowState = xlNormal + .Top = 0 + .Left = (Application.UsableWidth * 0.25) + 1 + .Height = Application.UsableHeight + .Width = Application.UsableWidth * 0.75 + End With + +## Zooming a Window and Setting Display Options + +To change the zoom, set the Zoom property of the appropriate Window object. For example, the following statement zooms the active window to 150 percent: + + ActiveWindow.Zoom = 150 + +In some procedures, you may need to change the display of the Excel window to ensure that certain features are (or are not) available to the user. Use the Boolean properties DisplayScrollBars, DisplayStatusBar, and DisplayFormulaBar to control whether Excel displays the scroll bars, status bar, and formula bar. Use the DisplayFullScreen property to toggle full-screen view on and off. + +For example, the following statements make sure that the scroll bars and status bar are hidden and that the formula bar is displayed: + + With Application + .DisplayScrollBars = False + .DisplayStatusBar = False + .DisplayFormulaBar = True + End With + +# Working with Find and Replace + +Excel's Find and Replace features can be useful for locating data in your procedures. In Excel, Find and Replace are implemented through methods rather than (as in Word) through a Find object. + +Both the Range object and the WorksheetFunction object have Find methods and Replace methods (but with different syntax). For most find and replace operations, you'll want to use the Range object—for example, to replace the contents of specific cells on a worksheet. + +## Searching with the Find Method + +The syntax for the Range object's Find method is as follows: + + _expression_.Find(What, After, LookIn, LookAt, SearchOrder, SearchDirection, MatchCase, MatchByte, SearchFormat) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Range object. + * What is a required Variant argument that specifies the data to find. This data can be a string of text or any Excel data type. + * After is an optional Variant argument that you can use to specify the cell after which to begin searching. After must be a cell in the range that's being searched. If you omit After, Excel begins the search at the upper-left cell in the range. + * LookIn is an optional Variant argument that you can use to specify whether to search in formulas (xlFormulas), values (xlValues), or comments (xlComments). + * LookAt is an optional Variant argument that you can set to xlWhole to search for the entire contents of a cell, or to xlPart to search for the match within the contents of cells. + * SearchOrder is an optional Variant argument that you can set to xlByRows to search by rows, or to xlByColumns to search by columns. + * SearchDirection is an optional Variant argument that you can set to xlNext to search downward, or to xlPrevious to search upward. + * MatchCase is an optional Variant argument that you can set to True to use case-sensitive searching. The default setting is False. + * MatchByte is an optional Variant argument used only if you've installed double-byte language support. + * SearchFormat is an optional Variant argument that controls whether Excel searches for specified formatting (True) or not (False). + +* * * + +**Practical Searching: Beware Persistent Settings** + +The LookIn, LookAt, SearchOrder, and MatchByte arguments of the Range object's Find method _persist_ —Excel retains them from one search to the next. Unless you know that the settings used in the previous search are suitable for your current needs, you should set these arguments explicitly for each new search to avoid getting unexpected results. + +Pay particular attention to the LookAt setting. This setting corresponds to the Match Entire Cell Contents check box in the Find And Replace dialog box. To see this option manually in an Excel window, click any cell, then click the Find And Select button on the Ribbon's Home tab. Choose Replace on the menu that drops down, then click the Options button in the Find And Replace dialog box, if necessary, to display all the options available in that dialog box. + +Remember that format settings such as font and subscript persist as well. So you might want to also specify them if you're concerned they might have been previously employed by you or the user. + +And, finally, remember to always be courteous to users by restoring their settings. Users know that Find and Replace settings persist, so they expect them to remain as you found them, no matter what you might do with them while your procedure executes. So at the start of your procedure, store the user's current settings in variables. Then at the end of your procedure, save these settings back to the various options. + +Excel has no global command equivalent to Word's ClearFormatting statement, described in Chapter 21, "Working with Widely Used Objects in Word." + +* * * + +The following example code searches for 2008 in formulas in cells after the active cell, without searching for formatting: + + Cells.Find(What:="2008", After:=ActiveCell, LookIn:=xlFormulas, LookAt _ + :=xlWhole, SearchOrder:=xlByRows, SearchDirection:=xlNext, MatchCase:= _ + True, SearchFormat:=False).Activate + +Notice that in this code each argument is named. And one, MatchByte, is omitted. Recall that if you leave out an argument in an argument list, you must either insert a comma as a placeholder or use named arguments. Given that Excel's Find arguments are persistent, it is a good idea to use named arguments here to remind yourself that they need to be restored to the user's previous settings. + +## Continuing a Search with the _FindNext_ and _FindPrevious_ Methods + +After you have executed a search using the Find method, you can use the FindNext method to find the next instance of the search item, or the FindPrevious method to find the previous instance. The syntax is as follows: + + _expression_.FindNext(After) + _expression_.FindPrevious(After) + +Here, _expression_ is a required expression that returns a Range object, and After is an optional Variant argument that specifies the cell after which you want to search (for the FindNext method) or before which you want to search (for the FindPrevious method). After must be a single cell. + +For example, the following statement finds the next instance of the search item: + + Cells.FindNext + +## Replacing with the _Replace_ Method + +To replace using VBA, use the Replace method with the Range object. The syntax is as follows: + + _expression_.Replace(What, Replacement, LookAt, SearchOrder, MatchCase, MatchByte, SearchFormat, ReplaceFormat) + +The components of the syntax are the same as for the Search method except for the following: + + * Replacement is a required Variant argument that specifies the replacement string for the search. + * ReplaceFormat is an optional Variant argument that controls whether Excel replaces formatting in the search (True) or not (False). + +For example, the following statement replaces the instances of the word Sales in column B of the active worksheet with the words Sales & Marketing, using case-sensitive matching: + + ActiveSheet.Columns("B").Replace What:="Sales", _ + Replacement:="Sales & Marketing", SearchOrder:=xlByColumns, _ + MatchCase:=True + +## Searching for and Replacing Formatting + +To search for formatting, use the FindFormat property of the Application object to define the formatting, and then set the SearchFormat argument of the Find method to True. Similarly, use the ReplaceFormat property of the Application object to define the replacement formatting, and then set the ReplaceFormat property of the Replace method to True. + +For example, the following statements use a With structure to set the Application.FindFormat.Font properties for which to search, a With structure to set the Application.ReplaceFormat.Font with which to replace them, and the Replace method of the Cells collection to effect the replacement: + + With Application.FindFormat.Font + .Name = "Arial" + .Size = "12" + .Bold = True + End With + With Application.ReplaceFormat.Font + .Name = "Arial Black" + .Bold = False + End With + Cells.Replace What:="5", Replacement:="5", LookAt:=xlPart, SearchOrder _ + :=xlByColumns, MatchCase:=False, SearchFormat:=True, ReplaceFormat:=True + +# Adding Shapes + +It's easy to add shapes to a worksheet. This technique can be used to draw attention to important points or liven up statistical data for a presentation. Here's an example that adds two explosion graphics to a worksheet: + + Sub AutoShapes() + + ActiveSheet.Shapes.AddShape(msoShapeExplosion2, 425, 145, 86, 101).Select + ActiveSheet.Shapes.AddShape(msoShapeExplosion1, 265, 224, 190, 190).Select + + End Sub + +The AddShape method takes the following arguments: + + AddShape(Type, Left, Top, Width, Height) + +The Type argument specifies one of a set of msoShape constants that can be found in Excel's VBA Editor. There are dozens of shapes, including a moon, a heart, and a tear. Press F2 to display the Object Browser. In the list box at the top left of the Object Browser, you'll likely see All Libraries displayed by default. Instead, open this list and select Office. (This list box specifies the library of objects that will be searched.) Now in the field directly below that, type **msoshape** and click the binoculars icon next to the field. + +# The Bottom Line + +**Work with charts.** + +You can create either full chart sheets or embedded charts within an ordinary Excel worksheet. + +Master It + +What object is used in a procedure to represent an embedded chart? + +**Work with windows.** + +To open a new window on a workbook, you use the NewWindow method of the appropriate Window object. + +Master It + +Does the NewWindow method take any arguments? + +**Work with Find and Replace.** + +When working with the Find and Replace features in Excel, you need to be aware of a phenomenon known as _persistence_. + +Master It + +What is persistence, and why should it concern you? +Chapter 24 + +Understanding the PowerPoint Object Model and Key Objects + +This chapter shows you how to start working with the PowerPoint object model, the architecture underlying PowerPoint, and how to perform common actions with the most immediately useful PowerPoint objects. These objects include the Presentations collection and the Presentation object, the ActivePresentation object, the Slides collection and Slide objects, Window objects, and Master objects. + +In this chapter you will learn to do the following: + + * Understand the PowerPoint object model + * Understand PowerPoint's creatable objects + * Work with presentations + * Work with windows and views + * Work with slides + * Work with masters + +# Getting an Overview of the PowerPoint Object Model + +As with all Office applications that include VBA, you can write macros without understanding how the PowerPoint object model fits together, but most people find that familiarity with the main objects in the object model is helpful. Also, the code examples in the Help system's object-model reference can be invaluable. They show how and where to employ objects in your own programming. + +To begin exploring the PowerPoint object model, follow these steps: + +1. Launch or switch to PowerPoint, and then press Alt+F11 to launch or switch to the VBA Editor. + +2. Move your cursor to a blank space in the code window (to avoid context-sensitive help). + +3. Press F1 in the Editor to launch the VBA language reference for Office 2013 web page. + +4. In the Bing search field, type **powerpoint 2013 object model** and press Enter. + +5. Click the link _Object model reference_ ( _PowerPoint 2013 Developer Reference_ ). You'll now have access to the whole collection of syntax specifications, useful descriptions, and code examples, as shown in Figure 24.1. + +Figure 24.1 The entries in the PowerPoint object-model reference will help you write your own VBA code. + +* * * + +Help When Migrating Legacy Code from Earlier Office Projects + +If you've inherited VBA code written in earlier versions of PowerPoint, those procedures might contain objects, methods, and properties that have been changed in Office 2013. Though changes to previous object models are generally few, some incompatibilities can crop up and "break" the code so it won't run correctly. Fortunately, you can download a free utility to assist you in mending the broken code. See the sidebar in Chapter 22 titled "Help When Migrating Legacy Code from Earlier Office Projects" for more information. + +* * * + +# Understanding PowerPoint's Creatable Objects + +In PowerPoint, the Application object gives you access to all the objects in the PowerPoint application. But for many operations, you can go directly through one of the "creatable" objects available in PowerPoint. (Recall that _creatable_ merely means you can optionally leave out the word Application when specifying a creatable object in your code.) The four most useful creatable objects are listed here: + + * The ActivePresentation object represents the active presentation, the presentation that would respond if you typed something. + * The Presentations collection contains the Presentation objects, each of which represents one of the currently open presentations. + * The ActiveWindow object represents the active window in the application. + * The SlideShowWindows collection contains the SlideShowWindow objects, each of which represents an open slide-show window. This collection is useful for manipulating a slide show that's currently displayed. + +Within a presentation, you'll typically find yourself working with the Slides collection, which contains all the Slide objects that represent the slides. On a slide, most items are represented by Shape objects gathered into the Shapes collection. For example, the text in a typical placeholder is contained in the Text property of the TextRange object in the TextFrame object within a Shape object on a slide. + +# Working with Presentations + +To get any work done in PowerPoint, you'll usually need to work with one or more presentations. VBA uses the Presentation object to represent a presentation and organizes the open Presentation objects into the Presentations collection. + +## Creating a New Presentation Based on the Default Template + +You can create a new presentation based on the default template. This is equivalent to clicking the File tab on PowerPoint's Ribbon, then clicking the New option in PowerPoint. To do this, use the Add method with the Presentations collection. The syntax is as follows: + + _expression_.Add(WithWindow) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Presentations object. Often, it's easiest to use the Presentations object itself. + * WithWindow is an optional Long argument. Set WithWindow to msoFalse to prevent the new presentation from being visible—for example, so that you can create and manipulate it without the user seeing the details. (You may want to temporarily hide the presentation so that the user doesn't have to endure the irritating flickering effect that PowerPoint tends to exhibit while creating presentation objects programmatically.) The default value is msoTrue, making the new presentation visible. + +For example, the following statements declare an object variable of the Presentation type named myPresentation, create a new presentation, assign the new presentation to myPresentation, and make it invisible to the user: + + Dim myPresentation As Presentation + Set myPresentation = Presentations. **Add** (WithWindow:=msoFalse) + +* * * + +Understanding Tri-State Values + +This is a bit rarified, but let's leave no stone unturned. The Add method of the Presentations object allows you to set its WithWindow argument to four different states: msoFalse, msoTrue, msoTriStateToggle, or msoTriStateMixed. True and false are common and easily understood states. But PowerPoint makes extensive use of two unusual states called MsoTriState values, both of which represent a kind of super-Boolean state. Instead of being limited to merely True or False, a tri-state value can also be in special third and fourth states. msoTriStateMixed means that something is both true and false at the same time, like lovers. Here's an example: if a string contains three words, one of which is bold, is the string bold or not? Well, the answer in VBA is that it is a _mixed_ string. msoTriStateToggle means that the state is potentially true or false. + +In other words, the user could click a two-state control resetting the status either way, or your code could reset it. + +In most cases, you'll want to set a tri-state value to either msoTrue or msoFalse. In fact, I can't imagine a situation in which your code would actually ever have a need to _set_ msoTriState values; instead, you would only ever need to check this value (to _read_ it) to find out if the property you were dealing with contained a mixture of msoTrue and msoFalse values. Remember that _mixed_ means something is true and false at the same time; _toggle_ means that something is _potentially_ either true or false. Actually, don't remember this, because it's so infrequently useful. I mention it only because it pervades the documentation on PowerPoint programming. + +* * * + +## Creating a New Presentation Based on a Template + +To create a new presentation based on a template other than the default template, use the Open method of the Presentations collection. The syntax is as follows: + + _expression_.Open(FileName, ReadOnly, Untitled, WithWindow) + +The components of the syntax are explained here (ReadOnly, Untitled, and WithWindow are all msoTriState values, but pay no attention to that): + + * _expression_ is a required expression that returns a Presentations object. Often, it's easiest to use the Presentations object itself. + * FileName is a required String argument that specifies the path and name of the file to use as a template for the new presentation. This file can be either a template in the conventional sense or a presentation that you want to use as a template. + * ReadOnly is an optional argument that specifies whether the file is opened with read-only status (msoTrue) or with read/write status (msoFalse). When creating a new presentation based on a template, you don't need to specify ReadOnly. + * Untitled is an optional argument that specifies whether to open the file as itself (msoFalse) or as a copy (msoTrue). When creating a new presentation based on a template, set Untitled to msoTrue. + * WithWindow is an optional argument that you can set to msoFalse to prevent the new presentation from being visible. The default value is msoTrue, making the new presentation visible. + +For example, the following statement creates a new presentation based on the template named Capsules.potm in the C:\Users\ _Richard_ \Documents\Custom Office Templates\ folder: + + Presentations. **Open** _ + FileName:=˝C:\Users\ ** _Richard_** \Documents\Custom Office Templates\Presentation2.potx˝, Untitled:=msoTrue + +As usual, replace my name, _Richard_ , with your name. + +## Opening an Existing Presentation + +To open an existing presentation already on the hard drive, use the Open method of the Presentations collection. The syntax is as shown in the previous section. The difference is that you use the FileName argument to specify the presentation you want to open (as opposed to the file that you want to use as the template for creating a new presentation) and either omit the Untitled argument or set it to msoFalse. You may also need to use the OpenConflictDocument argument to specify how to handle any conflict file that exists for the presentation you're opening. + +For example, the following statement opens the existing presentation named Train Time.pptm stored in the folder Z:\Public, opening the presentation for editing rather than opening it as read-only: + + Presentations. **Open** FileName:=˝Z:\Public\Train Time.pptm˝, ReadOnly:=msoFalse + +## Opening a Presentation from the Cloud + +Chapters 20 and 22, "Understanding the Word Object Model and Key Objects" and "Understanding the Excel Object Model and Key Objects," demonstrated how to save documents to SkyDrive and Dropbox. Here we'll go the other way and open a presentation that's been stored on SkyDrive. The mechanics of contacting the cloud are, blessedly, handled for us by the various cloud services. There _are_ security issues—particularly during transmission to and from the storage servers—which I personally am glad to leave to these companies' programmers. + +All we VBA programmers have to do to store to or open from the cloud is to get the file path right. It's as if you are storing something on your hard drive—which in fact you are. The only difference is that the files in this location on your hard drive are also automatically stored (synced) somewhere else in the world, in a server farm. + +Let's assume you have a presentation named PX.pptm stored in SkyDrive. The file path will normally be "C:\Users\ ** _Richard_** \SkyDrive\PX.pptm". + +So, to open this PX presentation, you can use this code, replacing _Richard_ with whatever your name is: + + Presentations.Open FileName:=˝C:\Users\ ** _Richard_** \SkyDrive\PX.pptm˝, ReadOnly:=msoFalse + +## Saving a Presentation + +The first time you save a presentation, you must specify the path and filename to use. After that, you can save the presentation under the same name or specify a different path, name, format, or all three. This is the same distinction between the Save and Save As options on the File tab of the Ribbon. + +### Saving a Presentation for the First Time or under a Different Name + +To save a presentation for the first time, or to save a presentation using a different path, name, or format, use the SaveAs method. The syntax is as follows: + + _expression_. **SaveAs** (Filename, FileFormat, EmbedFonts) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Presentation object. + * Filename is a required String argument that specifies the filename under which to save the presentation. Normally, you include the path in Filename; if you omit the path, PowerPoint uses the current folder. + * FileFormat is an optional argument that specifies the file format to use. Although there are 27 total SaveAs constants, Table 24.1 lists only the 7 most widely useful formats. + * EmbedFonts is an optional argument that you can set to msoTrue to embed TrueType fonts in the presentation, or to False (the default) to not embed them. + +Table 24.1 Useful FileFormat constants for saving PowerPoint files + +**Format Name** | **Constant** +---|--- +PowerPoint format | ppSaveAsPresentation +Default format (set on the Save tab of the Options dialog box) | ppSaveAsDefault +Single-file web page | ppSaveAsWebArchive +Web page | ppSaveAsHTML +Presentation | ppSaveAsPresentation +Design template | ppSaveAsTemplate +PowerPoint show | ppSaveAsShow + +For example, the following statement saves the presentation identified by the object variable myPresentation under the name HR.pptm in the folder Z:\Shared\Presentations, using the web-page format and not embedding fonts: + + myPresentation. **SaveAs** FileName:=˝Z:\Shared\Presentations\HR.pptm˝, _ + FileFormat:= **ppSaveAsHTML** , EmbedTrueTypeFonts:=msoFalse + +* * * + +Using the Object Browser to Quickly See Constants and Objects + +Here's a useful reminder. When you don't need code samples or extra details, you don't need to take the time to look through the full online Help system for an object's members or constants (such as the ppSaveAs constants shown in Table 24.1). Instead, just press F2 in the VBA Editor to bring up the Object Browser. Then, in the search field (to the left of the binoculars icon), type the object's name, a member (property or method), or a constant name. For example, you could type **ppSaveAsPresentation** , then click the binoculars icon. You would then see the entire list of 27 ppSaveAs constants. + +To see the full list in the online Help system, visit this web page: + + + +* * * + +### Saving a Presentation under Its Existing Name + +To save a presentation under its existing name, use the Save method. This method takes no arguments because it has only one possible behavior. For example, the following statement saves the active presentation: + + ActivePresentation.Save + +If the presentation on which you use the Save method has never been saved, PowerPoint doesn't prompt the user to specify the filename and location. Instead, PowerPoint saves the presentation using the default name assigned to its window (for example, a presentation whose window is called Presentation11 will be saved as Presentation11.pptm) and in the current folder. To avoid using this default name and location, you can check the Path property of the Presentation object before using the Save method if you need to determine whether the presentation has been saved. If it has not been saved (if Path = ""), then you would use the SaveAs method to specify the folder and title you want to use, as in this example: + + If ActivePresentation.Path = ˝˝ Then + ActivePresentation.SaveAs FileName:=˝z:\public\presentations\Corporate.pptm˝ + Else + ActivePresentation.Save + End If + +### Saving a Copy of a Presentation + +Instead of using the SaveAs method to save a presentation under a different name, you can use the SaveCopyAs method to save a copy of the open presentation without affecting the open presentation (the presentation remains open, and any unsaved changes remain unsaved). The syntax and arguments for the SaveCopyAs method are the same as for the SaveAs method: + + expression.SaveAs(Filename, FileFormat, EmbedFonts) + +For example, the following statement saves a copy of the active presentation under the name Copy 1.pptm in the folder Z:\Public\Presentations, using the same file format as the presentation currently uses: + + ActivePresentation.SaveCopyAs FileName:=˝Z:\Public\Presentations\Copy 1.pptm˝ + +### Saving All Open Presentations + +The Presentations collection doesn't have a Save method, but you can save all open presentations by using a loop such as that shown in the following subroutine. This subroutine leaves unsaved any presentation that doesn't yet have a filename assigned. + + Sub Save_All_Presentations() + Dim myPresentation As Presentation + For Each myPresentation In Presentations + If myPresentation.Path <> ˝˝ Then myPresentation.Save + Next myPresentation + End Sub + +## Closing a Presentation + +To close a presentation, use the Close method of the appropriate Presentation object. The Close method takes no arguments. For example, the following statement closes the active presentation: + + ActivePresentation.Close + +If the presentation you're closing contains unsaved changes, PowerPoint prompts the user to save them. To avoid the user's being prompted, set the Saved property of the Presentation object to True before using the Close method. Here's an example: + + With Presentations(˝Karelia Industry.pptm˝) + .Saved = True + .Close + End With + +## Exporting a Presentation or Some Slides to Graphics + +You can export an entire presentation, a single slide, or a range of slides by using the Export method of the Presentation object, the Slide object, or a SlideRange object. The syntax for the Export method with a Presentation object is as follows: + + _expression_.Export(Path, FilterName, ScaleWidth, ScaleHeight) + +The syntax for the Export method with a Slide object or a SlideRange object is almost the same: + + _expression_.Export(FileName, FilterName, ScaleWidth, ScaleHeight) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Presentation object, a Slide object, or a SlideRange object, as appropriate. + * Path (for a Presentation object) is a required String argument that specifies the path of the folder in which to save the graphics files of the slides. + * FileName (for a Slide object or a SlideRange object) is a required String argument that specifies the filename to use for the exported graphic. Include the path in FileName unless you want PowerPoint to use the current folder. + * FilterName is a required String argument that specifies the filter to use. Use the registered filename extension (JPG, TIF, BMP, or PNG) for FilterName. + * ScaleWidth is an optional Long argument that you can include to specify the width of the graphic in pixels. + * ScaleHeight is an optional Long argument that you can include to specify the height of the graphic in pixels. + +For example, the following statement exports all the slides in the active presentation to 800×600 JPG graphics in the Z:\Public\Presentations folder. PowerPoint names the graphics Slide1, Slide2, and so on: + + ActivePresentation.Export Path:=˝Z:\Public\Presentations˝, _ + FilterName:=˝JPG˝, ScaleWidth:=800, ScaleHeight:=600 + +The following statement exports the sixth slide in the active presentation to the file named Slide6.png in the Z:\Public\Presentations folder, using the PNG format: + + ActivePresentation.Slides(6).Export _ + FileName:=˝Z:\Public\Presentations\Slide6.png˝, FilterName:=˝PNG˝ + +## Printing a Presentation + +To print a presentation, use the PrintOut method of the appropriate Presentation object. The syntax is as follows: + + _expression_.PrintOut(From, To, PrintToFile, Copies, Collate) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Presentation object. + * From and To are optional Integer arguments that specify the first slide and last slide to print. If you omit From, PowerPoint prints from the first slide; if you omit To, PowerPoint prints through the last slide. + * PrintToFile is an optional String argument that you can include to make PowerPoint print to the specified file rather than to the printer. + * Copies is an optional Integer argument that specifies how many copies of the presentation or slides to print. Omit Copies to use the default value, 1. + * Collate is an optional argument that you can set to msoFalse to prevent PowerPoint from collating multiple copies (which is the default setting). + +For example, the following statement prints all the slides in the active presentation: + + ActivePresentation.PrintOut + +The following example prints slides 5 through 12 of the presentation identified by the object variable myPresentation: + + myPresentation.PrintOut From:=5, To:=12 + +## Applying a Template to a Presentation, to a Slide, or to a Range of Slides + +You can apply a design template to a presentation, to a single slide within a presentation, or to a range of slides by using the ApplyTemplate method with the Presentation object, the Slide object, or the SlideRange object. The syntax is as follows: + + _expression_.ApplyTemplate(FileName) + +Here, _expression_ is a required expression that returns a Presentation object, a Slide object, or a SlideRange object. FileName is a required String argument that specifies the path and name of the design template. + +For example, the following statement applies the design template named Clouds.potm stored in the C:\Users\ ** _Richard_** \AppData\Roaming\Microsoft\Templates\ folder: + + ActivePresentation.Slides(1). **ApplyTemplate** FileName:= _ + ˝C:\Users\ **Richard** \AppData\Roaming\Microsoft\Templates\Clouds.potm˝ + +As usual, replace my name, _Richard_ , with your name. + +The following statement applies the design template named Mountain Top.potm stored in the Z:\Public\Template folder to the first slide in the presentation named Success.pptm: + + Presentations(˝Success.pptm˝).Slides(1). **ApplyTemplate** FileName:= _ + ˝Z:\Public\Template\Mountain Top.potm˝ + +The following example applies the design template named Disaster.potm stored in the Z:\Public\Template folder to a range of slides consisting of the first, fourth, and sixth slides in the active presentation: + + ActivePresentation.Slides.Range(Array(1, 4, 6)).ApplyTemplate _ + FileName:=˝Z:\Public\Template \Disaster.potm˝ + +## Working with the Active Presentation + +The ActivePresentation property of the Application object returns a Presentation object that represents the active presentation (the presentation in the active window). The ActivePresentation object can be very useful for procedures that the user starts. + +If no window is open, trying to use the ActivePresentation object returns an error. Unless you're sure that there is an active presentation, it's a good idea to check that a window is open before you access the ActivePresentation object, as in this example: + + If Windows.Count = 0 Then + MsgBox ˝Please open a presentation before running this macro.˝ + End + End If + +# Working with Windows and Views + +To get the PowerPoint window into the state you want, you'll often need to work with the window and with the view. PowerPoint uses two types of windows: + + * _Document windows_ are windows that contain documents (presentation files) rather than slide shows. VBA considers document windows to be DocumentWindow objects organized into the DocumentWindows collection but represents them with Window objects organized into the Windows collection. (Sounds mad, but you'll see how this works shortly.) + * _Slide_ - _show windows_ are windows that contain open slide shows. VBA uses SlideShowWindow objects and the SlideShowWindows collection to represent slide-show windows. + +The following sections show you how to work with document windows. You'll learn how to work with slide-show windows in ˝Setting Up and Running a Slide Show˝ in Chapter 25, ˝Working with Shapes and Running Slide Shows.˝ + +The Windows collection contains a Window object for every open window in the PowerPoint application. When you open a presentation while working interactively, PowerPoint opens a window so that you can see the presentation. When you open a presentation via VBA, you can set the WithWindow argument of the Add method to msoFalse to prevent PowerPoint from displaying a window for the presentation. In the user interface, you can also open further windows as necessary—for example, by clicking the New Window button in the Window section of the Ribbon's View tab. + +## Working with the Active Window + +PowerPoint uses the ActiveWindow object to represent the window that is active (the window that currently has the _focus_ and is thus the one that accepts mouse clicks or typing). + +Only one window is active at a time. The active window is always the first Window object in the Windows collection—Windows(1). + +If no window is open at all, or all open windows are hidden, there is no active window and using the ActiveWindow object causes VBA to return an error. To make sure that a window is open, check whether the Count property of the Windows collection is 0. Here's an example: + + If Windows. **Count** = 0 Then MsgBox ˝There is no active window.˝, vbOkOnly + _ + vbExclamation, ˝No Window Is Open˝ + +When you're working with presentations using VBA, you may sometimes find that the ActiveWindow object is a handy way to access a presentation, especially for a macro that the user runs after choosing the presentation, slide, or other object that they want to affect. In other cases, you may find that the ActivePresentation object is a more convenient way to access the presentation you need to work with, or you may prefer to access the presentation via the Presentations collection. + +## Opening a New Window on a Presentation + +To open a new window, use the NewWindow method of the appropriate Window object. This method takes no arguments. For example, the following statement opens a new window showing the contents of the active window: + + ActiveWindow.NewWindow + +## Closing a Window + +To close a window, use the Close method with the appropriate Window object. In PowerPoint, the Close method takes no arguments. + +* * * + +Be Careful When Closing Windows Programmatically + +Recall that _programmatically_ means _by programming_ , by executing code (as opposed to by user interaction). If the window you're closing is the last window open for the presentation, PowerPoint simply closes the window without prompting the user to save any unsaved changes. For this reason, be careful when closing windows, or your code can cause the user to lose data. + +* * * + +For example, you might close all windows but one on a presentation: + + Do While ActivePresentation.Windows.Count > 1 + ActivePresentation.Windows(ActivePresentation.Windows.Count).Close + Loop + +Alternatively, you might use the Save method to save a presentation before closing its last window, as in the next example. (More simply, you could use the Close method to close the presentation itself after saving it.) + + With ActivePresentation + If .Path = ˝˝ Then + MsgBox ˝Please save this presentation.˝, vbOKOnly + Else + .Save + For Each myWindow In Windows + .Close + Next myWindow + End If + End With + +## Activating a Window + +To activate a window or one of its panes, use the Activate method of the appropriate Window object. For example, the following statement activates the first window open on the presentation Benefits.pptm: + + Presentations(˝Benefits.pptm˝).Windows(1).Activate + +## Arranging and Resizing Windows + +To arrange windows, use the Arrange method with the appropriate Windows collection. The syntax is as follows: + + _expression_.Arrange(ArrangeStyle) + +Here, _expression_ is a required expression that returns a Windows collection. ArrangeStyle is a required argument that specifies how to arrange the windows: ppArrangeCascade (cascade the windows in an overlapping arrangement that lets you see the title bar of each window but the contents of only the front window) or ppArrangeTiled (tile the windows; the default setting). + +You can maximize, minimize, or restore the application window by setting the WindowState property of the Application object to ppWindowMaximized, ppWindowMinimized, or ppWindowNormal. Similarly, within the application window, you can maximize, minimize, or restore a document by setting its WindowState property. + +When a window is in a ˝normal˝ state (ppWindowNormal, not maximized or minimized), you can position it by using the Top and Left properties to specify the position of the upper-left corner of the window and size it by setting its Height and Width properties. + +The following example maximizes the application window and cascades the document windows within it: + + Application.WindowState = ppWindowMaximized + Windows.Arrange ArrangeStyle:=ppArrangeCascade + +## Changing the View + +To change the view in a window, set the ViewType property of the appropriate Window object to one of these 12 constants: ppViewHandoutMaster, ppViewMasterThumbnails, ppViewNormal, ppViewNotesMaster, ppViewNotesPage, ppViewOutline, ppViewPrintPreview, ppViewSlide, ppViewSlideMaster, ppViewSlideSorter, ppViewThumbnails, or ppViewTitleMaster. For example, the following statement switches the active window into Slide Sorter view: + + ActiveWindow.ViewType=ppViewSlideSorter + +To zoom the view, specify a value from 10 to 400 for the Zoom property of the View object for the appropriate window. The value represents the zoom percentage, but you don't include a percent sign. For example, the following statement zooms the active window to 150 percent: + + ActiveWindow.View.Zoom = 150 + +## Working with Panes + +The Pane object represents a pane of the PowerPoint window in Slide view. The Outline pane is represented by index number 1, the Slide pane by index number 2, and the Notes pane by index number 3. You can activate a pane by using the Activate method with the appropriate Pane object. The following example switches the view in the active window to Slide view and activates the Outline pane: + + With ActiveWindow + .ViewType = ppViewSlide + .Panes(1).Activate + End With + +To change the arrangement of the panes in a PowerPoint window in Slide view, use the SplitHorizontal property and the SplitVertical property of the Window object. + +The SplitHorizontal property controls the percentage of the document window's width that the Outline pane occupies, and the SplitVertical property controls the percentage of the document window's height that the Slide pane occupies. The following example sets the Outline pane to 25 percent of the width of the document window (leaving 75 percent to the Slide pane) and the Slide pane to 75 percent of the height of the window (leaving 25 percent to the Notes pane): + + With ActiveWindow + .SplitHorizontal = 25 + .SplitVertical = 75 + End With + +# Working with Slides + +Once you have created or opened the presentation you want to affect, you can access the slides it contains by using the Slides collection, which contains a Slide object for each slide in the presentation. Each slide is identified by its index number, but you can also assign names to slides in three different ways: + +**Using object variables** + +Then you can refer to each slide by its object variable name. + +**Using ID numbers** + +See the section titled "Finding a Slide by Its ID Number" later in this chapter. + +**Using the Name property** + +See the section titled "Accessing a Slide by Name" later in this chapter. + +Having a unique name for a slide is especially useful when you add slides to or delete slides from a presentation, because this causes the index numbers of the slides to change. It's much easier to just name the slides than to try to keep track of their shifting index numbers. + +## Adding a Slide to a Presentation + +To add a slide to a presentation, use the Add method with the Slides collection. The syntax is as follows: + + _expression_.Add(Index, Layout) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Slides collection. In many cases, it's easiest to use the Slides collection itself. + * Index is a required Long argument that specifies the index number for positioning the slide in the presentation. For example, the number 2 makes the new slide the second slide in the presentation. + * Layout is a required Long argument that specifies the layout for the new slide. The layout names correspond closely to the names you'll see in the Insert Slide dialog box or the Slide Layout task pane. For example, ppLayoutBlank specifies a blank slide, ppLayoutTitleOnly a title-only slide, and ppLayoutChartAndText a chart-and-text slide. The following statements declare an object variable named mySlide and assign to it a new title slide added at the beginning of the active presentation: + + Dim mySlide As Slide + Set mySlide = ActivePresentation.Slides.Add(Index:=1, _ + Layout:=ppLayoutTitle) + +* * * + +Understanding the ˝Mixed˝ Constants + +If you look at the list of constants for the Layout property, you'll notice one is called ppLayoutMixed. There's no ˝Mixed˝ layout in PowerPoint's list of slide layouts, and if you try to apply ppLayoutMixed to a slide, VBA returns an error. This is because ppLayoutMixed is the value VBA returns for the Layout property of a slide range that contains multiple slides with different designs. + +Other properties have similar Mixed values to indicate that the objects use different values. For example, ppTransitionSpeedMixed means that the slides or shapes use different transition speeds. Don't try to set a property to a Mixed value, because doing so always gives an error. + +* * * + +## Inserting Slides from an Existing Presentation + +When creating presentations automatically, it's often useful to insert slides from an existing presentation. To do so, use the InsertFromFile method of the Slides collection. The syntax is as follows: + + _expression_.InsertFromFile(FileName, Index, SlideStart, SlideEnd) + +Here are the components of this syntax: + + * _expression_ is a required expression that returns a Slides collection. Often, you'll want to use the Slides collection itself. + * FileName is a required String argument that specifies the file from which to insert the slides. + * Index is a required Long argument that specifies the slide position in the open presentation at which to insert the slides. + * SlideStart is an optional Long argument that specifies the first slide to insert. If you omit SlideStart, PowerPoint starts at the first slide. + * SlideEnd is an optional Long argument that specifies the last slide to insert. If you omit SlideEnd, PowerPoint goes up to the last slide. + +For example, the following statement inserts slides 2 through 8 from the presentation named Handbook.pptm stored in the folder Z:\Transfer\Presentations, placing the slides starting at the fifth slide in the open presentation Corporate.pptm: + + Presentations(˝Corporate.pptm˝).Slides. **InsertFromFile** _ + FileName:=˝Z:\Transfer\Presentations\Handbook.pptm˝, Index:=5, _ + SlideStart:=2, SlideEnd:=8 + +## Finding a Slide by Its ID Number + +When working programmatically with a presentation, it can be difficult to track which slide is which, especially when you add, delete, insert, copy, or move slides—thereby changing the slides' index numbers. + +To help you, PowerPoint assigns a slide ID number to each slide when it's created. The slide ID number doesn't change when you move a slide to a different position in the presentation, unlike the index number, which always reflects the slide's position in the presentation. You can check a slide's ID number by returning the SlideID property of the appropriate Slide object. + +To find a slide by its ID number, use the FindBySlideID method of the Slides collection. The syntax is as follows: + + _expression_.FindBySlideID(SlideID) + +Here, _expression_ is a required expression that returns a Slides collection. SlideID is a required Long argument that specifies the ID number of the slide you want to return. + +The following example declares a Long variable named TargetSlide and assigns to it a new slide added at the fifth index position in the active presentation, inserts a full presentation at the third index position, and then uses the FindBySlideID method to return the slide identified by TargetSlide and apply a different design template to it. This approach is similar to creating object variables for slides, as described earlier in this chapter. However, here you create Long variables to hold the ID numbers instead of object variables: + + Dim TargetSlide As Long + TargetSlide = ActivePresentation.Slides.Add(Index:=5, _ + Layout:=ppLayoutFourObjects).SlideID + Presentations(˝Corporate.pptm˝).Slides.InsertFromFile _ + FileName:=˝Z:\Transfer\Presentations\Handbook.pptm˝, Index:=3 + ActivePresentation.Slides.FindBySlideID(TargetSlide).ApplyTemplate _ + FileName:=˝C:\Program Files\Microsoft Office\Templates\Presentation + ÂDesigns\Brain Blitz.potm˝ + +## Changing the Layout of an Existing Slide + +To change the layout of an existing slide, set its Layout property. For example, the following statement changes the layout of the first slide in the active presentation to the clip-art-and-vertical-text layout: + + ActivePresentation.Slides(1).Layout = ppLayoutClipArtAndVerticalText + +When you change the layout of a slide, PowerPoint moves its existing contents to allow any new objects needed to be added to the slide. + +## Deleting an Existing Slide + +To delete an existing slide, use the Delete method with the appropriate Slide object. For example, the following statement deletes the first slide in the active presentation: + + ActivePresentation.Slides(1).Delete + +Be aware that PowerPoint doesn't confirm the deletion of a slide via VBA. + +## Copying and Pasting a Slide + +To copy a slide, use the Copy method of the appropriate Slide object. The Copy method takes no arguments. (You can also cut a slide by using the Cut method, which also takes no arguments.) + +To paste a slide, use the Paste method of the Slides collection. The Paste method takes an Index argument that specifies the slide position at which to paste in the slide. + +For example, the following statements copy the first slide in the active presentation and paste it in so that it is the fifth slide: + + ActivePresentation.Slides(1).Copy + ActivePresentation.Slides.Paste Index:=5 + +## Duplicating a Slide + +Instead of copying and pasting, you can directly duplicate a slide by using the Duplicate method of the Slide object. This method takes no arguments and places the duplicate of the slide immediately after the original in the index-number list. For example, the following statement duplicates the fourth slide in the active presentation, placing the copy at the fifth index position: + + ActivePresentation.Slides(4).Duplicate + +## Moving a Slide + +Instead of cutting and pasting a slide, you can move it directly by using the MoveTo method with the appropriate Slide object. Moving a slide has the same ultimate effect as cutting and pasting it but has the advantage of not changing the contents of the Clipboard (which you might need to preserve for the user or for other purposes). The syntax for the MoveTo method is as follows: + + _expression_.MoveTo(ToPos) + +Here, _expression_ is a required expression that returns a Slide object, and ToPos is a required Long argument that specifies the index position to which you want to move the slide. + +For example, the following statement moves the third slide in the presentation identified by the object variable myPresentation to the beginning of the presentation: + + myPresentation.Slides(3).MoveTo ToPos:=1 + +## Accessing a Slide by Name + +Instead of accessing a slide by its index number, you can assign a name to it by using the Name property of the Slide object. For example, the following statements assign the name Chairman's Introduction to the fifth slide in the active presentation and then use the Select method of the Slide object to select that slide by name: + + ActivePresentation.Slides(1).Name = ˝Chairman's Introduction˝ + ActivePresentation.Slides(˝Chairman's Introduction˝).Select + +## Working with a Range of Slides + +To work with a range of slides, use the Range method of the Slides collection to return a SlideRange object that represents the slides. The SlideRange object can represent a single slide, but you're usually better off using it to represent a range of slides. (You can access a single slide more easily by its index number or by a name you assign to it than through a SlideRange object.) + +To return a SlideRange object that encompasses two or more slides, use the Array function with a comma-delimited list of the slides. The list can use either the index numbers or the names of the slides. For example, the following statements declare the SlideRange object variable mySlideRange and assign to it the first five slides in the open presentation named HR.pptm: + + Dim mySlideRange As SlideRange + Set mySlideRange = _ + Presentations(˝HR.pptm˝).Slides.Range(Array(1, 2, 3, 4, 5)) + +The following statement assigns to the SlideRange object variable mySlideRange the slides named Intro and Outro in the active presentation: + + Set mySlideRange = ActivePresentation.Slides.Range(Array(˝Intro˝, ˝Outro˝)) + +## Formatting a Slide + +You can apply a design template to a slide by using the ApplyTemplate method, as discussed in ˝Applying a Template to a Presentation, to a Slide, or to a Range of Slides,˝ earlier in this chapter. You can also apply a background or a color scheme, as discussed in the following sections. + +### Applying a Background to One or More Slides + +To apply a background to a slide or several slides, use the Background property of the appropriate Slide object or SlideRange object to return the ShapeRange object representing the background of the slide or slides. You can then use the Fill object to set a color, fill, gradient, or picture in the background. + +The following example applies the picture Winter.jpg from the folder C:\Sample Pictures to the fourth slide in the presentation named Corporate.pptm. The example sets the FollowMasterBackground property to msoFalse, making the slide use a different background than the slide master, and also sets the DisplayMasterShapes property to msoFalse, making the slide not display the shapes on the slide master: + + With Presentations(˝Corporate.pptm˝).Slides(4) + .FollowMasterBackground = msoFalse + .DisplayMasterShapes = msoFalse + With .Background + .Fill.ForeColor.RGB = RGB(255, 255, 255) + .Fill.BackColor.SchemeColor = ppAccent1 + .Fill.UserPicture ˝C:\Sample Pictures\Winter.jpg˝ + End With + End With + +### Applying a Color Scheme to a Slide + +A color scheme is a group of eight colors that are used to create the look of the title, background, and other elements of a slide, handout, or notes page. VBA uses an RGBColor object to represent each color, and a ColorScheme object to represent each color scheme. The ColorScheme objects are gathered in a ColorSchemes collection for the entire presentation. + +To change the color scheme of a slide or several slides, use the ColorScheme property of the appropriate Slide object or SlideRange object to return the ColorScheme object, and then work with the Colors method to specify the color. The syntax is as follows: + + _expression_.Colors(SchemeColor) + +Here, _expression_ is a required expression that returns a ColorScheme object. SchemeColor is a required argument that specifies which color in the color scheme to set—for example, ppAccent1 (for the first accent in the color scheme), ppBackground (for the background color), or ppTitle (for the title color). + +The following statement sets the background color of the color scheme for the first three slides in the active presentation to black, which is RGB(0, 0, 0): + + ActivePresentation.Slides.Range(Array(1, 2, 3)) _ + .ColorScheme.Colors(ppBackground).RGB = RGB(0, 0, 0) + +## Setting a Transition for a Slide, a Range of Slides, or a Master + +To set a transition for a slide, a range of slides, or a master, use the SlideShowTransition property of the Slide object, the SlideRange object, or the Master object to return the SlideShowTransition object. + +To specify the speed at which the transition runs, set its Speed property to ppTransitionSpeedFast, ppTransitionSpeedMedium, or ppTransitionSpeedSlow. + +* * * + +Creating Effective Transitions between Slides + +Using transitions between slides can make a presentation look smooth and professional or awkward and amateurish. + +To specify the effect to use, set the EntryEffect property to the constant for the effect. There are too many constants to list here, but their names are generally descriptive enough to be easy to decipher. For example, the ppEffectBlindsHorizontal constant generates a transition that resembles an adjustment of window blinds, the ppEffectDissolve constant causes a rather crude kind of melting effect, and the ppEffectNone constant represents the No Transition setting. + +You should avoid all of these, and most of the other available transitions, unless you want to go back several decades to early TV transition effects. Contemporary television and movies employ smooth, subtle, and unobtrusive transitions between scenes. And slides are simple scenes. So, you should generally stay away from trick transitions like window blinds or crude transitions like the ppEffectDissolve, which is highly pixelated. + +To figure out which of the transition effects are sophisticated and discreet, try them out. You can preview all the transitions by clicking the Transitions tab on the PowerPoint Ribbon. Then click any slide transition you want to see. + +The default transition is quite a good dissolve, but if you want to try another classy transition, experiment with ppEffectFade. It's similar to the default. Also try experimenting with the Animations tab on the PowerPoint Ribbon, which governs how you animate the various shape objects on a slide. + +And also try experimenting with the transition speed to suit the animation to the subject of your presentation. + +* * * + +To control how the slide advances, set the AdvanceOnTime property to msoTrue (for automatic advancing) or msoFalse (for manual advancing). If you use automatic advancing, use the AdvanceTime property to specify the number of seconds. If you want the slide to advance when the user clicks, set the AdvanceOnClick property to msoTrue. (You can set both AdvanceOnTime and AdvanceOnClick to msoTrue. The slide advances manually if the user clicks before the AdvanceTime interval has elapsed.) + +To play a preset sound effect with the transition, use the SoundEffect property of the SlideShowTransition object to return the SoundEffect object, use the Name property to specify the name of the sound effect, and then use the Play method to play the sound effect. You can also play any compatible sound file by using the ImportFromFile method of the SoundEffect object and using the FullName argument to specify the path and filename of the sound file. + +PowerPoint 2013 can play any of the following audio-file types: .aiff, .au, .mid, .midi, .mp3, .m4a, .mp4, .wav, or .wma. But be aware that even if a file has one of these filename extensions, it still might not be playable if the proper codec isn't available. That's why everyone urges you to never give a _naked presentation_. Always first do a test run of any presentation on the equipment you'll be using for the official presentation when people are there closely watching you. + +If you want the sound to loop until the next sound, set the LoopSoundUntilNext property of the SlideShowTransition object to msoTrue. The default value is msoFalse. + +The following example sets up a transition for the second slide in the active presentation. The transition uses the Fade effect running at medium speed, sets advancing to either on click or after a delay of 30 seconds, and plays a sound file from an external source without looping: + + With ActivePresentation.Slides(2) + With .SlideShowTransition + .EntryEffect = ppEffectFade + .Speed = ppTransitionSpeedMedium + .AdvanceOnClick = msoTrue + .AdvanceOnTime = msoTrue + .AdvanceTime = 30 + .SoundEffect.ImportFromFile _ + FileName:=˝d:\Sounds\Crescendo.wav˝ + .LoopSoundUntilNext = msoFalse + End With + End With + +# Working with Masters + +VBA uses the Master object to represent the various masters that PowerPoint uses: the slide master, title master, handout master, and notes master. + +## Working with the Slide Master + +To work with the slide master for a presentation, use the SlideMaster property of the Presentation object. + +To return the slide master for a slide, use the Master property of the appropriate Slide object. For example, the following statement adds a title to the slide master for the active presentation (if the slide master already has a title, VBA returns an error): + + ActivePresentation.SlideMaster.Shapes.AddTitle.TextFrame.TextRange.Text = _ + ˝Orientation˝ + +## Working with the Title Master + +To find out whether a presentation has a title master, check the HasTitleMaster property. If it doesn't, you can use the AddTitleMaster method of the Presentation object to add a title master, as in the following example. If the presentation already has a title master, VBA returns an error when you try to add a title master: + + If Not ActivePresentation.HasTitleMaster Then _ + ActivePresentation.AddTitleMaster + +To return the title master for the presentation, use the TitleMaster property of the Presentation object. The following example checks that the title master exists and, if it does, formats the date and time to be visible and to use the dMMMyy format with automatic updating: + + With myPresentation + If .HasTitleMaster Then + With .TitleMaster.HeadersFooters.DateAndTime + .Visible = msoTrue + .Format = ppDateTimedMMMyy + .UseFormat = msoTrue + End With + End If + End With + +## Working with the Handout Master + +To work with the handout master, use the HandoutMaster property of the Presentation object to return the Master object. The following example uses the HandoutMaster property of the ActivePresentation object to fill the background of the handout master with a picture: + + With ActivePresentation.HandoutMaster.Background + .Fill.ForeColor.RGB = RGB(255, 255, 255) + .Fill.BackColor.SchemeColor = ppAccent1 + .Fill.UserPicture ˝d:\igrafx\dawn.jpg˝ + End With + +## Working with the Notes Master + +To work with the notes master, use the NotesMaster property of the Presentation object to return the Master object. For example, the following statement clears the HeaderFooter objects in the notes master in the first open presentation: + + Presentations(1).NotesMaster.HeadersFooters.Clear + +## Deleting a Master + +You can delete the title master or handout master, but not the slide master or notes master. To delete the title master or handout master, use the Delete method of the Master object. The following example checks that the active presentation has a title master and then deletes it: + + If ActivePresentation.HasTitleMaster Then _ + ActivePresentation.TitleMaster.Delete + +# The Bottom Line + +**Understand PowerPoint's creatable objects.** + +Creatable objects are commonly used objects that can be employed in VBA code without requiring that you qualify them with the Application object. You can leave that word out of your code; it's optional, and rarely used. + +Master It + +Name one of the objects or collections that are creatable in PowerPoint procedures. + +**Work with presentations.** + +You can create a new presentation programmatically, but PowerPoint generates an annoying flicker on most systems while it brings the new presentation into view. You can block this unpleasant, strobelike effect to avoid disturbing your audience. + +Master It + +How do you prevent a newly created presentation from being visible so that you can create and manipulate it in your code without the user seeing the flickering effect onscreen? + +**Work with windows and views.** + +To get the PowerPoint window into the state you want, you'll often need to work with the window and with the view. + +Master It + +PowerPoint uses two types of windows. What are they? + +**Work with slides.** + +Once you have created or opened the presentation you want to manipulate, you can access the slides it contains by using the Slides collection. This collection contains a Slide object for each slide in the presentation. Each slide is identified by its index number, but you can also use object variables to refer to slides or assign names to slides. + +Master It + +Why would you want to assign names to slides rather than using the default index numbers that are automatically assigned to the slides? + +**Work with masters.** + +Before attempting to manipulate a master in your code, you should determine whether the master actually exists in the presentation. + +Master It + +How do you find out whether a presentation has a title master? +Chapter 25 + +Working with Shapes and Running Slide Shows + +In the previous chapter you learned to work with Presentation objects, Slide objects, and Master objects. In this chapter you'll learn to work with Shape objects to manipulate the contents of slides and with HeaderFooter objects to control the contents of headers and footers. You'll also see how to set up and run a slide show using VBA. + +In this chapter you will learn to do the following: + + * Work with shapes + * Work with headers and footers + * Set up and run a slide show + +# Working with Shapes + +Most of the objects on a typical PowerPoint slide are Shape objects. For example, a title box is a Shape object, as is a picture or a Word table that you've pasted in. You access the Shape objects through the Shapes collection of a Slide object, a SlideRange object, or a Master object. + +## Adding Shapes to Slides + +Varying methods of the Shapes collection add the different types of shapes. Table 25.1 lists the Shape objects you can add and the methods and arguments for adding them. The following sections explain the arguments. You can find additional details about the Shapes object here: + + + +### Shared Arguments for Adding Shapes + +These are arguments that are shared among various shape-adding methods: + + * BeginX and EndX are required arguments (of the Single data type) that specify the horizontal starting position and ending position of the connector or line, measured in points from the left edge of the slide. + * BeginY and EndY are required Single data arguments that specify the vertical starting point and ending point of the connector or line, measured in points from the top of the slide. + * FileName is a required String argument used to specify the file to be used for creating the object (for example, the media file for creating a media object). + * Left is a required Single argument that specifies the position of the left edge of the shape from the left edge of the slide, measured in points. Top is a required Single argument that specifies the position of the top edge of the shape from the top edge of the slide, measured in points. + * Height is a required Single argument that specifies the height of the shape, measured in points. Width is a required Single argument that specifies the width of the shape, measured in points. + * LinkToFile is on optional argument that you can set to msoTrue to link the picture to its source file. + * NumColumns and NumRows are required Long arguments that specify the number of columns and rows in the table you're adding. + * Orientation is a required argument that specifies the orientation: msoTextOrientationHorizontal (horizontal) or msoTextOrientationVerticalFarEast (vertical). + * SafeArrayOfPoints is a required Variant argument that supplies an array of coordinate pairs that give the vertices and control points of a curve or polyline. The line begins at the first pair of coordinates and ends at the last pair. + * SaveWithDocument is a required argument that controls whether PowerPoint saves the linked picture in the presentation (msoTrue) or not (msoFalse). If you set LinkToFile: =msoFalse, you must set SaveWithDocument: =msoTrue. + +Table 25.1 Shapes and the methods for adding them to slides + +**To Add This Shape** | **Use This Method and These Arguments** +---|--- +Callout | AddCallout(Type, Left, Top, Width, Height) +Chart | AddChart(Type, Left, Top, Width, Height) +Chart2 | AddChart(Style, Type, Left, Top, Width, Height, NewLayout) +Comment | AddComment(Left, Top, Width, Height) +Connector | AddConnector(Type, BeginX, BeginY, EndX, EndY) +Curve | AddCurve(SafeArrayOfPoints) +Label | AddLabel(Orientation, Left, Top, Width, Height) +Line | AddLine(BeginX, BeginY, EndX, EndY) +Media object | AddMediaObject(FileName, Left, Top, Width, Height) +Media object 2 | AddMediaObject2(FileName, LinkToFile, Left, Top, Width, Height) +Media object from embed tag | AddMediaObjectFromEmbedTag(EmbedTag, Left, Top, Width, Height) +OLE object | AddOLEObject(Left, Top, Width, Height, ClassName, FileName, DisplayAsIcon, IconFileName, IconIndex, IconLabel, Link) +Picture | AddPicture(FileName, LinkToFile, SaveWithDocument, Left, Top, Width, Height) +Placeholder | AddPlaceholder(Type, Left, Top, Width, Height) +Polyline | AddPolyline(SafeArrayOfPoints) +Shape | AddShape(Type, Left, Top, Width, Height) +Smart Art | AddSmartArt(Layout, Left, Top, Width, Height) +Table | AddTable(NumRows, NumColumns, Left, Top, Width, Height) +Textbox | AddTextbox(Orientation, Left, Top, Width, Height) +Text Effect | AddTextEffect(PresetTextEffect, Text, FontName, FontSize, FontBold, FontItalic, Left, Top) +Title | AddTitle + +### _Type_ Argument for Adding Shapes + +The Type argument is different for the various methods that use it. Here are some examples: + + * Type for the AddPlaceholder method is a required argument that specifies the type of placeholder to add. The names are self-explanatory: ppPlaceholderBitmap, ppPlaceholderBody, ppPlaceholderCenterTitle, ppPlaceholderChart, ppPlaceholderDate, ppPlaceholderFooter, ppPlaceholderHeader, ppPlaceholderMediaClip, ppPlaceholderMixed, ppPlaceholderObject, ppPlaceholderOrgChart, ppPlaceholderPicture, ppPlaceholderSlideNumber, ppPlaceholderSubtitle, ppPlaceholderTable, ppPlaceholderTitle, ppPlaceholderVerticalBody, ppPlaceholderVerticalObject, ppPlaceholderVerticalTitle + +* * * + +Limitations on Placeholders + +You can use the ppPlaceholderVerticalBody and ppPlaceholderVerticalTitle placeholders only on slides that use vertical text—the slide layouts ppLayoutVerticalText, ppLayoutClipArtAndVerticalText, ppLayoutVerticalTitleAndText, and ppLayout VerticalTitleAndTextOverChart. + +* * * + + * Type for the AddCallout method is a required argument that specifies the type of callout line to add: msoCalloutOne (a one-segment line that can be vertical or horizontal), msoCalloutTwo (a one-segment line that rotates freely), msoCalloutThree (a two-segment line), or msoCalloutFour (a three-segment line). + * Type for the AddShape method is a required argument that specifies the type of AutoShape to add. There are too many constants to list here, but most are easy to identify from their names. For example, msoShapeHeart is a heart shape, msoShapeLightningBolt gives a lightning bolt, and so on. To see a list of the constants, search for the AddShape method in the VBA Editor Help file, and then click the link for the msoAutoShapeType entry. Or type **msoautoshapetype** in the editor's Object Browser search field. + * Type for the AddDiagram method is a required argument that specifies the diagram type: msoDiagramCycle (a cycle diagram), msoDiagramOrgChart (an org chart), msoDiagramPyramid (a pyramid diagram), msoDiagramRadial (a radial diagram), msoDiagramTarget (a target diagram), or msoDiagramVenn (a Venn diagram). + +* * * + +**What Is MSO? Practical Advice for the Perplexed Programmer** + +You may have noticed that many of the enumerations and constants you're running into in PowerPoint are prepended (the opposite of _appended_ ) by _mso_. This strange little acronym can stand for several things: Martha Stewart's stock market name (Martha Stewart Omnimedia), or Milwaukee Symphony Orchestra, or Microsoft Office. In this case, it stands for Microsoft Office. And why it is prepended to PowerPoint enumerations and not to other Office 2013 enumerations is just one of those mysteries that keep all us programmers on our toes. After all, even those of us who are semiconscious are likely aware that we're using VBA in Microsoft Office. + +Here's another example of a mystery. Throughout the decades of BASIC programming history, and in all other versions of BASIC and VBA, you use the words True and False to mean true and false. That makes a certain kind of sense when you think about it. However, in PowerPoint 2013, you can also use the built-in constants msoTrue and msoFalse to, for example, set the Visible property of a footer on a slide. Luckily, these constants are optional. You can still use the traditional True and False. There is no difference between Microsoft Office's truth and truth in general. (I'm speaking here strictly in the context of these constants.) + +_Mso_ also appears in the MsoTriState variable type—that bizarro uber-Boolean type that you ran into in the previous chapter. You remember it: it's like the famous quantum mechanical tri-state cat, which can be alive, dead, or a mixture of the two. + +In my opinion, you should not worry much over these weird usages such as msoTriStateMixed; tri-state entities have no precedent outside electronic chip diagrams and advanced physics. Think of them as Boolean (true or false). And remember that although you can _read_ the third (the mixed true and false) status of a tri-state type, you can't _set_ (assign) anything other than the traditional True or False values to it. + +* * * + +### Arguments Specific to the _AddTextEffect_ Method + +The following arguments apply only to the AddTextEffect method: + + * PresetTextEffect is a required argument that specifies the preset text effect to use. These preset text effects are identified by the constants msoTextEffect1 through msoTextEffect30, which correspond to the order in which the samples appear in the WordArt Gallery dialog box (1 through 6 are the first row, 7 through 12 the second row, and so on). + * Text is a required String argument that specifies the text to use in the WordArt object. + * FontBold is a required argument that you set to msoTrue to make the font bold or msoFalse to make it not bold. + * FontItalic is a required argument that you set to msoTrue to make the font italic and msoFalse to make it not italic. + * FontName is a required String argument that specifies the name of the font to use. + * FontSize is a required Single argument that specifies the font size to use. + +### Arguments Specific to the _AddOLEObject_ Method + +The following arguments apply only to the AddOLEObject method: + + * ClassName is an optional String argument that specifies the program ID (the ProgID) or OLE long class name for the object. You must use either ClassName or FileName, but not both. In most cases, it's easiest to use FileName. + * DisplayAsIcon is an optional argument that you can set to msoTrue to display the OLE object as an icon rather than as itself (the default). + * IconFileName is an optional String argument that you can use with DisplayAsIcon:=True to specify the filename of the icon you want to display for the object. + * IconIndex is an optional Integer argument that specifies the index of the icon to use within the icon file specified by IconFileName. If you omit the IconIndex argument, VBA uses the second icon in the icon file, the icon at position 1 (the first icon in the file is at position 0). + * IconLabel is an optional String argument that you can use to specify the caption (or label) to display under the icon. + * Link is an optional argument that you can set to msoTrue to link the OLE object to its source file when you use the FileName argument. Link must be msoFalse when you use ClassName to specify a class name. + +### An Example of Using the _AddShape_ Method + +The following statement uses the AddShape method to add a bent up-arrow to the upper-right corner of the last slide in the active presentation. Before executing this example, click the File tab on PowerPoint's Ribbon, then click the New option in the left pane to see the some of the available templates and themes. Double-click one of the templates so you'll have some slides to work with in this example. (In versions of PowerPoint prior to 2013, you'll have to open the Sample Templates folder before choosing a template.) + +Open the Visual Basic Editor by pressing Alt+F11. Locate the project in the Project window, right-click on its name (it will be boldface), and choose Insert ⇒ Module. Type the following into the new module, and then press F5 with your blinking cursor inside this subroutine to execute the code and see the effect: + + Sub test() + + ActivePresentation.Slides(ActivePresentation.Slides.Count) _ + .Shapes.AddShape Type:=msoShapeBentUpArrow, Left:=575, Top:=10, _ + Width:=150, Height:=75 + + End Sub + +To see what happened, look at last slide and notice that a shape has been added to it—a bent up-arrow. + +### An Example of Using the _AddTextEffect_ Method + +The following example uses the AddTextEffect method to superimpose a WordArt item onto the third slide. Ensure that you have at least three slides by pressing Ctrl+M a few times to add some new slides. + +This code draws the text _Questions_ & _Answers_ (on three lines) on the slide. This WordArt item is instructed in our code to use 54-point bold Garamond. + + ActivePresentation.Slides(3).Shapes. **AddTextEffect** _ + **PresetTextEffect:=msoTextEffect14** , _ + Text:="Questions" + Chr$(CharCode:=13) + _ + "&" + Chr$(CharCode:=13) + "Answers", _ + FontName:="Garamond", FontSize:=54, FontBold:=msoTrue, _ + FontItalic:=msoFalse, Left:=230, Top:=125 + +There are 30 msoTextEffect constants you can experiment with. They range from msoTextEffect1 to msoTextEffect30. msoTextEffect14 is nice; it provides a kind of metallic effect. + +### An Example of Using the _AddTextbox_ Method + +The following example adds a text box to the second slide in the active presentation and assigns text to it: + + Dim myTextBox As Shape + + With ActivePresentation.Slides(2) + Set myTextBox = .Shapes.AddTextbox _ + (Orientation:=msoTextOrientationHorizontal, Left:=100, Top:=50, _ + Width:=400, Height:=100) + myTextBox.TextFrame.TextRange.Text = "Corrective Lenses" + End With + +## Deleting a Shape + +To delete a shape, use the Delete method with the appropriate Shape object. For example, the following statement deletes the first Shape object on the second slide in the active presentation: + + ActivePresentation.Slides(2).Shapes(1).Delete + +## Selecting All Shapes + +To select all the shapes on a slide, use the SelectAll method of the appropriate Shapes collection. For example, the following statement selects all the Shape objects on the first slide in the active presentation: + + ActivePresentation.Slides(1).Shapes.SelectAll + +## Repositioning and Resizing a Shape + +To reposition a shape, set its Left property (to specify the distance in points from the left edge of the slide to the left edge of the shape) and its Top property (to specify the distance in points from the top edge of the slide to the top edge of the shape). + +To change the size of a shape, set its Width and Height properties to the appropriate number of points. + +For example, the following statements position the first shape on the first slide in the active presentation 200 points from the left side of the slide and 100 points from its top and make the shape 300 points wide by 200 points high: + + With ActivePresentation.Slides(1).Shapes(1) + .Left = 200 + .Top = 100 + .Width = 300 + .Height = 200 + End With + +You can also move a shape relative to its current location by using the IncrementLeft method and the IncrementTop method. Rotate it by using the IncrementRotation method. Note that these methods are not absolute specified locations within a slide. Instead, they are relative to the current position or rotation of the shape. Each of these methods takes an Increment argument: + + * For the IncrementLeft and IncrementTop methods, the Increment argument specifies the number of points to move the shape. A negative number moves the shape to the left or upward, while a positive number moves the shape to the right or downward. + * For the IncrementRotation method, the Increment argument specifies the number of degrees to rotate the shape. A positive number rotates the shape clockwise; a negative number rotates the shape counterclockwise. + +The following example works with the first shape on the third slide of the active presentation, moving it 100 points to the left and 200 points down and rotating it 90 degrees counterclockwise: + + With ActivePresentation.Slides(3).Shapes(1) + .IncrementLeft Increment:=-100 + .IncrementTop Increment:=200 + .IncrementRotation Increment:=-90 + End With + +## Copying Formatting from One Shape to Another + +Often, it's useful to be able to apply the same formatting to multiple shapes. When one shape has the formatting you want, you can use the PickUp method of the Shape object to copy the formatting from that shape and then use the Apply method to apply that formatting to another shape. + +Neither the PickUp method nor the Apply method uses any arguments. The following example copies the formatting from the first shape on the second slide in the active presentation and applies it to the third shape on the fourth slide: + + With ActivePresentation + .Slides(2).Shapes(1).PickUp + .Slides(4).Shapes(3).Apply + End With + +## Working with Text in a Shape + +The text within a shape is contained in a TextRange object, which itself is contained in a TextFrame object. To work with the text in a shape, you use the TextFrame property of the Shape object to return the TextFrame object and then use the TextRange property of the TextFrame object to return the TextRange object. Got it? + +Within the TextRange object, the Text property contains the text, the Font object contains the font formatting, the ParagraphFormat object contains the paragraph formatting, and the ActionSettings collection contains the action settings for the text range. + +### Finding Out Whether a Shape Has a Text Frame + +Not every shape has a text frame, so prior to manipulating text it's a good idea to first determine whether the shape you're dealing with in fact even _has_ a text frame. + +To do so, check that the HasTextFrame property of the Shape object is msoTrue, as in this example: + + If ActivePresentation.Slides(1).Shapes(1).HasTextFrame = msoTrue Then + MsgBox "The shape contains a text frame." + End If + +You may also need to check whether the text frame contains text. To do so, check that the HasText property of the TextFrame object is msoTrue. Here's an example: + + With ActivePresentation.Slides(1).Shapes(1).TextFrame + If .HasText = msoTrue Then MsgBox .TextRange.Text + End With + +### Returning and Setting the Text in a Text Range + +To return (read) or set (specify) the text in a text range, you can simply use the Text property of the TextRange object. For example, the following statement sets the text in the first shape on the fourth slide in the presentation identified by the object variable myPresentation to Strategic Planning Meeting: + + Sub Test() + + Dim myPresentation As Presentation + Set myPresentation = Presentations(1) + + myPresentation.Slides(4).Shapes(1).TextFrame.TextRange. **Text** _ + = "Strategic Planning Meeting" + + End Sub + +You can also return parts of the text by using the Paragraphs method, the Sentences method, the Lines method, the Words method, the Characters method, or the Runs method. The syntax for these methods is shown here, using the Paragraphs method as the example: + + _expression_.Paragraphs(Start, Length) + +The components of the syntax are as follows: + + * _expression_ is a required expression that returns a TextRange object. + * Start is an optional Long argument that specifies the first item (paragraph, sentence, line, word, character, or text run) to return. + * Length is an optional Long argument that specifies how many items to return—for example, two paragraphs, three sentences, or four words. + +* * * + +Understanding Text Runs + +A _text run_ is a sequence of characters that have the same font formatting. Text runs can be useful for picking out parts of text ranges that are formatted in a particular way. + +* * * + +The following code example returns the second through fifth words (the four words starting with the second word) from the first shape on the first slide in the active presentation: + + MsgBox ActivePresentation.Slides(1).Shapes(1).TextFrame _ + .TextRange.Words(Start:=2, Length:=4) + +The next code example sets the text of the second paragraph in the second shape on the sixth slide in the presentation identified by the object variable myPresentation to VP of Business Development: + + myPresentation.Slides(6).Shapes(2).TextFrame.TextRange _ + .Paragraphs(Start:=2, Length:=1).Text = "VP of Business Development" + +### Formatting the Text in a Text Range + +To format the text in a text range, use the ParagraphFormat object to control the paragraph formatting (including the alignment and the space before and after) and the Font object to control the font formatting. + +These are the most useful properties of the ParagraphFormat object: + + * The Alignment property controls the alignment. Use ppAlignLeft for left alignment, ppAlignCenter for centering, ppAlignJustify for justified alignment, ppAlignDistribute for distributed alignment (justified using all available space), or ppAlignRight for right alignment. + * The Bullet property returns the BulletFormat object, which represents the bullet formatting. See the next section for details. + * The LineRuleBefore property, the LineRuleAfter property, and the LineRuleWithin property determine whether the measurements set by the SpaceBefore property, the SpaceAfter property, and the SpaceWithin property use lines (msoTrue) or points (msoFalse). + * The SpaceBefore property and the SpaceAfter property control the amount of space before and after each paragraph. The SpaceWithin property controls the amount of space between base lines in a paragraph. All measurements are in points. + +The following example sets left alignment, 18 points of spacing before and after paragraphs, and 12 points of spacing between lines for the second shape on the slide identified by the object variable mySlide: + + Dim mySlide As Slide + Set mySlide = Presentations(1).Slides(2) + + With mySlide.Shapes(2).TextFrame.TextRange.ParagraphFormat + .Alignment = ppAlignLeft + .LineRuleAfter = msoFalse + .SpaceAfter = 18 + .LineRuleBefore = msoFalse + .SpaceBefore = 18 + .LineRuleWithin = msoFalse + .SpaceWithin = 12 + End With + +### Formatting the Bullets for a Text Range + +Bullets and numbers are vital to the lists used in many PowerPoint slides. To control whether and how bullets and numbers appear, use the Bullet property of the TextRange object to return the BulletFormat object, and then work with the BulletFormat object's properties and methods. + +To make bullets and numbers visible, set the Visible property of the BulletFormat object to msoTrue; to hide bullets and numbers, set Visible to msoFalse. + +To specify which type of bullet or numbering to use, set the Type property of the BulletFormat object to ppBulletUnnumbered (for a bullet), ppBulletNumbered (numbers), ppBulletPicture (for a picture), or ppBulletNone (no bullet). + +* * * + +Another Mixed Data Type + +The Type property of the BulletFormat object returns the value ppBulletMixed when the selection includes multiple types of bullets. You can't set Type to ppBulletMixed. You can only read it. + +* * * + +To specify the bullet character, use the Character property and the character number. You can find out the character number from the Symbol dialog box or the Character Map applet, which you can run by pressing the Windows key to get to the Modern view in Windows 8, then typing **Character Map**. + +Seeing this application in previous versions of Windows is somewhat more clumsy (Yay! Windows 8 demonstrates that it is an improvement in some ways). For Windows 7 and previous, click Start ⇒ All Programs ⇒ Accessories ⇒ System Tools ⇒ Character Map. + +Unfortunately, the character codes are given in the _hexadecimal_ numbering system. If you look up the check-box symbol for the Wingdings font that's used in the following code example, the character map utility doesn't say 254 in our human decimal numbering system. Instead, it says Character Code: 0xFE (the hex way of expressing 254). + +This tedious holdover from the early days of computing serves no particular purpose in character codes, but you have to deal with it. Why? Because some people think that pointless complexity is cute, or it helps make programming seem somehow more mysterious than it in fact is. This type of thing can also help with job security because managers will usually be dazzled by what they assume are complicated programming mysteries like hex. Hex (short for hexadecimal) is based on 16 digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F. People with eight fingers on each hand have an advantage here. + +To solve the hex-character-code problem, you can either use a calculator that can translate between hex and decimal or just prepend the characters _& H_ in front of the hex code and let VBA translate it for you when executing your procedure. + +For example, in the following code example, I used 254 (a decimal number) because I can translate hex (well, my HP programming calculator can). But if you can't, or more likely don't want to be bothered, just click the character you want to use in the Character Map dialog box and then look at its hex code in the lower left of the dialog box.. In this example, it's listed as 0xFE (which means, you guessed it, decimal 254). Since the Wingdings font has only 256 characters, ignore the _0x_ part and use the _FE_ , like this, in your code: + + .Character = &H **FE** + +Use the Font property to specify the font name, size, and color. The following example sets the bullet for the first shape on the slide identified by the object variable mySlide to Wingdings character 254, a check box, using the color white, which is RGB(255, 255, 255), and 44-point size: + + With mySlide.Shapes(1).TextFrame.TextRange.ParagraphFormat.Bullet + .Type = ppBulletUnnumbered + .Character = 254 + With .Font + .Name = "Wingdings" + .Size = 44 + .Color = RGB(255, 255, 255) + End With + End With + +Color is of course an important element in any design. You can easily find out which RGB values you need to employ for various colors by visiting this web page: + + + +To use your own custom picture as a bullet, set the Type property of the BulletFormat object to ppBulletPicture and then use the Picture method with the Picture argument, a required String argument that specifies the path and filename of the file to use as the bullet. You can use most common types of graphics files, including .bmp, .eps, .gif, .jpg, .jpeg, .pcx, .png, .tiff, and .wmf files. The following example uses the file Face1.jpg stored in the folder Z:\Public\Pictures as the bullet for the first shape on the slide identified by the object variable mySlide: + + With mySlide.Shapes(1).TextFrame.TextRange.ParagraphFormat.Bullet + .Type = ppBulletPicture + .Picture Picture:="z:\Public\Pictures\Face1.jpg" + End With + +## Animating a Shape or a Range of Shapes + +To animate a shape or a range of shapes, use the AnimationSettings property of the Shape object or the ShapeRange object to return the AnimationSettings object. + +To specify the animation effect to use, set the EntryEffect property to the constant for the effect. Let's see how to figure out which animation effect looks best for the shape you're working with. First, click a shape in a slide to select the shape. Now display the Add Animation pane. + +There are too many animation constants to list here, but their names are easy to understand from the names listed in the Add Animation pane. To open this pane, click the Animations tab in PowerPoint's Ribbon, then click the Add Animation icon in the Advanced Animation section. + +A pane drops down in PowerPoint's window, as shown in Figure 25.1. + +Figure 25.1 Here's a selection of animation effects available for use in PowerPoint. + +As usual with animations, less is more. Choose subtle effects unless you're presenting to an audience of louts that will appreciate vulgarity. + +To write code that creates an animation, set the Animate property to msoTrue. (To turn off an animation, set Animate to msoFalse.) + +To control how the text in a shape is animated, set the TextLevelEffect property to ppAnimateLevelNone (no animation), ppAnimateByFirstLevel, ppAnimateBySecondLevel, ppAnimateByThirdLevel, ppAnimateByFourthLevel, ppAnimateByFifthLevel, or ppAnimateByAllLevels. + +If you set TextLevelEffect to any value other than ppAnimateByAllLevels or ppAnimate LevelNone, you can use the TextUnitEffect property to specify how to animate the text. Use ppAnimateByParagraph to animate by paragraph, ppAnimateByWord to animate by word, or ppAnimateByCharacter to animate by character. + +To reverse the order of the animation, set the AnimateTextInReverse property to msoTrue. (The default is msoFalse.) + +To control how the animation advances, set the AdvanceMode property to ppAdvanceOnTime (for automatic advancing using a timing) or ppAdvanceOnClick (for manual advancing). If you use automatic advancing, use the AdvanceTime property to specify the number of seconds to wait before advancing. + +To play a built-in sound effect with the transition, use the SoundEffect property of the AnimationSettings object to return the SoundEffect object, use the Name property to specify the name of the sound effect, and then use the Play method to play the sound effect. You can also play your own sound file by using the ImportFromFile method of the SoundEffect object and using the FullName argument to specify the path and filename of the sound file. + +To control how a media clip is played, use the PlaySettings property of the Animation Settings object to return the PlaySettings object. For example, if you want the sound to loop until the next sound, set the LoopSoundUntilNext property of the PlaySettings object within the AnimationSettings object to msoTrue. The default value is msoFalse. + +You can find all these options by pressing F2 to display the Object Browser in the VBA Editor and then searching for them. For example, search for ppEntryEffect to see all possible constants for the various possible lead-in animations. + +The following example applies a custom animation to the first shape on the slide identified by the object variable mySlide. The animation uses the entry effect Fly In From Right, plays a sound effect from a file, animates the text by first-level paragraphs and by whole paragraphs, and advances when the user clicks: + + Dim mySlide As Slide + Set mySlide = Presentations(1).Slides(2) + + With mySlide.Shapes(1).AnimationSettings + .EntryEffect = ppEffectFlyFromRight + .AdvanceMode = ppAdvanceOnClick + .SoundEffect.ImportFromFile FileName:="D:\Media\Whistle4.wav" + .TextLevelEffect = ppAnimateByFirstLevel + .TextUnitEffect = ppAnimateByParagraph + End With + +To test this (or other code examples you try in PowerPoint), just press F5 in the main PowerPoint window, and then repeatedly click the screen to activate the various transitions and effects. Press Esc when you're done. + +# Working with Headers and Footers + +PowerPoint uses HeaderFooter objects to represent the headers, footers, slide numbers, and date and time on slides. The HeaderFooter objects are organized into the HeadersFooters collection, which you access through the HeaderFooters property of the Master object, a Slide object, or a SlideRange collection. + +Be warned: Before you can execute the following code examples, you must first _add a footer to the slides in your active presentation_. The code examples expect to modify an existing footer, not to create it (unlike in Excel, where a new header or footer _will_ be created automatically). + +So, before executing these examples, click the Insert tab on PowerPoint's Ribbon, and then in the Text area, click the Header And Footer button to open the Header And Footer dialog box. In this dialog box, click the Date And Time check box and the Footer check box. Then click the Apply To All button. + +## Returning the Header or Footer Object You Want + +To access the object you want, use the appropriate property of the HeaderFooter object: + + * Use the DateAndTime property to return the date and time. + * Use the Footer property to return the footer itself. + * Use the Header property to return the header on a notes page or handout. Slides themselves can't have a header. + * Use the SlideNumber property to return the slide number on a slide or the page number on a notes page or a handout. + +The following example uses the Footer property to set the text of the HeaderFooter object of the first slide in the active presentation: + + ActivePresentation.Slides(1).HeadersFooters.Footer.Text = "Sentence 102" + +## Displaying or Hiding a Header or Footer Object + +To display the HeaderFooter object, set its Visible property to msoTrue (or just True). To hide the HeaderFooter object, set its Visible property to msoFalse. For example, the following statement hides the footer on the fifth slide in the active presentation: + + ActivePresentation.Slides(5).HeadersFooters.Footer.Visible = False + +## Setting the Text in a Header or Footer + +To set the text that you want in a HeaderFooter object, assign a string containing the text to the object's Text property. For example, the following statement sets the text of the footer of the fifth slide in the active presentation to Confidential: + + ActivePresentation.Slides(5).HeadersFooters.Footer.Text = "Confidential" + +If you executed the previous example code, executing this example will trigger an error message. That's because you made the same slide (#5) invisible in the previous code. To be able to set the text in this slide, it must first be visible: + + ActivePresentation.Slides(5).HeadersFooters.Footer.Visible = **True** + ActivePresentation.Slides(5).HeadersFooters.Footer.Text = "Confidential" + +## Setting the Format for Date and Time Headers and Footers + +If your slides, notes pages, or handouts use dates and times in their footers or headers, use the Format property to specify how the dates and times should appear. Table 25.2 lists the constants you can use. + +Table 25.2 Format property constants for date and time headers and footers + +**Format** | **Example** +---|--- +ppDateTimeddddMMMMddyyyy | Thursday, October 05, 2013 +ppDateTimedMMMMyyyy | 5 October 2013 +ppDateTimedMMMyy | 5-Oct-13 +ppDateTimeFormatMixed | 10/5/2013 +ppDateTimeHmm | 10:17 +ppDateTimehmmAMPM | 10:17AM +ppDateTimeHmmss | 10:17:16 +ppDateTimehmmssAMPM | 10:17:16AM +ppDateTimeMdyy | 10/5/2013 +ppDateTimeMMddyyHmm | 10/5/2013 10:17AM +ppDateTimeMMddyyhmmAMPM | 10/5/2013 10:17:16AM +ppDateTimeMMMMdyyyy | October 5, 2013 +ppDateTimeMMMMyy | October 08 +ppDateTimeMMyy | Oct-08 + +Set the UseFormat property of the HeaderFooter to msoTrue if you want the date and time to be updated automatically. Set UseFormat to msoFalse if you want the date and time to remain unchanged. + +The following example displays the current date in the format Friday, April 12, 2013: + + Sub SetFooter() + + Dim objPresTation As Presentation + Set objPresTation = Application.ActivePresentation + + With objPresTation.Slides(2).HeadersFooters.DateAndTime + + .UseFormat = True + + **.Format = ppDateTimeddddMMMMddyyyy** + + End With + + End Sub + +# Setting Up and Running a Slide Show + +Not only can you assemble and format a slide show using VBA; you can also run it using VBA. To set up a slide show, use the SlideShowSettings property of the Presentation object to return the SlideShowSettings object. When you run the slide show, VBA creates a SlideShowWindow object, which you can then manipulate to control the slide show. + +## Controlling the Show Type + +To specify the type of show, set the ShowType property of the SlideShowSettings object to ppShowTypeSpeaker (for a standard full-screen presentation presented by a speaker), ppShowTypeKiosk (for a kiosk presentation), or ppShowTypeWindow (for a "browsed by an individual" presentation that appears in a window). For a show in a window, you can use the Left and Top properties to specify the position of the upper-left corner of the window and the Height and Width properties to specify its size. + +To control whether animation and narration are used, set the ShowWithAnimation property and the ShowWithNarration property of the SlideShowSettings object to msoTrue or msoFalse. + +To control whether the presentation loops until stopped, set the LoopUntilStopped property of the SlideShowSettings object to msoTrue or msoFalse. + +To control how the presentation advances, set the AdvanceMode property to ppSlideShowManualAdvance (for manual advancing), ppSlideShowUseSlideTimings (for automatic advancing using timings already set), or ppSlideShowRehearseNewTimings (to rehearse new timings while the show plays). + +The following example sets the active presentation running as a kiosk presentation that will advance automatically using its timings and loop until it is stopped: + + With ActivePresentation.SlideShowSettings + .LoopUntilStopped = msoCTrue + .AdvanceMode = ppSlideShowUseSlideTimings + .ShowType = ppShowTypeKiosk + .Run + End With + +This next example sets the presentation named Corporate.pptm running in speaker (full-screen) mode, sizing the image to 800×600 pixels and positioning it at the upper-left corner of the screen. The show uses manual advancing: + + With Presentations("Corporate.pptm").SlideShowSettings + .LoopUntilStopped = msoFalse + .ShowType = ppShowTypeSpeaker + .AdvanceMode = ppSlideShowManualAdvance + With .Run + .Height = 600 + .Width = 800 + .Left = 0 + .Top = 0 + End With + End With + +## Creating a Custom Show + +Custom shows within a presentation are represented by the NamedSlideShows collection within the SlideShowSettings object. Use the NamedSlideShows property of the SlideShowSettings object to return the NamedSlideShows collection. + +To create a custom show, use the Add method of the NamedSlideShows collection. The syntax is as follows: + + _expression_.Add(Name, SafeArrayOfSlideIDs) + +Here, _expression_ is a required expression that returns a NamedSlideShows object. Name is a required String argument that specifies the name to assign to the new custom show. SafeArrayOfSlideIDs is a required Variant that specifies the numbers or names of the slides to include in the custom show. + +For example, the following statements declare an array of the Long data type; assign to it slides 2, 4, 5, and 10 from the open presentation named Corporate.pptm; and create a new custom show named Short Show using the following array: + + Dim myArray(4) As Long + With Presentations("Corporate.pptm") + myArray(1) = .Slides(2).SlideID + myArray(2) = .Slides(4).SlideID + myArray(3) = .Slides(5).SlideID + myArray(4) = .Slides(10).SlideID + .SlideShowSettings.NamedSlideShows.Add Name:="Short Show", _ + safeArrayOfSlideIDs:=myArray + End With + +## Deleting a Custom Show + +To delete a custom show, use the Delete method with the appropriate NamedSlideShow object. For example, the following statement deletes the custom show named Overview from the active presentation: + + ActivePresentation.SlideShowSettings.NamedSlideShows("Overview").Delete + +## Starting a Slide Show + +To start a slide show using the whole presentation, use the Run method of the SlideShowSettings object. For example, the following statement starts the slide show running in the presentation identified by the object variable myPresentation: + + myPresentation.SlideShowSettings.Run + +To show only a range of slides from a presentation, set the RangeType property of the SlideShowSettings object to ppShowSlideRange, use the StartingSlide property of the SlideShowSettings object to specify the first slide and the EndingSlide property to specify the last slide, and then use the Run method to run the presentation. The following example shows slides 4 through 8 in the presentation named Corporate.pptm: + + With Presentations("Corporate.pptm").SlideShowSettings + .RangeType = ppShowSlideRange + .StartingSlide = 4 + .EndingSlide = 8 + .Run + End With + +To start running a custom show, set the RangeType property of the SlideShowSettings object to ppShowNamedSlideShow, use the SlideShowName property to specify the name of the custom show, and then use the Run method to run the custom show. The following example shows the custom show named Short Show in the active presentation: + + With ActivePresentation.SlideShowSettings + .RangeType = ppShowNamedSlideShow + .SlideShowName = "Short Show" + .Run + End With + +When you start a slide show, VBA creates a SlideShowWindow object representing the object. You can access the SlideShowWindow object either through the SlideShowWindows collection (a creatable object that contains a SlideShowWindow object for each open slide show) or through the SlideShowWindow property of the Presentation object. If you know which presentation is running, it's easier to go through the appropriate Presentation object. + +## Changing the Size and Position of a Slide Show + +To find out whether a slide show is displayed full screen or in a window, check the IsFullScreen property of the SlideShowWindow object. If the IsFullScreen property returns -1, the presentation is full screen; if the property returns 0, the presentation is a window. + +To set the height and width of the slide-show window in pixels, use the Height property and the Width property. To set its position, use the Top property to specify the distance in pixels of the top edge of the presentation from the top of the window or screen, and the Left property to specify the distance in pixels of the left edge of the presentation from the left edge of the window or the screen. + +## Moving from Slide to Slide + +Apart from controlling the position and size of the presentation, most of the actions you can take with a presentation involve the View object. To find out which slide is displayed, return the CurrentShowPosition property: + + MsgBox ActivePresentation.SlideShowWindow.View.CurrentShowPosition + +To display the first slide in the presentation, use the First method. To display the last slide, use the Last method: + + ActivePresentation.SlideShowWindow.View.First + ActivePresentation.SlideShowWindow.View.Last + +To display the next slide, use the Next method. To display the previous slide, use the Previous method. Here's an example: + + ActivePresentation.SlideShowWindow.View.Previous + +To display a particular slide in the slide show, use the GotoSlide method of the View object, using the Index argument to specify the slide number. For example, the following statement displays slide 5 in the first open slide-show window: + + Application.SlideShowWindows(1).View.GotoSlide Index:=5 + +## Pausing the Show and Using White and Black Screens + +To display a white screen, set the State property to ppSlideShowWhiteScreen. To display a black screen, set the State property of the View object to ppSlideShowBlackScreen: + + ActivePresentation.SlideShowWindow.View.State = ppSlideShowWhiteScreen + ActivePresentation.SlideShowWindow.View.State = ppSlideShowBlackScreen + +To toggle the black screen or white screen off and start the show running again, set the State property to ppSlideShowRunning. + +To pause the presentation, set the State property of the View object to ppSlideShowPaused. To start the show again, set the State property to ppSlideShowRunning, as in this example: + + With ActivePresentation.SlideShowWindow.View + .State = ppSlideShowPaused + .State = ppSlideShowRunning + End With + +## Starting and Stopping Custom Shows + +To start a custom show running, use the GotoNamedShow method and use the SlideShowName argument to specify the name of the custom show. For example, the following statement starts the custom show named New Show running: + + SlideShowWindows(1).GotoNamedShow SlideShowName:="New Show" + +To exit a custom show, use the EndNamedShow method and then use the Next method to advance the presentation. PowerPoint then displays the first slide in the full presentation: + + With ActivePresentation.SlideShowWindow.View + .EndNamedShow + .Next + End With + +## Exiting a Slide Show + +To exit the slide show, use the Exit method of the View property of the SlideShowWindow object. For example, the following statement exits the slide show in the active presentation: + + ActivePresentation.SlideShowWindow.View.Exit + +# The Bottom Line + +**Work with shapes.** + +PowerPoint VBA provides many ways to access and manipulate shapes. + +Master It + +Describe what the following line of code does: + + ActivePresentation.Slides(2).Shapes(1).Delete + +**Work with headers and footers.** + +Using PowerPoint headers and footers can be a convenient way to provide continuity for presentations as well as to identify each element. + +**Master It** + +In this chapter, you worked with several examples showing how to manipulate footers for slides. Why were there no examples illustrating how to manipulate headers for slides? + +**Set up and run a slide show.** + +To create a custom slide show, you use the Add method of the NamedSlideShows collection. + +Master It + +The syntax when using the Add method of the NamedSlideShows collection is + + _expression_.Add(Name, SafeArrayOfSlideIDs) + +Explain what the four components of this line of code are and do. +Chapter 26 + +Understanding the Outlook Object Model and Key Objects + +In this chapter, you'll begin to come to grips with the Outlook object model and using VBA to manipulate Outlook. You'll learn where Outlook stores VBA items, meet the VBA objects for Outlook's creatable objects and main user-interface items, and work with some of the main Outlook objects. You'll explore a variety of objects, from the Application object that represents the entire application through the objects that represent individual messages, calendar items, and tasks. You'll also learn how to search programmatically. + +In this chapter you will learn to do the following: + + * Work with the Application object + * Work with messages + * Work with calendar items + * Work with tasks and task requests + * Search for items + +# Getting an Overview of the Outlook Object Model + +Many people find Outlook harder to work with programmatically than other Office applications, so it's particularly helpful to explore the Outlook object model to see which objects Outlook uses and how they're related. Above all, when working with objects, seeing VBA code examples in the online Help system or online can be invaluable. + +You can find the Outlook object-model reference by following these steps: + +1. Launch or switch to Outlook, and then press Alt+F11 to launch or switch to the VBA Editor. + +2. Move your cursor to a blank space in the code window (to avoid context-sensitive help). + +3. Press F1 in the editor to launch MSDN (the Microsoft Developer Network). At the time of this writing, you'll see a message that the page you requested cannot be found. This is because Outlook has the incorrect link built into its Help feature. Never mind. We just want to use the Bing search anyway. + +4. In the Bing search field, type **outlook 2013 object model** and press Enter. + +5. Click the link _Object Model_ ( _Outlook 2013 Developer Reference_ ). You'll now have access to the whole collection of Outlook 2013 VBA syntax specifications, useful descriptions, and code examples (one of which is shown in Figure 26.1). + +Figure 26.1 Sample code found in the Outlook object model-reference will help you write your own VBA code. + +## Understanding Where Outlook Stores VBA Macros + +As you've seen earlier in this book, Word and Excel let you store VBA projects either in a global location (the Normal.dotm template in Word or the Personal Macro Workbook in Excel) or in individual templates or document files. PowerPoint lets you store VBA projects in presentation files and templates. + +Outlook, by contrast, doesn't let you store VBA projects in individual items (such as Outlook's email messages or contacts). Instead, Outlook saves all projects in a single VBA project called VbaProject.OTM, which is stored in the following folder (instead of _Richard_ in this path, substitute your username): + + C:\Users\ _Richard_ \AppData\Roaming\Microsoft\Outlook + +## Understanding Outlook's Most Common Creatable Objects + +In Outlook VBA, the Application object represents the entire Outlook application, so you can access any Outlook object by going through the Application object. However, Outlook also exposes various creatable objects, allowing you to reach some of the objects in its object model without explicitly going through the Application object. Recall that "creatable" merely means that when you're writing code involving these objects, using the word Application is optional. You can get the same result by using either of the following versions: + + Application.Explorers + +or more simply, + + Explorers + +Here is a list of Outlook's most common creatable objects; you'll work with most of them in more detail later in this chapter and in the next chapter: + + * The Explorers collection contains an Explorer object for each window that displays the contents of a folder. + * The Inspectors collection contains an Inspector object for each window that's open displaying an Outlook item. + * The COMAddIns collection contains a COMAddIn object for each COM (Component Object Model) add-in loaded in Outlook. + * The Reminders collection contains a Reminder object for each reminder. + +The most prominent objects in the Outlook user interface are represented in VBA by items whose names are descriptive of their purpose, such as these, for example: + + * The MailItem object represents a mail item. + * The ContactItem object represents a contact. + * The TaskItem object represents a task. + * The AppointmentItem object represents an appointment. + * The JournalItem object represents a journal entry. + * The NoteItem object represents a note. + +You'll learn how to work with these objects later in this chapter and in the next chapter. + +# Working with the Application Object + +You can have only one instance of Outlook running at a time. (By contrast, you can run multiple instances of Word or Excel at the same time.) You probably won't find this a limitation when you're writing macros that work within Outlook. But if you create a procedure in another application (such as Word) that will communicate with and manipulate Outlook, you will need to check whether there is an instance of Outlook currently running in the computer before you create an instance programmatically. (See Chapter 30, "Accessing One Application from Another Application," for instructions on accessing one application programmatically from another application.) + +## Working with the NameSpace Object + +Here is a new concept: the _NameSpace_. Among all the VBA-enabled Office applications, only Outlook employs this technique. That the NameSpace approach is unique to Outlook demonstrates beyond all doubt that the various Microsoft Office application teams work at least partly independently when building their object-model structures. + +## Working with Inspectors and Explorers + +Many Outlook VBA activities, such as accessing email messages, tasks, or contacts programmatically, require that you use the GetNameSpace method of Outlook's Application object to return the NameSpace object that represents the root object of the data source. Anyway, that's the official version. Just remember that you use the following syntax to get most jobs done in Outlook VBA: + + _expression_.GetNameSpace(Type) + +Here, _expression_ is a required expression that returns an Application object. Type is a required String argument that specifies the type of namespace you want to return. Outlook supports only the MAPI data source, so you always use Type: = "MAPI" with the GetNameSpace method. For example, the following statement returns the NameSpace and uses the CurrentUser property to display the name—the email address—of the current user in a message box: + + MsgBox Application. **GetNamespace("MAPI")**.CurrentUser + +* * * + +What Is an API? + +MAPI means Messaging Application Programming Interface. It's a collection of functions written by Microsoft that can be used in programming related to email. There are all kinds of APIs used for various purposes. _API_ is just another term for a library of built-in functions. Come to think of it, _namespace_ is also a synonym. The general term _namespace_ in computer programming means a collection of functions that is self-contained. This allows you to have functions with identical names that are distinguished by their individual namespaces. That way VBA or another language knows which function to trigger when it appears in the code. It's similar to a teacher using full names to distinguish John Thompson from John Ortega. + +* * * + +### Accessing Default Folders within the NameSpace Object + +The NameSpace object contains the folders that Outlook uses—both the collection of default folders used to store default items such as email messages, tasks, and contacts as well as any other folders created by the user or by custom procedures. These folders are represented in Outlook's VBA by MAPIFolder objects that are organized into a Folders collection. + +You'd probably expect that to find out which are the current default folders, you would use a method of the Folders collection. Nope. Given that we're in a special situation here (dealing with email), GetDefaultFolder is a method of the NameSpace object. The syntax is as follows: + + _expression_.GetDefaultFolder(FolderType) + +Here, _expression_ is a required expression that returns a NameSpace object. FolderType is a required argument that specifies which default folder you want to return. The constants are self-explanatory: olFolderCalendar, olFolderConflicts, olFolderContacts, olFolderDeletedItems, olFolderDrafts, olFolderInbox, olFolderJournal, olFolderJunk, olFolderLocalFailures, olFolderManagedEmail, olFolderNotes, olFolderOutbox, olFolderRSSFeeds, olFolderSentMail, olFolderServerFailures, olFolderSuggestedContacts, olFolderSyncIssues, olFolderTasks, olFolderToDo, or olPublicFoldersAllPublicFolders. + +The following example creates the object variable myCal and assigns the default calendar folder to it: + + Dim myCal As MAPIFolder + Set myCal = Application.GetNamespace("MAPI") _ + .GetDefaultFolder(FolderType:=olFolderCalendar) + +### Accessing Other Folders within the NameSpace Object + +Accessing the default folders in the NameSpace object via the GetDefaultFolder method is easy, but often you'll need to access other folders. In this case, you _do_ use the Folders collection. + +The following example displays a message box (see Figure 26.2) containing a list of all the folders contained in the namespace: + + Sub List_All_NameSpace_Folders() + Dim myNS As NameSpace + Dim myFolder As MAPIFolder + Dim mySubfolder As MAPIFolder + Dim strFolderList As String + + strFolderList = "Your Outlook NameSpace contains these folders:" _ + & vbCr & vbCr + + Set myNS = Application.GetNamespace("MAPI") + With myNS + For Each myFolder In myNS.Folders + strFolderList = strFolderList & myFolder.Name & vbCr + For Each mySubfolder In myFolder.Folders + strFolderList = strFolderList & "* " & mySubfolder.Name & vbCr + Next mySubfolder + Next myFolder + + End With + MsgBox strFolderList, vbOKOnly + vbInformation, "Folders in NameSpace" + + End Sub + +Figure 26.2 Listing the folders contained in the NameSpace object + +### Creating a New Folder + +To create a new folder, use the Add method with the Folders collection. The syntax is as follows: + + _expression_.Add(Name, Type) + +Here, _expression_ is a required expression that returns a Folders collection. Name is a required String argument that specifies the display name to assign to the new folder. Type is an optional Long argument that you can use to specify the type of folder to create: olFolderCalendar, olFolderContacts, olFolderDrafts, olFolderInbox, olFolderJournal, olFolderNotes, or olFolderTasks. If you omit Type, Outlook assigns the new folder the same type as its parent folder (the folder in which you create the new folder). + +The following statement creates a new folder named Personal Tasks in the Tasks folder, assigning the new folder the olFolderTasks folder type explicitly for clarity: + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderTasks) _ + .Folders.Add Name:="Personal Tasks", Type:=olFolderTasks + +### Deleting a Folder + +To delete a folder, use the Delete method with the appropriate MAPIFolder object. This method takes no arguments. The following example deletes the folder named Personal Tasks in the Tasks folder: + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderTasks) _ + .Folders("Personal Tasks").Delete + +* * * + +Deletion Is Dangerous + +Be careful when deleting objects in Outlook. First, Outlook doesn't request any confirmation before deleting an object. Second, the deletion is permanent; there's no Recycle Bin backup. + +* * * + +## Understanding Inspectors and Explorers + +VBA uses two major Outlook objects that most users wouldn't recognize from working with the Outlook user interface alone: + + * An Inspector is an object that represents a window displaying a specific Outlook item, such as an email message or an appointment. + * An Explorer object represents a window that displays the contents of a folder, such as a list of emails. + +* * * + +Objects within Objects + +Unlike the behavior of many collections, an Explorer object is included in the Explorers collection even if it is not visible. + +* * * + +### Opening an Inspector Window + +To open an inspector window for an object, use the Display method of the Inspector object. For example, the following statement displays an inspector window for the object referenced by the object variable myItem: + + myItem.Display + +### Returning the Inspector Associated with an Item + +To return the inspector associated with an item, use the GetInspector property of the appropriate object. The following example returns the inspector for the item identified by the object variable myItem: + + myItem.GetInspector + +### Returning the Active Window, Inspector, or Explorer + +Unlike Word, Excel, and PowerPoint, Outlook doesn't have an ActiveWindow object that represents the active window. However, Outlook's Application object does have an ActiveWindow method, which returns the topmost Outlook window. (If there is no window, ActiveWindow returns Nothing.) + +This window will be either an Inspector object or an Explorer object. Similarly, the ActiveExplorer method of the Application object returns the active explorer, and the ActiveInspector method of the Application object returns the active inspector. Got it? + +You can use the TypeName function to determine which type of window is active. The following example displays a message box that states which window type is active _if_ there is an active window: + + If Not TypeName(ActiveWindow) = "Nothing" Then + MsgBox "An " & TypeName(ActiveWindow) & " window is active." + End If + +Notice that we say here If Not... Nothing. The double negative means "if the active window isn't nothing." + +### Working with the Active Inspector + +In many procedures, you'll need to determine what the topmost inspector in the Outlook application is, either so that you can work with that inspector or so that you can restore the inspector to the topmost position at the end of a procedure that manipulates other inspectors. (Remember, you should always try to restore an application to the state it was in when your procedure started execution. This is a courtesy to the user and evidence of careful, quality programming.) + +To find out which is the topmost inspector, use the ActiveInspector method of the Application object. For example, the following statement maximizes the window of the topmost inspector: + + Application.ActiveInspector.WindowState = olMaximized + +Note that this example attempts to maximize an inspector window, so there must actually _be_ an inspector window open when you run the code. In other words, double-click an email message in Outlook to open it in a window separate from the Outlook window. This separate window, showing a single email, is an inspector. If you want to _trap_ this error (and you should) to prevent your macro from crashing when no inspector exists, here's how to make sure there is an active inspector. You can check that the TypeName function does not return Nothing when run on the ActiveInspector method of the Application object, like this: + + If TypeName(Application.ActiveInspector) = "Nothing" Then + MsgBox "No item is currently open." + End 'shut down the macro + End If + +## Creating Items + +To create new items in Outlook, you use the CreateItem method or the CreateItemFromTemplate method of the Application object. The CreateItem method creates default items, while the CreateItemFromTemplate method creates items based on the templates you specify. + +* * * + +You Can Use Custom Forms to Create New Objects + +You can also create new objects using a custom form. To do so, use the Add method with the Items collection. + +* * * + +### Using the CreateItem Method to Create Default Items + +The syntax for the CreateItem method is as follows: + + _expression_.CreateItem(ItemType) + +Here, _expression_ is a required expression that returns an Application object. ItemType is a required argument that specifies the type of item to create: olAppointmentItem, olContactItem, olDistributionListItem, olJournalItem, olMailItem, MobileItemMMS, MobileItemSMS, olNoteItem, olPostItem, or olTaskItem. + +The following example creates a new email message; assigns a recipient (by setting the To property), a subject (by setting the Subject property), and body text (by setting the Body property); and then displays the message window: + + Dim myMessage As MailItem + Set myMessage = Application.CreateItem(ItemType:=olMailItem) + With myMessage + .To = "test@example.com" + .Subject = "Test message" + .Body = "This is a test message." + .Display + End With + +### Using the CreateItemFromTemplate Method to Create Items Based on Templates + +Instead of creating a default item by using the CreateItem method, you can alternatively use the CreateItemFromTemplate method of the Application object to create a new item based on a template. The syntax for the CreateItemFromTemplate method is as follows: + + _expression_.CreateItemFromTemplate(TemplatePath, InFolder) + +Here, _expression_ is a required expression that returns an Application object. TemplatePath is a required String argument that specifies the path and filename of the template on which to base the new item. InFolder is an optional Variant argument that you can use to specify the folder in which to create the item. If you omit the InFolder argument, Outlook creates the item in the default folder for that item type. + +Before you can test the following example, you must create a note template to work with. Press Ctrl+Shift+N in Outlook to create a new note, then choose File ⇒ Save As and choose the Outlook Template option in the Save As Type list box in the Save As dialog box. Save it as tpltNote.oft. + +The following example creates a new note item based on the custom template tpltNote.oft you just stored in the C:\Users\ _Richard_ \AppData\Roaming\Microsoft\Templates folder within the user's user profile (substitute your computer's name for _Richard_ ). The example then displays the new note item: + + Dim myNoteItem As NoteItem + + Set myNoteItem = Application.CreateItemFromTemplate _ + ("C:\Users\ _Richard_ \AppData\Roaming\" _ + & "Microsoft\Templates\tpltNote.oft") + myNoteItem.Display + +## Quitting Outlook + +To quit Outlook, use the Quit method of the Application object. This method takes no arguments: + + Application.Quit + +You may also want to work with the events available to the Application object. See Chapter 27, "Working with Events in Outlook," for a discussion of how to work with these application-level events and with item-level events. + +# Understanding General Methods for Working with Outlook Objects + +Many of the objects in Outlook use the methods covered in the following sections. You'll see brief examples showing you how to use the methods, as well as further examples on the individual types of objects—email messages, appointments, contacts, tasks, and so on—later in this chapter and in the next. + +## Using the Display Method + +To open an item in an inspector window, use the Display method. The syntax is as follows: + + _expression_.Display(Modal) + +Here, _expression_ is a required expression that returns the type of object you want to display—for example, a ContactItem object or a MailItem object. Modal is an optional Variant argument that you can set to True to make the window modal. A window is modeless by default, or becomes modeless if you set Modal to False. Making the window modal means that users must close the window before they can work with another window. + +Note that the Modal argument isn't available for Explorer and MAPIFolder objects. + +For example, the following statement uses the Display method to display the Inbox: + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox).Display + +## Using the Close Method + +To close a window, use the Close method. The syntax is as follows: + + _expression_.Close(SaveMode) + +Here, _expression_ is a required expression that returns the object you want to close. SaveMode is a required argument that specifies whether to save changes (olSave), discard the changes (olDiscard), or prompt the user to decide whether to save the changes (olPromptForSave). + +The following example closes the active inspector and saves any changes to its contents: + + ActiveInspector.Close SaveMode:=olSave + +Remember that this code requires that an inspector be currently open. See the warning earlier in this chapter in the section titled "Working with the Active Inspector." + +## Using the Delete Method + +To delete an item, use the Delete method. This method takes no arguments. The following example deletes the item with the index number 1 in the Contacts folder. Be careful if you want to give this code a test run. It will delete a contact, but exactly _which_ contact is unpredictable. The sidebar "Practical Programming" explains why. So, if you value your contacts list, don't test this example. Take my word for it; I had to test this code and I still don't know which of my contacts were deleted. I hope it was some long-ago acquaintance. + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderContacts) _ + .Items(1).Delete + +* * * + +**Practical Programming: The Items Collection Is Unsorted** + +You often need to sort and search data. Be warned that the index numbers in the Items collection of your contacts are not ordered in any way. The collection is not alphabetical, nor is it ordered in any other fashion (by the date the contact was entered, was modified, or by any other order). Using the Delete, Display, or other methods with the Items collection accesses what to us, as programmers, will be a random item. In the previous example, Items(1) will almost certainly not be the first contact in your list of contacts. Or, as the Outlook online Help system puts it, "The items in the Items collection object are not guaranteed to be in any particular order." + +However, you _can_ sort items yourself if you wish, by writing code that sorts. Then you can search the sorted list that's generated. You do this by using the Sort method, as the following example illustrates. These statements sort your contacts alphabetically by the Full Name field in the Contacts dialog box. You can optionally sort by due date (for tasks), by last name (for contacts), and many other ways. + + Sub SortContacts() + Dim strNames As String + Dim myNameSpace As Outlook.NameSpace + Dim myFolder As Outlook.Folder + Dim myItem As Outlook.ContactItem + Dim myItems As Outlook.Items + + Set myNameSpace = Application.GetNamespace("MAPI") + Set myFolder = myNameSpace.GetDefaultFolder(olFolderContacts) + Set myItems = myFolder.Items + myItems.Sort "[FullName]", False + + For Each myItem In myItems + + strNames = strNames & ", " & myItem.FullName + + Next myItem + + MsgBox strNames + + End Sub + +Notice that you could use this For Each... Next loop to search for a particular item in the collection of items. + +Alternatively, you can use the AdvancedSearch method of the Application object, as described in the section "Searching for Items" later in this chapter. + +* * * + +## Using the PrintOut Method + +To print an item, use the PrintOut method. This method takes no arguments. The following example prints the item with the index number 1 in the Inbox: + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox) _ + .Items(1).PrintOut + +## Using the Save Method + +To save an item, use the Save method. This method takes no arguments. The following example creates a new task; assigns it a subject, start date (today), and due date (a week from today); turns off the reminder for the task; and then saves it: + + Dim myTask As TaskItem + Set myTask = Application.CreateItem(ItemType:=olTaskItem) + With myTask + .Subject = "Arrange Review Meeting" + .StartDate = Date + .DueDate = Date + 7 + .ReminderSet = False + .Save + End With + +This item will appear in the ToDo list of the MyTasks section of your Outlook Tasks. + +## Using the SaveAs Method + +To save an item as a separate file, use the SaveAs method. The syntax is as follows: + + _expression_.SaveAs(Path, Type) + +Here, _expression_ is a required expression that returns the object to be saved. Path is a required String argument that specifies the path and filename under which to save the file. Type is an optional Variant argument that you can use to control the file type used for the file, as shown in Table 26.1. + +Table 26.1 Type arguments for the SaveAs method + +**Argument** | **Type of File** +---|--- +olHTML | HTML file +olMSG | Outlook message format (.msg filename extension) +olRTF | Rich Text format +olTemplate | Template +olDoc | Word document format (email messages using WordMail) +olTXT | Text file +olVCal | vCal file +olVCard | vCard file +olICal | iCal file +olMSGUnicode | Outlook Unicode message format (.msg filename extension) + +The following example saves the message open in the active inspector. So before testing this example, be sure that a message has been double-clicked and is thus open in its own window separate from the main Outlook window. Remember that code involving the active inspector requires that an inspector be currently open. See the warning, and a way to error-trap this, earlier in this chapter in the section titled "Working with the Active Inspector." + +If the IsWordMail property of the ActiveInspector object returns True, the example saves the message as a .doc file; if the IsWordMail property returns False, the example saves the message as an .rtf file. If no inspector window is active, the example displays a message box pointing out the problem to the user: + + If TypeName(ActiveInspector) = "Nothing" Then + MsgBox "This macro cannot run because " & _ + "there is no active window.", vbOKOnly, "Macro Cannot Run" + End + Else + If ActiveInspector.IsWordMail Then + ActiveInspector.CurrentItem.SaveAs "c:\keep\message.doc" + Else + ActiveInspector.CurrentItem.SaveAs "c:\keep\message.rtf" + End If + End If + +# Working with Messages + +If you or your colleagues use Outlook's email capabilities extensively, you may be able to save time by programming Outlook to create or process messages automatically. The following sections show you how to create a new message, work with its contents, add an attachment, and send the message. + +## Creating a New Message + +To create a new message, use the CreateItem method of the Application object and specify olMailItem for the ItemType argument. The following example creates a MailItem object variable named myMessage and assigns to it a new message: + + Dim myMessage As MailItem + Set myMessage = Application.CreateItem(ItemType:=olMailItem) + +## Working with the Contents of a Message + +To work with the contents of a message, set or get the appropriate properties. These are the most widely useful properties: + + * To is the recipient or recipients of the message. + * CC is the recipient or recipients of copies of the message. + * BCC is the recipient or recipients of blind copies of the message. + * Subject is the subject line of the message. + * Body is the body text of the message. + * BodyFormat is the message's formatting type: olFormatPlain for text only, olFormatRichText for text with formatting, and olFormatHTML for HTML formatting. + * Importance is the relative importance of the message. Set it to olImportanceHigh, olImportanceNormal, or olImportanceLow. + +The following example creates a new message item and assigns it to the object variable myMessage. It then adds an addressee, a subject, and body text; applies the HTML format; sets the importance to high; and sends the message: + + Dim myMessage As MailItem + Set myMessage = Application.CreateItem(ItemType:=olMailItem) + With myMessage + .To = "petra_smith@ourbigcompany.com" + .Subject = "Preparation for Review" + .Body = "Please drop by tomorrow and spend a few minutes" _ + & " discussing the materials we need for Darla's review." + .BodyFormat = olFormatHTML + .Importance = olImportanceHigh + .Send + End With + +When this message, shown in Figure 26.3, arrives at Petra's machine, Outlook 2013 briefly displays it in the upper-right corner: + +Figure 26.3 A portion of a message of high importance is briefly displayed in Outlook. + +## Adding an Attachment to a Message + +To add an attachment to a message, use the Add method with the Attachments collection, which you return by using the Attachments property of the MailItem object. The syntax is as follows: + + _expression_.Add(Source, Type, Position, DisplayName) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns an Attachments collection. + * Source is a required String argument that specifies the path and filename of the attachment. + * Type is an optional String argument that you can use to specify the type of attachment. + * Position is an optional String argument that you can use with rich-text messages to specify the character at which the attachment is positioned in the text. Use character 0 to hide the attachment, 1 to position the attachment at the beginning of the message, or a higher value to position the attachment at the specified character position. To position the attachment at the end of the message, use a number higher than the number of characters in the message. + * DisplayName is an optional String argument that you can specify to control the name displayed for the attachment in the message. + +The following example attaches to the message referenced by the object variable myMessage the file Corporate Downsizing.pptm stored in the folder Y:\Sample Documents, positioning the attachment at the beginning of the message and setting its display name to Downsizing Presentation: + + myMessage.Attachments. **Add** _ + Source:="Y:\Sample Documents\Corporate Downsizing.pptm", _ + Position:=1, DisplayName:="Downsizing Presentation" + +To test this, insert this code into the example code from the previous section ("Working with the Contents of a Message"), like this: + + Dim myMessage As MailItem + Set myMessage = Application.CreateItem(ItemType:=olMailItem) + + **myMessage.Attachments.Add _** + **Source:="Y:\Sample Documents\Corporate Downsizing.pptm", _** + **Position:=1, DisplayName:="Downsizing Presentation"** + + With myMessage + .To = "petra_smith@ourbigcompany.com" + .Subject = "Preparation for Review" + .Body = "Please drop by tomorrow and spend a few minutes" _ + & " discussing the materials we need for Darla's review." + .BodyFormat = olFormatHTML + .Importance = olImportanceHigh + .Send + End With + +## Sending a Message + +To send a message, use the Send method. This method takes no arguments. The following example sends the message referenced by the object variable myMessage: + + myMessage.Send + +* * * + +Multiple Sends + +The Send method applies to the AppointmentItem, MeetingItem, and TaskItem objects as well as to the MailItem object. + +* * * + +To check whether a message has been sent, check its Sent property. This Boolean property returns True if the message has been sent and False if it has not. + +# Working with Calendar Items + +If you create or receive many calendar items, you may be able to save time or streamline your scheduling by using VBA. The following sections show you how to create a calendar item and work with its contents. + +## Creating a New Calendar Item + +To create a new calendar item, use the CreateItem method of the Application object and specify olAppointmentItem for the ItemType argument. The following example creates an AppointmentItem object variable named myAppointment and assigns to it a new appointment item: + + Dim myAppointment As AppointmentItem + Set myAppointment = Application.CreateItem(ItemType:=olAppointmentItem) + +## Working with the Contents of a Calendar Item + +To work with the contents of a calendar item, set or get the appropriate properties. These are the most widely useful properties: + + * Subject is the subject of the appointment. + * Body is the body text of the appointment. + * Start is the start time of the appointment. + * End is the end time of the appointment. + * BusyStatus is your status during the appointment: olBusy, olFree, olOutOfOffice, or olTentative. + * Categories is the category or categories assigned to the item. + * ReminderSet determines whether the appointment has a reminder (True) or not (False). + * ReminderMinutesBeforeStart is the number of minutes before the event that the reminder should occur. + +The following example creates a new AppointmentItem object and assigns it to the object variable myAppointment. It then sets the subject, body, start date (2:30 p.m. on the day seven days after the present date), and end date (one hour after the start); marks the time as busy; assigns the Personal category; sets a reminder 30 minutes before the appointment; and saves the appointment: + + Dim myAppointment As AppointmentItem + Set myAppointment = Application.CreateItem(ItemType:=olAppointmentItem) + With myAppointment + .Subject = "Dentist" + .Body = "Dr. Schmitt " & vbCr & "4436 Acacia Blvd." + .Start = Str(Date + 7) & " 2.30 PM" + .End = Str(Date + 7) & " 3.30 PM" + .BusyStatus = olBusy + .Categories = "Personal" + .ReminderMinutesBeforeStart = 30 + .ReminderSet = True + .Save + End With + +The AppointmentItem object has a grand total of 71 properties. If you want to explore more of them, take a look at this MSDN web page: + + + +* * * + +Allowing Users to Manually Assign Categories + +Assigning categories to an item programmatically can be difficult, especially because many users create custom categories or assign categories in an idiosyncratic manner. In many cases, it's better to allow each user to assign their preferred categories manually by displaying the Categories dialog box at the appropriate point in your procedure. You can do so by using the ShowCategoriesDialog method of the item—for example, myAppointment.ShowCategoriesDialog for an item referenced by the object variable myAppointment. + +* * * + +# Working with Tasks and Task Requests + +VBA can automate tasks and task requests. The following sections show you how to create a task, work with the contents of a task item, and send a task request. + +## Creating a Task + +To create a new task item, use the CreateItem method of the Application object and specify olTaskItem for the ItemType argument. The following example creates a TaskItem object variable named myTask and assigns to it a new task item: + + Dim myTask As TaskItem + Set myTask = Application.CreateItem(ItemType:=olTaskItem) + +## Working with the Contents of a Task Item + +To work with the contents of a task item, set or get the appropriate properties. These are the most widely useful properties: + + * Subject is the subject of the task. + * Body is the body text of the task. + * Start is the start time of the task. + * DueDate is the due date of the task. + * Importance is the importance of the task. Set it to olImportanceHigh, olImportanceNormal, or olImportanceLow. + * Status is the status of the task: olTaskNotStarted, olTaskWaiting, olTaskDeferred, olTaskInProgress, or olTaskComplete. + * PercentComplete is the percentage of the task completed. + * Companies is the companies associated with the task. + * BillingInformation is the company or department to bill for the task. + +The following example creates a TaskItem object variable named myTask and assigns to it a new task item. It then sets the subject and body of the task, specifies a due date, sets the status to olTaskInProgress and the percentage complete to 10, specifies the company involved and who to bill, sets the importance to High, and then saves the task: + + Dim myTask As TaskItem + Set myTask = Application.CreateItem(ItemType:=olTaskItem) + With myTask + .Subject = "Create a business plan" + .Body = "The business plan must cover the next four years." & _ + vbCr & vbCr & "It must provide a detailed budget, " & _ + "staffing projections, and a cost/benefit analysis." + .DueDate = Str(Date + 28) + .Status = olTaskInProgress + .PercentComplete = 10 + .Companies = "Acme Polyglot Industrialists" + .BillingInformation = "Sales & Marketing" + .Importance = olImportanceHigh + .Save + End With + +The TaskItem object has 69 properties. If you want to explore more of them, take a look at this MSDN web page: + + + +## Assigning a Task to a Colleague + +To assign a task to a colleague, use the Assign method of the TaskItem object, and then use the Add method of the Recipients collection to add one or more recipients. Finally, you can use the Send method to send the task to your colleague. + +The following example creates a task, uses the Assign method to indicate that it will be assigned, specifies a recipient, and sends the task: + + Dim myTaskAssignment As TaskItem + Set myTaskAssignment = Application.CreateItem(ItemType:=olTaskItem) + With myTaskAssignment + .Assign + .Recipients.Add Name:="Peter Nagelly" + .Subject = "Buy Bagels for Dress-Down/Eat-Up Day" + .Body = "It's your turn to get the bagels on Friday." + .Body = .Body & vbCr & vbCr & "Remember: No donuts AT ALL." + .DueDate = Str(Date + 3) + .Send + End With + +# Searching for Items + +To search for items, use the AdvancedSearch method of the Application object. The syntax is as follows: + + _expression_.AdvancedSearch(Scope, Filter, SearchSubFolders, Tag) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns an Application object. + * Scope is a required String argument that specifies the scope of the search (which items to search). Usually you'll search a particular folder. For example, you might search the Inbox for messages that match certain criteria, or you might search the Tasks folder for particular tasks. + * Filter is an optional String argument that specifies the search filter. While this argument is optional, you will need to use it unless you want to return all the items within the scope you've specified. + * SearchSubFolders is an optional Variant argument that you can set to True to search through any subfolders of the folder specified by the Scope argument, or False to search only the specified folder. The default is False. + * Tag is an optional Variant argument that you can use to specify a name for the search you're defining. If you create a name, you can call the search again. + +The following example searches the Inbox (Scope: = "Inbox") for messages with the subject line containing _Office_. If any messages are found, the procedure produces a list of sender names, which it assigns to the String variable strMessages and displays in a message box. + +Note that at the time of this writing, there appears to be a timing bug in the advanced search feature. If you press F5 to execute the following code, no search hits are found. However, if you press F8 repeatedly to step through the code, it works as expected and hits are found. + +I'm including this code because _it should work_. If you find a way to insert an effective delay or otherwise fix the problem, please email me at my address in the introduction to this book. Or perhaps by the time this book is published Microsoft will have fixed it. + + Sub Sample_Advanced_Search() + + Dim mySearch As Search + Dim myResults As Results + Dim intCounter As Integer + Dim strMessages As String + Dim intTotal As Integer + + Dim strFilter As String + strFilter = Chr(34) & "urn:schemas:httpmail:subject" & Chr(34) & " like '%Office%'" + + Set mySearch = AdvancedSearch(Scope:="Inbox", filter:=strFilter) + + Set myResults = mySearch.Results + intTotal = myResults.Count + + For intCounter = 1 To intTotal + strMessages = strMessages & _ + myResults.Item(intCounter).SenderName & vbCr + Next intCounter + + MsgBox strMessages, vbOKOnly, "Search Results" + + End Sub + +* * * + +You Can Execute 100 Searches Simultaneously, But Should You? + +If necessary, you can run two or more searches at the same time. To do so, use the AdvancedSearch method in successive lines of code. Actually, you can run up to 100 searches at the same time, but doing so puts a considerable load on your computer and may make it run slowly or appear to stop responding. + +* * * + +# The Bottom Line + +**Work with the **Application** object.** + +VBA uses two major Outlook objects that most users wouldn't recognize from working with the Outlook user interface alone. + +Master It + +One of these objects represents a window that displays the contents of a folder. The other represents a window displaying an Outlook item, such as an email message or an appointment. What are the names of these two objects? + +**Work with messages.** + +To work with the contents of a message in VBA, you set or get various properties. + +Master It + +Name one of the most widely useful properties employed when manipulating the contents of a message in a procedure. + +**Work with calendar items.** + +You can create new calendar appointment items via VBA. + +Master It + +To create a new calendar item, you use a particular method of the Application object and specify olAppointmentItem for the ItemType argument. What is the method? + +**Work with tasks and task requests.** + +You can assign a task to a colleague and then add one or more recipients. You can then send the task to your colleague and, optionally, the additional recipients. + +Master It + +What methods do you use to assign, add, and send a task to others? +Chapter 27 + +Working with Events in Outlook + +If you want to automate the way that Outlook works, you may sometimes need to write code that responds to Outlook events. Outlook has two classes of events, application-level events and item-level events, and between them, they enable you to write code that responds to most anything that happens in Outlook. In this chapter, you will learn how to work with both types of events, and you will see code examples showing how to manage some of the events. + +* * * + +How Event-Handler Procedures Differ from Ordinary Macros + +Both writing and testing an event-handler procedure differ from the techniques you've been employing throughout this book when creating and testing ordinary macro procedures. If you intend to test the examples in this chapter, I suggest that you first read the sidebar titled "How to Test Event-Handler Procedures" later in this chapter. + +The following points summarize the qualities of event-handler procedures that differ from ordinary procedures: + + * An event handler must be located within a class module, not in an ordinary macro module. Therefore, you're entering the world of object-oriented programming (OOP). And in spite of some useful qualities, OOP can sink us programmers into a quagmire of complexity. A complete example demonstrating how to add an event handler to Outlook and then test it can be found in the sidebar "How to Test Event-Handler Procedures." + * You must declare an object variable. + * You must initialize the object variable to connect it to an actual object. + * You then write code in a procedure triggered by the event you're interested in. + * You test your code differently than you would in ordinary modules. In a class module, you cannot simply test the event handler by pressing F5 to run it directly. Pressing F5 brings up the Macro dialog box. Instead you test your code indirectly by triggering the event it's designed to service—for example, by modifying a contact in the Contacts folder. + +* * * + +In addition to the events discussed in this chapter, Outlook supports form events such as those discussed in "Using Events to Control Forms" in Chapter 15, "Creating Complex Forms Boxes." However, as is so often the case with Outlook and Access, the folders in Outlook are somewhat unique and are different from the VBA forms you've worked with earlier in this book. Outlook's folder/form is described later in this chapter in the sidebar titled "What Is VBScript?" As you'll see, you even use a special variation on the VBA language when programming this folder-slash-form. + +We'll conclude this chapter with a brief look at Outlook's Quick Steps feature. For those who don't wish to, or can't, write macros, Quick Steps provides an alternative, if seriously limited, way to automate some tasks. This tool is similar to Access's Macro Designer, though even more simplistic. + +In this chapter you will learn to do the following: + + * Create event handlers + * Work with application-level events + * Work with item-level events + * Understand the Quick Steps tool + +# Working with Application-Level Events + +By default, macros are disabled in Outlook 2013. To work with the examples in this chapter, or to use macros in general, you must select an enabling option in Outlook's Trust Center. To do so, follow these steps: + +1. Click the File tab on the Ribbon. + +2. Choose Options in the left pane of the File window. + +3. Click Trust Center in the left pane of the Outlook Options dialog box. + +4. Click the Trust Center Settings button. + +5. Click Macro Settings in the left pane of the Trust Center dialog box. + +6. Now choose one of the two lower options: Notification For All Macros (which gets old quickly) or Enable All Macros. + +Recall that an event is something that happens to an object, such as a click, a mouse drag, a keystroke, and so on. You can write code in an event (an _event procedure_ , as it's called) to respond to the click or other event. + +An application-level event is an event that happens to the Outlook application as a whole rather than to an individual item within it. For example, the Startup event is an application-level event that occurs when Outlook starts, and the Quit event is an application-level event that occurs when Outlook closes. By contrast, item-level events represent things that happen to individual items—for example, the opening of a particular email message or contact record, or a user's switching from one folder to another. + +The application-level events are easier to access than the item-level events because the Application object is the topmost object and is always available when Outlook is running. This means that you don't have to use an event handler to create the Application object. It just always exists. You do, however, have to write code to create an object for an item-level event. + +To access the application-level events, you use the built-in ThisOutlookSession class module. It's automatically inserted into the VBA Editor. Look in the Project Explorer and expand the Project1 item that represents the Outlook VBA project, then expand the Microsoft Outlook Objects item. You now see the ThisOutlookSession item. Double-click it to open a Code window showing its contents. (If this is the first time you've opened the ThisOutlookSession class module, it will have no contents.) + +Each of the events described in the following sections works with the Application object. For simplicity, most of the following examples directly use the Outlook Application object itself, but you could declare an object variable, then use it to return the Application object if you wish. + +Recall that you can find the Application object in the drop-down list on the top left of the VBA Editor's Code window. All the events available to the Application object can be selected from the drop-down list on the top right of the Code window, as shown in Figure 27.1. + +Figure 27.1 The drop-down list on the right shows all the events available in the Application object. + +You can select these various events from the drop-down list (causing the editor to type in the procedure structure for you) or just type the event name yourself as a sub directly in the Code window. However, if you select from the drop-down list, the VBA Editor will automatically add any necessary arguments as well. So that's the easier approach. + +Also, if you declare object variables using the WithEvents statement, like this, the Editor's drop-down lists will include these objects and their available events: + + Public WithEvents myInspectors As Inspectors + Public WithEvents myInspector As Inspector + +That can be a useful shortcut while programming because you can view every event available in an object—and also have the editor type in any necessary arguments. Later in this chapter you'll experiment with the Inspectors collection and the Inspector argument. + +## Using the Startup Event + +The Startup event, which takes no arguments, occurs when Outlook starts. In other words, every time the user starts Outlook, any code you might have written in the Sub Application_Startup() procedure will automatically execute. + +The Startup event is useful for making sure that Outlook is correctly configured for the user to start work. Say that someone always starts off by writing notes, and the first note is always a reminder about time cards. The following example creates a new NoteItem object (a note), assigns text to its Body property, and uses the Display item to display it: + + Private Sub **Application_Startup** () + Dim myNoteItem As NoteItem + Set myNoteItem = Application.CreateItem(ItemType:=olNoteItem) + myNoteItem.Body = "Please start a new time card for the day." + myNoteItem.Display + End Sub + +You can also put the Startup event to good use by writing code with the Set command to connect an object variable to a real object it is supposed to represent. More on this later in this chapter, in the section titled "Declaring an Object Variable and Initializing an Event." + +## Using the Quit Event + +The Quit event occurs when Outlook is shut down. This event is triggered three possible ways: + + * By the user choosing Exit in the File tab of the Ribbon. + * By the user clicking the red X icon in the upper right of the Outlook window. + * By the programmer using the Quit method of the Application object in VBA. + +By the time that the Quit event fires (is triggered), all of Outlook's windows have already been closed and all global variables have been released, so there's little left for a programmer to access via code in this event procedure. One possibility, however, is to display a parting message to the user, as in the following example, which displays a message on the workday that precedes a national holiday to remind the user of the holiday: + + Private Sub Application_Quit() + + Dim strMessage As String + Select Case Format(Date, "MM/DD/YYYY") + Case "01/18/2013" + strMessage = "Next Monday is Martin Luther King Day." + Case "02/15/2013" + strMessage = "Next Monday is President's Day." + Case "05/23/2013" + strMessage = "Next Monday is Memorial Day." + Case "07/03/2013" + strMessage = "Friday is Independence Day." & _ + " Monday is a company holiday." + Case "08/29/2013" + strMessage = "Next Monday is Labor Day." + 'other National Holidays here + End Select + + If strMessage = "" Then Exit Sub + + MsgBox strMessage, vbOKCancel + vbExclamation, "Don't Forget..." + + End Sub + +## Using the ****ItemSend Event + +The ItemSend event occurs when an item is sent, either by the user issuing a Send command (for example, by clicking the Send button in a message window) or by executing the Send method in VBA code. The syntax for the ItemSend event is as follows: + + Sub _expression_ _ItemSend(ByVal Item As Object, Cancel As Boolean) + +Here, _expression_ is a required expression that returns an Application object. Item is a required argument that specifies the item that's being sent. Cancel is an optional Boolean argument that you can set to False to prevent the item from being sent. + +The following example examines the Subject property of the Item object being sent. If the Subject property is an empty string, the message box prompts the user to add a subject line, and the Cancel = True statement cancels the sending of the item: + + Private Sub Application_ItemSend(ByVal Item As Object, Cancel As Boolean) + If Item.Subject = "" Then + MsgBox "Please add a subject line to this message before sending it." + Cancel = True + End If + End Sub + +## Using the ****NewMail ****and NewMailEx Events + +The NewMail event occurs when one or more new mail items arrives in the Inbox. The NewMail event can be useful for sorting messages automatically. You can also specify custom rules to sort messages automatically. The NewMail event takes no arguments. + +The following example displays a message box that offers to show the Inbox when new mail arrives, triggering the NewMail event: + + Private Sub Application_NewMail() + If MsgBox("You have new mail. Do you want to see your Inbox?", _ + vbYesNo + vbInformation, "New Mail Alert") = vbYes Then + + Application.GetNamespace("MAPI").GetDefaultFolder(olFolderInbox).Display + End If + End Sub + +The NewMailEx event is a more complex version of the NewMail event that passes to your code a list of the items received in the Inbox since that event last fired. The NewMailEx event passes this list only for Exchange Server and other mailboxes that provide notification of messages received. The syntax is as follows: + + Sub _expression_.NewMailEx(EntryIDCollection As String) + +Here, _expression_ is a required expression that returns an Application object. EntryIDCollection is a string that contains the entry IDs of the messages that have been received. Each entry ID is separated from the next by a comma; if there is a single entry ID, there is no comma in the EntryIDCollection string. + +The following example of a NewMailEx event procedure uses a Do While... Loop loop to separate the individual message IDs (by using the InStr function to identify each section of the EntryIDCollection string, up to the next comma, in turn). Then the code builds a string that contains introductory text followed by the subject line of each message, one message to a line. Finally, the procedure displays the string in a message box so that when Outlook receives new mail, the user receives an executive summary of the subject lines: + + Private Sub Application_NewMailEx(ByVal EntryIDCollection As String) + + Dim myMailItem As Object + Dim intMsgIDStart As Integer, intMsgIDEnd As Integer + Dim intCutPoint As String, strMailItemID As String, strMailList As String + + intMsgIDStart = 1 + intCutPoint = Len(EntryIDCollection) + + intMsgIDEnd = InStr(intMsgIDStart, EntryIDCollection, ",") + strMailList = "You have the following messages:" + + Do While intMsgIDEnd <> 0 + strMailItemID = Strings.Mid(EntryIDCollection, intMsgIDStart, _ + (intMsgIDEnd - intMsgIDStart)) + Set myMailItem = Application.Session.GetItemFromID(strMailItemID) + strMailList = strMailList & vbCr & myMailItem.Subject + intMsgIDStart = intMsgIDEnd + 1 + intMsgIDEnd = InStr(intMsgIDStart, EntryIDCollection, ",") + Loop + + MsgBox strMailList, vbOKOnly + vbInformation, "Mail Alert" + + End Sub + +* * * + +An Alternative to the NewMail Events + +Instead of using a NewMail or NewMailEx event, you can use an ItemAdd event with the items in the Inbox to process each new message that arrives. + +* * * + +## Using the AdvancedSearchComplete and the AdvancedSearchStopped ****Events + +Outlook provides two events for working with advanced searches created using the AdvancedSearch method. The AdvancedSearchComplete event fires when the AdvancedSearch method is run via VBA and finishes searching. The AdvancedSearchStopped event fires when the AdvancedSearch method is run via VBA, and is stopped by using the Stop method of the search. + +The syntax for the AdvancedSearchComplete event is as follows: + + Private Sub _expression_ _ AdvancedSearchComplete(ByVal SearchObject As Object) + +Here, _expression_ is a required expression that returns an Application-type object variable that has been declared with events in a class module. SearchObject is the Search object that the AdvancedSearch method returns. + +The following example uses the AdvancedSearchComplete event to return the number of search results that were found by the AdvancedSearch method: + + Private Sub Application_AdvancedSearchComplete(ByVal SearchObject As Search) + MsgBox "The search has finished running and found " & _ + SearchObject.Results.Count & " results.", vbOKOnly + vbInformation, _ + "Advanced Search Complete Event" + End Sub + +The following example uses the AdvancedSearchStopped event to inform the user that the search has been stopped: + + Private Sub Application_AdvancedSearchStopped(ByVal SearchObject As Search) + MsgBox "The search was stopped by a Stop command.", vbOKOnly + End Sub + +## Using the MAPILogonComplete ****Event + +The MAPILogonComplete event occurs when the user has successfully logged on to Outlook. You can use the MAPILogonComplete event to ensure that Outlook is configured correctly for the user or simply to display some information in a message. The MAPILogonComplete event takes no arguments. + +The following example of a MAPILogonComplete procedure displays a message about current trading conditions when the user has successfully logged on to Outlook. The code includes a commented line indicating where the String variables strPubDownBegin and strPubForecast would be declared and assigned data in a real-world implementation of this example: + + Private Sub Application_MAPILogonComplete() + + Dim strMsg As String + + 'strPubDowBegin and strPubForecast declared and assigned strings here + + strMsg = "Welcome to the UltraBroker Trading System!" & vbCr & vbCr + strMsg = strMsg & "Today's starting value is " & strPubDowBegin & "." _ + & vbCr & vbCr + strMsg = strMsg & "Today's trading forecast is " & strPubForecast & "." + MsgBox strMsg, vbOKOnly + vbInformation, _ + "UltraBroker Trading System Logon Greeting" + End Sub + +## Using the Reminder ****Event + +The Reminder event fires immediately before the reminder for a meeting, task, or appointment is displayed to the user. You can use the Reminder event to take an action related to the reminder. Because the reminder itself is usually adequate for reminding the user of the meeting, task, or appointment, the Reminder event tends to be more useful when accessing Outlook programmatically than when a user is working interactively with Outlook. The syntax is as follows: + + Sub _expression_ _Reminder(ByVal Item As Object) + +Here, _expression_ is a required expression that returns an Application object, and Item is the AppointmentItem, MailItem, ContactItem, or TaskItem object associated with the reminder. + +## Using the OptionsPagesAdd ****Event + +The OptionsPagesAdd event occurs when either the Options dialog box (Tools ⇒ Options) or the Properties dialog box for a folder, such as the Inbox, is opened. (To open the Properties dialog box for a folder, right-click the folder, and then choose Properties from the context menu.) You can use this event to add a custom page (which is contained in a COM [Component Object Model] add-in that you have created) to the Options dialog box or the Properties dialog box. The syntax for the OptionsPagesAdd event is as follows: + + Sub _expression_ _OptionsPagesAdd(ByVal Pages As PropertyPages, _ + ByVal Folder As MAPIFolder) + +Here, _expression_ is a required expression that returns an Application object or a NameSpace object. Pages is a required argument that gives the collection of custom property pages added to the dialog box. Folder is a required argument used when _expression_ returns a MAPIFolder object. Folder returns the MAPIFolder object for which the Properties dialog box is being opened. + +# Working with Item-Level Events + +In addition to the application-level events discussed so far, Outlook supports a wide variety of _item-level events_ —events that fire when specific items are manipulated, as opposed to events related to Outlook as a whole. + +You can handle item-level events in Outlook in two ways: + + * By declaring an event in a class module and running an initialization procedure so that VBA then traps the event when it fires. This chapter takes this approach. + * By creating Visual Basic Script (VBScript) code and placing it in a "custom form" used by the item. Custom forms are not to be confused with the UserForms we've been working with in the VBA Editor throughout this book. You create a custom form in Outlook by clicking the Developer tab on the Ribbon, then choosing options displayed on the Custom Forms section of the Ribbon. + +* * * + +What Is VBScript? + +Script versions of computer languages were originally designed to execute when a user visits a web page. So these languages are supposed to contain fewer capabilities than ordinary languages. For example, VBScript doesn't have a command that deletes a folder in Outlook, whereas VBA does (FolderRemove). Why? You don't want Outlook folders being deleted—or to trigger similar damaging actions—just because you simply opened a malicious web page in your browser. + +Although the original intent was that script languages would be lightweight, Web-oriented versions of their parent languages, as always seems to happen with mission creep, they have changed over time to perform various tasks and to have a variety of implementations. This sort of corruption is typical in computer software: there are many versions of "standards" like XML, HTML, and the like. They start out with the intention to be uniform across platforms, to be governed by certain laws, and so on. Then they deconstruct. It reminds you of Mae West's famous remark: "I used to be Snow White, but I drifted." + +In spite of VBScript's limitations, you might want to employ it for one specialized job in Outlook: sharing items with others. VBScript code is contained within its custom form, so you can send it to other people. You can't directly export VBA to others inside items you share. + +If you're interested in pursuing Outlook's Custom Forms and the VBScript that drives them, consult the useful tutorial here: + + + +* * * + +## Declaring an Object Variable and Initializing an Event + +Follow these steps to declare an object variable and initialize an event: + +1. Use a class module to contain your object-variable declaration, in one of the following three ways: + + * Use the built-in ThisOutlookSession module. In the Project Explorer, expand the project name (it's in boldface and by default is named Project1). Expand the Microsoft Outlook Objects item, and double-click the ThisOutlookSession item to open its Code window. + * Create a new class module by right-clicking the project name in the Project Explorer and choosing Insert ⇒ Class Module from the context menu. The VBA Editor automatically opens a Code window for the class. + * If there is one, you can open an existing class module by double-clicking it in the Project Explorer. + +2. In the declarations area at the beginning of your class module (at the top of the Code window), declare a variable to represent the object to which the event applies. Use the WithEvents keyword to specify that this object has events. The following example creates a public variable named myPublicContactItem: + + Public **WithEvents** myPublicContactItem As ContactItem + +3. Initialize the object variable by setting it to represent the appropriate object. The following example sets our myPublicContactItem variable to represent the first item in the default contacts folder: + + Set myPublicContactItem = Application.GetNamespace("MAPI") _ + .GetDefaultFolder(olFolderContacts).Items(1) + +Once you've initialized the object variable, the procedure will run after the event fires. + +You can initialize the object variable manually if necessary, and you may find it convenient to do so when you're writing and testing code to handle events. But if you need to handle the event each time Outlook runs—if you want to make the macro a permanent part of your macro collection—it's obviously best to run the code to initialize the object variable automatically. For example, you might use the Startup event of the Application object (discussed in "Using the Startup Event," earlier in this chapter) to run event-handling initialization code automatically each time Outlook starts. In other words, + + Private Sub Application_Startup() + + Set myPublicContactItem = Application.GetNamespace("MAPI") _ + .GetDefaultFolder(olFolderContacts).Items(1) + + End Sub + +## Understanding the Events That Apply to All Message Items + +Table 27.1 lists the common message events. I'm using the term _message_ here to refer to the AppointmentItem, MailItem, ContactItem, and TaskItem objects. In other words, Table 27.1 lists the most common events that are available to these four objects. + +But be aware that there are additional "item" objects in Outlook, such as the DocumentItem, DistListItem, JournalItem, MeetingItem, and so on. To view these various items, and see descriptions of their events, visit this web page: + + + +Also note that although Table 27.1 describes 16 common events, each of the "item" objects actually has 26 events. As an example, the complete list of events for the MailItem object in Outlook 2013 is provided on this web page: + +) + +Table 27.1 Common item-level events + +**Event** | **Event Occurs** +---|--- +AttachmentAdd | After an attachment is added to the item +AttachmentRead | When the user opens an email attachment for reading +BeforeAttachmentSave | When the user chooses to save an attachment but before the command is executed +BeforeCheckNames | Before Outlook checks the names of the recipients of an item being sent +BeforeDelete | Before an item is deleted +Close | When an inspector is being closed but before the closing occurs +CustomAction | When the custom action of an item is executed +CustomPropertyChange | When a custom property of an item is changed +Forward | When the user forwards an item +Open | When an item is opened in an inspector +PropertyChange | When a standard property (as opposed to a custom property) in the item is changed +Read | When an item is opened for editing in an inspector window or is selected for editing in-cell +Reply | When the user issues a Reply command for an item +ReplyAll | When the user issues a Reply All command +Send | When a Send command has been issued but before the item is sent +Write | When an item is saved, either explicitly by the user or implicitly by Outlook + +Note that the Close event applies to the Inspector object and the Explorer object as well as to the objects just mentioned. + +The events that fire before an action occurs allow you to cancel the action, preventing it from happening at all. The syntax for these events uses a Boolean argument named Cancel that you can set to True to prevent the action from taking place. For example, the syntax for the BeforeDelete event is as follows: + + Sub _expression_ _BeforeDelete(ByVal Item As Object, **Cancel** As Boolean) + +Here, _expression_ is a required expression that returns one of the message items to which the event applies (for example, a TaskItem object). The following example uses the BeforeDelete event to see if the TaskItem object that's open in an inspector is marked as complete when the user tries to delete it. If the task is not marked as complete, a message box prompts the user to complete the task, and the example then sets the Cancel argument to True to prevent the deletion: + + Private Sub myTaskItem_BeforeDelete(ByVal Item As Object, Cancel As Boolean) + If myTaskItem.Complete = False Then + MsgBox "Please complete the task before deleting it.", _ + vbOKOnly + vbExclamation, "Task Is Incomplete" + **Cancel = True** + End If + End Sub + +* * * + +The Difference between the ****Read**** and ****Open**** Events + +The Read event and the Open event both occur when the user opens an existing item for editing. The difference between the two events is that the Open event occurs only when the item is being opened in an inspector window, whereas the Read event occurs both when the item is being opened in an inspector window and also when it is being selected for editing in a cell. + +* * * + +## Understanding the Events That Apply to Explorers, Inspectors, and Views + +Table 27.2 lists the events that apply to explorers, inspectors, and views. Some events apply to both explorers and inspectors. + +Table 27.2 Events that apply to explorers, inspectors, or views + +**Event** | **Applies To** | **Event Occurs** +---|---|--- +BeforeFolderSwitch | Explorer | Before the explorer displays a new folder +BeforeItemCopy | Explorer | When the user issues a Copy command but before the Copy operation takes place +BeforeItemCut | Explorer | When an item is cut from a folder +BeforeItemPaste | Explorer | Before an item is pasted +BeforeViewSwitch | Explorer | Before the view changes in the Outlook window +Close | Explorer, Inspector | When an explorer is closing +FolderSwitch | Explorer | After an explorer displays a new folder +SelectionChange | Explorer | When the focus is moved to a different item in a folder, or when Outlook selects the first item in a folder when the user selects that folder +AttachmentSelectionChange | Explorer, Inspector | When a new or different attachment is selected +ViewSwitch | Explorer | When the view changes in the explorer window +Activate | Explorer, Inspector | When an explorer window or an inspector window is activated (becomes the active window) +Deactivate | Explorer, Inspector | When an explorer window or an inspector window is deactivated (stops being the active window) +BeforeMaximize | Explorer, Inspector | When the user maximizes the explorer or inspector but before maximization takes place +BeforeMinimize | Explorer, Inspector | When the user minimizes the explorer or inspector but before minimization takes place +BeforeMove | Explorer, Inspector | When the user moves an explorer window or an inspector window but before the action takes place +BeforeSize | Explorer, Inspector | When the user resizes the explorer window or inspector window but before the resizing takes place +PageChange | Inspector | When the active form page changes +InlineResponse | Explorer | When an inline response appears in the reading pane +InlineResponseClose | Explorer | When an inline response in the reading pane closes +NewExplorer | Explorers | When a new explorer window is opened +NewInspector | Inspectors | When a new inspector window is opened +ViewAdd | Views | When a view is added to the Views collection +ViewRemove | Views | When a view is removed from the Views collection + +If you work on a small screen (for example, a laptop screen), you might prefer to use the NewInspector event to maximize each inspector window you open and to hide any toolbars you don't need. The first procedure in the following example (which includes the necessary declarations) uses the NewInspector event to make sure the Standard toolbar is displayed, hide the Advanced toolbar, and assign the Inspector object representing the new inspector to the Public object variable myInspector. The second procedure uses the Activate event of the myInspector object to maximize its window by setting the WindowState property to olMaximized. + +The net effect of these two event procedures is to configure the toolbars as described earlier and maximize the inspector window. Put more simply, if you, for example, double-click an email, it opens in a new window. That windows is the "inspector" object. The Activate event procedure is necessary because the NewInspector event runs before the inspector window is displayed, which means the NewInspector event procedure cannot maximize the inspector window. + + Public WithEvents myInspectors As Inspectors + Public WithEvents myInspector As Inspector + + Private Sub myInspectors_NewInspector(ByVal Inspector As Outlook.Inspector) + With Inspector + With .CommandBars + .Item("Standard").Visible = True + .Item("Advanced").Visible = False + End With + Set myInspector = Inspector + End With + End Sub + + Private Sub myInspector_Activate() + myInspector.WindowState = olMaximized + End Sub + +* * * + +**How to Test Event-Handler Procedures** + +You don't test event handlers the same way that you test ordinary VBA modules. In an ordinary module, you click to put the blinking insertion cursor inside the macro you want to execute, then press F5 to execute that procedure. + +In a _class_ module, by contrast, pressing F5 merely opens the Macros dialog box rather than directly running the code. + +If you are confused about where to put handler code, and how to test it, don't be discouraged. An event handler must be put into a class module. And whenever you use classes, you're venturing into OOP. OOP, whatever its merits, always adds a layer of complexity for the programmer. So, let's briefly review this topic so you'll see how to write, then test, event handlers. + +In this next example, you want to respond to any changes the user might make to one of the user's contacts. In other words, you need to write some code in the ItemChange event of the Contacts folder. Perhaps you want your code to alert the user that they need to make further changes. Or that they need to send this new information to their assistant. Whatever the reason, your purpose is to write code that executes when a Contact item changes—when the user modifies a contact, then clicks the Save button, thereby triggering the ItemChange event. + +"Handling" an event (writing your own code that executes when an event takes place) requires that you take three steps: + +1. **Create an object variable** —using the WithEvents command—that will represent the object whose event you want to handle. Where does this code go? At the top of a class module in the General Declarations section above any subs. Outlook has that special built-in class module named ThisOutlookSession. So instead of creating a new class module, let's keep things simple and just use the existing ThisOutlookSession class module to declare our object variable. + +2. **Point or connect (** **Set** **) your new object variable to the actual object whose event you want to handle**. In our example, we want to handle the Items collection of Outlook's Contacts folder. Where does this code go? It could be put into a macro. Or, because we want to have this connection made automatically, let's put it in Outlook's startup event. That way the connection is made whenever the user runs Outlook. Remember that the various Office applications have specially reserved names: if you name a procedure in Word AutoExec, for example, its code executes when you start Word. If you name a procedure Application_Startup in Outlook, that's the equivalent of Word's AutoExec. + +3. **Write the event-handler code—the actions you want taken when this event occurs**. Where does this code go? In the same class module where you declared the object variables (step 1, above). + +There are other ways to handle events, but this is a straightforward example. To keep it simple, we'll put the code for all three steps in Outlook's built-in ThisOutlookSession class module. Now let's follow the preceding steps, only this time we'll insert the actual code: + +1. First open Outlook's VBA Editor by pressing Alt+F11. Expand Project1 in the project window until you see the ThisOutlookSession class module (under Microsoft Outlook Objects). Double-click ThisOutlookSession to open its code window. + +2. At the top of the ThisOutlookSession Code window, type the object variable's declaration: + + Public **WithEvents** objContacts As Items + +3. Now, in the Application Startup event, we'll write code that connects the object variable to the real Outlook object we're interested in: the Items collection of the Contacts folder: + + Private Sub Application_Startup() + + **Set** objContacts = Application.GetNamespace("MAPI") _ + .GetDefaultFolder( **olFolderContacts** ). **Items** + + End Sub + +4. Finally, we'll write the event-handler code that does the job we want done. This code also goes in the ThisOutlookSession module: + + Private Sub objContacts_ **ItemChange** (ByVal Item As Object) + + MsgBox "This Contact Item Has Been Changed" + + End Sub + +Now to test this event handler. In Outlook, open your Contacts folder. In Outlook 2013 this folder is named _People_ and is found in the lower-left corner next to the Mail and the Calendar links. Double-click some random contact and type something in the Notes field, then click the Save button. This should cause your event-handler code to execute, displaying a message box telling you that the contact info has changed. + +I'm not going to pretend that any of this is easy. Although OOP has its merits, writing code employing OOP rules can be a real wrestling match. Complexities involving diction, punctuation, reference, scope, precedence, and other issues will often draw you into a world of multiplying interactions—leading to unpredictable and perplexing test-code-retest cycles. Your best bet when working with class modules is to try to find working example code online that's close to what you're trying to accomplish, then modify it to suit your purposes. + +* * * + +## Understanding the Events That Apply to Folders + +Outlook provides three events (see Table 27.3) that apply to folders. + +Table 27.3 Events that apply to folders + +**Event** | **Event Occurs** +---|--- +FolderAdd | When a folder is added to the specified Folders collection +FolderChange | When a folder in the specified Folders collection is changed +FolderRemove | When a folder is removed from the specified Folders collection + +## Understanding the Events That Apply to Items and Results + +Table 27.4 lists the events that apply to items and results. + +Table 27.4 Events that apply to items and results + +**Event** | **Event Occurs** +---|--- +ItemAdd | When one or more items are added to the collection but not when many items are added all at once +ItemChange | When an item in the Items collection or the Results collection is changed +ItemRemove | When an item is deleted from the Items collection or the Results collection but not when 16 or more items are deleted at once from a Personal Folders file, an Exchange mailbox, or an Exchange public folder; also not when the last item in a Personal Folders file is deleted + +The example in the sidebar "How to Test Event-Handler Procedures" earlier in this chapter employs the ItemChange event to monitor when any contact is changed in the Contacts folder. + +## Understanding the Events That Apply to Reminders + +Table 27.5 explains the events that Outlook provides for reminders. You can use these events to take actions when a reminder fires, before the reminder dialog box appears, when the user clicks the Snooze button to dismiss a reminder, or when reminders are added, changed, or removed. + +Table 27.5 Events that apply to reminders + +**Event** | **Event Occurs** +---|--- +BeforeReminderShow | Before Outlook displays the Reminder dialog box +ReminderAdd | When a reminder is added +ReminderChange | After a reminder has been changed +ReminderFire | Before a reminder is executed +ReminderRemove | When a reminder is removed from the Reminders collection +Snooze | When the user dismisses a reminder by clicking the Snooze button + +## Understanding the Events That Apply to Synchronization + +If you write procedures to synchronize Outlook, you may need to use the three events that apply to the SyncObject object, which represents a Send/Receive group for a user. (You can access the SyncObject object by using the SyncObjects property of the NameSpace object to return the SyncObjects collection.) Table 27.6 explains the events that apply to the SyncObject object. + +Table 27.6 Events that apply to the SyncObject object + +**Event** | **Event Occurs** +---|--- +SyncStart | When Outlook starts synchronizing a user's folders +Progress | Triggers periodically during the synchronization of Outlook folders +SyncEnd | After synchronization ends +OnError | When an error occurs during synchronization + +The following example uses the OnError event with the object variable mySyncObject. If an error occurs during synchronization of the SyncObject represented by mySyncObject, the procedure displays an error message giving the error code and description: + + Private Sub mySyncObject_OnError(ByVal Code As Long, _ + ByVal Description As String) + + Dim strMessage As String + strMessage = "An error occurred during synchronization:" & vbCr & vbCr + strMessage = strMessage & "Error code: " & Code & vbCr + strMessage = strMessage & "Error description: " & Description + MsgBox strMessage, vbOKOnly + vbExclamation, "Synchronization Error" + + End Sub + +# Understanding Quick Steps + +A Quick Steps feature allows non-programmers to combine actions in Outlook without having to record a macro (Outlook has no recorder anyway) or write a procedure using VBA. + +While looking at the Mail page in Outlook, click the Home button on the Ribbon. You'll see the Quick Steps area right in the middle of the Ribbon. + +The rationale for Quick Steps is the same as the rationale for writing or recording macros: After you've specified and saved a set of actions, you need not manually repeat those actions in the future—you merely run the macro and the behaviors are carried out automatically. + +Quick Steps is similar to Access's Macro Designer: You're presented with a list of common actions and you can choose to combine two or more of them into a macro-like little "program." And, like a macro, a Quick Steps one-click button saves time by launching the "program" anytime the user chooses. Non-programmers can build the Quick Steps "programs" out of actions that they frequently perform—thus saving time. + +Although not nearly as flexible and powerful as writing macros in VBA, for a common task you might consider seeing if it's possible to create a Quick Step. + +Some sample Quick Steps are already available in the Ribbon, and when you first click them you're asked to customize their behavior to suit your way of working. Click, for example, the _MoveTo: ?_ sample, and the First Time Setup dialog box opens, as shown in Figure 27.2. + +Figure 27.2 Experiment with the sample Quick Steps to get an idea how to create and customize them. + +As you see in Figure 27.2, you're allowed to customize this Quick Step by changing its name, specifying the target folder, and deciding whether or not to mark it as read. So this little program performs two actions at the click of a button. That could be a time-saver if you frequently store read email in a particular folder. Also notice the Options button where you can further modify the behavior of this Quick Step. You can add more actions, delete actions, specify a shortcut key, and write a tooltip. + +Quick Steps makes 20 actions available to you, so it's no competition for the thousands of things you can do with VBA. Nonetheless, you might want to consider employing the Quick Steps tool for quick and easy automation of common mail-related tasks in Outlook. + +# The Bottom Line + +**Work with application-level events.** + +Event handlers are procedures that contain code that responds to an event. In other words, if a user modifies one of their contacts, an event can detect this modification and execute code you've written to respond to the modification. + +Master It + +Event-handler procedures are unlike ordinary macro procedures in several ways. Name one of the differences. + +**Work with item-level events.** + +Outlook has two primary kinds of events. + +Master It + +What are the two types of events in Outlook? And how do they differ? +Chapter 28 + +Understanding the Access Object Model and Key Objects + +If you work with Access databases, forms, or reports, you'll find many opportunities for customizing Access using VBA to streamline your work and that of your colleagues. Depending on the purposes for which you use Access, you might program Access to automatically extract data sets you need, to create custom reports on a regular schedule, and to perform many other tasks. + +Even if your work in Access consists simply of entering data into databases and checking that it is correct, you may be able to program VBA to make mundane tasks less onerous. For example, you might use VBA to simplify the process of data entry or to validate the data that the user enters to avoid problems further down the line. + +This chapter first shows you how to get started with VBA in Access because Access implements VBA in a different way from the other applications this book has discussed. You'll then come to grips with the Access object model and learn about its most important creatable objects. After that, the chapter shows you how to open and close databases, set startup properties for a database, work with the Screen object, and use the DoCmd object to run Access commands. + +The next chapter discusses how to manipulate the data in an Access database via VBA. + +In this chapter you will learn to do the following: + + * Get started with VBA in Access + * Understand Access-style macros + * Open and close databases + * Work with the Screen object + * Use the DoCmd object to run Access commands + +# Getting Started with VBA in Access + +Access implements VBA differently than the other Office applications do. Here are the main differences: + + * Collections in Access are zero-based—the first item in a collection is numbered 0 (zero) rather than 1. For example, Forms(0).Name returns the Name property of the first Form object in the Forms collection. Zero-based collections make your job as a programmer more difficult, particularly when employing loops. + * The term _macro_ is used in a special way in Access, unlike the way it's used in other Office applications, not to mention all other forms of computing. An Access "macro" is a historical entity—a holdover from the early days of this database system. Some consider the whole approach rather harebrained because it's limited to a subset of the available programming statements, and it's not nearly as useful or flexible or efficient (in most cases) as just writing VBA code. With an Access macro, you enter a list of actions that you want to perform by using a special utility—the Macro Designer (formerly known as the Macro Builder)—that's built into Access. You choose these actions from a list, then type in arguments in the next cell in a table displayed by the Macro Designer. So it's all a bit like filling in a form and not that much like real programming. It's similar to Outlook's Quick Steps tool described in Chapter 27, "Working with Events in Outlook." + * Access's so-called "macros" are created by clicking the Table tab on the Ribbon, clicking the Named Macros option, and clicking Create Named Macro. From now on we'll call these self-styled "macros" _Access_ - _style macros_ , to distinguish them from the true macros we've worked with throughout this book. + * When you write VBA code in the Access VBA Editor—as you would in the other Office 2013 applications—you create true macros, properly so called. (Just remember that Access doesn't describe these VBA procedures as _macros_. You just have to get used to the difference in terminology.) We'll focus our attention on the VBA capabilities in Access rather than on the legacy Macro Designer. + * For a user to execute a macro Sub, you must first create an Access-style function that calls the subprocedure. While you, the programmer, are working on a macro in the VBA Editor, you can debug and run the subprocedure by using the VBA Editor's usual commands (for example, press F5 to run and test the subprocedure). But a user will not be able to run the macro directly from the Access user interface. Instead, you must employ the RunCode action, as you'll see. There is an exception to this rule. In Chapter 31, "Progamming the Office 2013 Ribbon" (see the section titled "Direct Communication with VBA"), you'll learn how to directly trigger VBA by modifying the Access Ribbon. + +The following sections provide a complete, start-to-finish example of how to work with VBA in Access. You create a module, then write a procedure in that module, and finally, use the Macro Designer to create an Access-style macro whose sole purpose is to start the execution of the VBA procedure. + +This chapter shows you how to create "macros" in Access, so first you need to ensure that macros are, in fact, enabled in Access. Follow these steps to enable Access macros: + +1. Click the File tab on the Ribbon. + +2. Click Options in the left pane. + +3. Click Trust Center in the Access Options dialog box. + +4. Click the Trust Center Settings button. + +5. Click Macro Settings in the left pane of the Trust Center dialog box. + +6. Click the Enable All Macros option button. + +7. Click OK twice to close the dialog boxes. + +## Creating a Module in the VBA Editor + +To create a module where you can write VBA code, open an Access database and click the Ribbon's Database Tools tab. Click the Visual Basic button on the Ribbon (or simply press Alt+F11). + +The VBA Editor opens. Choose Insert ⇒ Module in the VBA Editor or right-click the project's name (it's boldface) in the Project Explorer pane, and choose Insert ⇒ Module from the shortcut menu. + +## Creating a Function + +After creating a VBA module in the VBA Editor, you can create a function within it as described earlier in this book. The following example creates a function named Standard_Setup that simply displays a message box to indicate that it is running (the next section uses this macro as an example): + + Public Function Standard_Setup() + 'put your choice of commands here + MsgBox "The Standard_Setup macro is running." + End Function + +You can test this code as usual by clicking somewhere inside the procedure, then pressing F5. + +After creating the function, switch back to Access by pressing Alt+F11 or clicking the View Microsoft Access button on the far left of the Standard toolbar in the VBA Editor. Of course, you could also use the traditional Windows Alt+Tab shortcut. + +## Using the Macro Designer + +Although this and the next chapter focus on automating Access via the more flexible and powerful VBA language, some readers may be interested to know how to work with the Macro Designer tool. So we'll explore it briefly before moving on to VBA examples. + +## Creating an Access-Style Macro to Run a Function + +Recall that a user can't directly trigger a VBA procedure interactively from the main Access interface (although you, the programmer, can press F5 to test procedures in the VBA Editor). You'll find no Macros dialog box like the one in Word and other Office 2013 applications. True, there _is_ a Run Macro button on the Database Tools tab of the Access Ribbon, but this feature cannot directly trigger a VBA procedure. (It only triggers an Access-style macro.) + +For a user to run a VBA procedure, you have to create an Access-style macro that was built using Access's Macro Designer. You use the RunCode action (command) to call the VBA procedure. We'll see how to do that now: + +1. Display the database window if it's not already displayed. For example, click the word _View_ (the _word_ with the small black down-arrow, not the icon) on the Ribbon's Home tab, then select Datasheet View from the options displayed. + +2. Click the Macro button on the Ribbon's Create tab to open the Macro Designer window (see Figure 28.1). This also opens a Design tab on the Ribbon. + +Figure 28.1 Use the Macro Designer window to create a new Access-style "macro" in Access. + +3. In the Action Catalog pane on the right, open the Macro Commands folder and double-click the RunCode item. This inserts the RunCode command into the middle pane. (The RunMacro option command can only execute Access-style macros. Likewise, if you try to add a button to the Quick Access Toolbar above the Ribbon, it too can only execute Access-style macros. ) + +4. In the Function Name field, type **Standard_Setup()** , the name of the VBA test function you created earlier in this chapter. _The empty parentheses are required, so don't omit them._ + +5. Click the Save icon in the Quick Access Toolbar above the Ribbon, or press Ctrl+S. + +6. Type the name **test** in the Save As dialog box, and click the OK button. (Tip: If you modify the macro later and want to change its name, choose File ⇒ Save As ⇒ Save Object As, then click the Save button. Isn't Access remarkably roundabout sometimes? Or you can right-click the macro's name in the left pane of the main Access window, then choose Rename.) + +7. Now test this macro (and consequently the VBA procedure it triggers) by clicking the Run icon on the Ribbon. It's the icon with the red exclamation point. This icon appears only when the Macro Designer is active in the Design tab of the Ribbon. You now see the message box telling you that your macro is running. + +The user can execute Access-style macros when the Macro Designer is closed. Just double-click _test_ in the All Access Objects list (the pane on the left side of the main window). It may be necessary to click the small down arrow at the top of this pane and choose Show All. + +Or the user can click the Database Tools tab of the Ribbon, then click the Run Macro icon on the Macro section (it too has a red exclamation point). Access's Run Macro dialog box opens. Select _test_ as the macro name you want to run, then click OK to close the dialog box and execute your macro. + +* * * + +Three Ways to Execute an Access-Style Macro + +To sum up, a user can execute an Access-style macro only three ways: + + * Choose the Run Macro option from the Ribbon's Database Tools tab. This opens a small Run Macro dialog box from which you can select an Access-style macro and execute it. + * Double-click the Access-style macro's name in the All Access Objects list in the left pane of the main Access window. + * Add a button to the Quick Access Toolbar that will execute the Access-style macro. + +Add a button to the Quick Access Toolbar by following these steps: + +1. Click the Customize Quick Access Toolbar button (the down arrow icon on the right of the Quick Access Toolbar at the top left of the Access window). + +2. Click the More Commands option in the drop-down list. The Access Options dialog box opens. + +3. Select Macros in the Choose Commands From drop-down list. + +4. Double-click your macro's name to move it into the list on the right side (where the toolbar's displayed items are listed). + +5. Click OK to close the dialog box and put your macro on the toolbar. + +Note that you can't trigger a macro from a keyboard shortcut (Access doesn't permit you to create custom keyboard shortcuts). + +* * * + +## Translating an Access-Style Macro into a VBA Macro + +Given that VBA is far more powerful than the Access-style macros, you might want to convert an Access-style macro into VBA to enhance it. You can have Access automatically translate Access-style macros into VBA functions. Follow these steps: + +1. Display the database window if it's not already displayed. For example, click the word _View_ (the _word_ with the small black down-arrow, not the icon) on the Ribbon's Home tab, then select Datasheet View from the options displayed. + +2. Click the tab on the top of the main Access window named _test_ to view the Access-style macro you created earlier in this chapter. Click the Macro button on the Ribbon's Create tab to open the Macro Designer window (see Figure 28.1). This also opens a Design tab on the Ribbon. + +3. On the left side of the Ribbon, click Convert Macros To Visual Basic. + +4. You see a dialog box where you can optionally refuse to include error handling or comments. + +5. Click the Convert button. + +6. Press Alt+F11 to open the VBA Editor. + +7. In the Navigation pane, locate and double-click the module named Converted Macro-test. You now see the translated code: + + '------------------------------------------------------------ + ' test1 + ' + '------------------------------------------------------------ + Function test1() + On Error GoTo test1_Err + + Run_SampleProcedure + + test1_Exit: + Exit Function + + test1_Err: + MsgBox Error$ + Resume test1_Exit + + End Function + +If you opted to avoid the error trapping and commenting, it's simpler: + + '------------------------------------------------------------ + ' test1 + ' + '------------------------------------------------------------ + Function test1() + + Run_SampleProcedure + + End Function + +## Using an _AutoExec_ Macro to Initialize an Access Session + +To set up preconditions for an Access session, you can use an AutoExec macro. When Access starts running, it checks to see if there is a macro named AutoExec. If so, that macro is executed (runs) automatically when Access opens. This AutoExec feature is also available in other Office applications, like Word. + +For example, you might choose to maximize the application window, open a particular item (for example, a table), or display a particular record. Note that AutoExec must be the name of an Access-style macro, not a VBA procedure. + +By the way, you can prevent an AutoExec macro from running when you open a database by holding down the Shift key while the database opens. + +To create an AutoExec macro, start a new macro as described in the previous section, add to it the actions that you want the macro to perform, and save it with the special reserved name AutoExec. The macro then runs the next time you open the database. + +We'll now turn our attention to regular VBA programming, but if you're interested in learning more about the Macro Designer, see the tutorial on this web page: + + + +## Running a Subprocedure + +Until now, you've mostly created traditional subs when writing or recording a macro. And for consistency, the Access VBA code examples in this chapter and elsewhere will also be subs. + +But beware. If you want to permit the user to execute Access VBA procedures, they must be turned into functions. Just replace the word Sub with Function in your code. VBA will then automatically change the line at the end of your procedure from End Sub to End Function. Easy enough. + +So, just remember that in this way, and many others, Access differs from other Office applications. When you're writing a VBA macro in Access, there's no good reason to create Access VBA code in a subprocedure rather than in a function because a sub cannot be triggered directly in Access. + +Only functions can be directly triggered, as the example in the previous section illustrated. If you feel you must create a Sub, the only way to execute it is to create a function that, in turn, has the single job of executing your subprocedure. So what is the point? + +This function triggering indirection is clumsy, but it can be made to work if for some unimaginable reason you want to use a sub procedure. Here is a simple example: + +1. In the VBA Editor, create a subprocedure that performs the actions you want: + + Sub SampleProcedure() + MsgBox "The subprocedure named Sample Procedure is running." + End Sub + +2. Still in the VBA Editor, create a function that runs the subprocedure: + + Public Function Run_SampleProcedure() + Call SampleProcedure + End Function + +3. Then switch to Access and create an Access-style macro that uses the RunCode action to run the function that runs the subprocedure. (See the section earlier in this chapter titled "Creating an Access-style Macro to Run a Function.") + +## Understanding the _Option Compare Database_ Statement + +When you launch the VBA Editor in Access (by pressing Alt+F11 or clicking the Visual Basic button on the Ribbon's Database Tools tab) and then insert a code module you'll notice that Access automatically enters an Option Compare Database statement in the General Declarations area of the Code window. + +As an aside, recall that if you've selected the Require Variable Declaration check box on the Editor tab of the VBA Editor Options dialog box (Tools ⇒ Options) to make the VBA Editor force you to declare all variables explicitly, you'll see an Option Explicit statement in the General Declarations area as well. + +Access supports three different ways of comparing text strings: Option Compare Database, Option Compare Binary, and Option Compare Text. Here's what these options mean: + + * Option Compare Database is the default comparison type for Access databases and performs string comparisons using the sort order for the locale that Windows is using (for example, U.S. English). Sorting is not case sensitive. Access automatically inserts an Option Compare Database statement in the declarations section of each module that you insert. You can delete the Option Compare Database statement, in which case Access will use Option Compare Binary instead. + * Option Compare Binary performs case-sensitive sorting. To use Option Compare Binary, either delete the Option Compare Database statement in the declarations section or change it to an Option Compare Binary statement. + * Option Compare Text performs case-insensitive sorting. To use Option Compare Text, change the Option Compare Database or Option Compare Binary statement to an Option Compare Text statement. + +# Getting an Overview of the Access Object Model + +It's not crucial to understand how the Access object model fits together in order to work with VBA in Access, but most people find it helpful to know the main objects in the object model. And sometimes the code examples in the Help system's object-model reference can be invaluable—showing you how and where to employ objects in your own programming. + +To explore the Access object model, follow these steps: + +1. Launch or activate Access, and then press Alt+F11 to launch or activate the VBA Editor. + +2. Move your cursor to a blank space in the code window (to avoid context-sensitive help). + +3. Press F1 in the editor to launch the Help web page for the VBA language reference for Office 2013. + +4. In the Bing search field, type **Access 2013 object model** and press Enter. + +5. Click the link _Access object model reference_ ( _Access 2013 developer reference_ ). You now see the list of primary Access objects, as shown in Figure 28.2. + +Figure 28.2 The entries in the Access object-model reference will help you write your own VBA code. + +# Understanding Creatable Objects in Access + +Access _exposes_ (makes available for your use in code) various _creatable_ objects, meaning that you can employ most of the important objects in its object model without explicitly going through (mentioning in your code) the Application object. + +For most programming purposes, these creatable objects are the most commonly used objects. The main creatable objects in Access are as follows: + + * The Forms collection contains all the Form objects, which represent the open forms in a database. Because it's creatable, you need not write Application.Form in your code. You can leave off the Application and merely write Form. + * The Reports collection contains all the Report objects, which represent the open reports in a database. + * The DataAccessPages collection contains all the DataAccessPage objects, which represent the open data access pages in a project or a database. (An Access _project_ is a file that connects to a SQL Server database.) + * The CurrentProject object represents the active project or database in Access. + * The CurrentData object represents the objects stored in the current database. + * The CodeProject object represents the project containing the code database of a project or database. + * The CodeData object represents the objects stored in the code database. + * The Screen object represents the screen object that currently has the focus (the object that is receiving input or ready to receive input). The object can be a form, a report, or a control. + * The DoCmd object enables you to run Access commands. + * The Modules collection contains the Module objects, which represent the code modules and class modules in a database. + * The References collection contains the Reference objects, which represent the references set in the Access application. + * The DBEngine object represents the Microsoft Jet Database Engine and is the topmost object in the Data Access Objects (DAO) hierarchy. The DBEngine object provides access to the Workspaces collection, which contains all the Workspace objects available to Access, and to the Errors collection, which contains an Error object for each operation involving DAO. + * The Workspace object contains a named session for a given user. When you open a database, Access creates a workspace by default and assigns the open database to it. You can work with the current workspace or create more workspaces as needed. + * The Error object contains information about the data-access errors that have occurred in a DAO operation. + +# Opening and Closing Databases + +The following sections show you how to open and close databases. You can use the CurrentDb method to return the current database, open a database and treat it as the current database, or even open multiple databases at once. You can also create and remove workspaces. + +## Using the CurrentDb Method to Return the Current Database + +To work with the database that's currently open in Access, use the CurrentDb method on the Application object or an object variable representing the Application object. The CurrentDb method returns a Database object variable representing the currently open database that has the focus. + +The following example declares an object variable of the Database type named myDatabase and then uses the CurrentDb method to assign the active database to it: + + Dim myDatabase As Database + Set myDatabase = Application.CurrentDb + +## Closing the Current Database and Opening a Different Database + +In Access, you can choose from among several ways of opening and closing a database. This section discusses the simplest method of opening and closing a database—by treating it as the current database. This method is similar to opening and closing a database when working interactively in Access. See the next section for another method of opening and closing databases that lets you have two or more databases open at the same time. + +To open a database as the current database, use the OpenCurrentDatabase method of the Application object. The syntax is as follows: + + _expression_.OpenCurrentDatabase(Filepath, Exclusive, bstrPassword) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns an Application object. + * Filepath is a required String argument that specifies the path and filename of the database to open. You should specify the filename extension; if you omit it, Access assumes the extension is .accdb. + * Exclusive is an optional Boolean argument that you can set to True to open the database in Exclusive mode rather than in Shared mode (the default, or the result of an explicit False setting). + * bstrPassword is an optional String argument that specifies the password required to open the database. + +To close the current database, use the CloseCurrentDatabase method with the Application object. This method takes no arguments. + +You can run the CloseCurrentDatabase method from the current database, but you can't do anything after that because the code stops after VBA executes the CloseCurrentDatabase method and the database containing the code closes. To close the current database and open another by using the OpenCurrentDatabase method, you must run the code from outside the databases involved—for example, by using automation from another application. Chapter 30, "Accessing One Application from Another Application," describes this technique. + +* * * + +Prepare the Northwind Database to Use with This Book's Examples + +To test and experiment with some of the Access code examples in this and the following chapters, you need to do a little preliminary housekeeping. Put simply, we all need to be experimenting with the same database so we get the same results. + +Traditionally, when authors have written about Access, they've employed a sample database named Northwind that Microsoft included with Access. Northwind is a full-featured and therefore useful example database. It can be particularly valuable when you want to experiment with Access but don't want to use your own database (both to keep it safe and because your database might not have some of the features that Northwind has). + +I'll use Northwind in some of the examples in this book so that all readers can be working with the same data and the same structures. Therefore, before you test some of the upcoming code examples, please put a copy of Northwind.accdt in your C:\Temp directory so the example code in this book can locate it. If you don't have a C:\Temp directory, create one. + +You may already have Northwind on your hard drive. To see if you do, press the Start button to display the Windows 8 Modern home page, and type **Northwind.accdt**. Then click the Files search option in the right pane. If it shows up in the search, right-click it, choose Open File Location, then copy it and paste it into your C:\Temp directory. + +Then double-click this Northwind.accdt file. It will open in Access. Give the database the name **Northwind** and save it to C:\Temp. You want to end up with a file named Northwind.accdb in your C:\Temp directory. + +If you don't find Northwind.accdt on your hard drive, you can download it from Microsoft's website: + + + +The downloaded file will be named TS01128997.accdt. At some point Windows may ask your permission to download or install an ActiveX object. Agree to that. Don't worry; just double-click 01228997.accdt to open Northwind in Access. You'll see the File New Database dialog box open. In the File Name field, change the name to **Northwind.accdb** and click OK to close the File New Database dialog box and save Northwind.accdb to C:\Temp. As I said earlier, you want to end up with a file named Northwind.accdb in your C:\Temp directory. + +Now the code examples in this book can reference this file path to open Northwind: + + filepath:="C:\Temp\Northwind.accdb" + +Next you'll want to remove the default login dialog box so you can work with the database more easily from code. Open Northwind.accdb by double-clicking its name in Windows Explorer. + +By default a login dialog box appears asking you to select one of the "employees" from this imaginary company. Click the Login button to close the dialog box and see Northwind in Access. + +If it's not already open, click the >> symbol in Access's left pane to open the _Navigation pane_. Locate the Supporting Objects entry in the Navigation pane and click it to expand it. Scroll down until you locate the macro named AutoExec. Right-click AutoExec, choose Cut from the context menu, and then close Access. + +Now that login dialog box won't interrupt you any more when you open the Northwind example database. + +* * * + +There's an additional requirement when you're writing code that communicates _between_ Office applications. You can't simply declare an object variable to point to an application object, like this: + + Dim myAccess As Access.Application + +This code will run only if you first provide a _reference_ in the host application. For example, if you're trying to manipulate Access from VBA code within a Word macro, you need to set a reference in Word's VBA Editor. + +The following example illustrates a way to contact and manipulate Access from another VBA host—for example, from Excel or from Word. But before you can execute this code from Word or some other application, you must first choose Tools ⇒ References in the Word VBA Editor, then select _Microsoft Access 15.0 Object Library_ from the drop-down list. For this example, you must also have a database currently loaded and running in an instance of Access. + +This next example declares the object variable myAccess as the Access.Application type and the object variable myDatabase as the Object type. The example uses the GetObject method to assign to myAccess the copy of Access that's running, uses the CloseCurrentDatabase method to close this database, and then uses the OpenCurrentDatabase method to open another database, namely Northwind, in Exclusive mode. The final statement uses the CurrentDb method to assign the open database to the myDatabase object variable: + + Dim myAccess As Access.Application + Dim myDatabase As Object + + Set myAccess = GetObject(, "Access.Application") + myAccess.CloseCurrentDatabase + myAccess.OpenCurrentDatabase _ + filepath:="C:\Temp\Northwind.accdb", Exclusive:=True + Set myDatabase = myAccess.CurrentDb + +When you test this code by executing it in the Word VBA Editor, you'll know it works because whatever database was open in Access will be replaced by Northwind (see the "Prepare the Northwind Database to Use with This Book's Examples" sidebar). Also note that when running this code, you might get an error message saying "User-defined type not defined." And the Editor will highlight this line of code: + + Dim myAccess As Access.Application + +This means that the editor can't locate the object named Access. For reasons unknown, a newly added library is sometimes deselected in References. To fix this problem, just repeat the steps described previously to use Tools ⇒ References to add a reference to the _Microsoft Access 15.0 Object Library_ again. + +## Opening Multiple Databases at Once + +Instead of using the OpenCurrentDatabase method to open a database as the current database, you can use the OpenDatabase method of the Workspace object to open another database and return a reference to the Database object representing it. The syntax for the OpenDatabase method is as follows: + + Set _database_ = _workspace_.OpenDatabase (Name, Options, ReadOnly, Connect) + +* * * + +Creating New Databases, Forms, and Reports in Access + +The discussions of the other Office applications in this part of the book (Part 6) have emphasized creating and saving new files—for example, creating new documents in Word or new workbooks in Excel and saving them under suitable names and in the appropriate formats. + +Access, too, has its own VBA commands for creating new databases, forms, reports, tables, and other objects programmatically: + + * To create a new database, use the NewCurrentDatabase method of the Application object. + * To create a new form, use the CreateForm method. To place controls on the form, use the CreateControl method. + * To create a new report, use the CreateReport method. To place controls on the report, use the CreateReportControl method. + +While creating a new database programmatically is quite feasible, it is not only complex but also something that you probably won't need to do often, if ever. In most cases, the goal of your Access VBA programming will be to manipulate existing databases and objects that you have built manually. + +* * * + +Here are the components of the syntax: + + * _database_ is an object variable that will represent the database you open. + * _workspace_ is an optional object variable that specifies the workspace in which you want to open the database. If you omit _workspace_ , Access opens the database in the default workspace. Although you can open the database in the default workspace without problems, you may find it more convenient to create another workspace and use it to keep the database separate. See "Creating and Removing Workspaces" later in this chapter for details. + * Name is a required String argument that specifies the name of the database to open. An error results if the database doesn't exist or isn't available or if another user has opened the database for exclusive access. + * Options is an optional Variant argument that specifies any options you want to set for the database. For an Access database, you can specify True to open the database in Exclusive mode or False (the default) to open it in Shared mode. For ODBCDirect workspaces, you can use other options; see the Access Visual Basic Help file for details. + * ReadOnly is an optional Variant argument that you can set to True to open the database in read-only mode. The default value is False, which opens the database in read/write mode. + * Connect is an optional Variant that you can use to pass any necessary connection information, such as a password for opening the database. + +The following example "opens" the Northwind database in a special sense: it's opened behind the scenes for our code to contact it and have access to its data, structure, and other features. But it _is not opened in Access where the user can see it_. In other words, an instance of the database is fully exposed to our code, but there's no user interface. There's no display in Access of the Northwind database. For this reason, I've included a message box in the code example to prove to you that the code example has actually opened Northwind and fetched some data from it. + +Also, when you use this invisible database technique, it's a good idea to finish up by closing any recordsets or other objects you've opened, as well as closing the database instance itself. This way, unattached and useless entities aren't left floating in your computer's memory. + +This example will not work if you have Northwind open in Access. You must test this code while a different database is open in Access. + +This example declares a Workspace object variable named myWorkspace and a Database object variable named myDatabase, assigns to myWorkspace the first Workspace object in the Workspaces collection (the default workspace), and assigns to myDatabase the database Northwind.accdb, which it opens in Exclusive mode with read/write access. + +To show you that Northwind did come into existence, we fetch the City data from the first record in the Customers table. Finally, we display the city name, then clean up memory by closing both the recordset and the database instance. + +You can try this by entering this code in a module in the Access VBA Editor, but just do this while some database other than Northwind is open in Access. Press F5, and you'll see the city data. + + Sub test() + + Dim myWorkspace As Workspace + Set myWorkspace = DBEngine.Workspaces(0) + + Dim myDatabase As Database + Dim RecSet As Recordset + + Set myDatabase = myWorkspace.OpenDatabase _ + (Name:="C:\temp\northwind.accdb", _ + Options:=True, ReadOnly:=False) + + Set RecSet = myDatabase.OpenRecordset("Customers", dbOpenDynaset) + + MsgBox RecSet!City + + RecSet.Close + myDatabase.Close + + End Sub + +## Closing a Database + +To close a database that you've opened by using the OpenDatabase method, use the Close method of the object variable to which you've assigned the database. For example, the following statement closes the database assigned to the object variable myDatabase: + + myDatabase.Close + +## Creating and Removing Workspaces + +To keep different databases in separate sessions, you can create a new workspace as needed and remove it when you have finished working with it. + +### Creating a New Workspace + +To create a new workspace, use the CreateWorkspace method of the DBEngine object. + +The syntax is as follows: + + Set _workspace_ = CreateWorkspace(Name, UserName, Password, UseType) + +Here are the components of the syntax: + + * _workspace_ is the object variable to which you want to assign the workspace you're creating. + * Name is a required String argument that specifies the name to assign to the new workspace. + * UserName is a required String argument that specifies the owner of the new workspace. + * Password is a required String argument that specifies the password for the new workspace. The password can be up to 14 characters long. Use an empty string if you want to set a blank password. + * UseType is an optional argument that indicates the type of workspace to create. Use dbUseJet to create a Microsoft Jet workspace. Use dbUseODBC to create an ODBCDirect workspace. Omit this argument if you want the DefaultType property of the DBEngine object to determine the type of data source connected to the workspace. + +The following example declares an object variable named myWorkspace of the Workspace type and assigns to it a new Jet workspace named Workspace2. The example makes the admin account the owner of the new workspace: + + Dim myWorkspace As Workspace + Set myWorkspace = CreateWorkspace(Name:="Workspace2", _ + UserName:="admin", Password:="", UseType:=dbUseJet) + +After creating a new workspace, you can use it to open a new database (as described earlier in this chapter). + +### Removing a Workspace + +Before removing a workspace from the Workspaces collection, you must close all the open connections and databases. You can then use the Close method to close the Workspace object. For example, the following statement closes the Workspace object identified by the object variable myWorkspace: + + myWorkspace.Close + +# Working with the _Screen_ Object + +If you've used VBA in the other Office applications, you've probably written code that works with whichever object is currently active. For example, in Word you can use the ActiveDocument object to work with the active document or the Selection object to work with the current selection. In PowerPoint you can work with the ActivePresentation object to work with whichever presentation happens to be active. + +In Access, you can use the Screen object to work with the form, report, or control that has the focus. The Screen object has various properties, including the following: + + * The ActiveForm property returns the active form. If there is no active form, trying to use the ActiveForm property returns the error 2475. + * The ActiveDatasheet property returns the active datasheet. If there is no active datasheet, trying to use the ActiveDatasheet property returns the error 2484. + * The ActiveReport property returns the active report. If there is no active report, trying to use the ActiveReport property returns the error 2476. + * The ActiveDataAccessPage property returns the active data access page. If there is no active data access page, trying to use the ActiveDataAccessPage property returns the error 2022. + * The ActiveControl property returns the active control. If there is no active control, trying to use the ActiveControl property returns the error 2474. + * The PreviousControl property lets you access the control that previously had the focus. + +To avoid errors, you should check which object is active before trying to manipulate it by using the Screen object. The following example uses the these listed error numbers to determine whether a form, report, datasheet, or data access page is active and then displays a message box identifying the item and giving its name: + + On Error Resume Next + + Dim strName As String + Dim strType As String + strType = "Form" + strName = Screen.ActiveForm.Name + If Err = 2475 Then + Err = 0 + strType = "Report" + strName = Screen.ActiveReport.Name + If Err = 2476 Then + Err = 0 + strType = "Data access page" + strName = Screen.ActiveDataAccessPage.Name + If Err = 2022 Then + Err = 0 + strType = "Datasheet" + strName = Screen.ActiveDatasheet.Name + End If + End If + End If + + MsgBox "The current Screen object is a " & strType & vbCr _ + & vbCr & "Screen object name: " & strName, _ + vbOKOnly + vbInformation, "Current Screen Object" + +If you test this, use the Create tab on the Ribbon (and click the Form icon) to ensure that there is a form active in Access. + +# Using the _DoCmd_ Object to Run Access Commands + +The DoCmd object enables you to execute normal Access commands, such as Find or Rename, in your VBA code. + +To run a command, you use one of the methods of the DoCmd object. Table 28.1 lists the 66 DoCmd methods available in Access 2013 and explains briefly what they do. + +The following sections include examples showing how to use some of the methods described in Table 28.1. + +Table 28.1 Methods of the DoCmd object + +**Method** | **Explanation** +---|--- +AddMenu | Adds a menu to the global menu bar or to a custom menu bar. +ApplyFilter | Applies a filter so that only records that match certain criteria are displayed. +Beep | Makes the computer beep—for example, to attract the user's attention when an error has occurred. +BrowseTo | BrowseTo is an Access-style macro action that helps you either create a custom user interface on top of an existing wizard navigation control or build your own. +CancelEvent | Cancels the event that has occurred. +ClearMacroError | Use after you handle an Access-style macro error to reset the data about the error so you can check for any future errors (in the MacroError object) while the macro continues to execute. +Close | Closes the specified object—for example, a form or a report. +CloseDatabase | Closes the database, just as if you'd clicked the File tab on the Ribbon and chosen the Close Database option. A Save dialog box will appear if necessary, asking for your disposition of any unsaved objects. +CopyDatabaseFile | Copies the database connected to the current project to a SQL Server file. +CopyObject | Copies the specified object (for example, a query or a table) into the specified database (or to a new table in the current database). +DeleteObject | Deletes the specified object from the database. +DoMenuItem | Performs a command from a menu or toolbar. This is an older command that has been replaced by the RunCommand method (described later in this table). +Echo | Provides backward compatibility for running the Echo action in earlier versions of VBA. It's better to use Application.Echo now. +FindNext | Finds the next record matching the search criteria specified by the FindRecord method. +FindRecord | Performs a search for a record that matches the specified criteria. +GoToControl | Moves the focus to the specified control or field in a form or datasheet. +GoToPage | Moves the focus to the specified page of a form. +GoToRecord | Makes the specified record the current record. +Hourglass | Changes the mouse pointer to an hourglass (a wait pointer) or back to a normal pointer. +LockNavigationPane | This option prevents the user from right-clicking a database object displayed in the left pane (Navigation pane) and then selecting the Cut or Delete option from the context menu that appears. Other options on that menu, such as Copy and Paste, are still enabled. +Maximize | Maximizes the active window. +Minimize | Minimizes the active window. +MoveSize | Moves or resizes (or both) the active window. +NavigateTo | Allows you to specify how objects are displayed in the Navigation pane (left pane). For example, you could reorganize the list of objects, or even prevent some objects from being displayed at all. +OpenDataAccessPage | Opens the specified data access page in the specified view. +OpenDiagram | Opens the specified database diagram. +OpenForm | Opens the specified form and optionally applies filtering. +OpenFunction | Opens the specified user-defined function in the specified view (for example, datasheet view) and mode (for example, for data entry). +OpenModule | Opens the specified VBA module at the specified procedure. +OpenQuery | Opens the specified query in the specified view and mode. +OpenReport | Opens a report in Design view or Print Preview. Alternatively, you can use this method to print a hard copy of the report. +OpenStoredProcedure | A macro action that opens a stored procedure in Design view, Datasheet view, or Print Preview. +OpenTable | Opens the specified table in the specified view and mode. +OpenView | Opens the specified view in the specified view and mode. +OutputTo | Outputs the data in the specified object (for example, a report or a data access page) in the specified format. +PrintOut | Prints the specified object. +Quit | Provides backward compatibility with Access 95. With later versions of Access, use Application.Quit instead. +RefreshRecord | Refreshes a record. +Rename | Renames the specified object with the name given. +RepaintObject | Repaints the specified object, completing any screen updates that are pending. +Requery | Updates the data in the specified control by querying the data source again. +Restore | Restores the active window to its nonmaximized and nonminimized size. +RunCommand | Runs the specified built-in menu command or toolbar command. +RunDataMacro | Calls a named data macro. +RunMacro | Runs the specified macro. +RunSavedImportExport | Runs a saved import or export specification. +RunSQL | Runs an Access action query using the specified SQL statement. +Save | Saves the specified object or (if no object is specified) the active object. +SearchForRecord | Searches for a specific record in a table, form, query, or report. +SelectObject | Selects the specified object in the database window or in an object that's already open. +SendObject | Sends the specified object (for example, a form or a report) in an email message. +SetDisplayedCategories | Specifies which categories are displayed under the Navigate To Category option in the Navigation pane. If you click anywhere in the Navigation pane's title bar, you'll see the various options. +SetFilter | Can be used to change the WHERE clause to update a URL. +SetMenuItem | Sets the state of a menu item—for example, enabling or disabling a menu item. +SetOrderBy | Change an order by. In other words, sort records in ascending or descending order. +SetParameter | Sets the values of parameters. +SetProperty | Sets various properties of a control or field, such as BackColor, Width, Enabled, and Caption. +SetWarnings | Turns system messages on or off. +ShowAllRecords | Removes any existing filters from the current form, query, or table. +ShowToolbar | Displays or hides the specified toolbar. +SingleStep | Pauses the currently executing macro and displays a Macro Single Step dialog box. +TransferDatabase | Imports data into or exports data from the current database or project. +TransferSharePointList | Imports (or links) data from a Microsoft Windows SharePoint Services 3.0 site. +TransferSpreadsheet | Imports data from or exports data to a spreadsheet. +TransferSQLDatabase | Transfers the specified SQL Server database to another SQL Server database. +TransferText | Imports data from or exports data to a text file. + +## Using the OpenForm Method to Open a Form + +To open a form, use the OpenForm method of the DoCmd object. The syntax is as follows: + + _expression_.OpenForm(FormName, View, FilterName, WhereCondition, DataMode, WindowMode, OpenArgs) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns a DoCmd object. In many cases, it's easiest to use the DoCmd object itself. + * FormName is a required Variant argument that specifies the name of the form you want to open. The form must be in the current database. + * View is an optional argument that specifies the view to use: acNormal (the default), acDesign, acFormDS, acFormPivotChart, acFormPivotTable, or acPreview. + * FilterName is an optional Variant argument that you can use to specify the name of a query. The query must be stored in the current database. + * WhereCondition is an optional Variant that you can use to specify a SQL WHERE clause. Omit the word WHERE from the clause. + * DataMode is an optional argument for specifying the mode in which to open the form: acFormPropertySettings, acFormAdd, acFormEdit, or acFormReadOnly. acFormPropertySettings is the default setting and opens the form using the mode set in the form. + * WindowMode is an optional argument for specifying how to open the form. The default is acWindowNormal, a normal window. You can also open the form as a dialog box (acDialog) or as an icon (acIcon) or keep it hidden (acHidden). + * OpenArgs is an optional Variant that you can use to specify arguments for opening the form—for example, to move the focus to a particular record. + +The following example uses the DoCmd object to open a form in the Northwind sample database (you must have this database open in Access for this to work). Press Alt+F11 to open Access's VBA Editor, and then type in this code. When you execute the code by pressing F5, Access displays the first record for which the Employee field matches Jan Kotas: + + Sub test () + + DoCmd.OpenForm FormName:="Sales Analysis Form", View:=acNormal, _ + WhereCondition:="Employee ='Jan Kotas'" + + End Sub + +## Using the PrintOut Method to Print an Object + +To print an object, use the PrintOut method. The syntax is as follows: + + _expression_.PrintOut(PrintRange, PageFrom, PageTo, PrintQuality, Copies, CollateCopies) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns a DoCmd object. + * PrintRange is an optional argument that specifies what to print: all of the object (acPrintAll, the default), specific pages (acPages), or the selection (acSelection). + * PageFrom and PageTo are optional Variant arguments that you use with PrintRange: = acPages to specify the starting and ending page numbers of the print range. + * PrintQuality is an optional argument that you can use to specify the print quality. The default setting is acHigh, but you can also specify acLow, acMedium, or acDraft (draft quality, to save ink and time). + * Copies is an optional Variant argument that you can use to specify how many copies to print. The default is 1. + * CollateCopies is an optional Variant argument that you can set to True to collate the copies, and False not to. The default setting is True. + +The following example prints one copy (the default) of the first page in the active object at full quality without collating the copies: + + DoCmd.PrintOut PrintRange:=acPages, _ + PageFrom:=1, PageTo:=1, CollateCopies:=False + +Be sure to trap this code for an error in case you've requested a printout of something that doesn't exist—such as a range of 1 to 4 for a single-page form. In fact, it's always a good idea to trap errors in code that contacts peripherals such as printers or hard drives. What if the printer isn't turned on or the hard drive is full? Your code should anticipate and manage situations like these. + +## Using the _RunMacro_ Method to Run an Access-Style Macro + +To run an Access-style macro, use the RunMacro method. The syntax is as follows: + + _expression_.RunMacro(MacroName, RepeatCount, RepeatExpression) + +Here are the components of the syntax: + + * _expression_ is a required expression that returns a DoCmd object. + * MacroName is a required Variant argument that specifies the macro name. + * RepeatCount is an optional Variant argument that you can use to specify an expression to control the number of times that the macro should run. The default is 1. + * RepeatExpression is an optional Variant argument that contains a numeric expression to be evaluated each time the macro runs. The macro stops when this expression evaluates to 0 (False). + +The following example runs an Access-style macro named RemoveDuplicates: + + DoCmd.RunMacro "RemoveDuplicates" + +# The Bottom Line + +**Get started with VBA in Access.** + +Access allows you to write macros in a VBA Editor using VBA code. But it also features a legacy Macro Designer utility (formerly known as the Macro Builder) with which you create an entirely different kind of macro, what we've been calling an Access-style macro. + +Master It + +The term _macro_ is used in a special way in Access (referring to only one of the two types of custom procedures Access permits you to construct: VBA and Macro Designer). This usage of _macro_ is unlike the way the term _macro_ is used in other Office applications, not to mention all other forms of computing. Describe what Access means by the term _macro_. + +**Open and close databases.** + +Access permits you to open a database in several ways. + +Master It + +Two common commands that open a database in Access are OpenCurrentDatabase and OpenDatabase. What is the difference between these two commands? + +**Work with the Screen object.** + +You became familiar with using ActiveDocument objects in Word to access the document that currently has the focus. Or you used the ActivePresentation object to work with whichever presentation happened to be active in PowerPoint. Access, however, employs the Screen object as the parent of whatever object has the focus. + +Master It + +The Screen object represents the screen object that currently has the focus in Access (that is, the object that is receiving input or ready to receive input). Three types of common Access objects can have the focus when you employ the Screen object. What are they? + +**Use the DoCmd object to run Access commands.** + +Many of the tools that Access makes available to users, such as printing a report or maximizing a window, are also available to the programmer via the methods of the DoCmd object. + +Master It + +The DoCmd object has 66 methods in Office 2013. Describe the purpose of the DoCmd object's Beep method. +Chapter 29 + +Manipulating the Data in an Access Database via VBA + +This chapter shows you how to begin manipulating the data in an Access database. You can do so either from within Access or from another VBA-enabled application—for example, from Excel or from Word. This chapter shows you how to work from within Access. + +There are two main ways to manage data in an Access database: via Data Access Objects (DAO) or via ActiveX Data Objects (ADO). DAO is the older technology to access data, and it works for both Microsoft Jet databases (Microsoft Jet is the Access database engine) and ODBC-compliant data sources. (ODBC is Open Database Connectivity, a long-existing standard for accessing databases. ODBC is also useful for accessing open-source solutions, such as MySQL.) ADO is a high-level programming interface that can be used with a wide range of data sources. + +Access offers you the choice of methods, but you will probably find it easier to use ADO than DAO. Additional information about choosing between these two technologies can be found at the following location: + + + +In this chapter you will learn to do the following: + + * Open a recordset + * Access a particular record in a recordset + * Search for a record + * Edit a record + * Insert and delete records + +# Understanding How to Proceed + +Once you've chosen between ADO and DAO, you take the following primary steps to manipulate the data in the database from Access: + +1. Add a reference to the object library you'll be using. + +2. Create a recordset that contains the records with which you want to work. + +3. Work with the records in the recordset. + +4. Close the recordset. + +All the steps work in more or less the same way for ADO and DAO, except that you create the recordset in different ways. The following sections take you through these steps, splitting the path where necessary to cover the differences in ADO and DAO. + +# Preparing to Manage the Data in a Database + +Given that there are two distinct ways to manage data in Access—ADO and DAO—you have to specify which one you're planning to employ. You can think of libraries as collections of pre-written functions. + +Why bother fooling around with multiple libraries? The answer is that there can't be a single, massive, all-purpose library because, among other issues, there would be name confusion. Two different functions in two different libraries might well share the same name. But they could perform different tasks or perform the same task differently. It's like having various libraries in a large university. The word _positive_ means entirely different things in the law library than it does in the medical library. + +Note that some of the following code examples will work just fine no matter which library you are currently referencing. + +However, to ensure consistency and avoid bugs, create a reference to the object library you want to use (ADO or DAO). And in your code you'll specify the appropriate connection to the data source—the Microsoft ActiveX Data Objects Object 6.1 Library for an ADO connection or Microsoft DAO 3.6 Object Library for a DAO connection. (Note that these 6.1 and 3.6 version numbers might not match the versions of these libraries available on your machine. Just choose the latest, highest version number you see.) + +## Adding a Reference to the Appropriate Object Library + +To create a reference to the object library you need, follow these steps: + +1. Launch Access. + +2. Launch or activate the VBA Editor by pressing Alt+F11. + +3. In the VBA Editor, choose Tools ⇒ References to display the References dialog box. + +4. Scroll down the Available References list box to the appropriate object library item, and then select its check box and click OK to close the References dialog box: + + * For an ADO connection, select the check box for the Microsoft ActiveX Data Objects 6.1 Library item. + * For a Data Access Object, select the check box for Microsoft DAO 3.6 Object Library. + +You can't select both libraries at the same time. And if you don't include the correct library, you'll get a compile error when you try to execute one of the objects in that library (such as a DAO.Recordset). The message will refer to this as a "user-defined" object because it can't find the object in the currently referenced libraries—so the Editor thinks it's a new object introduced by you, the programmer, but that you forgot to declare it. + +## Establishing a Connection to the Database + +It's possible to establish connections to databases in a variety of ways, but in this chapter we'll use a simple, direct line of code. In Chapter 28, "Understanding the Access Object Model and Key Objects," you saw what steps to take to go online and obtain the Northwind.accdb sample database and where to store it on your hard drive so you could experiment with the example code in these final chapters of the book. If you haven't already taken those steps, see the sidebar in Chapter 28 titled "Prepare the Northwind Database to Use with This Book's Examples." + +To open a connection (but not make it visible to the user in Access) to the Northwind sample database, you can use this code if you're employing DAO: + + Dim myDatabase As DAO.Database + Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb") + +You'll see this approach used in examples later in this chapter. You'll also see how to manipulate Northwind while it's loaded into Access where the user can see it. Recall from the previous chapter that you can open a database two ways: to get to its data but not display it in Access, or to load it into Access and make it visible to the user. + +# Opening a Recordset + +To get to the records in the database to which you're establishing the connection, you must open a recordset. ADO and DAO use different approaches. The following subsections give you the details. + +## Opening a Recordset Using ADO + +To open a recordset using ADO, you use the Open method of the RecordSet object. The syntax for the Open method is as follows: + + _recordset_.Open Source, ActiveConnection, CursorType, LockType, Options + +Here are the components of the syntax: + + * _recordset_ is the RecordSet object that you want to open. Often, you'll use an object variable that references the RecordSet object. + * Source is an optional Variant argument that specifies the table, command, SQL statement, or file that contains the recordset. + * ActiveConnection is an optional Variant argument. This can be either an object variable of the Connection type or a Variant/String containing parameters for the connection. + * CursorType is an optional argument for specifying the type of cursor to use in the recordset. Table 29.1 explains the cursor types. + * LockType is an optional argument for specifying how to lock the recordset while it is open. Table 29.2 explains the lock options. + * Options is an optional Long argument that you can use to control how the Source value is evaluated if it is not a Command object. Table 29.3 explains the available constants, which fall into two categories: command-type options and execute options. You can use two or more constants for the Options argument. + +* * * + +An Alternative to Providing Arguments for the Open Method + +Instead of specifying the arguments with the Open method, you can set the Source, ActiveConnection, CursorType, and LockType properties of the RecordSet object you're opening and then use the Open method without arguments. You may find that this approach makes your code easier to read. + +* * * + +Table 29.1 Cursor-type constants for opening a recordset + +**Constant** | **Cursor Type and Explanation** +---|--- +adOpenForwardOnly | Forward-only cursor. You can scroll through the recordset only forward. This is the default cursor and provides the best performance when you need to go through the records only once. +adOpenDynamic | Dynamic cursor. You can move freely through the recordset, and you can see changes that other users make to records. +adOpenKeyset | Keyset cursor. You can move freely through the recordset and see changes that other users make to records. You cannot see records that other users add, and records that other users delete are inaccessible. +adOpenStatic | Static cursor. You can't see changes that other users make. Use a static cursor when you need to only search for data or create reports from the data that exists when you open the recordset. + +Table 29.2 Lock options for opening a recordset via ADO + +**Constant** | **Opens the Recordset With** +---|--- +adLockReadOnly | Data in read-only mode, so you cannot alter it. Use this constant if you need to search or analyze the data but not manipulate it. +adLockOptimistic | Optimistic locking, which locks a record only when you run the Update method to update it explicitly. +adLockBatchOptimistic | Optimistic batch locking, which enables you to perform a simultaneous update on several records that you've changed. +adLockPessimistic | Pessimistic locking, which locks a record immediately after you change it. + +Table 29.3 Choices for the Options argument when opening a recordset + +**Constant** | **Explanation** +---|--- +****************| ******** +adCmdText | Evaluates Source as text specifying a command or stored procedure call. +acCmdTable | Evaluates Source as the name of a table consisting of columns returned by an internally generated SQL query. +acCmdStoredProc | Evaluates Source as the name of a stored procedure. +acCmdFile | Evaluates Source as the filename of a stored recordset. +acCmdTableDirect | Evaluates Source as a table name and returns all columns of the table. Do not use with adAsyncExecute. +adCmdUnknown | This means that the type is unknown. This is the default. +**Execute Options** | **** +adAsyncExecute | Executes the command asynchronously. Does not work with acCmdTableDirect. +adAsyncFetch | Retrieves the rows specified by the CacheSize property synchronously and the remaining rows asynchronously. +adAsyncFetchNonBlocking | Prevents the main thread from blocking other data access while retrieving data. +adExecuteRecord | The CommandText (adCmdText, described earlier in this table) is a stored procedure or a command that fetches a single row of data. It is returned as a Record object. +adExecuteNoRecords | Used to improve performance when you know that no records will be returned (for example, you're merely adding, not fetching, data). +adExecuteStream | Treats the data returned by Source as a single row that becomes a Record object. + +You'll see examples of opening a recordset a little later in this chapter. First, you must decide how to access the data in the recordset. The easiest methods are to use an existing table or a SQL SELECT statement. + +## Choosing How to Access the Data in an ADO Recordset + +How you actually get to the data in the recordset you open depends on whether you want to fetch all the data in a table or just part of it. If you want all the data in a table, you can use a table to access the data. If you want to return only particular records, you can use an SQL SELECT statement to fetch them. + +### Using a Table to Access the Data in an ADO Recordset + +To open in a recordset a whole table from a database, specify the table name as the Source argument in the Open statement. The following example declares a RecordSet object variable, uses a Set statement to assign the appropriate recordset type to it, uses the ActiveConnection method to connect to the currently active database (whatever you have loaded into Access at the time), and then uses the Open method to open the entire Customers table. We'll use the Northwind sample database (that you installed in Chapter 28), which has a Customers table. + +This example demonstrates how to bring into an ADO recordset the data from an entire table and then move around within this recordset. Your code will not need to instantiate a database object but instead will work with the Northwind database that's currently loaded into Access. (This is a very simple example to illustrate some basic concepts. Normally when accessing a database, you'll want to employ an SQL statement and check for recordset boundary conditions—using BOF and EOF properties. SQL and BOF/EOF are described later in this chapter. For now, just consider the following example code an illustration of elementary principles, to which you'll add real-world maneuvers demonstrated in the code examples later in this chapter.) + +As always, it's necessary for you to first ensure that the ADO library is referenced. So in the VBA Editor, choose Tools ⇒ References and select the check box next to Microsoft ActiveX Data Objects 6.1 Library. Finally, load the Northwind.accdb sample database into Access. + + 1. Sub ExploreRecordset() + 2. Dim myRecordset As ADODB.Recordset + 3. Set myRecordset = New ADODB.Recordset + 4. + 5. 'point to the currently loaded database + 6. myRecordset.ActiveConnection = CurrentProject.Connection + 7. myRecordset.CursorType = adOpenStatic + 8. myRecordset.Open Source:="Customers" + 9. + 10. 'Display the First Name from the first row + 11. MsgBox myRecordset("First Name") + 12. + 13. 'Move to the last row and show the Last Name + 14. myRecordset.MoveLast + 15. MsgBox myRecordset("Last Name") + 16. + 17. 'Move to the previous row and display the Job Title + 18. myRecordset.MovePrevious + 19. MsgBox myRecordset("Job Title") + 20. + 21. 'Move back to the first row and display the Phone Number + 22. myRecordset.MoveFirst + 23. MsgBox myRecordset("Business Phone") + 24. + 25. 'Move to the next row and show the Last Name + 26. myRecordset.MoveNext + 27. MsgBox myRecordset("Last Name") + 28. + 29. + 30. myRecordset.Close + 31. Set myRecordset = Nothing + 32. End Sub + +In this code, you first declare a recordset variable, and in line 6 you point it to the database currently loaded in Access. Line 7 defines the cursor type as static, and line 8 loads the data—the entire Customers table—into your recordset. + +Line 11 doesn't move anywhere within the recordset, so by merely supplying the recordset's name, MyRecordset, along with one of the table's field names, Last Name, to a MsgBox function, you can display the first record in the table. + +Line 14 does move to a different record within the recordset—the last record—before displaying the data in that record's Last Name field. Line 18 moves to the penultimate record, line 22 moves to the first record, and line 26 moves to the second record. Finally, line 30 closes the recordset and line 31 assigns Nothing to the object variable, which has the effect of eliminating it. + +* * * + +What Is a "First Record" in a Table? + +LWparaofcase study typeIt's important for beginners to understand the practical difference between a table of raw data in a database and an organized _recordset_ extracted from that database. The concept of a "first record" within a relational database is essentially meaningless until you use an SQL statement to organize (sort or group) the records in some fashion. + +Records in a relational database (the type of database Access employs) are not necessarily organized. For example, they are not necessarily alphabetized by any particular field (such as LastName) or numerically listed by an ID number, or organized using some other scheme. True, data is stored in tables, and a table _does_ have structure: its fields separate the data into logical categories such as LastName, Address, CellPhone, and so on. But its records (rows of actual data) are _not necessarily_ maintained in any particular order. + +A set of records (a _recordset_ ) is extracted from a database when you execute an SQL statement. This statement allows you to specify how you want to see the records organized (grouped by city, alphabetized, or whatever). SQL is flexible: You can organize records in many ways when you extract a recordset from a database. You can sort records by any of their fields; you can also sort in either ascending (the default) or descending order (specify DESC for descending). Which record is first also depends on which field you sort the recordset by, as specified in the ORDER BY statement. + +In the example in the section "Using a Table to Access the Data in an ADO Recordset" in this chapter, the records are moved into the recordset unsorted. As each action is carried out in this code—moving forward and backward through the recordset—message boxes display the records in their unsorted order. However, if you want to organize the records in alphabetical order by each customer's last name, add an ORDER BY keyword to your SQL statement, like this: + + myRecordset.Open "Select * from Customers + **ORDER BY 'Last Name'** + " + +Just remember that you can get a recordset without using an SQL statement, like this: + + myRecordset.Open Source:="Customers" + +But the concept of a "first record" in this recordset probably will have no meaning. + +However, you can get a recordset by using an SQL statement, like this: + + myRecordset.Open strSQL + +In this case, the "first record" will have meaning to you—based on the criteria you specified in the SQL statement (strSQL here would be a string you previously defined that contains an SQL statement). The section titled "Using an SQL SELECT Statement to Access a Subset of the Data in an ADO Recordset," later in this chapter, explains how to use an SQL statement. + +* * * + +### Using an SQL _SELECT_ Statement to Access a Subset of the Data in an ADO Recordset + +If you want to add to your recordset only those records that match criteria you specify, use an SQL SELECT statement. SELECT statements can be constructed in complex ways, but you can also create straightforward statements with a little practice using this syntax: + + SELECT [DISTINCT] _fields_ FROM _table_ WHERE _criteria_ ORDER BY _fields_ [DESC] + +The words in uppercase are the SQL keywords, and the words in lowercase italics are placeholders for the data you supply, such as the actual name of a real table. Here are the details: + + * The SELECT keyword indicates that you're creating a statement to select records (as opposed to, say, delete records). + * You can include the optional DISTINCT keyword (the brackets indicate that it is optional) to make the statement return only unique records, discarding any duplicates that the statement would otherwise return. If you omit DISTINCT, you get any duplicates as well. + * _fields_ is a list of the fields that you want to have appear in the recordset. If you use two or more field names, separate them with commas — for example, contact, company, address. To return all field names, enter an asterisk (*). + * FROM _table_ specifies the name of the table from which to draw the data. + * WHERE _criteria_ specifies the criteria for filtering the records. Enter the field name, an equal sign, a single straight quote, the value you're looking for, and another single straight quote. For example, WHERE City = 'Taos' returns only the results where Taos appears in the City field. + * ORDER BY _fields_ specifies the field or fields on which to sort the results. If you use two or more fields, put them in the order of precedence you want (the first sort field first, the second sort field second, and so on) and separate them with commas. The default sort order is ascending, but you can force a descending sort by adding the DESC keyword. For example, ORDER BY Zip DESC produces a descending sort by the Zip field, while ORDER BY State, City produces an ascending sort by the State field and, within that, by City. + +Because SQL SELECT statements contain so many elements, putting a SELECT statement as an argument in an Open statement can create uncomfortably long lines of code. You can break the lines of code in the editor with the underscore symbol as usual, but you may find it easier to use the properties of the RecordSet object to specify the details of the recordset rather than using the Open arguments. + +Another way to avoid using a large SQL statement as an argument for the Open method is to first assign the SELECT statement to a String variable and then use that string to supply the argument. The following code illustrates that approach. + +In this code, we'll assign an SQL statement to a string and then use that string as the argument for the Open statement. Before executing this example, press Ctrl+G in the VBA Editor to open the Immediate window, where the results will be displayed. + + Sub SubSet() + + Dim strSQL As String + + Dim myRecordset As ADODB.Recordset + Set myRecordset = New ADODB.Recordset + myRecordset.ActiveConnection = CurrentProject.Connection + **strSQL = "Select * FROM Customers WHERE ID > 17"** + myRecordset.Open strSQL + + Do Until myRecordset.EOF + Debug.Print myRecordset("Last Name") + myRecordset.MoveNext + Loop + + End Sub + +In this example, you want to import into the recordset only those records that have an ID higher than 17, so you set up an SQL statement that specifies that condition. Then you looped through the recordset until EOF (end of file), displaying each last name in the Immediate window. + +#### _Opening a Recordset Using DAO_ + +When working with DAO, you use a different approach than the ADO techniques explored so far in this chapter. You use the OpenRecordset method of the Database object to create a new recordset and add it to the Recordsets collection. + +The syntax for the OpenRecordset method is as follows: + + Set _recordset_ = _object_.OpenRecordset (Name, Type, Options, LockEdit) + +Here are the components of the syntax: + + * _recordset_ is an object variable representing the RecordSet object you're opening. + * _object_ is an object variable representing the database from which to create the new RecordSet object. + * Name is a required String argument that specifies the table, query, or SQL statement that provides the records for the recordset. If you're using a Jet database and returning a table-type recordset, you can use only a table name for the Name argument. + * Type is an optional argument that you can use to specify the type of recordset you're opening. Table 29.4 explains the constants you can use for Type. + * Options is an optional argument that you can use to specify constants that control how Access opens the recordset. Table 29.5 explains the constants you can use for Options. + * LockEdit is an optional constant that you can use to specify how the recordset is locked. Access 2013 no longer supports ODBCDirect workspaces. So if you need to connect to external data stores directly (not through Access's database engine), then you must use ADO rather than DAO. Table 29.6 explains the constants you can use for LockEdit. + +Table 29.4 Constants for the Type argument for the OpenRecordSet method + +**Constant** | **Opens This Type of Recordset** +---|--- +dbOpenTable | Table-type. This works only in Microsoft Jet workspaces. This is the default setting if you open a recordset in a Jet workspace without specifying the Type. +dbOpenDynamic | Dynamic-type. This works only in ODBCDirect workspaces. The recordset is similar to an ODBC dynamic cursor and enables you to add, remove, or edit rows from a database table. +dbOpenDynaset | Dynaset-type. This recordset is similar to an ODBC keyset cursor and enables you to add, remove, or edit rows from a database table. You can also move freely through the rows in the dynaset. +dbOpenSnapshot | Snapshot-type. This recordset is similar to an ODBC static cursor. It opens a snapshot of the records but does not update them when other users make changes. To update the snapshot, you must close the recordset and reopen it. +dbOpenForwardOnly | Forward-only. You can move only forward through the recordset. + +Table 29.5 Table 29.5: **** Constants for the Options argument + +**Constant** | **Explanation** | **Limitations** +---|---|--- +dbAppendOnly | Users can add new records but cannot edit or delete existing records. | Jet dynaset-type recordsets only +dbSQLPassThrough | Passes an SQL statement to an ODBC data source connected via Jet. | Jet snapshot-type recordsets only +dbSeeChanges | Causes a runtime error if a user attempts to change data that another user is already editing. | Jet dynaset-type recordsets only +dbDenyWrite | Prevents other users from adding or modifying records. | Jet recordsets only +dbDenyRead | Prevents other users from reading data. | Jet table-type recordsets only +dbForwardOnly | Forces a forward-only recordset. This is an older option included for backward compatibility. Use Type: = dbOpenForwardOnly instead. | Jet snapshot-type recordsets only +dbReadOnly | Prevents users from changing the recordset. This is an older option included for backward compatibility. Use LockEdits: = dbReadOnly instead. If you must use Options: = dbReadOnly, do not include the LockEdits argument. | Jet recordsets only +dbRunAsync | Runs a query asynchronously (so that some results are returned while others are still pending). | ODBCDirect workspaces only +dbExecDirect | Runs a query by calling SQLExecDirect. | ODBCDirect workspaces only +dbInconsistent | Permits inconsistent updates, enabling you to update a field in one table of a multitable recordset without updating another table in the recordset. You can use either this constant or dbConsistent, but not both. | Jet dynaset-type and snapshot-type recordsets only +dbConsistent | Permits only consistent updates so that shared fields in tables underlying a multitable recordset must be updated together. You can use either this constant or dbInconsistent, but not both. | Jet dynaset-type and snapshot-type recordsets only +dbFailOnError | If an error occurs, updates are rolled back. | Jet recordsets only + +Table 29.6 Constants for the LockEdit argument + +**Constant** | **Explanation** | **Default or Limitations** +---|---|--- +dbPessimistic | Uses pessimistic locking, which locks a record immediately after you change it. | Default for Jet workspaces +dbOptimistic | Uses optimistic locking, which locks a record only when you run the Update method to update it explicitly. | +dbOptimisticValue | Uses optimistic concurrency, comparing the data values in old and new records to find out if changes have been made since the record was last accessed. The concurrency is based on row values. | ODBCDirect workspaces only +dbOptimisticBatch | Uses optimistic batch locking, which enables you to perform a simultaneous update on several records that you've changed. | ODBCDirect workspaces only + +### Opening a DAO Recordset Using a Table + +The easiest way to open a DAO recordset is to open an entire table by specifying the table name for the Name argument and using Type: = dbOpenTable to explicitly state that you're opening a table. The following example declares the object variable myRecordset as a DAO.Recordset object and then assigns to it the records from the Customers table in the database identified by the myDatabase object variable: + + Sub DAOTest() + Dim myRecordset As DAO.Recordset + Dim myDatabase As DAO.Database + + 'Open the copy of Northwind on the hard drive + Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb") + + 'Create the DAO-style Recordset + + **Set myRecordset = myDatabase.OpenRecordset(Name:="Customers", _** + **Type:=dbOpenTable)** + + MsgBox myRecordset("ID") + MsgBox myRecordset("Company") + MsgBox myRecordset("Address") + MsgBox myRecordset("City") + + Set myRecordset = Nothing + End Sub + +### Opening a DAO Recordset Using an SQL _SELECT_ Statement + +If you want to return only a subset of records rather than an entire table, use an SQL SELECT statement to open the DAO recordset. (See "Using an SQL SELECT Statement to Access a Subset of the Data in an ADO Recordset," earlier in this chapter, for an explanation of the essentials of SQL SELECT statements.) + +Specify the SQL statement as the Name argument for the OpenRecordset method, as the following example illustrates. This code declares a Database object variable, assigns the Northwind sample database to it, declares a RecordSet object variable, and then assigns to the object variable the results of a SELECT statement run on the database: + + Sub DAOSelect() + + Dim myDatabase As DAO.Database + Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb") + + Dim myRecordset As DAO.Recordset + Set myRecordset = myDatabase.OpenRecordset _ + (Name:="SELECT * FROM Customers WHERE City ='Boston'", _ + Type:=dbOpenDynaset) + Do Until myRecordset.EOF + Debug.Print myRecordset("Last Name") + myRecordset.MoveNext + Loop + + Set myRecordset = Nothing + End Sub + +Note that the results in this example are printed in the VBA Editor's Immediate window, so press Ctrl+G to open that window before pressing F5 to test this procedure. + +# Accessing a Particular Record in a Recordset + +To work with a particular record in a recordset, you can either move through (loop) the records until you find the one you want or search for the record using Seek or Find methods. The RecordSet object includes these methods for moving about the records in the recordset: + +**Method** | **Moves to Record** +---|--- +MoveFirst | First +MoveNext | Next +MovePrevious | Previous +MoveLast | Last +Move | Move to a specified record + +## Using the MoveFirst, MoveNext, MovePrevious, and MoveLast Methods + +The MoveFirst method and MoveLast method are always safe to use because as long as the recordset contains one or more records, there's always a first record and a last record. (If the recordset contains only one record, that record is considered both first and last.) + +But if you use the MovePrevious method from the first record in the recordset or the MoveNext method from the last record, you move beyond the recordset, accessing what is sometimes called a "phantom record"—one that isn't there. When you try to access the contents of such a record, VBA gives the runtime error 3021 ("No current record"). Figure 29.1 shows this error. + +Figure 29.1 The runtime error "No current record" usually means that you've moved outside the recordset. + +BOF means beginning of file, and EOF means end of file. Note that you can visualize the end of a recordset as a point just beyond the last record. EOF, therefore, is not the same as the last record. BOF, likewise, is not the first record, but a point just before it. (I mention this because we have a tendency to view the first item in a set as the "beginning" of the set; we would consider the first float as the beginning of a parade. Recordsets aren't like that.) + +To check whether you're at the beginning or end of the recordset, use the BOF property or the EOF property of the RecordSet object. The BOF property returns True when the current record is at the beginning of the file, and the EOF property returns True when the current record is at the end of the file. To avoid errors, after using the MovePrevious method check whether the beginning of the file has been reached, as in this example: + + With myRecordset + .MovePrevious + **If .BOF = True Then .MoveNext** + End With + +Similarly, after using the MoveNext method check whether the end of the file has been reached: + + myRecordset.MoveNext + **If myRecordset.EOF Then myRecordset.MovePrevious** + +## Using the _Move_ ****Method to Move by Multiple Records + +To move by several records at once, but not to the first record or last record in the recordset, use the Move method. The syntax for ADO differs from that used with DAO. + +Here's the syntax for the Move method with ADO: + + _recordset_.Move NumRecords, Start + +The syntax for the Move method with DAO is as follows: + + _recordset_.Move Rows, StartBookmark + +Here, _recordset_ is the recordset involved, NumRecords or Rows is the number of records by which to move (use a positive number to move forward or a negative number to move back), and Start or StartBookmark is an optional argument that you can use to specify a bookmark from which you want to start the movement. If you omit Start or StartBookmark, movement starts from the current record. + +For example, the following statement moves 10 records forward from the current record in an ADO recordset: + + myRecordset.Move NumRecords:=10 + +The following statement moves 5 records backward from the current record in a DAO recordset: + + myRecordset.Move Rows:=-5 + +To create a bookmark, move to the record that you want to mark, and then use the Bookmark property of the RecordSet object. The following example declares a Variant variable named myBookmark and then assigns to it a bookmark representing the current record in an ADO recordset: + + Dim myBookmark As Variant + myBookmark = myRecordset. **Bookmark** + +After setting a bookmark, you can use it as the starting point of a move. For example, the following statement moves to the eighth record after the bookmark myBookmark in an ADO recordset: + + myRecordset.Move NumRecords:=8, **Start:=myBookmark** + +# Searching for a Record + +The process of searching for a record in a recordset differs in ADO and in DAO. The following sections show you how to search using either technology. + +* * * + +Also Consider the Seek Method + +Both ADO recordsets and DAO recordsets include a method called Seek, which is more complex and more powerful than the Find method for ADO and the four Find methods for DAO discussed here. Consult the Access VBA Help file for additional details on the Seek method. + +* * * + +## Searching for a Record in an ADO Recordset + +To search for a record in an ADO recordset, you can use the Find method of the RecordSet object. The syntax is as follows: + + _recordset_.Find Criteria, SkipRows, SearchDirection, Start + +Here are the components of the syntax: + + * _recordset_ is the recordset involved. + * Criteria is a required String argument that specifies the column name, type of comparison, and value to use. For example, to locate a record where the state is California, you could specify that the State column is equal (=) to CA. + * SkipRows is an optional Long value that you can use to specify an offset from the current row (or from the bookmark specified by the Start argument) at which to start searching instead of starting from the current row. For example, an offset of 3 starts the search three rows later than the current row. + * SearchDirection is an optional argument for specifying whether to search forward or backward. The default is adSearchForward; specify adSearchBackward to search backward instead. + * Start is an optional Variant argument that specifies the bookmark from which to start the search or the offset. If you omit Start, the search starts from the current row. + +When you run the search, it stops at the first matching record. If no record matches and you're searching forward, it stops at the end of the recordset; if you're searching backward, it stops at the beginning of the recordset. If the end or beginning of the recordset is reached, you know that there was no match for the search. + +The following example begins by moving to the first record in the recordset that is represented by the object variable myRecordset. Then the code searches for the first record that matches the criterion "City = 'Denver'". The example checks the EOF property to ensure that the end of the recordset has not been reached. If it has not, this means we found a record containing Denver in the City field, so the example displays a message box with the last name data for the record matching Denver. However, if the end of the recordset has been reached, the example displays a message box stating that no match was found: + + Sub SearchADO() + + Dim strSQL As String + + Dim myRecordset As ADODB.Recordset + Set myRecordset = New ADODB.Recordset + myRecordset.ActiveConnection = CurrentProject.Connection + + myRecordset.Open Source:="Select * from Customers", _ + Options:=adCmdText + + With myRecordset + .MoveFirst + . **Find Criteria:="City='Denver'"** + + If Not .EOF Then + MsgBox .Fields("Last Name") + Else + MsgBox "No matching record was found." + End If + + End With + + End Sub + +To continue your search for the same criteria, you can use the SkipRows argument to specify an offset so that you don't simply find the current record again. For example, you'll likely want to move ahead just one row, like this: + + myRecordset.Find Criteria="City='Denver'", SkipRows:=1 + +## Searching for a Record in a DAO Recordset + +To search for a record in a DAO recordset, you can use one of these four methods: + + * The FindFirst method starts searching at the beginning of the recordset and searches forward. + * The FindNext method starts searching at the current record and searches forward. + * The FindPrevious method starts searching at the current record and searches backward. + * The FindLast method starts searching at the end of the recordset and searches backward. + +The syntax for these four methods is as follows: + + _recordset_.FindFirst _Criteria_ + _recordset_.FindNext _Criteria_ + _recordset_.FindPrevious _Criteria_ + _recordset_.FindLast _Criteria_ + +Here, _recordset_ is a required object variable that represents the RecordSet object involved. _Criteria_ is a required String argument that specifies the criteria for the search. _Criteria_ works in the same way as the WHERE clause in an SQL statement, except that it does not use the word WHERE. + +The following example uses the FindFirst method to search from the beginning of the recordset for the first record that matches the criterion City = 'Las Vegas': + + Sub DAOSearch() + + Dim myDatabase As DAO.Database + Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb") + Dim myRecordset As DAO.Recordset + Set myRecordset = myDatabase.OpenRecordset _ + (Name:="SELECT * FROM Customers", _ + Type:=dbOpenDynaset) + + myRecordset. **FindFirst "City = 'Las Vegas'"** + + MsgBox myRecordset("Last Name") + + Set myRecordset = Nothing + End Sub + +When you start a search in a DAO recordset using one of the four Find methods, the NoMatch property of the RecordSet object is set to True. If the method finds a match, the NoMatch property is set to False. So you can test the NoMatch property to tell whether or not the search found a match, as in this example: + + If myRecordset.NoMatch = False Then + MsgBox myRecordset("Last Name") + End If + +# Returning the Fields in a Record + +Once you've moved to a record, you can return the fields it contains by using the appropriate Field object from the Fields collection. Field is the default property for the RecordSet object, so you can omit it if you choose. For example, both the following statements return the Last Name field from the current record: + + myRecordset.Fields("Last Name") + myRecordset("Last Name") + +# Editing a Record + +To change the data in a record, first use the Edit method to specify the value you want to store in the field, and then use the Update method of the RecordSet object to update the data in the underlying table. The following example prepares a record for editing with the Edit method, changes the value in the Last Name field to Schmidtz, and then uses the Update method to update it: + + With myRecordset + .Edit + .Fields("Last Name").Value = "Schmidtz" + .Update + End With + +# Inserting and Deleting Records + +To insert a new record, use the AddNew method of the RecordSet object. You can then assign data to the fields in the record. After that, use the Update method to save the data to the table in the database. The following example uses a With statement to perform these actions: + + Sub AddOne() + + Dim myDatabase As DAO.Database + Set myDatabase = DBEngine.OpenDatabase("C:\temp\Northwind.accdb") + Dim myRecordset As DAO.Recordset + Set myRecordset = myDatabase.OpenRecordset _ + (Name:="SELECT * FROM Customers", _ + Type:=dbOpenDynaset) + + With myRecordset + .AddNew + .Fields("ID").Value = 32 + .Fields("Last Name").Value = "Murphy" + .Fields("First Name").Value = "Andrea" + .Fields("Company").Value = "Company RP" + .Fields("City").Value = "City of Industry" + 'add data for the other fields here + .Update + End With + + Set myRecordset = Nothing + + End Sub + +After you press F5 in the VBA Editor to test this code, switch to Access, display the Customers table, _and then you will need to press F5 to refresh the view in Access before you can see this new record_. + +To delete a record, identify it by either moving to it or searching for it, and then use the Delete method followed by the Update method. The following example deletes the current record and then updates the table: + + myRecordset.Edit + myRecordset.Delete + myRecordset.Update + +# Closing a Recordset + +After working with an object, you should close it. To close a recordset, use the Close method with the appropriate RecordSet object or the object variable that represents the RecordSet object. The following example closes the recordset represented by the object variable myRecordset: + + myRecordset.Close + +After closing the recordset, set its object variable to Nothing to release the memory it occupied: + + Set myRecordset = Nothing + +# Saving a Recordset to the Cloud + +You might want to store a recordset on your hard drive or in the cloud. As you've seen in cloud-access examples in previous chapters, saving files to the cloud is much the same as saving to an ordinary hard-drive folder. By the way, this example also illustrates how to use the Save method of the RecordSet object: + + 1. Sub SaveToCloud() + 2. + 3. Dim myRecordset As ADODB.Recordset + 4. Set myRecordset = New ADODB.Recordset + 5. myRecordset.ActiveConnection = CurrentProject.Connection + 6. + 7. Dim strSQL As String + 8. Dim strFilepath As String + 9. strFilepath = "C:\Users\Richard\ **SkyDrive** \Cities.xml" + 10. + 11. strSQL = "SELECT city FROM Employees" + 12. myRecordset.Open strSQL + 13. + 14. myRecordset. **Save** strFilepath, adPersistXML + 15. + 16. Set myRecordset = Nothing + 17. + 18. End Sub + +To test this, open Northwind and press Alt+F11 to open the VBA Editor. Paste this code into a module, but change _Richard_ in line 9 to your own name. + +Most of this code should be understandable from previous examples in this chapter. Line 9 specifies the location on my hard drive where files move to SkyDrive automatically after being saved there. You could just as easily save this recordset to any ordinary hard-drive folder, like this: + + myRecordset.Save "c:\temp\Cities.xml", adPersistXML + +The save command we're using stores this recordset in the XML format, about which I'll have much more to say in Chapter 31, "Programming the Office 2010 Ribbon." + +However, for the curious, here's what this recordset looks like in the XML format, showing the city data for the nine records in the Employees table: + + + + + + + + + + + + + + + + + + + + + + + +# The Bottom Line + +**Open a recordset.** + +You can open an ADO recordset in two different ways. + +**Master It** + +One way to open an ADO recordset is to provide an argument list following the Open method. What is the other way to open an ADO recordset, which doesn't involve using arguments? Some people say that this second approach makes their code easier to read. + +**Access a particular record in a recordset.** + +Both ADO and DAO technologies have methods that allow you to move around within a recordset. + +**Master It** + +One method you can use to traverse a recordset is the MoveFirst method. It takes you to the first record in the recordset. What does the _first record_ mean in a recordset in a relational database? Is it the record that's the lowest numerically, the lowest alphabetically, or what? + +**Search for a record.** + +Both ADO and DAO offer methods to directly search for a particular record. + +**Master It** + +ADO offers a Find method. How many methods does DAO offer, and what are they? + +**Edit a record.** + +When editing a record, you first use the Edit method, and then you can change the value in a field. + +**Master It** + +After you have made a change to a value in a record, what method do you use to save this change to make it part of the database? + +**Insert and delete records.** + +It's not difficult to insert new records or delete existing ones. In both situations, you use the Update method when finished to save the changes to the database. + +**Master It** + +To insert a new record into a recordset, what method do you use before you can assign data to the fields in the new record? +Chapter 30 + +Accessing One Application from Another Application + +So far, this book has focused on how to work with VBA to perform actions _within_ a VBA host application, such as Word or Access. + +But you might sometimes (perhaps often) need to communicate between applications as well. This chapter shows you the tools for contacting and manipulating one application from another: Automation, data objects, Dynamic Data Exchange (DDE), and SendKeys. + +In this chapter you will learn to do the following: + + * Use Automation to transfer information + * Use the Shell function to run an application + * Use data objects to store and retrieve information + * Communicate via DDE + * Communicate via SendKeys + +# Understanding the Tools Used to Communicate between Applications + +Most VBA host applications (such as the Office applications that this chapter uses as examples) offer several tools for communicating with other applications: + +**Automation** + +Formerly known as Object Linking and Embedding (OLE), Automation is usually the most effective method for transferring information from one Windows application to another. If the applications you're using support Automation, use it in preference to the alternatives, DDE and SendKeys. + +**Dynamic Data Exchange (DDE)** + +An older method of transferring information between applications that remains a good fallback when Automation isn't available. DDE is available in only some applications. + +**SendKeys** + +The oldest and most primitive method of communicating between applications, SendKeys relies on sending keystroke equivalents to the other application. It's an attempt to pretend that someone is typing on the keyboard. But this can cause timing and other issues. Although rudimentary by comparison to Automation and DDE, SendKeys can still be effective in some situations. + +Beyond these three communications tools, this chapter discusses the DataObject object, which you can use to store information and to transfer information to and from the Windows Clipboard. + +* * * + +Don't Forget the Command Line + +If an application doesn't offer any of the control methods discussed in this chapter, you may be able to control it through the command line. For example, you can use the /p command-line switch in many applications to print a file without any user interaction. Search the Web for "command line, vba" and the application's name to find relevant tutorials. + +* * * + +# Using Automation to Transfer Information + +Automation is the most powerful and efficient way to communicate between applications. Each application that supports Automation offers one or more Component Object Model (COM) objects that you can access programmatically—usually an object representing the application, an object representing the various types of files the application uses, objects representing its major components, and so on. + +For any Automation transaction, there's a _server application_ that provides the information or tools and a _client application_ that receives or employs them. (There's also another pair of terms that distinguish between two communicating applications: the server application is also sometimes known as the _object application_ , and the client application is known as the _controlling application_.) + +Automation lets the client application harness the built-in capabilities of the server application. For example, Excel has better calculation features than Word and can generate useful charts, data maps, and so on based on its calculations and data. By using Automation, Word can borrow Excel's calculation engine and then insert the results into a Word document. Or Word could use Excel to create a chart that it then inserts into a document as well. Word can also take more-limited actions, such as causing Excel to open a workbook, copy a group of cells from a spreadsheet in it, and paste-link them into a document. + +To use Automation through VBA, you create an object in VBA that references the application you want to work with. You use the CreateObject function to create a new object in another application and the GetObject function to retrieve an existing object in another application. + +When using Automation, you can choose whether to display the server application or keep it hidden from the user. For some procedures, you'll need to display it—for example, the user might need to choose a file or a folder or make another choice that requires live intervention. In other situations, it can be best to keep the server application hidden so that the user isn't distracted by an application suddenly launching itself spontaneously and robotically carrying out actions in front of the user's startled eyes. This can make some users uneasy, as if the computer has gotten out of control. A colleague of mine, something of a prankster, used to torment new hires by inserting a procedure in their word processor that caused individual characters in a document to start swinging and then drop off the bottom of the screen. As if they'd "come loose." Then he would walk over and tell them that this wouldn't be a problem as long as they didn't jar their desk while typing. + +But even if you decide to hide a server application from the user when the procedure runs, in most cases it's helpful to display to yourself the server application while you're writing and testing the procedure. Doing so makes it much easier to see what's going wrong when your code doesn't work as expected. + +## Understanding Early and Late Binding + +When you use Automation to access another application, you can choose which type of _binding_ to use—that is, how to establish the connection between the client application and the server application. + +_Early binding_ involves adding a reference to the application's object library by using the References dialog box (Tools ⇒ References) at design time and then declaring an object at the start of the code by using a Dim statement that declares the specific object class type rather than declaring the object generically As Object. + +For example, the following code connects to a slide within a PowerPoint presentation by using early binding: + + Dim myPowerPoint As PowerPoint.Application + Dim myPresentation As Presentation + Dim mySlide As Slide + Set myPowerPoint = CreateObject("PowerPoint.Application") + Set myPresentation = myPowerPoint.Presentations.Add + Set mySlide = myPresentation.Slides.Add(Index:=1, Layout:=ppLayoutTitleOnly) + +With late binding, you create an object that references the other application when you run the code. If you declare the object explicitly, you declare it as a generic object—As Object—rather than declare it as a specific object class type. + +For example, the following statements declare the Object variable myOutlook and then assign to it a reference to an Outlook.Application object: + + Dim myOutlook As Object + Set myOutlook = CreateObject("Outlook.Application") + +* * * + +Early Binding Isn't Universal + +Not all applications that support Automation support early binding. Some applications cannot provide direct access to their functions at design time, while you're writing your code, as is required for early binding. They provide access to their functions only at runtime, when the code itself is executing. With such applications, you have no choice; you must use late binding. + +* * * + +If the server application you're using supports early binding, use it in preference to late binding. There are three advantages to early binding: + + * Once you've added to the project the reference to the application's object library, you can dynamically work in your code with the outside (server) application's objects, properties, and methods through the VBA Editor in the client application. This makes it much easier to use the Editor's built-in IntelliSense features to find the objects, properties, and methods you need in the application you're referring to, and to avoid mistakes such as typos and missing arguments. + * Because you specify the particular type of object when you declare the object variable, you're less likely to attempt to work with the wrong object by mistake. + * Because VBA can compile more information about the object, elements of its methods and properties need not be resolved during runtime. So it runs faster. + +On the other hand, late binding can avoid object-library issues such as having to make the right references and other library-version problems. + +## Creating an Object with the _CreateObject_ Function + +The CreateObject function creates and returns a reference to an Automation object exposed to other applications. The syntax is as follows: + + CreateObject( _class_ [, _servername_ ]) + +Here, _class_ is a required argument specifying the class (the formal definition) of the object to create. The _class_ argument consists of the name of the library that will provide the object and the type of object to be provided, so it looks like this: + + applicationname.objecttype + +For example, to specify the Excel Application object as a class, use a _class_ argument of Excel.Application. Here, Excel is the name of the application that provides the object, and Application is the type of object that we want Excel to provide. Likewise, Excel.Sheet would specify a worksheet object in Excel. + +_servername_ is an optional string Variant that specifies the name of the network server on which to create the object. If you merely want to connect to an application located on the user's machine (in other words, if both applications—the client and server applications—are located on the same hard drive), omit _servername_ or specify an empty string. To connect with an application located on a remote server machine, you must have DCOM (the Distributed Component Object Model) installed, and the object on the server computer must be configured to allow remote creation. + +Typically, you'll use a CreateObject function with a Set statement to assign to an object variable the object that you create. For example, the following statements declare an object variable named myNewSheet and assign an Excel worksheet object to it: + + Dim myNewSheet As Object + Set myNewSheet = CreateObject("Excel.Sheet") + +* * * + +Can Be Used with Any COM Object + +You can use the CreateObject function with any COM object on your computer system, not just with application objects. + +* * * + +## Returning an Object with the _GetObject_ Function + +The GetObject function returns a reference to an existing Automation object. The syntax is as follows: + + GetObject([ _pathname_ ] [, _class_ ]) + +You can provide either argument—but you must provide _one_ of them. Here, _pathname_ is an optional string Variant specifying the full path and name of the file that contains the object you want to retrieve. _pathname_ is optional, but if you don't specify it, you must specify the _class_ argument. _class_ (which is optional if you specify _pathname_ , but required if you don't) is a string Variant specifying the class of the object you want to return. + +As with CreateObject, typically you'll use a GetObject function with a Set statement to assign to an object variable the object that you return with the GetObject function. For example, in the second of the following statements, the GetObject function returns an object consisting of the workbook Z:\Finance\Revenue.xlsm. The Set statement assigns this object to the object variable named Revenue declared in the first statement: + + Dim Revenue As Object + Set Revenue = GetObject("Z:\Finance\Revenue.xlsm") + +Here, the workbook is associated with Excel. When this code runs, VBA starts Excel if it isn't already running and activates the workbook. You can then reference the object by referring to its object variable; in this example, you could manipulate the Revenue object to affect the Z:\Finance\Revenue.xlsm workbook. + +## Examples of Using Automation with the Office Applications + +The following sections show three examples of using Automation with Office applications. + +### Transferring Information from an Excel Spreadsheet to a Word Document + +This example transfers information from an Excel spreadsheet to a Word document. + +First, you need to add to the target Word project (the client project that will contain the code that accesses Excel) a reference to the Excel object library. Follow these steps: + +1. Start or activate Word, and then press Alt+F11 to launch the VBA Editor. + +2. In the Project Explorer, click the project to which you want to add the reference. For example, if the procedure or procedures will reside in the Normal.dotm template, select the Normal project in the Project Explorer before adding the reference. Or just choose Insert ⇒ Module to create a brand-new module to play around with. + +3. Choose Tools ⇒ References to display the References dialog box. + +4. Select the check box for the Microsoft Excel 15.0 Object Library item. + +5. Click the OK button to close the References dialog box. + +Once you've added the reference, you can use the VBA Editor's Object Browser to browse Excel objects. Display the Object Browser as usual by pressing F2 or choosing View ⇒ Object Browser, and then choose Excel in the Object Browser's Project/Library drop-down list. The Object Browser will display the contents of the Excel object library, as shown in Figure 30.1. You can display the help (code examples, syntax) for a selected Excel object by clicking the Help button (the question-mark icon) in the Object Browser. + +Figure 30.1 Once you've loaded the Excel object library, you can view its contents in the Object Browser from the VBA Editor session launched from the host application (in this case, Microsoft Word). + +To create and test the next code example, first set up in Excel the preconditions that this procedure expects: namely, a range object named SalesTotal. To do this, open Excel, and right-click a cell anywhere in the displayed sheet in Book1 (the default name of the first blank workbook). If you don't see a workbook named Book1, choose File ⇒ New, then click the blank workbook icon in the displayed templates. + +In the context menu that opens when you right-click a cell in Book1, choose the Define Name option. In the New Name dialog box that opens, type **SalesTotal** in the Name field. Then click OK to close the dialog box. + +Now double-click the same cell you just named and type in **145** or some other value. It's this value that your macro in Word will pluck from this workbook. Now click the File tab in the Ribbon, choose Save As, and save this workbook as **Book1.xlsx** in the C:\temp subdirectory. (Note that you're saving it as an .xlsx file.) Now you can either leave Excel running or just close it. It won't matter because your macro will open the file on the hard drive. + +Okay, now in Word's VBA Editor, add the code. Because you used early binding, you have available the Editor's Intellisense assistance and code-completion features. Create the procedure shown in Listing 30.1. This procedure uses the GetObject function to retrieve the information from the specified cell in the Excel spreadsheet you previously created and inserts this data in the active Word document at the current insertion point (where the blinking cursor is). + +**Listing 30.1**: Getting data from an Excel cell and inserting it into Word + + 1. Sub Return_a_Value_from_Excel() + 2. + 3. Dim mySpreadsheet As Excel.Workbook + 4. Dim strSalesTotal As String + 5. + 6. Set mySpreadsheet = _ + 7. **GetObject("C:\Temp\Book1.xlsx")** + 8. + 9. strSalesTotal = **mySpreadsheet.Application.Range("SalesTotal").Value** + 10. + 11. Set mySpreadsheet = Nothing + 12. + 13. Selection.TypeText "Current sales total: $" & strSalesTotal & "." + 14. + 15. Selection.TypeParagraph + 16. + 17. End Sub + +This subprocedure retrieves one piece of information from an Excel spreadsheet that's on the hard drive in the C:\temp directory. Here's what happens in the subprocedure: + + * Line 3 declares the object variable mySpreadsheet of the type Excel.Workbook. Line 4 declares the String variable strSalesTotal. + * Line 6 uses a Set statement and the GetObject function to make mySpreadsheet reference the spreadsheet C:\Temp\Book1.xlsm. + * Line 9 assigns to the String variable strSalesTotal the Value property (the actual data) of the Range object named SalesTotal in the Excel Application object. You defined the SalesTotal range as a single cell, so strSalesTotal receives the value of that cell. + * Line 11 assigns to the mySpreadsheet object the special value Nothing, releasing the memory it occupied. (Because the procedure ends almost immediately afterward, this statement isn't necessary here. VBA will destroy it at the end of execution of the procedure. But it's good practice to free the memory assigned to an object when you no longer need to use the object, just to get into the habit.) + * Line 13 uses the TypeText method of the Selection object in Word to enter a string of text and the strSalesTotal string at the current selection. Line 14 uses the TypeParagraph method to insert a paragraph after the text. + +If you have trouble getting this example to work, double-check the following: + + * Choose Tools ⇒ References in the editor to ensure that the check box next to Microsoft Excel 15.0 Object Library is checked. + * If you see an error message stating "Run-time error '432': File name or class name not found during Automation operation," it means that there's something wrong in this line of code: + + Set mySpreadsheet = _ + GetObject("C:\Temp\Book1.xlsx") + +Either you've mistyped this path in your code (such as typing C:\Docs rather than C:\Temp) or you have not saved an Excel file named Book1.xlsx to this folder. + + * If you see an error message stating "Run-time error '1004': Method 'Range' of object '_Application' failed," this is an error in the following line of code: + + strSalesTotal = mySpreadsheet.Application.Range("SalesTotal").Value + +A failure of this code means either you've got a typo in the code, such as specifying the wrong range name, or there is no range by the name SalesTotal in the Excel workbook you're opening. + +### Transferring Information from a Word Document to an Excel Workbook + +We managed to send data from Excel to Word in the previous section. Now let's go the other way. + +This next procedure (Listing 30.2) runs as a macro in Word. The procedure requires that Excel be currently running, so the procedure checks for the possibility that Excel isn't executing and handles the problem itself by starting Excel if necessary. The procedure creates a new Excel workbook and then transfers information from Word to the workbook. + +For this example to work, you must store a Word .docm file named test.docm in your C:\temp directory. + +As before, you'll find creating this procedure easier if you first add to the current Word project a reference to the Excel object library. (See the previous section for instructions.) + +**Listing 30.2**: Sending data from Word to Excel + + 1. Sub Send_Word _Count_to_Excel_Spreadsheet() + 2. + 3. Dim WordCount As Variant + 4. Dim strPath As String + 5. Dim strFile As String + 6. Dim docCurDoc As Document + 7. Dim myXL As Excel.Application + 8. Dim myXLS As Excel.Workbook + 9. Const errExcelNotRunning = 429 + 10. Const errDocNotAvailable = 5174 + 11. + 12. On Error GoTo Handle + 13. + 14. ' open the Word document: + 15. strPath = "C:\temp" + 16. strFile = "test.docm" + 17. Set docCurDoc = Documents.Open(strPath & "\" _ + 18. & strFile, AddToRecentFiles:=False) + 19. + 20. + 21. 'is Excel already running? + 22. Set myXL = GetObject(, "Excel.application") + 23. + 24. myXL.Visible = True + 25. Set myXLS = myXL.Workbooks.Add + 26. myXL.ActiveCell.Range("A1").Select + 27. myXL.ActiveCell = "Word Count" + 28. + 29. WordCount = docCurDoc _ + 30. .BuiltInDocumentProperties(wdPropertyWords) + 31. + 32. myXL.ActiveCell.Range("A2").Select + 33. myXL.ActiveCell = WordCount + 34. + 35. docCurDoc.Close SaveChanges:=wdDoNotSaveChanges + 36. + 37. Shutdown: + 38. Set myXL = Nothing + 39. Set myXLS = Nothing + 40. + 41. Exit Sub + 42. + 43. Handle: + 44. If Err.Number = errExcelNotRunning Then + 45. 'If no instance of Excel is running then, run it: + 46. Set myXL = CreateObject("Excel.Application") + 47. Err.Clear + 48. Resume Next + 49. ElseIf Err.Number = errDocNotAvailable Then + 50. MsgBox "No Word Document named Test.docm Found" + 51. GoTo Shutdown + 52. Else + 53. Resume Next + 54. End If + 55. + 56. End Sub + +Here's what happens in Listing 30.2: + + * Line 2 is a spacer. In fact, all blank lines are just spacers—so I won't mention them again. + * Line 3 declares the Variant variable that will be assigned the number of words in a Word document. Later, in line 33, this same variable assigns its value to an Excel cell. Line 4 declares the String variable strPath that will hold the file path to the Word document, and line 5 declares the String variable strFile that will hold the Word document's filename. + * Line 6 declares the Document variable docCurDoc; it will point to the Word document when it is opened using the Open method of the Documents object. Line 7 declares an Excel.Application object variable myXL, and line 8 declares an Excel.Workbook object variable myXLS. + * Line 9 declares the constant errExcelNotRunning, setting its value to 429. This error number indicates that the procedure attempted to manipulate Excel while no instance of Excel was currently executing. Line 10 declares the constant errDocNotAvailable, setting its value to 5174. This error number indicates that the Word document your procedure attempted to open could not be found. + * Line 12 starts error handling for the procedure, directing execution to the code below the label Handle in the event of an error. + * Line 17 opens the Word document specified by strPath, a backslash, and strFile, assigning the document object to the docCurDoc variable. If the document isn't available, an error occurs and execution is transferred to the error-handler code that starts in line 43. This error number matches the constant defined in the procedure as errDocNotAvailable, so a message box informs the user that the Word document wasn't found. Then execution is transferred to the Shutdown label where the two object variables are destroyed and the procedure is exited. + * Line 22 can also potentially trigger an error condition. It attempts to assign a currently executing instance of Excel to the object variable myXL. However, if this attempt fails, execution is transferred to the Handle label. If Excel isn't running at this point, error 429 ("ActiveX component cannot create object") occurs, so line 44 in the error handler checks for this error by using the constant errExcelNotRunning. If it matches the error number, line 46 assigns to myXL a _new_ instance of Excel that it creates by using the CreateObject function. Line 47 then uses an Err.Clear statement to clear the error, and line 48 contains a Resume Next statement to cause VBA to resume execution back up at the next statement following the offending statement. + * One way or another, by the time line 24 is executed, myXL refers to a running instance of Excel. Line 24 sets the Visible property of myXL to True so that it appears onscreen. + * Line 25 assigns to myXLS a new workbook created by using the Add method of the Workbooks object in myXL. + * Line 26 positions the insertion pointer in the first cell. + * Line 27 assigns to the active cell in myXL the text Word Count. + * Line 29 assigns the document's word count value to the variable WordCount. This value is accessed by using the wdPropertyWords property from the BuiltInDocumentProperties collection of docCurDoc. + * Line 32 moves the insertion cursor down one row in Excel to cell A2, and line 33 displays the word count in that cell. + * Finally, line 35 closes the Word document without saving any changes that may have been made to it while it was opened for inspection. + * Line 41 contains an Exit Sub statement to exit the procedure at this point—to avoid permitting execution to continue down into the zone where the error-handling statements are. Using an Exit Sub like this is common when a procedure includes an error handler at the end. + +### Placing a PowerPoint Slide in an Outlook Message + +The next procedure shows how to communicate between PowerPoint and Outlook. This procedure, run from PowerPoint, returns the existing instance of Outlook or (if there is none) creates a new instance. The procedure then uses PowerPoint to send a message that gives details drawn from the presentation. + +Listing 30.3 shows the procedure. There's one complication: Because PowerPoint doesn't have a central macro storage project like Word's Normal.dotm or Excel's Personal Macro Workbook, the code must be stored in an open presentation. This could be the presentation that is the subject of the email, but it is much more convenient to maintain a code-only presentation that you open at the beginning of all PowerPoint sessions that require the use of code. This becomes your own personal macro-storage system. + +In any case, you need some slides from which to pick information that will be sent (and you also need to provide your email address), so follow these steps to set up the necessary preconditions for the upcoming example. + +First, prepare the target PowerPoint project (the project that will contain the code that accesses Outlook and will contain the slides you're accessing): + +1. Start PowerPoint. Click the Photo Albums link at the top of the sample templates (just below the search field). Click the Contemporary Photo Album presentation, then click the Create button to load it into PowerPoint. + +2. Launch the VBA Editor by pressing Alt+F11. + +3. In the VBA Editor, choose Insert ⇒ Module to open a code module where you can put this procedure. + +4. Choose Tools ⇒ References to display the References dialog box. + +5. Select the check box for the Microsoft Outlook 15.0 Object Library item. + +6. Click OK to close the References dialog box. + +Now enter the code from Listing 30.3 into the module you inserted in step 3. Be sure to replace my email address in line 23 with your email address. + +**Listing 30.3**: Placing a PowerPoint Slide in an Outlook Message + + 1. Sub Notify_of_New_Presentation() + 2. + 3. Dim myPresentation As Presentation + 4. Dim strPresentationFilename As String + 5. Dim strPresentationTitle As String + 6. Dim strPresentationPresenter As String + 7. Dim myOutlook As Outlook.Application + 8. Dim myMessage As Outlook.MailItem + 9. Const errOutlookNotRunning = 429 + 10. + 11. On Error GoTo ErrorHandler + 12. + 13. Set myPresentation = ActivePresentation + 14. With myPresentation + 15. strPresentationFilename = .FullName + 16. strPresentationTitle = _ + .Slides(1).Shapes(3).TextFrame.TextRange.Text + 17. strPresentationPresenter = _ + .Slides(1).Shapes(1).TextFrame.TextRange.Text + 18. End With + 19. + 20. Set myOutlook = GetObject(, "Outlook.Application") + 21. Set myMessage = myOutlook.CreateItem(ItemType:=olMailItem) + 22. With myMessage + ' replace the following line with your email address: + 23. .To = "richard41@pri.r.com **"** + 24. + 25. .Subject = "Presentation for review: " & strPresentationTitle + 26. .BodyFormat = olFormatHTML + 27. .Body = "Please review the following presentation:" & _ + vbCr & vbCr & "Title: " & strPresentationTitle & vbCr & _ + "Presenter: " & strPresentationPresenter & vbCr & vbCr & _ + "The presentation is in the file: " & _ + strPresentationFilename + 28. .Send + 29. End With + 30. + 31. myOutlook.Quit + 32. + 33. Set myMessage = Nothing + 34. Set myOutlook = Nothing + 35. Exit Sub + 36. ErrorHandler: + 37. If Err.Number = errOutlookNotRunning Then + 38. Set myOutlook = CreateObject("Outlook.Application") + 39. Err.Clear + 40. Resume Next + 41. Else + 42. MsgBox Err.Number & vbCr & Err.Description, vbOKOnly + _ + vbCritical, "An Error Has Occurred" + 43. End If + 44. + 45. End Sub + +Here's what happens in Listing 30.3: + + * Line 3 declares a Presentation object variable named myPresentation. Line 4 declares a String variable named strPresentationFilename, which is used for storing the path and filename of the presentation. Line 5 declares a String variable named strPresentationTitle, which is used to store the title of the presentation. Line 6 declares a String variable named strPresentationPresenter, which is used to store the name of the presenter of the presentation. + * Line 7 declares an Outlook.Application object variable named myOutlook that is used to represent the Outlook application. Line 8 declares an Outlook.MailItem object variable named myMessage that is used to represent the message that the procedure creates. Line 9 declares a constant named errOutlookNotRunning and assigns to it the number 429, the error number returned if no instance of Outlook is available when the GetObject function tries to access it. + * Line 11 starts error handling for the procedure, directing execution to the label ErrorHandler (in line 36) in the event of an error. + * Line 13 assigns the active presentation to the myPresentation object variable. Lines 14 through 18 contain a With structure that works with myPresentation. Line 15 assigns the FullName property of myPresentation to strPresentationFilename. + * Line 16 assigns to strPresentationTitle the Text property of the TextRange object in the TextFrame object in the third Shape object on the first Slide object—in other words, the text from the first placeholder shape on the first slide in the presentation. Similarly, line 17 assigns to strPresentationPresenter the text from the second shape on the second slide. + * Line 20 assigns to myOutlook the current instance of Outlook, which it returns using the GetObject function. If Outlook isn't running at this point, error 429 ("ActiveX component cannot create object") occurs, so line 37 in the error handler checks for this error by using the constant errOutlookNotRunning. If it matches, line 38 assigns to myOutlook a new instance of Outlook that it creates by using the CreateObject function. Line 39 then uses an Err.Clear statement to clear the error, and line 40 contains a Resume Next statement to cause VBA to jump back up in the code and resume execution where it left off (at the statement after the offending statement). + * Line 21 uses the CreateItem method of the Outlook Application object (represented by myOutlook) to create a new mail item (a new email), which it assigns to myMessage. Lines 22 through 29 contain a With structure that works with myMessage. + * Line 23 assigns recipients by setting the To property. ( _You should change this line to your own email address so you can test this code and receive the message it sends._ ) + * Line 24 is a placeholder. + * Line 25 enters text for the Subject property. Line 26 specifies that the message use HTML formatting (.BodyFormat = olFormatHTML). Line 27 assigns text to the body of the message by using the Body property. Line 28 then uses the Send method to send the message. + * Line 31 uses the Quit method to close myOutlook. + * Line 33 sets myMessage to Nothing, releasing the memory it occupied. Similarly, line 34 sets myOutlook to Nothing. Line 35 then exits the procedure. + * As discussed earlier in this list, the primary function of the error handler is to launch an instance of Outlook if none is currently running. If any error other than error 429 occurs, execution branches to the Else statement in line 41, and line 42 displays a message box that gives the error number and description. + +If you test this example, be sure to remember to change line 23 from my email address to your email address. When the procedure finishes execution, look in your Inbox in Outlook for the new email message. + +# Using the _Shell_ Function to Run an Application + +Instead of using the CreateObject function to start an application and return a reference to it, you can use the Shell function to run an application. Shell can run any executable program, and its syntax is straightforward: + + Shell(pathname[,windowstyle]) + +Here, pathname is the file path and program name of the program you want the Shell command to execute. Also include in the pathname any necessary command-line switches or arguments required by that program. + +This example opens Internet Explorer, maximizes its window, then switches the focus to it: + + Sub OpenIE() + + Dim id + + id = Shell("c:\program files\internet explorer\iexplore.exe", vbMaximizedFocus) + + End Sub + +* * * + +_Shell_ Can Launch Applications via Filename Extensions + +Shell can also start an application based on a file whose filename extension is associated with that program. It's as if you had double-clicked on a file in Windows Explorer, causing Windows to see if any application is associated with that file's extension. + +For example, say that you specify a.txt filename extension as the argument for Shell: Shell "testfile.txt". A file with a .txt extension usually starts Notepad because Notepad is usually associated with the filename extension .txt. (I say _usually_ because Windows users are free to reassign filename extensions to alternative applications.) If Shell can't find the specified application or file, it returns a runtime error. + +* * * + +windowstyle is an optional integer Variant that you use to specify the type of window in which to run the application and to switch focus to the newly launched application. Table 30.1 lists the constants and values for windowstyle. + +* * * + +Using the _Sleep_ Function to Avoid Problems with Shell's Asynchrony + +LWparaofcase study typeThe Shell function runs other programs _asynchronously_ rather than _synchronously_. In other words, Shell doesn't halt all other activity until it is finished with its job. So when VBA executes a Shell statement, it registers the statement as an action to be performed—but that action may not necessarily be finished before the next statement in your code executes. + +This asynchrony can cause errors in your procedures if subsequent commands depend on the Shell statement having already been executed. If you run into this type of problem, a crude but often-effective fix is to just allow extra time for the Shell function to execute before taking any dependent action. For example, you might run the Shell function earlier in the procedure than you otherwise would have done rather than running it right before the dependent actions. But a better solution is to use an API call (such as Sleep) to delay the execution of further statements for a few seconds so that the Shell function can finish executing. Place this declaration in the declarations section at the top of the Code window: + + Public Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long) + +Then call the Sleep function at the appropriate point in your code, specifying the number of milliseconds you want the code to wait. The following statement uses Sleep to implement a 2-second delay: + + Sleep (2000) + +* * * + +Table 30.1 Constants and values for the windowstyle argument + +**Constant** | **Value** | **Window Style** +---|---|--- +vbHide | 0 | Minimized and hidden, but with focus +vbNormalFocus | 1 | Normal ("restored") with focus +vbMinimizedFocus | 2 | Minimized with focus (the default) +vbMaximizedFocus | 3 | Maximized with focus +vbNormalNoFocus | 4 | Normal ("restored") without focus +vbMinimizedNoFocus | 6 | Minimized without focus + +# Using Data Objects to Store and Retrieve Information + +As you've seen so far in this book, you can store information in many places using VBA. But what you will find uniquely useful about the _data object_ is its ability to copy information to, and retrieve information from, the Clipboard. This chapter is all about ways to communicate between applications, and the Clipboard is one such way. + +A data object is logically attached to a UserForm object in the Microsoft Forms object model, but you can use a data object by itself with no user form displayed. This is similar to the way that you can create and manipulate a hidden Access database with no visible interface displayed to the user. (This phenomenon is described in the section titled "Opening Multiple Databases at Once" in Chapter 28.) + +A data object, which is represented in VBA by the DataObject object, is used to store data. Each data object can hold multiple pieces of text information, and each piece must be in a different, defined format. You can create and use multiple data objects to store multiple pieces of data in the same format, or you can cheat and tell VBA that information is in a different format when really it's not. + +At any given time, the Clipboard can contain one text item and one item in another format, such as a graphical object. If you copy another text item to the Clipboard, that item will overwrite the previous text item, but any graphical item on the Clipboard will remain unscathed. Likewise, if you copy a graphical item to the Clipboard, it will overwrite any previous graphical item (or indeed any item in a non-text format) stored in the Clipboard, but any text item in the Clipboard won't be affected. + +The data object works in a way similar to the Clipboard. However, a data object can't store graphical information. It _can_ store multiple pieces of text information, each _defined_ as being in a different format. + +## Creating a Data Object + +To create a data object, declare an object variable of the DataObject type and then use a Set statement to assign a new DataObject object to it. For example, the following statements declare a DataObject variable named myDObj and assign a new DataObject to it: + + Dim myDObj As DataObject + Set myDObj = New DataObject + +## Storing Information in a Data Object + +To store information in a data object, use the SetText method, which has the following syntax: + + _object_.SetText(StoreData [,format]) + +The components of the syntax are as follows: + + * _object_ is a required argument specifying a valid object. + * StoreData is a required argument specifying the data to store in the data object. + * format is an optional argument containing an Integer value or a String specifying the format of the information in StoreData. A value of 1 indicates text format; a value other than 1 or a String indicates a user-defined format. + +For example, the following statement stores the text Sample text string in the DataObject named myDObj: + + myDObj.SetText "Sample text string" + +The following statement stores the text Sample formatted text string in the DataObject named myDObj, defining and using the custom format myFormat: + + myDObj.SetText "Sample formatted text string", "myFormat" + +Once the custom format has been defined and stored in the data object, you can access the data stored in that format by specifying the format. In this case, no formatting is actually involved—the code simply uses the format argument to create and identify a different data slot in the data object so that the new string doesn't overwrite the existing text string. It's a trick. + +## Returning Information from a Data Object + +To return information from a data object, use the GetText method of the DataObject object. The GetText method has the following syntax: + + _object_.GetText([format]) + +The components of the syntax are as follows: + + * _object_ is a required argument specifying a valid object. + * format is an optional argument containing a String or an Integer specifying the format of the data to retrieve. + +For example, the following statement displays a message box containing the plain-text string stored in the DataObject named myDObj: + + MsgBox myDObj.GetText + +The following statement assigns to the String variable strTemp the text stored with the myFormat format in the DataObject named myDObj: + + strTemp = myDObj.GetText("myFormat") + +Here's a working code example that illustrates how to create a data object and then uses it to store and retrieve information. First, choose Tools ⇒ References in the editor to ensure that the check box next to Microsoft Forms 2.0 Object Library is checked. Note that it's likely this library will not be in its correct alphabetic location in the list of libraries in the References dialog box. Instead, it will probably be already checked and, thus, found in the first 10 or so libraries at the top of the References list. + +Type this working example into an application's VBA Editor, and press F5 to see it execute: + + Sub StoreText() + + Dim myDObj As DataObject + + Set myDObj = New DataObject + + myDObj.SetText "Sample text string" + + MsgBox myDObj.GetText + + End Sub + +## Assigning Information to the Clipboard + +To assign text to the Clipboard from a data object, use the PutInClipboard method of the DataObject. For example, the following example creates a new data object named myDO, assigns to it the text Nasta Louise Gomes, and then assigns that text to the Clipboard: + + Sub StoreText() + + Dim myDO As DataObject + Set myDO = New DataObject + myDO.SetText "Nasta Louise Gomes" + myDO.PutInClipboard + + End Sub + +Test this by pressing F5, and then press Ctrl+V to display the Clipboard contents in the Editor, or Word, or some other text application. + +## Returning Information from the Clipboard to a Data Object + +To fetch whatever text information is in the Clipboard and store it in a data object, use the GetFromClipboard method of the DataObject object. The following example creates a data object referenced by the variable aDO, assigns to it the text from the Clipboard, and then displays the text: + + Sub GetClipboardText() + + Dim aDO As DataObject + Set aDO = New DataObject + aDO.GetFromClipboard + + MsgBox aDO.GetText + + End Sub + +To return formatted information from the Clipboard and store it in a data object, use the GetFormat method of the DataObject object. + +## Finding Out Whether a Data Object Contains a Given Format + +To find out whether a data object contains a given format, use the GetFormat method of the DataObject object. The syntax for the GetFormat method is as follows: + + _object_.GetFormat(format) + +Here are the components of the syntax: + + * _object_ is a required argument that returns a valid DataObject object. + * format is an Integer or String specifying the format you're looking for. If the DataObject contains the format, GetFormat returns True; if not, GetFormat returns False. + +For example, the following statement checks to see if the DataObject named myDO contains the format myHTML and assigns the format's contents to the string strHTMLText if it does: + + If myDO.GetFormat("myHTML") = True Then _ + strHTMLText = myDO.GetText(Format:="myHTML") + +# Communicating via DDE + +If the application with which you want to communicate doesn't support Automation, you can try Dynamic Data Exchange (DDE). DDE is a protocol that establishes a channel between two applications through which they can automatically exchange data. DDE can be tricky to set up, but once you get it working, it is usually reliable. + +Not all applications support DDE. Among the Office applications, Word, Excel, and Access support DDE, but PowerPoint and Outlook do not. What's more, Microsoft warns that DDE is not a secure technology. So use it only in situations where you aren't vulnerable to outside intrusion. + +In the following descriptions of DDE statements, I'll use the term _method_ in its more generic, non-OOP sense. Back long, long ago when DDE was introduced (in Windows 3.0!), object-oriented programming wasn't yet fashionable. + +A typical DDE conversation can contain the following actions: + + * Using the DDEInitiate method to start a DDE connection and establish the channel on which the connection operates + * Using the DDERequest method to return text from the other application or the DDEPoke method to send text to the other application + * Using the DDEExecute method to execute a command in the other application + * Using the DDETerminate method to close the current DDE channel or using the DDETerminateAll method to close all the DDE channels + +## Using _DDEInitiate_ to Start a DDE Connection + +To start a DDE connection, you use the DDEInitiate method. The DDEInitiate method employs the following syntax: + + _expression_.DDEInitiate(App, Topic) + +The components of the syntax are as follows: + + * _expression_ is an optional expression specifying an Application object. + * App is a required String argument specifying the name of the application with which the DDE connection is to be started. + * Topic is a required String argument specifying the DDE topic (such as an open file) in the application. To discover the list of topics available for an application, you send a DDE request (via the DDERequest method, discussed in the next section) to the System object in the application. + +DDEInitiate returns the number of the DDE channel established. You then use this number for subsequent DDE calls. + +For example, the following statements declare the Long variable lngDDEChannel1 and assign to it a DDE channel established with the workbook Sales Results.xlsm in Excel: + + Dim lngDDEChannel1 As Long + lngDDEChannel1 = DDEInitiate("Excel", "Sales Results.xlsm") + +## Using _DDERequest_ to Return Text from Another Application + +To return a string of text from another application, you use the DDERequest method. The DDERequest method has the following syntax: + + _expression_.DDERequest(Channel, Item) + +The components of the syntax are as follows: + + * _expression_ is an optional expression that returns an Application object. + * Channel is a required Long argument specifying the DDE channel to use for the request. + * Item is a required String argument specifying the item requested. + +To get the list of topics available via DDE, request the Topics item from the System topic. For example, the following statements establish a DDE channel to FrontPage (by using DDEInitiate) and return the list of DDE topics, assigning the list to the String variable strDDETopics: + + Dim lngDDE1 As Long + Dim strDDETopics As String + lngDDE1 = DDEInitiate(App:="FrontPage", Topic:="System") + strDDETopics = DDERequest(Channel:=lngDDE1, Item:="Topics") + +Open Excel, click the File tab on the Ribbon, and then click the New option. Click Monthly Family Meal Planner in the display of templates. Then click the Create button. + +Now open Word's VBA Editor and type in the following procedure. The following statements establish a DDE channel to the workbook SalesReport1.xlsm in Excel and return the contents of cell C7 (R7C3) in the String variable strResult: + + Sub DDEtoExcel() + + Dim lngDDEChannel1 As Long, strResult As String + lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1") + strResult = DDERequest(lngDDEChannel1, "R11C4") + MsgBox strResult + DDETerminateAll + + End Sub + +When you press F5 to test this, you should see a message box displaying "Beef and Mushroom Skillet Supper," which sounds pretty nasty. + +For DDE to work, you have to use the correct, full name of the target document as it appears in the title bar of the application. In this case, your target document is an Excel workbook named _Monthly family meal planner1._ + +The previous code works only if you haven't yet saved the Monthly family meal planner1 workbook because before it's saved, a new workbook has no filename extension appended to its name. However, if you _have_ already saved this workbook, you must append whatever filename extension you employed, such as .xlsm. Here's an example: + + lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1.xlsm") + +The DDETerminateAll statement is explained shortly. + +## Using _DDEPoke_ to Send Text to Another Application + +To send text to another application, use the DDEPoke method. The DDEPoke method has the following syntax: + + _expression_.DDEPoke(Channel, Item, Data) + +The components of the syntax are as follows: + + * _expression_ is an optional expression that returns an Application object. + * Channel is a required Long argument specifying the DDE channel to use. + * Item is a required String argument specifying the item to which to send the data. + * Data is a required String argument specifying the data to be sent. + +Continuing to use the previous example, the following statements use the DDEPoke method to assign the data Potato Salad Surprise to cell R11 C4 in the worksheet: + + Sub DDEPokeExcel() + + Dim lngDDEChannel1 As Long, strResult As String + + lngDDEChannel1 = DDEInitiate("Excel", "Monthly family meal planner1") + strResult = DDERequest(lngDDEChannel1, "R11C4") + + DDEPoke Channel:=lngDDEChannel1, Item:="R11C4", _ + Data:="Potato Salad Surprise" + DDETerminateAll + + End Sub + +Now look at the Excel worksheet and you'll see that "Beef and Mushroom Skillet Supper" has been replaced with the even more dubious-sounding "Potato Salad Surprise." + +## Using _DDEExecute_ to Have One Application Execute a Command in Another + +To execute a command in another application, use the DDEExecute method. The DDEExecute method has the following syntax: + + _expression_.DDEExecute(Channel, Command) + +The components of the syntax are as follows: + + * _expression_ is an optional expression that returns an Application object. + * Channel is a required Long argument specifying the DDE channel to use. + * Command is a required String argument specifying the command or series of commands to execute. + +For example, the following statements establish a DDE channel to Excel and issue a Close command to close the active workbook: + + Sub DDEExec() + + Dim lngMyChannel + lngMyChannel = DDEInitiate(App:="Excel", Topic:="System") + DDEExecute lngMyChannel, Command:="[Close]" + + Exit Sub + +If the workbook you're closing has unsaved data, Excel will display a message box prompting you to save it—thus preventing it from closing until the prompt is satisfied. + +## Using _DDETerminate_ to Close a DDE Channel + +When you've finished a DDE communication, use the DDETerminate method to close the DDE channel you opened. The syntax for the DDETerminate method is as follows: + + _expression_.DDETerminate(Channel) + +Here are the components of the syntax: + + * _expression_ is an optional expression that returns an Application object. + * Channel is a required Long argument specifying the DDE channel to close. + +The following statements employ the previous example, closing the DDE channel that was opened: + + Dim lngMyChannel + lngMyChannel = DDEInitiate(App:="Excel", Topic:="System") + DDEExecute lngMyChannel, Command:="[Close]" + DDETerminate lngMyChannel + +## Using _DDETerminateAll_ to Close All Open DDE Channels + +To close all open DDE channels, use the DDETerminateAll method: + + DDETerminateAll + +Because VBA doesn't automatically close DDE channels when a procedure ends, it's a good idea to use a DDETerminateAll statement to make sure you haven't inadvertently left any DDE channels open. + +# Communicating via _SendKeys_ + +The SendKeys statement is a basic and limited form of communication between applications. You may find SendKeys useful if neither Automation nor DDE works with the target application. But SendKeys does have shortcomings, as you'll see momentarily. + +SendKeys transmits specified keystrokes to the destination application. It impersonates someone typing at the keyboard. + +For example, to use SendKeys to send the command to create a new file in Notepad, you send the keystrokes for Alt+F, N (to execute the File ⇒ New command), and Notepad reacts as if you had pressed the keys manually. In Office 2013 applications, Alt+F opens the File tab on the Ribbon. + +SendKeys works only with currently running Windows applications: You can't use SendKeys to start another application running (for that you need to use Shell, as discussed earlier in this chapter), nor can you use SendKeys to communicate with DOS applications running in a virtual DOS machine under Windows. + +The syntax for the SendKeys statement is as follows: + + SendKeys string[, wait] + +Here, string is a required String expression specifying the keystrokes to be sent to the destination application. wait is an optional Boolean value specifying whether to wait after sending the keystrokes until the application has executed them (True) or to immediately return control to the procedure sending the keystrokes (False, the default setting). The True setting, however, can prevent some kinds of timing problems. + +Typically, string consists of a series of keystrokes (rather than a single keystroke). All alphanumeric characters that appear on the regular keyboard are represented by the characters themselves: To send the letter _H_ , you specify **H** in the string, and to send the word _Hello_ , you specify **Hello** in the string. To denote the movement (arrow) and editing keys, SendKeys uses keywords enclosed within braces ({}), as described in Table 30.2. + +Table 30.2 SendKeys keywords for movement and editing keys + +**Key** | **Code** +---|--- +Down arrow | {DOWN} +Left arrow | {LEFT} +Right arrow | {RIGHT} +Up arrow | {UP} +Backspace | {BACKSPACE}, {BS}, or {BKSP} +Break | {BREAK} +Caps Lock | {CAPSLOCK} +Delete | {DELETE} or {DEL} +End | {END} +Enter | {ENTER} +Esc | {ESC} +F1, F2, etc. | {F1}, {F2}, etc. (up to {F16}) +Help | {HELP} +Home | {HOME} +Insert | {INSERT} or {INS} +NumLock | {NUMLOCK} +Page Down | {PGDN} +Page Up | {PGUP} +Print Screen | {PRTSC} +Scroll Lock | {SCROLLLOCK} +Tab | {TAB} + +To send Shift, Control, and Alt, use the symbols shown in Table 30.3. + +Table 30.3 SendKeys symbols for meta keys + +**Key** | **Code** +---|--- +Shift | + +Ctrl | ˆ +Alt | % + +SendKeys automatically assigns the keystroke after the meta key to the meta key, thereby imitating pressing and holding the Alt key, for example, while simultaneously pressing S. + +In other words, to send a Ctrl+O keystroke combination, you would specify **ˆO** , and SendKeys imitates holding down Ctrl while pressing O. Then, the next keystroke after the _O_ is considered to be struck separately. If you need to assign multiple keystrokes to the meta key, enter the keystrokes in parentheses after the meta key. For example, to send Alt+F, I, I, you'd write **%(FII)** , not **%FII**. + +As you can see, SendKeys has special uses for the plus sign (+), caret (ˆ), percent sign (%), and parentheses (). The tilde (∼) gets special treatment as well. To use these characters to merely represent themselves instead of their special uses, enter them within braces: {=} sends a regular = sign, {ˆ} a regular caret, {%} a percent sign, {∼} a tilde, and {()} parentheses. Likewise, you must enclose brackets (which have a special meaning in DDE in some applications) within braces; braces themselves also go within braces. + +Using SendKeys is much less complex than these details initially make it appear—but with that reassurance, there's one more trick you should know: To repeat a key, enter the key and the number of repetitions in braces. For example, to send five up-arrow keystrokes, you'd specify {UP 5}; to send 10 zeroes, you'd specify {0 10}. + +Listing 30.4 shows an example of how to use SendKeys to send some text to Notepad after first starting it with the Shell command. + +* * * + +Warnings about SendKeys + +SendKeys is an old technology and it has two serious drawbacks. First, you can run into timing issues. SendKeys was created when computers ran far more slowly than they do today. For this reason, in some circumstances, executing the code in Listing 30.4 creates a problem when it displays the Save dialog box. Execution stops, failing to complete the file-saving. Or the filename is saved as _og file_ rather than _log file_. These are timing problems. The second drawback relates to testing your code. Because SendKeys needs to activate the target application, you can't step through your code (repeatedly pressing F8) in the VBA Editor—the editor grabs the focus back at the wrong point, becomes perplexed, and the keystrokes are dumped into the Editor rather than into Notepad, the intended target. Instead, you must run the procedure either from the VBA Editor (by pressing F5) or from the host application as a macro. Technically, this second behavior—absorbing keystrokes into the Editor rather than Notepad—is a result of what SendKeys is actually doing: It's pushing keystrokes into the key buffer. Then they pop back out wherever they can. + +* * * + +**Listing 30.4**: Automating Notepad with SendKeys + + 1. Sub Send_to_Notepad() + 2. Dim strLogDate As String + 3. Dim strSaveLog As String + 4. Dim strMsg As String + 5. Dim appNotepad As Variant + 6. strMsg = "Sample log text here." + 7. strLogDate = Month(Now) & "-" & Day(Now) & "-" & Year(Now) + 8. strSaveLog = "Log file for " & strLogDate & ".txt" + 9. appNotepad = Shell("notepad.exe", vbNormalFocus) + 10. AppActivate appNotepad + 11. SendKeys strMsg & "%FS" & strSaveLog & "{Enter}" & "%{F4}", True + 12. End Sub + +Here's how the code works: + + * The Send_to_Notepad procedure starts by declaring (in lines 2, 3, and 4) three String variables—strLogDate, strSaveLog, and strMsg—and (in line 5) one Variant variable, appNotepad. + * Line 6 then assigns to strMsg a sample string of text. + * Line 7 assigns to strLogDate a date built of the Day, Month, and Year values for Now (which returns the current date and time). For example, if the date is July 11, 2013, Month(Now) will return 7, Day(Now) will return 11, and Year(Now) will return 2013, so the strLogDate string will contain 7-11-2013. + * Line 8 then assigns to the strSaveLog string (which will be used to supply the filename for the log file) text describing the file, the strLogDate string, and the .txt filename extension (to continue our example, Log file for 7-11-2013.txt). + * In line 9, the procedure finally gets down to business, using the Shell statement to run Notepad in a "normal" (not maximized or minimized) window with focus and storing the task ID of the Notepad session in the variable appNotepad. + * Line 10 then uses an AppActivate statement to activate Notepad. + * Line 11 uses a SendKeys statement to send to Notepad the following: + * The information contained in the String variable strMsg. + * An Alt=F keystroke (to pull down the File menu), followed by an S keystroke to choose the Save item on the menu. This keystroke displays the Save As dialog box with the File Name text box selected. + * The strSaveLog String variable, which is entered in the File Name text box. + * An Enter keystroke to choose the Save button in the Save As dialog box. + * An Alt=F4 keystroke to quit Notepad. + * Line 12 ends the procedure. + +When you run this procedure (again, you need to run the procedure by pressing F5 rather than stepping into it with F8), you'll see the following: + +1. Notepad springs to life. + +2. The contents of the Msg string appear in the Notepad window. + +3. The Save As dialog box displays itself, enters the filename in the File Name text box, and then dismisses itself. + +4. Notepad closes. The .txt file is saved to the currently active folder on your hard drive. + +Because SendKeys was historically most often employed to open an application's menus and select an option from the menus (the way that Notepad still behaves), you might think that applications since Vista—which are largely menu-free and employ the Ribbon instead—would seriously curtail the flexibility of the SendKeys technique. However, this isn't true. Many of the features of the Ribbon, for example, are accessible via key combinations. Try pressing the sequence Alt, W, Q, 2, and the Enter key in Word; it will switch to the View tab on the Ribbon, select the Zoom option, and switch to a 200% zoom. The difference here is that instead of employing the traditional approach of simultaneously pressing the Alt key while pressing other keys (such as Alt+V to open a View menu), in current Windows operating systems you press and release Alt by itself, then you press the W key to switch to the View tab on the Ribbon. At this point, additional keystrokes are possible to activate the various options on the View tab. To exit from this mode, press Esc. + +Here's another code example, which illustrates how to manipulate Ribbon-based applications. This time Excel, not Notepad, is the target, and the Ribbon, not a menu, is manipulated. The code sends an Alt key by itself (this activates the shortcut key feature on the Ribbon and the Quick Access Toolbar as well, displaying a variety of keys you can choose from). Then the code switches to the View tab (a W does that), and finally full-screen mode is turned on by sending an E: + + Sub Send_to_Excel() + + Dim appExcel As Variant + + appExcel = Shell("Excel.exe", vbNormalFocus) + AppActivate appExcel + + SendKeys "%", True 'send Alt by itself + SendKeys "W", True 'W for the View tab + SendKeys "E", True 'E for full screen mode + + End Sub + +# Going beyond VBA + +VBA is not limited to its own library of functions. In this chapter you've seen how to use the Editor's Tools ⇒ References feature to make Office applications' object libraries available to VBA's built-in capabilities. But wait. There's more. + +VBA can also access the entire Windows API (application programming interface). This isn't as simple as adding a library via Tools ⇒ References. And the necessary code is verbose. But if you want to have very complete control over Windows's internals, to, for example, perfectly manage timing issues like waiting for an outside application to complete its task and other advanced techniques—the Windows API functions are up to such jobs (and plenty more besides). + +Windows API programming is beyond the scope of this book, but if you're interested, copy and paste the sample code from this MSDN web page: + + + +That sample code works fine in Word's or Access's VBA Editors. And the links provided on that web page are your doorways into further, deeper study of the topic. If, like me, you have major geek tendencies, it's great fun to wander around and experiment in an immense compendium like the API. You can make Windows do things you wouldn't believe. + +# The Bottom Line + +**Use Automation to transfer information.** + +Automation sets up communication between two applications, designating one of them as the _server_ and the other as the _client_. + +Master It + +Of the various ways to communication between applications, which is generally the most effective? + +**Use the **Shell** function to run an application.** + +Although the Shell function can prove useful in a variety of inter-application communication situations, Shell can also present the programmer with a timing problem. + +Master It + +Describe the timing issues that the Shell function raises, and describe a good solution to this problem. + +**Use data objects to store and retrieve information.** + +This book has described a variety of ways to store and retrieve information when working with the VBA language. Using data objects is one of these useful techniques. + +Master It + +How is the data-object technology special as a way of storing and retrieving information; what can a data object do that's unique? + +**Communicate via DDE.** + +Dynamic Data Exchange (DDE) is a technology introduced back in May 1990 with Windows 3.0. Use it if other, more efficient communication technologies are unavailable to the applications you are working with. + +Master It + +Not all applications support DDE. Which Office 2013 applications don't support DDE communication? + +**Communicate via **SendKeys**.** + +Using SendKeys is a fairly simple but rather awkward and limited way to communicate between applications. It imitates typing in keystrokes, thereby allowing you to manipulate an application by accessing some of its features using, for example, Alt+key combinations, such as Alt+F to open the File tab on the Ribbon. + +Master It + +SendKeys was historically most often employed to open an application's menus and select an option from the menus. Since Vista, Windows applications have largely done away with traditional menus, so is SendKeys of even more limited use now than in the past? +Chapter 31 + +Programming the Office 2013 Ribbon + +VBA programmers may want to customize the Office applications' Ribbons programmatically (via macro code as opposed to the user employing the Options dialog box). Perhaps your organization wants to hide certain features in Excel, add a step-through wizard to Word, create a Ribbon that is custom-designed for working with a particular presentation, add a special tab containing capabilities relevant to your business, or otherwise automate management of this major part of the user interface. + +Or you might want to create dynamic Ribbon effects, such as hiding, disabling, revealing, or modifying Ribbon elements—labels, groups, controls, or whatever—based on the user's behaviors in the application or on some other criterion. + +This chapter explores all aspects of Ribbon customization so you'll be able to fully exploit the Ribbon's capabilities programmatically. + +Note that the Ribbon can be programmatically modified in two ways: The most efficient approach is to create XML code and make it interact with VBA procedures. This chapter employs this technique and describes how to customize the Ribbon in Word, Excel, and PowerPoint. A second, more complex approach requires writing COM add-ins, a technique that is beyond the scope of this book. + +The Access Ribbon can't be modified in the same way that you modify the Ribbon in Word, Excel, and PowerPoint. Access requires a unique approach, including creating a specialized table to hold the XML code that modifies the Ribbon. Modifying the Access Ribbon is covered at the end of this chapter. + +The Ribbon's contents are described in the XML language, but you don't need to know how to write XML to manipulate the Ribbon. Throughout this chapter, you can just copy and paste XML code examples, making modifications to them to suit your needs. + +As you'll see shortly, there's also a handy utility you can download that helps you avoid several tedious steps when modifying the Ribbon and verifies that your XML statements are "well formed" (that they follow the rules of XML and thus should work). + +In this chapter you will learn to do the following: + + * Understand what XML is + * Hide a group on the Ribbon + * Add a new, custom group + * Create callbacks for event handling + * Manipulate the Access Ribbon + * Debug Ribbon programming + +# What Is XML? + +XML means _extensible markup language_. It's a way to combine data with descriptions of that data. + +Think of a file cabinet holding various documents, each of which is stored in a folder with a label describing the meaning of its document: Telephone Bill, Boat Insurance, Bobby's Arrest, and so on. + +But XML takes this a step further, becoming more granular (more finely detailed) in its marking (labeling) of data. Each paragraph, sentence, or even individual words can also be contained within descriptive "tags," like this: + + + 12,1,2013 + Sao Paulo, Brazil + + + + 12,14,2013 + Miami Airport + + + ... + +You get the idea: descriptive tags, then the data contained, followed by closing tags. For example, is a tag presumably containing some kind of geographical data; is a tag with a backslash, meaning that this is the end of the information about location. Any opening tag must be paired with a closing tag, and they thus surround the data that they describe. + +XML is "extensible," meaning anybody can make up their own tags. XML is a way of storing information along with descriptions of the meaning of that information. You can think of it as similar to a record in a database. + +Contrast this with HTML (the markup language that underlies web pages), which describes how to _display_ information and contains standardized tags, such as for italic, understood by all browsers. + +If you want to know more about XML, you'll find a good introductory tutorial here: + + + +# Hiding the Editing Group on the Word Ribbon + +To get an idea of how to modify the Ribbon, let's assume that you want to remove the Editing group in the Word Ribbon's Home tab. This group has three options: Find, Replace, and Select. However, you decide that you just don't need to display these options because you always press Ctrl+F to open the Find dialog box and Ctrl+H to open the Replace dialog box, and you select by simply dragging the mouse. To you, this Editing group is useless, just wasting valuable space on the Ribbon. + +To hide the Editing group on the Ribbon, follow these steps: + +1. First you'll want to download a free utility that makes working with the Ribbon much easier. Go to + + http://openxmldeveloper.org/blog/b/openxmldeveloper/archive/2010/08/10/23248.aspx + +and download, then install, the Office Custom UI Editor tool. This utility can be downloaded via a link on this web page named OfficeCustomUIEditorSetup.zip. (The file is just above the comments.) When you extract the contents of this zip file, you'll have an installer (.msi) file. Just double-click it to install the Custom UI Editor. + +2. Start Word. + +3. Press Alt, F, N, (or click the File tab, then click the New option). Then click the blank document template. + +4. Press Alt, F, A and save the document as RibbonMod.docm to your Desktop (or some other location such as C:\temp where you can easily locate it). + +5. Press Alt, F, C to close this document. Closing the document is necessary because if it's still open when you attempt to store your XML code in it (by choosing File ⇒ Save in the Custom UI Editor for Microsoft Office), you'll get an error message. + +* * * + +Why You Should Use Macro-Enabled File Types + +Note that you could also save the document as the default .docx file type, but in this chapter you'll always use the macro-enabled .docm type (and the other "m" type, such as .xlsm for macro-enabled Excel files and .pptm for PowerPoint). These types of files can include macros, and in some of the examples in this chapter, you'll need to write procedures to handle events—triggered when the user clicks a control that you've added to the Ribbon. + +* * * + +6. Run the Custom UI Editor for Microsoft Office. + +7. Choose File ⇒ Open. + +8. Browse to the RibbonMod.docm file that you saved in step 4, and open it. + +9. In the right pane of the Custom UI Editor, type the following XML code: + + /2009/07/ + + + + + + + + + +Identifiers (idMso), images (imageMso), and other attributes in Ribbon XML code can have an Mso suffix. Mso is short for Microsoft Office, and when appended to an attribute it means _built_ - _in_. So, a tab with an idMso attribute is one of the tabs on the Ribbon by default. A tab with a plain id attribute is a new tab you've added to the Ribbon. Likewise, an imageMso is one of the set of built-in Office 2013 icons, but an image is an icon you created by importing a graphics file (see "Creating Your Own Icons" later in this chapter). + +* * * + +**Watch Out for Special Characters** + +XML will choke on special characters—it expects plain vanilla text with none of those slanted quotation marks (called "smart quotes") or other fancy formatting. You used to be able to paste code into Notepad, then copy it from Notepad and paste it into the VBA Editor or the Custom UI Editor. When text was dipped into Notepad like this, all special characters were stripped off. Slanted quotation marks (which are two distinct characters, open and close quotes) turned into a single, vertical quotation-mark character. This was quite a good way to wash text. No more. Those at Microsoft who fiddle with good tools and make them less useful decided to justify their salaries by _not leaving Notepad alone_. After all, they're getting paid to do _something_ , so they get restless. Until the latest version, Notepad has been left alone, unchanged for decades. + +How do you get rid of characters like smart quotes (" and ") that XML (and the VBA Editor) cannot work with, replacing them with straight quotes (")? There are three ways, but #3 is the best: + +1. Hand-edit each bad character by selecting it, then pressing the " key. If you press this key in Notepad or a code editor, it will appear as the correct " simple quotation mark (no slant). + +2. If you're working with a large piece of code with many quotation marks, paste it into Notepad, then press Ctrl+H to open the Replace dialog box. Paste one of the bad, open-quote (") slanted quotation-mark characters into the Find What field, then click the Replace With field and press the " key. (Notepad by default uses the straight-quotes character.) Note that you'll have to repeat this process with the close-quote (") slanted quotation-mark character. + +3. What do we do when faced with a repetitive and tedious task? Anyone? + +Yes. Write a macro. Here's a macro that opens a new, blank Word document, pastes in the text that needs changing, then makes the necessary replacements: + + 1. Sub StraightenQuotes() + 2. ' Changes smart quotes (slanted) to straight quotes + 3. + 4. On Error GoTo Problem + 5. Dim aDO As DataObject + 6. Set aDO = New DataObject + 7. aDO.GetFromClipboard + 8. aDO.GetText + 9. + 10. Dim bQuotesOn As Boolean + 11. bQuotesOn = Options.AutoFormatAsYouTypeReplaceQuotes + 12. + 13. Options.AutoFormatAsYouTypeReplaceQuotes = False + 14. + 15. Documents.Add Template:="Normal", NewTemplate:=False, DocumentType:=0 + 16. + 17. Selection.Paste + 18. + 19. Selection.WholeStory + 20. + 21. Selection.Find.ClearFormatting + 22. Selection.Find.Replacement.ClearFormatting + 23. + 24. With Selection.Find + 25. .Text = ChrW(8221) + 26. .Replacement.Text = """" + 27. .Wrap = wdFindStop + 28. .Forward = True + 29. End With + 30. Selection.Find.Execute Replace:=wdReplaceAll + 31. + 32. Selection.Find.ClearFormatting + 33. Selection.Find.Replacement.ClearFormatting + 34. + 35. With Selection.Find + 36. .Text = ChrW(8220) + 37. .Replacement.Text = """" + 38. .Wrap = wdFindStop + 39. .Forward = True + 40. End With + 41. Selection.Find.Execute Replace:=wdReplaceAll + 42. + 43. Options.AutoFormatAsYouTypeReplaceQuotes = bQuotesOn + 44. + 45. Exit Sub + 46. + 47. Problem: + 48. MsgBox "There was a problem. Be sure that you have copied some text into the Clipboard before executing this macro." + 49. + 50. End Sub + +To test this, just copy some text (that contains the unwanted slanted quotation marks) into the Windows Clipboard (select the text, then press Ctrl+C). Then run the macro. Here's what the code does: + + * Line 4 says that if something goes wrong, jump down to the label named Problem at the end of the procedure. The most likely problem is that the user has a graphic in the Clipboard (they pressed PrtScn, for example) rather than text. + * Lines 5–8 fetch the text from the Clipboard. + * Lines 10 and 11 save the user's setting for smart quotes so we can restore it at the end of the macro. + * Line 13 turns off Word's Smart Quotes feature so when in our code the slanted quotation marks are replaced by straight quotation marks, Word will permit this. Line 15 opens a new, blank document. This is important because you might currently also be working on a second, ordinary text document where you want smart quotes. + * Line 17 pastes the text from the Clipboard into the blank document. + * Line 19 selects all the text. + * Lines 21 through 41 carry out the find and replace. Remember, this code must be executed twice, once for the open-quote and a second time for the close-quote characters. + * Line 43 restores the user's setting for the Smart Quotes option. + * Line 45 exits the procedure so we don't fall into the error handler after successfully running the procedure without error. + * Line 47's label identifies the error-handler code. + * Line 48 handles the error by reminding the user that there must be text in the Clipboard for this macro to work. + +Yes, I used Word's Macro Recorder to help me write this code. Having been programming in BASIC and writing books on it for 25 years, I'm almost freakishly proficient in the language. But I had only a vague idea what kind of code would turn off Word's Smart Quotes feature. So, I turned on the Macro Recorder, then went to File ⇒ Options in Word and turned off Smart Quotes. VBA created this code: + + Options.AutoFormatAsYouTypeReplaceQuotes = False + +So I just copied the code into my macro. I used the same trick to get the code that opens a new document and does the finding and replacing. Unless you're Martha Stewart and can remember everything you've ever read or done, you'll need to rely on the Macro Recorder and online code samples to write macros of even moderate complexity. + +* * * + +10. In the Custom UI Editor, click the icon with the red check mark. + +This tool validates your XML code (a very handy feature). + +If you don't now see the message "Custom UI XML is well formed," you've made a typo in the XML code or included bad special characters. Retype it (or better yet, copy and paste it from this book's web page—see this book's introduction for information on copying code). + +If you see an error message stating " "" is an unexpected token...," you need to fix the quotation marks in the XML code to make them straight, not "smart" quotation marks, as described in the sidebar in this chapter titled "Watch Out for Special Characters." + +You should always validate your XML code because if there _is_ an error of some kind, your Ribbon customization simply won't happen. You will be given no error message or other warning when executing the customization itself. It just won't work. + +11. Choose File ⇒ Save (which saves your Word document), then File ⇒ Exit to close the UI Editor. + +12. Now, to see the effect, open the RibbonMod document by clicking the File tab on Word's Ribbon and then clicking Open. In the list of recent documents, choose RibbonMod.docm (or double-click that filename in Windows Explorer). + +If you entered the correct XML code, you'll see a Ribbon like the one on the bottom of Figure 31.1. + +Figure 31.1 Word's Ribbon with (top) and without its Editing group (bottom) + +The key lines in the XML code are these: + + **<** tab idMso **="TabHome" >** + + +The line of code that begins with (immediately following the < is the name of the element). + +And just as objects in ordinary programming (such as VBA) have properties and methods, XML's elements have _attributes_. So, in the following example code, a button element is defined, and it has four attributes: label, size, onAction, and imageMso. Of the four, three are analogous to properties (qualities), with onAction similar to an object's method (a behavior). But in XML, they are all simply referred to as attributes. + +
+ +EXCEPT + +
+ +If you were paying attention when we reviewed EXISTS and NOT EXISTS, you can probably translate this to its NOT EXISTS equivalent, which would logically look something like this: + + + +WHERE NOT EXISTS + +(SELECT 1 + +FROM
+ +WHERE = [,...]) + +We'll see this in an example in the section following the INTERSECT syntax. + +INTERSECT + +The INTERSECT operator provides all data that matches on both sides of the INTERSECT. As with EXCEPT, the syntax is straightforward and works similar to a UNION: + +
+ +INTERSECT + +
+ +Again, you can translate this to an EXISTS (this time without the NOT), which would logically look something like this: + + + +WHERE NOT EXISTS + +(SELECT 1 + +FROM
+ +WHERE = [,...]) + +Now that we've seen the syntax for both EXCEPT and INTERSECT, let's move on to a set of examples that show them both in action and compare them to the versions based on the EXISTS operator. + +Comparing EXCEPT and INTERSECT with Their EXISTS and NOT EXISTS Equivalents + +As I indicated when discussing the basic concepts of EXCEPT and INTERSECT, both can, in terms of end result, be replicated via appropriate use of the EXISTS or NOT EXISTS operators. Let's run an example of each form, along with a simple UNION so we can see how similar the syntax is. We'll start by populating some small test data tables, then take a look at the UNION, then move on to the EXCEPT and INTERSECT operators with their EXISTS equivalents. + +SET NOCOUNT ON; -- Eliminate the row counts after each query to save space + +\-- Create our test tables and populate them with a few relevant rows + +CREATE TABLE UnionTest1 + +( + +idcol int IDENTITY, + +col2 char(3), + +); + +CREATE TABLE UnionTest2 + +( + +idcol int IDENTITY, + +col4 char(3), + +); + +INSERT INTO UnionTest1 + +VALUES + +('AAA'), + +('BBB'), + +('CCC'); + +INSERT INTO UnionTest2 + +VALUES + +('CCC'), + +('DDD'), + +('EEE'); + +PRINT 'Source and content of both tables:'; + +PRINT ''; + +SELECT 1 AS SourceTable, col2 AS Value + +FROM UnionTest1 + +UNION ALL + +SELECT 2, col4 + +FROM UnionTest2; + +PRINT 'Results with classic UNION'; + +SELECT col2 + +FROM UnionTest1 + +UNION + +SELECT col4 + +FROM UnionTest2; + +PRINT 'Results with EXCEPT'; + +PRINT '--------------------------'; + +SELECT col2 + +FROM UnionTest1 + +EXCEPT + +SELECT col4 + +FROM UnionTest2; + +PRINT 'Equivilent of EXCEPT but using NOT EXISTS'; + +PRINT '--------------------------'; + +SELECT col2 + +FROM UnionTest1 ut1 + +WHERE NOT EXISTS + +(SELECT col4 FROM UnionTest2 WHERE col4 = ut1.col2); + +PRINT 'Results with INTERSECT'; + +PRINT '--------------------------'; + +SELECT col2 + +FROM UnionTest1 + +INTERSECT + +SELECT col4 + +FROM UnionTest2; + +PRINT 'Equivilent of INTERSECT but using EXISTS'; + +PRINT '--------------------------'; + +SELECT col2 + +FROM UnionTest1 ut1 + +WHERE EXISTS + +(SELECT col4 FROM UnionTest2 WHERE col4 = ut1.col2); + +\-- Clean up after ourselves + +DROP TABLE UnionTest1; + +DROP TABLE UnionTest2; + +SET NOCOUNT OFF; -- Don't forget to turn this back to the default! + +Let's walk through the results of this a bit at a time—focusing on the points specific to EXCEPT and INTERSECT as well as their EXISTS-related equivalents. + +First, let's check out the results of the EXCEPT operator and its related NOT EXISTS version: + +Results with EXCEPT + +\-------------------------- + +col2 + +\---- + +AAA + +BBB + +Equivalent of EXCEPT but using NOT EXISTS + +\-------------------------- + +col2 + +\---- + +AAA + +BBB + +As you can see, the results were the same. It is, however, worth noting that the query plans were different. For example, on my system, the cost (you can find more on this in the chapter on Performance Tuning) of the EXCEPT was more than twice that of the NOT EXISTS approach. If you're in a performance sensitive environment, you may want to test out both methods on a realistic set of data for your application, and see what you wind up with. + +We'll see this same theme of the EXISTS version performing better than the EXCEPT/INTERSECT equivalent as we look at INTERSECT. As of this writing, every example I've seen personally or on the web yields a plan that is either more efficient with the EXISTS approach, or is identical; never have I seen the EXCEPT/INTERSECT approach perform better. + +Does this mean you shouldn't use EXCEPT and INTERSECT? Well, perhaps, but I don't believe things are quite that easy to decide. For example, in your development community, which reads more easily? Which is easier to understand? If the performance you're seeing is slower, but insignificantly so or "close enough," then you may be interested in using EXCEPT and INTERSECT because they make the desired result much more exact to someone who is reviewing the code later. EXISTS and NOT EXISTS are not that hard, but they have many more possible uses, so are slightly less intuitive; the right choice is often a matter of opinion. + +Now let's move on to the INTERSECT results: + +Results with INTERSECT + +\-------------------------- + +col2 + +\---- + +CCC + +Equivalent of INTERSECT but using EXISTS + +\-------------------------- + +col2 + +\---- + +CCC + +The results were, again, a match; we are able to replicate the functionality of the INTERSECT by using the EXISTS operator. + +Much like with EXCEPT, the EXISTS performs much better (about 30% of the cost of the EXCEPT. The result will vary somewhat depending on the amount of data you're looking at. As I will so often say, "your mileage may vary," by which I mean, make sure you've tested the impact in your environment. + +In general, the EXISTS approach will perform at least as well as the EXCEPT/ INTERSECT approach. The latter is, however, somewhat more readable. Take your specific situation into account when choosing between the two. + +Common Table Expressions (CTEs) + +Common Table Expressions (CTE) were first introduced back in SQL Server 2005. They provide a means to refer to a temporary result set by name, and thus utilize it as a table (albeit both temporary and virtual in nature). Perhaps the coolest thing about them is that you define them before actually using them, so you can avoid separate physical steps storing and re-referencing the table (as you would do with a temporary table—or even a table variable). This can have very favorable performance impacts since SQL Server can plan the work between the CTE and the queries that utilize it as part of one logical operation rather than as a series of separate activities. In their simplest form, CTEs are similar to views created on the fly, but a CTE can also enable other things that you can't really do with a view (for example, see the following section on recursive queries). + +The basic syntax for a CTE utilizes the WITH keyword followed by a name and definition: + +WITH [ ( [,...n] ) ] + +AS + +( ) + + + +After the CTE is defined, you can refer to it by name just as if it were a table. + +Note that while a CTE can nest, and a CTE can refer to a parent CTE, you cannot have completely independent CTEs at the same time, nor can you reference forward in your nested CTEs. Indeed, whatever statement is going to use the CTE must immediately follow the CTE declaration. + +So, as an example of CTE use, we could replace part of our earlier derived table with a CTE reference: + +USE AdventureWorks2008; + +WITH pumps (BusinessEntityID) + +AS + +( + +SELECT sc.PersonID AS BusinessEntityID + +FROM Sales.Customer sc + +JOIN Sales.SalesOrderHeader AS soh + +ON sc.CustomerID = soh.CustomerID + +JOIN Sales.SalesOrderDetail AS sod + +ON soh.SalesOrderID = sod.SalesOrderID + +JOIN Production.Product AS p + +ON sod.ProductID = p.ProductID + +WHERE p.Name = 'Minipump' + +) + +SELECT DISTINCT pp.FirstName, pp.LastName + +FROM Person.Person AS pp + +JOIN pumps + +ON pp.BusinessEntityID = pumps.BusinessEntityID + +JOIN ( SELECT sc.PersonID AS BusinessEntityID + +FROM Sales.Customer sc + +JOIN Sales.SalesOrderHeader AS soh + +ON sc.CustomerID = soh.CustomerID + +JOIN Sales.SalesOrderDetail AS sod + +ON soh.SalesOrderID = sod.SalesOrderID + +JOIN Production.Product AS p + +ON sod.ProductID = p.ProductID + +WHERE p.Name = 'AWC Logo Cap') caps + +ON pp.BusinessEntityID = caps.BusinessEntityID; + +Notice that I was able to cut the first derived table out entirely and replace it with the CTE reference. I cannot, however, also replace the caps derived table, as I can only make one CTE reference at a time. I can replace pumps, or I can replace caps, but not both. + +It's worth noting that certain constructs cannot be used within a CTE. These include: + + * COMPUTER and COMPUTE BY + * ORDER BY + * INTO + * The FOR XML, FOR BROWSE, and OPTION query clauses + +CTEs may seem a bit worthless at first given all these restrictions, but they show their power as we begin to work with recursive queries (which are effectively impossible without CTEs). Having said that, let's move right into looking at those.... + +Recursive Queries + +Historically, one of the more tricky things to deal with in a relational system has been hierarchical data. Microsoft has done much in the last two releases to ease the pain in this area. One of the pieces of functionality that is very powerful is the notion of a recursive query. A query or piece of code is considered to be recursive when it calls itself either directly or indirectly. We have long had the ability to have recursive stored procedures and functions, but the notion of a recursive query didn't become available until SQL Server 2005. + +Prior to the native hierarchical data type that is new with this release (we'll examine the new HierarchyID data type extensively in Chapter 7), most hierarchical data was stored in what is called a unary relationship—that is, a table that has a relationship where both the parent and the child columns are in the same table. A need for recursion is best seen in such unary relationships where the hierarchical data represented is "ragged" in structure. That is, the depth of each branch of the tree may vary, so you need to recurse until you find the bottom of the hierarchical structure—however deep that may be. Recursive queries make that possible. + +Recursive queries are made possible by using a properly constructed CTE. A recursive CTE needs to have at least two major parts: a foundation or "anchor" member, and a recursive member. The anchor member establishes the foundation to which the rest of the query data can be added. The recursive member handles the repetitive calls and provides the recursion check. + +As an example, let's look at a very typical ragged hierarchy—employee reporting chains. To take a look at this, we'll create a version of the AdventureWorks2008 Employees table where the reporting structure is represented in the older schema style (the 2008 version of AdventureWorks uses the newer HierarchyID data type). We'll generate this using data from the existing Employees table, so our data will easily match that used elsewhere in the AdventureWorks2008 database. + +CREATE TABLE HumanResources.Employee2 + +( + +BusinessEntityID int NOT NULL PRIMARY KEY, + +ManagerID int NULL, + +JobTitle nvarchar(50) NULL + +); + +INSERT INTO HumanResources.Employee2 + +SELECT hre.BusinessEntityID, + +(SELECT BusinessEntityID + +FROM HumanResources.Employee hre2 + +WHERE hre.OrganizationNode.GetAncestor(1) = hre2.OrganizationNode + +) AS ManagerID, + +JobTitle + +FROM HumanResources.Employee hre; + +This should get 290 rows into a new table called HumanResources.Employee2, which we'll use for the remainder of our CTE examples. + +So, now that we have your typical mix where a few employees (your basic "C" level staff) report to the CEO, and then managers report to those executives, supervisors report to the managers, and so on, we're ready to begin. The exact depth of the managerial chain varies by individual department and group. We can use a recursive query to crawl that chain for us. + +First, we need to build the root—or "anchor"—of the hierarchy. In this case, that would obviously be the CEO (no one is higher than he is!), but the way we'll format it is to grab any record where the employee has no one that they report to: + +\-- Establish the "Anchor Member" + +\-- This essentially defines the top node of the + +\-- recursion hierarchy + +SELECT hre.ManagerID, + +hre.BusinessEntityID, + +hre.JobTitle, + +hredh.DepartmentID, + +0 AS Level + +FROM HumanResources.Employee2 AS hre + +JOIN HumanResources.EmployeeDepartmentHistory AS hredh + +ON hre.BusinessEntityID = hredh.BusinessEntityID + +AND hredh.EndDate IS NULL -- Current employees only! + +WHERE hre.ManagerID IS NULL; + +Now, we need to add to that all the various employees that report to this root node, and then recurse down the tree until we get to the bottom. We'll UNION these results to those we just got for the root: + +UNION ALL + +\-- Define the piece that actually recurses + +SELECT hre.ManagerID, + +hre.BusinessEntityID, + +hre.JobTitle, + +hredh.DepartmentID, + +r.Level + 1 + +FROM HumanResources.Employee2 AS hre + +JOIN HumanResources.EmployeeDepartmentHistory AS hredh + +ON hre.BusinessEntityID = hredh.BusinessEntityID + +AND hredh.EndDate IS NULL -- Current employees only! + +JOIN Reports AS r + +ON hre.ManagerID = r.BusinessEntityID + +Now, let's put that all together, and then create a statement to make use of our CTE. I can add a WHERE clause to the calling statement, so I can filter my data down to just the groups, departments, or positions I want the reporting information on—for example: + +USE AdventureWorks2008; + +GO + +\-- Establish the CTE foundation for the recursion + +WITH Reports (ManagerID, BusinessEntityID, JobTitle, DepartmentID, Level) + +AS + +( + +\-- Establish the "Anchor Member" + +\-- This essentially defines the top node of the + +\-- recursion hierarchy + +SELECT hre.ManagerID, + +hre.BusinessEntityID, + +hre.JobTitle, + +hredh.DepartmentID, + +0 AS Level + +FROM HumanResources.Employee2 AS hre + +JOIN HumanResources.EmployeeDepartmentHistory AS hredh + +ON hre.BusinessEntityID = hredh.BusinessEntityID + +AND hredh.EndDate IS NULL -- Current employees only! + +WHERE hre.ManagerID IS NULL + +UNION ALL + +\-- Define the piece that actually recurses + +SELECT hre.ManagerID, + +hre.BusinessEntityID, + +hre.JobTitle, + +hredh.DepartmentID, + +r.Level + 1 + +FROM HumanResources.Employee2 AS hre + +JOIN HumanResources.EmployeeDepartmentHistory AS hredh + +ON hre.BusinessEntityID = hredh.BusinessEntityID + +AND hredh.EndDate IS NULL -- Current employees only! + +JOIN Reports AS r + +ON hre.ManagerID = r.BusinessEntityID + +) + +\-- Code to get it all started. + +SELECT ManagerID, BusinessEntityID, JobTitle, Level + +FROM Reports r + +JOIN HumanResources.Department AS dp + +ON r.DepartmentID = dp.DepartmentID + +WHERE dp.GroupName LIKE '%Admin%' + +ORDER BY Level, ManagerID, JobTitle; + +GO + +Note that the CTE is not controlling what group names are returned; instead, that is being driven from the calling query. The WHERE clause is, however, merged into the plan prior to execution and therefore the query will be optimized differently depending on the specific makeup of the calling query. + +Let's take a look at the results: + +ManagerID BusinessEntityID JobTitle Level + +\----------- ---------------- ------------------------------------ ----------- + +NULL 1 Chief Executive Officer 0 + +1 234 Chief Financial Officer 1 + +1 263 Information Services Manager 1 + +25 227 Facilities Manager 2 + +... + +... + +264 266 Network Administrator 3 + +228 229 Janitor 4 + +228 230 Janitor 4 + +228 231 Janitor 4 + +228 232 Janitor 4 + +(35 row(s) affected) + +"What is the level?" you may ask. It is something that I've inserted arbitrarily here to give you a feel for the depth each row has relative to the overall hierarchy. We could just as easily have left it out. + +The key thing to understand here is that recursive queries are now not only possible, but also relatively easy. The trick is to understand your root node and how to build off of that anchor. + +MERGE + +In previous versions of SQL Server, when you heard the word "merge" you generally thought of merge replication. With SQL Server 2008, however, we have a whole new way of thinking about the word merge and, more importantly, of thinking about DML statements. + +With MERGE, we have the prospect of combining multiple DML action statements (INSERT, UPDATE, DELETE) into one overall action, improving performance (they can share many of the same physical operations) and simplifying transactions. MERGE makes use of a special USING clause that winds up working somewhat like a CTE. The result set in the USING clause can then be used to conditionally apply your INSERT, UPDATE, and DELETE statements. The basic syntax looks something like this: + +MERGE [AS ] + +USING + +( + + + +) + +WHEN {[NOT] MATCHED | THEN + + + +[, [...n]] + +Let's use the example of receiving a shipment for inventory. We'll assume that we're keeping a special rollup table of our sales for reporting purposes. We want to run a query daily that will add any new sales to our monthly rollup. On the first night of the month, this is pretty much a no brainer, as, since there are no other rollup records for the month, any sales for the day are just rolled up and inserted. On the second day, however, we have a different scenario: We need to rollup and insert new records as we did the first day, but we need to just update existing records (for products that have already sold that month). + +Let's take a look at how MERGE can manage both actions in one overall step. Before we get going on this, however, we need to create our rollup table: + +USE AdventureWorks2008; + +CREATE TABLE Sales.MonthlyRollup + +( + +Year smallint NOT NULL, + +Month tinyint NOT NULL, + +ProductID int NOT NULL + +FOREIGN KEY + +REFERENCES Production.Product(ProductID), + +QtySold int NOT NULL, + +CONSTRAINT PKYearMonthProductID + +PRIMARY KEY + +(Year, Month, ProductID) + +); + +This is a pretty simple example of a monthly rollup table—making it very easy to get sales totals by product for a given year and month. To make use of this, however, we need to regularly populate it with rolled up values from our detail table. To do this, we'll use MERGE. + +First, we need to start by establishing a result set that will figure out what rows we need to be sourcing data for our rollup from. For purposes of this example, we'll focus on August of 2003, and start with our query for the first day of the month: + +SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold + +FROM Sales.SalesOrderHeader soh + +JOIN Sales.SalesOrderDetail sod + +ON soh.SalesOrderID = sod.SalesOrderID + +WHERE soh.OrderDate >= '2003-08-01' + +AND soh.OrderDate < '2003-08-02' + +GROUP BY soh.OrderDate, sod.ProductID; + +This gets us the total sales, by ProductID, for every date in our range (our range just happens to be limited to one day). + +There is a bit of a trap built into how we've done this up to this point. I've set the GROUP BY to use the OrderDate, but OrderDate is a datetime data type as opposed to a date data type. If our order were to start coming in with actual times on them, it would mess with our assumption that all orders will group nicely into one date. If this were a production environment, we would want to cast the OrderDate to a date data type or use DATEPART to ensure that the grouping was by day rather than by time. + +With this, we're ready to build our merge: + +MERGE Sales.MonthlyRollup AS smr + +USING + +( + +SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold + +FROM Sales.SalesOrderHeader soh + +JOIN Sales.SalesOrderDetail sod + +ON soh.SalesOrderID = sod.SalesOrderID + +WHERE soh.OrderDate >= '2003-08-01' AND soh.OrderDate < '2003-08-02' + +GROUP BY soh.OrderDate, sod.ProductID + +) AS s + +ON (s.ProductID = smr.ProductID) + +WHEN MATCHED THEN + +UPDATE SET smr.QtySold = smr.QtySold + s.QtySold + +WHEN NOT MATCHED THEN + +INSERT (Year, Month, ProductID, QtySold) + +VALUES (DATEPART(yy, s.OrderDate), + +DATEPART(m, s.OrderDate), + +s.ProductID, + +s.QtySold); + +Note that the semicolon is required at the end of the MERGE statement. While the semicolon remains optional on most SQL statements for backward compatibility reasons, you'll find it working its way into more and more statements as a required delimiter of the end of the statement; this is particularly true for multipart statements such as MERGE. + +When you run this, you should get 192 rows affected assuming you haven't been altering the data in AdventureWorks2008. Now, since our Sales.MonthlyRollup table was empty, there wouldn't have been any matches, so all rows were inserted. We can verify that by querying our Sales.MonthlyRollup table: + +SELECT * + +FROM Sales.MonthlyRollup; + +This gets us back the expected 192 rows: + +Year Month ProductID QtySold + +\------ ----- ----------- ----------- + +2003 8 707 242 + +2003 8 708 281 + +2003 8 711 302 + +... + +... + +2003 8 997 43 + +2003 8 998 138 + +2003 8 999 103 + +(192 row(s) affected) + +Every row that was in the basic SELECT that powered our MERGE wound up being inserted into our table. Let's move on, however, to the 2nd day of the month: + +MERGE Sales.MonthlyRollup AS smr + +USING + +( + +SELECT soh.OrderDate, sod.ProductID, SUM(sod.OrderQty) AS QtySold + +FROM Sales.SalesOrderHeader soh + +JOIN Sales.SalesOrderDetail sod + +ON soh.SalesOrderID = sod.SalesOrderID + +WHERE soh.OrderDate >= '2003-08-02' AND soh.OrderDate < '2003-08-03' + +GROUP BY soh.OrderDate, sod.ProductID + +) AS s + +ON (s.ProductID = smr.ProductID) + +WHEN MATCHED THEN + +UPDATE SET smr.QtySold = smr.QtySold + s.QtySold + +WHEN NOT MATCHED THEN + +INSERT (Year, Month, ProductID, QtySold) + +VALUES (DATEPART(yy, s.OrderDate), + +DATEPART(m, s.OrderDate), + +s.ProductID, + +s.QtySold); + +We update the date we're running this for (simulating running it on the 2nd day of the month), and running it should get us 38 rows: + +(38 row(s) affected) + +But something is different this time; we already had rows in the table that our new batch of sales may have matched up with. We know we affected 38 rows, but how did we affect them. Re-run the SELECT on our table: + +SELECT * + +FROM Sales.MonthlyRollup + +And instead of 230 rows (the 192 plus the 38), we only get 194 rows. Indeed, 36 of our 38 rows were repeat sales, and were therefore treated as updates rather than insertions. Two rows (ProductIDs 882 and 928) were sales of product that had not been previously sold in that month, and thus needed to be inserted as new rows—one pass over the data, but the equivalent of two statements ran. + +We could perform similar actions that decide to delete rows based on matched or not matched conditions. + +Using External Calls to Perform Complex Actions + +We have always had the need, on occasion, to get information that is sourced outside of SQL Server. For the vast, vast majority of installations, actually getting that information from within SQL Server was out of reach. Instead, there was typically a client or middle tier component that sorted out what was needed from SQL Server and what was needed from the external source. + +In many ways, this was just fine—after all, having your database server hung up waiting on an external call seems risky at best, and deadly at worst. Who knows how long before that call is going to return (if ever?). The risk of hung processes within your database server winds up being fairly high. + +Now, I said for the majority of installations, and that implies that a few got around it—and they did. There were a few different methods available. + +First, there was the idea of an extended stored procedure. These are DLLs that you can create in C using special SQL Server libraries. They run in process with SQL Server and can be (assuming you have a smart DLL writer) very fast, save for one problem—an external call. That means that we are beholden to the external process we are calling to return to us in a timely fashion. The additional issue was one of general safety. Since you're running in process to SQL Server, if your DLL crashes, then SQL Server is going to crash (if you're distributing software, I'm sure you can guess at how your customer would react if your product was taking down their SQL Server installation). Last, but not least, very few had the knack for figuring out how to get these written. + +Another solution was added to SQL Server in the OLE/COM era. The sp_CreateOAMethod family of stored procedures allowed you to instantiate a COM object and make calls to it. These passed data back and forth using variants, and were always run out of process. They were safer, but they were clumsy at best and painfully slow. + +With the advent of .NET and SQL Server becoming CLR language aware, we live in a new world. You can write your scripts using any .NET language, and can instantiate the objects you need to get the job done. You can create user-defined functions to call external processes—such as cross-communicating with some other online system that you cannot directly link to. Imagine, for a moment, allowing SQL Server to apply information gleaned from a Web service and merge that data in the end query? Heady stuff. + +The possibilities are endless; however, you need to keep your head about this. External calls are still external calls! Any time you rely on something external to your system, you are at the mercy of that external system. Be very, very careful with such calls. + +External calls should be considered to be an extreme measure. You are taking risks in terms of security (what is the risk of someone spoofing your external source?) and also taking an extreme performance risk. Tread lightly in this area. + +Performance Considerations + +We've already touched on some of the macro-level "what's the best thing to do" stuff as we've gone through the chapter, but, like most things in life, it's not as easy as all that. What I want to do here is provide something of a quick reference for performance issues for your queries. I'll try to steer you toward the right kind of query for the right kind of situation. + +Yes, it's time again folks for one of my now famous soapbox diatribes. At issue this time is the concept of blanket use of blanket rules. + +What I'm going to be talking about in this section is the way that things usually work. The word usually is extremely operative here. There are very few rules in SQL that will be true 100 percent of the time. In a world full of exceptions, SQL has to be at the pinnacle of that—exceptions are a dime a dozen when you try and describe the performance world in SQL Server. + +In short, you need to gauge just how important the performance of a given query is. If performance is critical, then don't take these rules too seriously—instead, use them as a starting point, and then TEST, TEST, TEST!!! + +JOINs vs. Subqueries vs. ? + +Deciding between joins and subqueries (and for that matter, other options) is that area I mentioned earlier in the chapter that I had a heated debate with a coworker over. And, as you might expect when two people have such conviction in their point of view, both of us were correct up to a point (and it follows, wrong up to a point). + +The long-standing, traditional viewpoint about subqueries has always been that you are much better off to use joins instead if you can. This is absolutely correct—sometimes. In reality, it depends on a large number of factors. The following is a table that discusses some of the issues that the performance balance will depend on, and which side of the equation they favor. + +Situation | Favors +---|--- +The value returned from a subquery is going to be the same for every row in the outer query. | Prequery. Declaring a variable and then selecting the needed value into that variable will allow the would-be subquery to be executed just once rather than once for every record in the outer table. The Optimizer in SQL Server is actually pretty smart about this and will do the prequery for you if it detects the scenario, but do not rely on it. If you know this is the scenario, perform your own prequery just to be sure. +Both tables are relatively small (say 10,000 records or less). | Subqueries. I don't know the exact reasons, but I've run several tests on this, and it held up pretty much every time. I suspect that the issue is the lower overhead of a lookup vs. a join when all the lookup data fits on just a data page or two. The Optimizer continues to get smarter about this with every release, so you may find some scenarios where the two options return exactly the same query plan. +The match, after considering all criteria, is going to return only one value. | Subqueries. Again, there is much less overhead in going and finding just one record and substituting it than in having to join the entire table. +The match, after considering all criteria, is going to return only a relatively few values, and there is no index on the lookup column. | Subqueries. A single lookup or even a few lookups will usually take less overhead than a hash join. +The lookup table is relatively small, but the base table is large. | Nested subqueries if applicable; joins vs. a correlated subquery. With subqueries the lookup will happen only once and has relatively low overhead. With correlated subqueries, however, you will be cycling the lookup many times—in this case, the join would be a better choice in most cases. +Correlated subquery vs. join | Join. Internally, a correlated subquery is going to create a nested loop situation. This can create quite a bit of overhead. It is substantially faster than cursors in most instances, but slower than other options that might be available. +Derived tables vs. whatever | Derived tables typically carry a fair amount of overhead, so proceed with caution. The thing to remember is that they are run (derived if you will) once, and then they are in memory, so, most of the overhead is in the initial creation and the lack of indexes in larger result sets. They can be fast or slow—it just depends. Think before coding on these. +EXISTS vs. whatever | EXISTS. It does not have to deal with multiple lookups for the same match—once it finds one match for that particular row, it is free to move onto the next lookup—this can seriously cut down on overhead. +Use of a CTE | A CTE is merged into the query plan of the calling query. In general, this means that a basic CTE will have no significant effect on the end performance. +MERGE vs. Multiple Statements | MERGE allows for the separate action statements to be accomplished in the same pass over the data and utilizing the same locks where applicable. The result will generally be improved performance. Keep in mind, however, that, for many users, it may make for code that is more difficult to read. + +These are just the highlights. The possibilities of different mixes and additional situations are positively endless. + +I can't stress enough how important it is, when in doubt—heck, even when you're not in doubt but performance is everything—to make reasonable tests of competing solutions to the problem. By reasonable, I mean that your tests should cover most of the typical scenarios that you users will execute the code in. In addition, your tests should be conducted against a database and load that is somewhat equivalent to what you expect to see in production. Most of the time the blanket rules will be fine, but not always. By performing reasonable tests, you can be certain you've made the right choice. + +Summary + +The query basics you've learned in your experience with SQL up to this point will cover perhaps 80 percent or more of the query situations that you run into, but it's that other 20 percent that can kill you. Sometimes the issue is whether you can even find a query that will give you the answers you need. Sometimes it's that you have a particular query or sproc that has unacceptable performance. Whatever the case, you'll run across plenty of situations where simple queries and joins just won't fit the bill. You need something more, and, hopefully, the options covered in this chapter have given you a little more of an arsenal to deal with those tough situations. +4 + +XML Integration + +Extensible Markup Language (XML)—looking back at its history is something of a funny thing to me. Part of its strength lies in its simplicity, so it would seem like it wouldn't change much. Indeed, the basic rules of it haven't changed at all—but all the things surrounding XML (such as how to access data stored in XML) have gone through many changes. Likewise, the way that SQL Server supports XML has seen some fairly big changes from the time it was first introduced. + +So, to continue my "it's a funny thing" observation, I realized some time back that I used to refer to XML support as being an "extra"—what a truly silly thing for me to say. Yeah, yeah, yeah—I always tempered that "extra" comment with the notion that it's only because XML support isn't really required to have a working SQL Server, but I've come to realize in today's world that it isn't much of a working SQL Server without support for XML. It is with this in mind, and looking back at how integral XML integration has become to the product, that I've moved my coverage of XML much further forward in the book versus where I had it in prior editions (where it was more of an afterthought). + +XML has, over the decade or so that it has grown into widespread use, become a fundamental consideration in the vast majority of data designs. Sure, there are many well thought out and well designed systems out there that do not use so much as one line of XML code, but there are very, very few that haven't had at least a moment of "should we use XML?" consideration in them. XML is used in websites, for data exchange, and for simple storage of things such as hierarchies—if you aren't at least considering XML in your data applications, then you probably aren't giving your data applications full consideration. + +So, with all that said, in this chapter we'll look at: + + * The XML data type + * XML schema collections + * Methods of representing your relational data as XML + * Methods of querying data that we have stored natively in XML (XQuery, Microsoft's XDL language (a variant on XQuery), and other methods) + +Some of these are actually embedded within each other, so let's get to taking a look so we can see how they mix. + +This chapter assumes that you have an existing knowledge of at least basic XML rules and constructs. If you do not have that foundation knowledge, I strongly recommend picking up a copy of the latest edition of a good XML book such as Beginning XML (also available from Wrox) or another XML-specific book before getting too far into this chapter. Keep in mind though, that other chapters may occasionally reference material introduced in this chapter. + +The XML Data Type + +The XML data type was first introduced in SQL Server 2005. It was a watershed moment in the history of mixing relational and XML data. With the xml data type, SQL Server takes data that is in XML format and recognizes it as truly being XML data. In previous versions, there were an increasing number of ways to address XML data, but all of it was done from the foundation of basic character data. The XML data type recognizes XML as XML and that opens up a host of new possibilities from indexing to data validation. + +The number of different things going on here is massive. Among the various things that we need to talk about when discussing the XML data type include: + + * Schema Collections—A core concept of XML is the notion of allowing XML to be associated with schema documents. XML schemas define the rules that allow us to determine whether our XML is "valid" (that is, does it meet the rules that this particular kind of XML document is supposed to do). XML schema collections in SQL Server are a way of storing schemas and allowing SQL Server to know that is what they are—validation documents. You can associate instances of XML data (column data or variables, for example) with XML schemas, and SQL Server will apply the schema to each instance of that XML to determine whether it is valid XML or not. + * Enforcing Constraints—Relational data systems have always had the notion of requiring a column to meet certain criteria before we'll let it into our table, but what about XML? XML allows for multiple pieces of discrete data to be stored within just one column—how do we validate those individual pieces of data? The XML data type understands XML, and, while direct definition of constraints is not allowed, we can utilize wrapper functions (in the form of stored procedures or triggers) to define constraints for specific nodes within our XML. + * XML Data Type Methods—When referring to a column or variable that is typed XML, you can utilize several methods that are intrinsic to that data type. For example, you can test for the existence of a certain node or attribute, execute XDL (a Microsoft-defined extension to XQuery that allows for data modification), or query the value of a specific node or attribute. + +Let's get more specific. + +Defining a Column as Being of XML Type + +We've already seen the most basic definition of an XML column. For example, if we examined the most basic definition of the Production.ProductModel table in the AdventureWorks2008 database, it would look something like this: + +CREATE TABLE Production.ProductModel ( + +ProductModelID int IDENTITY(1,1) NOT NULL, + +Name dbo.Name NOT NULL, + +CatalogDescription xml NULL, + +Instructions xml NULL, + +rowguid uniqueidentifier ROWGUIDCOL NOT NULL, + +ModifiedDate datetime NOT NULL, + +CONSTRAINT PK_ProductModel_ProductModelID PRIMARY KEY CLUSTERED + +( + +ProductModelID ASC + +); + +So, let's ask ourselves what we have here in terms of our two XML columns: + +1. We have defined them as XML, so we will have our XML data type methods available to us (more on those coming up soon). + +2. We have allowed nulls, but could have just as easily chosen NOT NULL as a constraint. Note, however, that the NOT NULL would be enforced on whether the row had any data for that column, not whether that data was valid. + +3. Our XML is considered "non-typed XML." That is, since we have not associated any schema with it, SQL Server doesn't really know anything about how this XML is supposed to behave to be considered "valid." + +The first of these is implied in any column that is defined with the data type XML rather than just plain text. We will see much more about this in our next XML data type section. + +The second goes with any data type in SQL Server—we can specify whether we allow NULL data or not for that column. + +So, the real meat in terms changes we can make at definition time has to do with whether we specify our XML column as being typed or non-typed XML. The non-typed definition we used in the preceding example means that SQL Server knows very little about any XML stored in the column and, therefore, can do little to police its validity. If we set the column up as being typed XML, then we are providing much more definition about what is considered "valid" for any XML that goes in our column. + +The AdventureWorks2008 database already has schema collections that match the validation we want to place on our two XML columns, so let's look at how we would change our CREATE statement to adjust to typed XML: + +CREATE TABLE Production.ProductModel ( + +ProductModelID int IDENTITY(1,1) NOT NULL, + +Name dbo.Name NOT NULL, + +CatalogDescription xml + +(CONTENT Production . ProductDescriptionSchemaCollection ) NULL, + +Instructions xml + +(CONTENT Production . ManuInstructionsSchemaCollection ) NULL, + +rowguid uniqueidentifier ROWGUIDCOL NOT NULL, + +ModifiedDate datetime NOT NULL, + +CONSTRAINT PK_ProductModel_ProductModelID PRIMARY KEY CLUSTERED + +( + +ProductModelID ASC + +); + +This represents the way it is defined in the actual AdventureWorks2008 sample. In order to insert a record into the Production.ProductModel table, you must either leave the CatalogDescription and Instructions fields blank or supply XML that is valid when tested against their respective schemas. + +XML Schema Collections + +XML schema collections are really nothing more than named persistence of one or more schema documents into the database. The name amounts to a handle to your set of schemas. By referring to that collection, you are indicating that the XML typed column or variable must be valid when matched against all of the schemas in that collection. + +We can view existing schema collections. To do this, we utilize the built-in XML_SCHEMA_NAMESPACE() function. The syntax looks like this: + +XML_SCHEMA_NAMESPACE( , , [] ) + +This is just a little confusing, so let's touch on these parameters just a bit: + +Parameter | Description +---|--- +SQL Server schema | This is your relational database schema (not to be confused with the XML schema). For example, for the table Production.ProductModel, Production is the relational schema. For Sales.SalesOrderHeader, Sales is the relational schema. +xml schema collection | The name used when the XML schema collection was created. In our create table example previously, we referred to the ProductDescriptionSchemaCollection and ManuInstructionsSchemaCollection XML schema collections. +namespace | Optional name for a specific namespace within the XML schema collection. Remember that XML schema collections can contain multiple schema documents—this would return anything that fell within the specified namespace. + +So, to use this for the Production.ManuInstructionsSchemaCollection schema collection, we would make a query like this: + +SELECT XML_SCHEMA_NAMESPACE('Production','ManuInstructionsSchemaCollection') + +This spews forth a ton of unformatted XML: + + + +SQL Server strips out any whitespace between tags, so if you create a schema collection with all sorts of pretty indentations for readability, SQL Server will remove them for the sake of efficient storage. + +Note that the default number of characters returned for text results in Management Studio is only 256 characters. If you're using text view, you will want to go Tools⇒Options⇒Query Results⇒SQL Server⇒Results to Text and change the maximum number of characters displayed. + +Creating, Altering, and Dropping XML Schema Collections + +The CREATE, ALTER, and DROP notions for XML schema collections work in a manner that is mostly consistent with how other such statements have worked thus far in SQL Server. We'll run through them here, but pay particular attention to the ALTER statement, as it is the one that has a few quirks versus other ALTER statements we've worked with. + +CREATE XML SCHEMA COLLECTION + +Again, the CREATE is your typical CREATE syntax that we've seen throughout the book, and uses the AS keyword we've seen with stored procedures, views, and other less structured objects: + +CREATE XML SCHEMA COLLECTION [.] + +AS { | } + +So if, for example, we wanted to create an XML schema collection that is similar to the Production.ManuInstructionsSchemaCollection collection in AdventureWorks2008, we might execute something like the following: + +CREATE XML SCHEMA COLLECTION ProductDescriptionSchemaCollectionSummaryRequired + +AS + +' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +' + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +This one happens to be just like the Production.ManuInstructionsSchemaCollection schema collection, but I've altered the schema to require the summary element rather than having it optional. Since the basic structure is the same, I utilized the same namespaces. + +ALTER XML SCHEMA COLLECTION + +This one is just slightly different from other ALTER statements in the sense that it is limited to just adding new pieces to the collection. The syntax looks like this: + +ALTER XML SCHEMA COLLECTION [.] + +ADD { | } + +I would not be at all surprised if the functionality of this is boosted a bit in a later service pack, but, in the meantime, let me stress again that this is a tool for adding to your schema collection rather than changing or removing what's there. + +DROP XML SCHEMA COLLECTION + +This is one of those classic "does what it says" things and works just like any other DROP: + +DROP XML SCHEMA COLLECTION [.] + +So, to get rid of our ProductDescriptionSchemaCollectionSummaryRequired schema collection we created earlier, we could execute: + +DROP XML SCHEMA COLLECTION ProductDescriptionSchemaCollectionSummaryRequired; + +And it's gone. + +XML Data Type Methods + +The XML data type carries several intrinsic methods with it. These methods are unique to the XML data type, and no other data type has anything that is at all similar. The syntax within these methods varies a bit because they are based on different, but mostly industry-standard, XML access methods. The basic syntax for calling the method is standardized though: + +. + +There are a total of five methods available: + + * .query—An implementation of the industry-standard XQuery language. This allows you to access your XML by running XQuery-formatted queries. XQuery allows for the prospect that you may be returning multiple pieces of data rather than a discrete value. + * .value—This one allows you to access a discrete value within a specific element or attribute. + * .modify—This is Microsoft's own extension to XQuery. Whereas XQuery is limited to requesting data (no modification language), the modify method extends XQuery to allow for data modification. + * .nodes—Used to break up XML data into individual, more relational-style rows. + * .exist—Much like the IF EXISTS clause we use extensively in standard SQL, the exist() XML data type method tests to see whether a specific kind of data exists. In the case of exist(), the test is to see whether a particular node or attribute has an entry in the instance of XML you're testing. + +.query (SQL Server's Implementation of XQuery) + +.query is an implementation of the industry standard XQuery language. The result works much like a SQL query, except that the results are for matching XML data nodes rather than relational rows and columns. + +.query requires a parameter that is a valid XQuery to be run against your instance of XML data. For example, if we wanted the steps out of the product documentation for ProductID 66, we could run the following: + +SELECT ProductModelID, Instructions.query('declare namespace + +PI="http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions"; + +/PI:root/PI:Location/PI:step') AS Steps + +FROM Production.ProductModel + +WHERE ProductModelID = 66; + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +The result is rather verbose, so I've truncated the right side of it, but you can see that we've trimmed things down such that we're getting only those nodes at the step level or lower in the XML hierarchy. + +ProductModelID Steps + +\-------------- -------------------------------------------------- + +66 Pinch Bolt (Product N... + +FI-620 + +(1 row(s) affected) + +It's also worth pointing out that all the XML still came in one column in one row per data row in the database. + +It bears repeating that .query cannot modify data—it is a read-only operation. + +Notice, by the way, my need to declare the namespace in this. Since a namespace is declared as part of the referenced schema collection, you can see how it really expands and virtually destroys the readability of our query. We can fix that by using the WITH XMLNAMESPACES() declaration: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT ProductModelID, Instructions.query('/PI:root/PI:Location/PI:step') AS Steps + +FROM Production.ProductModel + +WHERE ProductModelID = 66; + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +This gives you a somewhat more readable query, but yields the same result set. + +.value + +The .value method is all about querying out discrete data. It uses an XPath syntax to locate a specific node and extract a scalar value. The syntax looks like this: + +.value (, ) + +The trick here is making certain that the XPath specified really will return a discrete value. + +If, for example, we wanted to know the value of the LaborHours attribute in the first Location element for ProductModelID 66, we might write something like: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT ProductModelID, + +Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]', + +'decimal (5,2)') AS Location + +FROM Production.ProductModel + +WHERE ProductModelID = 66 + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +Check the results: + +ProductModelID Location + +\-------------- --------------------------------------- + +66 1.50 + +(1 row(s) affected) + +Note that SQL Server has extracted just the specified attribute value (in this case, the LaborHours attribute of the Location node) as a discrete piece of data. The data type of the returned values must be castable into a non-XML type in SQL Server, and must return a scalar value—that is, you cannot have multiple rows. + +.modify + +Ah, here things get just a little interesting. + +XQuery, left in its standard W3C form, is a read-only kind of thing—that is, it is great for selecting out data but offers no equivalents to INSERT, UPDATE, or DELETE. Bummer deal! Well, Microsoft is apparently having none of that and has done its own extension to XQuery to provide data manipulation for XQuery. This extension to XQuery is called XML Data Manipulation Language, or XML DML. XML DML adds three new commands to XQuery: + + * insert + * delete + * replace value of + +Note that these commands, like all XML keywords, are case sensitive. + +Each of these does what it implies, with replace value of taking the place of SQL's UPDATE statement. + +If, for example, we wanted to increase the original 1.5 labor hours in our .value example, we might write something like: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +UPDATE Production.ProductModel + +SET Instructions.modify('replace value of + +(/PI:root/PI:Location/@LaborHours)[1] with 1.75') + +WHERE ProductModelID = 66; + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +Now if we re-run our .value command: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT ProductModelID, Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]', + +'decimal (5,2)') AS Location + +FROM Production.ProductModel + +WHERE ProductModelID = 66 + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +We get a new value: + +ProductModelID Location + +\-------------- --------------------------------------- + +66 1.75 + +(1 row(s) affected) + +Note the way that this is essentially an UPDATE within an UPDATE. We are modifying the SQL Server row, so we must use an UPDATE statement to tell SQL Server that our row of relational data (which just happens to have XML within it) is to be updated. We must also use the replace value of keyword to specify the XML portion of the update. + +.nodes + +.nodes is used to take blocks of XML and separate what would have been, were it stored in a relational form, multiple rows of data. Taking one XML document and breaking it into individual parts in this way is referred to as shredding the document. + +What we are doing with .nodes is essentially breaking the instances of XML data into their own table (with as many rows as there are instances of data meeting that XQuery criteria). As you might expect, this means we need to treat .nodes results as a table rather than a column. The primary difference between .nodes and a typical table is that we must cross apply our .nodes results back to the specific table that we are sourcing our XML data from. So, .nodes really involves more syntax than just .nodes—think of it somewhat like a join, but using the special CROSS APPLY keyword in the place of the JOIN and .nodes instead of the ON clause. It looks like this: + +SELECT + +FROM + +CROSS APPLY .nodes() AS
+ +This is fairly confusing stuff, so let's look back at our .value example earlier. We see a query that looked for a specific entry and, therefore, got back exactly one result: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT ProductModelID, + +Instructions.value('(/PI:root/PI:Location/@LaborHours)[1]', + +'decimal (5,2)') AS Location + +FROM Production.ProductModel + +WHERE ProductModelID = 66; + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +.value expects a scalar result, so we needed to make certain our XQuery would return just that single value per individual row of XML. .nodes tells SQL Server to use XQuery to map to a specific location and treat each entry found in that XQuery to be an individual row instead. + +Let's modify our .value example to return all LocationIDs and their respective labor hours. We want to be able to perform queries against the data in our XML as though it were relational data, so we need to break up our LocationID and LaborHours information into columns just as if they were in a relational table. + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT pm.ProductModelID, + +pmi.Location.value('./@LocationID', 'int') AS LocationID, + +pmi.Location.value('./@LaborHours', 'decimal(5,2)') AS LaborHours + +FROM Production.ProductModel pm + +CROSS APPLY pm.Instructions.nodes('/PI:root/PI:Location') AS pmi(Location); + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +Notice that through the use of our .nodes method, we are essentially turning one table (ProductModel) into two tables (the source table and the .nodes results from the Instructions column within the ProductModel table). Take a look at the results: + +ProductModelID LocationID LaborHours + +\-------------- ----------- --------------------------------------- + +7 10 2.50 + +7 20 1.75 + +7 30 1.00 + +7 45 0.50 + +7 50 3.00 + +7 60 4.00 + +10 10 2.00 + +10 20 1.50 + +10 30 1.00 + +10 4 1.50 + +10 50 3.00 + +10 60 4.00 + +43 50 3.00 + +44 50 3.00 + +47 10 1.00 + +47 20 1.00 + +47 50 3.50 + +48 10 1.00 + +48 20 1.00 + +48 50 3.50 + +53 50 0.50 + +66 50 1.75 + +67 50 1.00 + +(23 row(s) affected) + +As you can see, we are getting back multiple rows for many of what was originally a single row in the ProductModel table. For example, ProductModelID 7 had six different instances of the Location element, so we received six rows instead of just the single row that existed in the ProductModel table. + +While this is, perhaps, the most complex of the various XML data type methods, the power that it gives to transform XML data for relational use is virtually limitless. + +.exist + +.exist works something like the EXISTS statement in SQL. It accepts an expression (in this case, an XQuery expression rather than a SQL expression) and will return a Boolean indication of whether the expression was true or not. (NULL is also a possible outcome.) + +If, in our .modify example, we had wanted to show rows that contain steps that had spec elements, we could use .exist: + +WITH XMLNAMESPACES ('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/ProductModelManuInstructions' AS PI) + +SELECT ProductModelID, Instructions + +FROM Production.ProductModel + +WHERE Instructions.exist('/PI:root/PI:Location/PI:step/PI:specs') = 1 + +Pay particular attention to the point at which the test condition is being applied! + +For example, the code would show us rows where at least one step had a spec element in it—it does not necessarily require that every step has the spec element. If we wanted every element to be tested, we would either need to pull the elements out as individual rows (using .nodes) or place the test condition in the XQuery. + +Note that the URL portion of the namespace declaration must be entered on a single line. They are shown here word wrapped onto multiple lines because there is a limit to how many characters we can show per line in print. Make sure you include the entire URL on a single line. + +Enforcing Constraints beyond the Schema Collection + +By the time you got to this book, you should have already become somewhat familiar with the basics of constraints in a relational database. Well, if our relational database needs constraints, it follows that our XML data does. Indeed, we've already implemented much of that idea through the use of schema collections. But what if we want to enforce requirements that go beyond the base schema? + +Retrieving Relational Data in XML Format + +This is an area that SQL Server already had largely figured out prior to the 2005 release. We had a couple of different options, and we had still more options within those options—between them all, things have been pretty flexible for quite some time. Let's take a look. + +The FOR XML Clause + +This clause is at the root of most of the different integration models available. With the exception of XML mapping schemas (fairly advanced, but we'll touch on them briefly later in the chapter) and the use of XPath, FOR XML will serve as the way of telling SQL Server that it's XML that you want back, not the more typical result set. It is essentially just an option added onto the end of the existing T-SQL SELECT statement. + +Let's look at the SELECT statement syntax: + +SELECT + +[FROM ] + +[WHERE ] + +[GROUP BY + +[HAVING ] + +[ORDER BY ] + +[FOR XML {RAW|AUTO|EXPLICIT|PATH} + +[, XMLDATA][, ELEMENTS][, BINARY base64]] + +[OPTION (, [,...n])] + +Most of this should seem pretty trivial by now—after all, this is a Professional level title—but it's time to focus in on that FOR XML line. + +FOR XML provides four different initial options for how you want your XML formatted in the results: + + * RAW—This sends each row of data in your result set back as a single data element, with the element name of "row" and with each column listed as an attribute of the row element. Even if you join multiple tables, RAW outputs the results with the same number of elements as you would have rows in a standard SQL query. + * AUTO—This option labels each element with either the table name or table name alias that the data is sourced from. If there is data output from more than one table in the query, the data from each table is split into separate, nested elements. If AUTO is used, then an additional option, ELEMENTS, is also supported if you would like column data presented as elements rather than as attributes. + * EXPLICIT—This one is certainly the most complex to format your query with, but the end result is that you have a high degree of control of what the XML looks like finally. With this option, you define something of a hierarchy to the data that's being returned, and then format your query such that each piece of data belongs to a specific hierarchy level (and gets assigned a tag accordingly) as desired. This choice has largely been supplanted by the PATH option and is here for backward compatibility. + * PATH—This was added in SQL Server 2005 to try to provide the level of flexibility of EXPLICIT in a more usable format—this is generally going to be what you want to use when you need a high degree of control of the format of the output. + +Note that none of these options provide the required root element. If you want the XML document to be considered to be "well formed," then you will need to wrap the results with a proper opening and closing tag for your root element. While this is in some ways a hassle, it is also a benefit—it means that you can build more complex XML by stringing multiple XML queries together and wrapping the different results into one XML file. + +In addition to the major formatting options, there are other optional parameters that further modify the output that SQL Server provides in an XML query: + + * XMLDATA—This tells SQL Server that you would like to apply an XML schema onto the front of the results. The schema will define the structure (including data types) and rules of the XML data that follows. + * ELEMENTS—This option is available only when you are using the AUTO formatting option. It tells SQL Server that you want the columns in your data returned as nested elements rather than as attributes. + * BINARY BASE64—This tells SQL Server to encode any binary columns (binary, varbinary, image) in base64 format. This option is implied (SQL Server will use it even if you don't state it) if you are also using the AUTO option. It is not implied but is currently the only effective option for EXPLICIT and RAW queries—eventually, the plan is to have these two options automatically provide a URL link to the binary data (unless you say to do the base64 encoding), but this is not yet implemented. + * TYPE—Tells SQL Server to return the results reporting the XML data type instead of the default Unicode character type. + * ROOT—This option will have SQL Server add the root node for you so you don't have to. You can either supply a name for your root or use the default (root). + +Let's explore all these options in a little more detail. + +RAW + +This is something of the "no fuss, no muss" option. The idea here is to just get it done—no fanfare, no special formatting at all—just the absolute minimum to translate a row of relational data into an element of XML data. The element is named "row" (creative, huh?), and each column in the select list is added as an attribute using whatever name the column would have appeared with if you had been running a more traditional SELECT statement. + +One downside to the way attributes are named is that you need to make certain that every column has a name. Normally, SQL Server will just show no column heading if you perform an aggregation or other calculated column and don't provide an alias—when doing XML queries, everything MUST have a name, so don't forget to alias calculated columns. + +So, let's start things out with something relatively simple. Imagine that our manager has asked us to provide a query that lists a few customers' orders—say CustomerIDs 1 and 2. After cruising through just the first five or so chapters of the book, you would probably say "No Problem!" and supply something like this: + +SELECT sc.CustomerID, + +pp.LastName, + +pp.FirstName, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485; + +So, you go hand your boss the results: + +29484 Achong Gustavo 44132 2001-09-01 00:00:00.000 + +29484 Achong Gustavo 45579 2002-03-01 00:00:00.000 + +... + +... + +29485 Abel Catherine 65157 2004-03-01 00:00:00.000 + +29485 Abel Catherine 71782 2004-06-01 00:00:00.000 + +Easy, right? Well, now the boss comes back and says, "Great—now I'll just have Billy Bob write something to turn this into XML—too bad that will probably take a day or two." This is your cue to step in and say, "Oh, why didn't you say so?" and simply add three key words: + +SELECT sc.CustomerID, + +pp.LastName, + +pp.FirstName, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +FOR XML RAW; + +You have just made the boss very happy. The output is a one-to-one match versus what we would have seen in the result set had we run just a standard SQL query: + + + + + + + + + + + + + + + + + + + + + + + +Let me just issue a reminder that Management Studio will truncate any column where the length exceeds the number set in the Tools⇒Options menu in the Query Results Results to Text node (maximum is 8192). This issue exists in the results window (grid or text, though grid will allow larger numbers if the data is XML) and if you output directly to a file. This is an issue with the tool—not SQL Server itself. If you use another method to retrieve results (ADO.NET for example), you shouldn't encounter an issue with this. + +Also, be aware that I added carriage returns in the preceding results for clarity's sake—SQL Server just runs all the elements together to make them more compact. + +We have one element in XML for each row of data our query produced. All column information, regardless of what table was the source of the data, is represented as an attribute of the row element. The downside of this is that we haven't represented the true hierarchical nature of our data—orders are placed only by customers. The upside, however, is that the XML Document Object Model (DOM)—if that's the model you're using—is going to be much less deep and, hence, will have a slightly smaller footprint in memory and perform better, depending on what you're doing. + +AUTO + +AUTO takes a somewhat different approach to our data than RAW does. AUTO tries to format things a little better for you—naming elements based on the table (or the table alias if you use one). In addition, AUTO recognizes the notion that our data probably has some underlying hierarchical notion to it that is supposed to be represented in the XML. + +Let's go back to our customer orders example from the last section. This time, we'll make use of the AUTO option, so we can see the difference versus the rather plain output we got with RAW. + +SELECT sc.CustomerID, + +pp.LastName, + +pp.FirstName, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +The first apparent difference is that the element name has changed to be the name or alias of the table that is the source of the data—you'll want to consider this when choosing the aliases for your tables in a FOR XML AUTO query. Perhaps an even more significant difference appears when we look at the XML more thoroughly. I have again cleaned up the output a bit for clarity: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Data that is sourced from our second table (as determined by the SELECT list) is nested inside the data sourced from the first table. In this case, our soh elements are nested inside our pp elements, which are in turn nested inside of our c elements. If a column from the SalesOrderHeader table were listed first in our select list, then Person and Customer would both be nested inside SalesOrderHeader. + +Pay attention to this business of the ordering of your SELECT list! Think about the primary question your XML query is meant to answer. Arrange your SELECT list such that the style that it produces is fitting for the goal of your XML. Sure, you could always style it into the different form—but why do that if SQL Server could have just produced it for you that way in the first place? + +The downside to using AUTO is that the resulting XML data model ends up being slightly more complex. The upside is that the data is more explicitly broken up into a hierarchical model. This makes life easier when the elements are more significant breaking points—such as when you have a doubly sorted report (for example, SalesOrderHeader rows sorted within Contact rows). + +EXPLICIT + +The word explicit is an interesting choice for this option—it loosely describes the kind of language you're likely to use while trying to create your query. The EXPLICIT option takes much more effort to prepare, but it also rewards that effort with very fine granularity of control over what's an element and what's an attribute, as well as what elements are nested in what other elements. + +Much of what you can do with EXPLICIT can now be replicated using PATH. EXPLICIT does, however, give you a very fine and, as the keyword name implies, explicit level of control about your output. In general, I would point you at PATH and tell you to look at EXPLICIT when PATH doesn't seem to be meeting your needs. + +EXPLICIT enables you to define each level of the hierarchy and how each level is going to look. To define the hierarchy, you create what is internally called the universal table. The universal table is, in many respects, just like any other result set you might produce in SQL Server. It is usually produced by making use of UNION statements to piece it together one level at a time, but you could, for example, build much of the data in a UDF and then make a SELECT against that to produce the final XML. The big difference between the universal table and a more traditional result set is that you must provide sufficient metadata right within your result set such that SQL Server can then transform that result set into an XML document in the schema you desire. + +What do I mean by sufficient metadata? Well, to give you an idea of just how complex this can be, let's look at a real universal table—one used by a code example we'll examine a little later in the section: + +This is what the universal table we would need to build would look like in order to make our EXPLICIT return exactly the same results that we received with our AUTO query in the last example. + +Your first inclination might be to say, "Hey, if this is just producing the same thing as AUTO, why use it?" Well, this particular example happens to be producible using AUTO—I'm using this one on purpose to illustrate some functional differences compared to something you've already seen. We will, however, see later in this section that EXPLICIT will allow us to do the formatting extras that aren't possible with AUTO or RAW (but are with PATH)—so please bear with me on this one. + +You should note several things about this result set: + + * It has two special metadata columns—Tag and Parent—added to it that do not, otherwise, relate to the data (they didn't come from table columns). + * The actual column names are adhering to a special format (which happens to supply additional metadata). + * The data has been ordered based on the hierarchy. + +Each of these items is critical to our end result, so, before we start working a complete example, let's look at what we need to know to build it. + +Tag and Parent + +XML is naturally hierarchical in nature (elements are contained with other elements, which essentially creates a parent-child relationship). Tag and Parent are columns that define the relationship of each row to the element hierarchy. Each row is assigned to a certain tag level (which will later have an element name assigned to it)—that level, as you might expect, goes in the Tag column. Parent then supplies reference information that indicates what the next highest level in the hierarchy is. When you do this, SQL Server knows at what level this row needs to be nested or assigned as an attribute (what it's going to be—element or attribute—will be figured out based on the column name—but we'll get to that in our next section). If Parent is NULL, then SQL Server knows that this row must be a top-level element or an attribute of that element. + +So, if we had data that looked like this: + +Tag | Parent +---|--- +1 | NULL +2 | 1 + +then the first row would be related to a top-level element (an attribute of the outer element or the element itself), and the second would be related to an element that was nested inside the top-level element (its Parent value of 1 matches with the Tag value of the first). + +Column Naming + +Frankly, this was the most confusing part of all when I first started looking at EXPLICIT. While Tag and Parent have nice neat demarcation points (they are each their own column), the name takes several pieces of metadata and crams them together as one thing—the only way to tell where one stops and the next begins is by separating them by an exclamation mark (!). + +The naming format looks like this: + +!![][!{element|hide|ID|IDREF|IDREFS|xml|xmltext|cdata}] + +The element name is, of course, just that—what you want to be the name of the element in the XML. For any given tag level, once you define a column with one name, any other column with that same tag must have the same name as the previous column(s) with that tag number. So, if you have a column already defined as [MyElement!2!MyCol], then another column could be named [MyElement!2!MyOtherCol], but [SomeOtherName!2!MyOtherCol] could not be. + +The tag relates the column to rows with a matching tag number. When SQL Server looks at the universal table, it reads the tag number and then analyzes the columns with the same tag number. So, when SQL Server sees the row: + +it can look at the tag number, see that it is 1, and know that it should process sc!1!ContactID, c!1!LastName, and c!1!FirstName, but that it doesn't have to process pp!2!LastName or soh!3!SalesOrderID, for example. Likewise it can look at the tag number in the next row, see that it is 2, and know that it should process sc!1!ContactID, pp!2!LastName, and pp!2!FirstName, but that it doesn't have to process soh!3!SalesOrderID. + +That takes us to the attribute name, which is where things start getting more complex (hey, we still have one more to go after this!). If you do not specify a directive (which comes next), then the attribute is required and is the name of the XML attribute that this column will supply a value for. The attribute will be in the XML as part of the element specified in the column name. + +If you do specify a directive, then the attribute falls into three different camps: + + * It's Prohibited—That is, you must leave the attribute blank (you do still use a bang (!) to mark its place though). This is the case if you use a CDATA directive. + * It's Optional—That is, you can supply the attribute but don't have to. What happens in this case varies depending on the directive that you've chosen. + * It's Still Required—This is true for the elements and xml directives. In this case, the name of the attribute will become the name of a totally new element that will be created as a result of the elements or xml directive. + +So, now that we have enough of the naming down to meet the minimum requirements for a query, let's go ahead and look at an example of what kind of query produces what kind of results. + +We will start with the query to produce the same basic data that we used in our RAW and AUTO examples. You will notice that EXPLICIT has a much bigger impact on the code than we saw when we went with RAW and AUTO. With both RAW and AUTO, we added the FOR XML clause at the end, and we were largely done. With EXPLICIT, we will quickly see that we need to entirely rethink the way our query comes together. + +It looks like this (yuck): + +USE AdventureWorks2008 + +SELECT 1 as Tag, + +NULL as Parent, + +sc.CustomerID as [sc!1!CustomerID], + +NULL as [pp!2!LastName], + +NULL as [pp!2!FirstName], + +NULL as [soh!3!SalesOrderID], + +NULL as [soh!3!OrderDate] + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION + +SELECT 2, + +1, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [pp!2!FirstName], + +NULL as [soh!3!SalesOrderID], + +NULL as [soh!3!OrderDate] + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION ALL + +SELECT 3, + +2, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [pp!2!FirstName], + +soh.SalesOrderID, + +soh.OrderDate + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID] + +FOR XML EXPLICIT + +Notice that we use the FOR XML clause only once—after the last query in the UNION. + +I reiterate—yuck! But, ugly as it is, with just a few changes, I could change my XML into forms that AUTO wouldn't give me. + +As a fairly simple illustration, let's make a couple of small alterations to our requirements for this query. What if we decided that we wanted the LastName information to be an attribute of the soh rather than (or, as it happens, in addition to) the pp element? With AUTO, we would need some trickery in order to get this (for every row, we would need to look up the Customer again using a correlated subquery—AUTO won't let you use the same value in two places). If you had multiple lookups, your code could get very complex—indeed, you might not be able to get what you're after at all. With EXPLICIT, this is all relatively easy (at least, by EXPLICIT's definition of easy). + +To do this with EXPLICIT, we just need to reference the LastName in our SELECT list again, but associate the new instance of it with soh instead of c: + +USE AdventureWorks2008 + +SELECT 1 as Tag, + +NULL as Parent, + +sc.CustomerID as [sc!1!CustomerID], + +NULL as [pp!2!LastName], + +NULL as [pp!2!FirstName], + +NULL as [soh!3!SalesOrderID], + +NULL as [soh!3!OrderDate], + +NULL as [soh!3!LastName] + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION + +SELECT 2, + +1, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [pp!2!FirstName], + +NULL as [soh!3!SalesOrderID], + +NULL as [soh!3!OrderDate], + +NULL as [soh!3!LastName] + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION ALL + +SELECT 3, + +2, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [soh!2!FirstName], + +soh.SalesOrderID, + +soh.OrderDate, + +pp.LastName + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID] + +FOR XML EXPLICIT + +Execute this, and you get pretty much the same results as before, only this time you received the additional attribute you were looking for in your soh element: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This example is really just for starters. You can utilize directives to achieve far more flexibility—shaping and controlling both your data and your schema output (if you use the XMLDATA option). + +Directives are a real pain to understand. Once you do understand them, they aren't all that bad to deal with, though they can still be confusing at times (some of them work pretty counterintuitively and behave differently in different situations). My personal opinion (and the members of the dev team I know are going to shoot me for saying this) is that someone at Microsoft had a really bad day and decided to make something that would inflict as much pain as he/she was feeling but would be so cool that people wouldn't be able to help but use it. + +All together, there are eight possible directives you can use. Some can be used in the same level of the hierarchy—others are mutually exclusive within a given hierarchy level. + +The purpose behind directives is to allow you to tweak your results. Without directives, the EXPLICIT option would have little or no value (AUTO would take care of most real things that you can do with EXPLICIT if you don't use directives, even though, as I indicated earlier, you sometimes have to get a little tricky). So, with this in mind, let's look at what directives are available. + +element + +This is probably the easiest of all the directives to understand. All it does is indicate that you want the column in question to be added as an element rather than an attribute. The element will be added as a child to the current tag. For example, let's say that our manager from the previous examples has indicated that he or she needs the OrderDate to be represented as its own element. This can be accomplished as easily as adding the element directive to the end of our OrderDate field: + +SELECT 1 as Tag, + +NULL as Parent, + +sc.CustomerID as [sc!1!CustomerID], + +NULL as [pp!2!LastName], + +NULL as [pp!2!FirstName], + +NULL as [soh!3!SalesOrderID], + +NULL as [soh!3!OrderDate!element] + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION + +SELECT 2, + +1, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [pp!2!FirstName], + +NULL, + +NULL + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +UNION ALL + +SELECT 3, + +2, + +sc.CustomerID as [sc!1!CustomerID], + +pp.LastName as [pp!2!LastName], + +pp.FirstName as [pp!2!FirstName], + +soh.SalesOrderID, + +soh.OrderDate + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +ORDER BY [sc!1!CustomerID], [pp!2!LastName], [pp!2!FirstName], [soh!3!SalesOrderID] + +FOR XML EXPLICIT + +Suddenly, we have an extra element instead of an attribute: + + + + + + + +2001-09-01T00:00:00 + + + + + +2002-03-01T00:00:00 + + + + + +2002-06-01T00:00:00 + + + + + +2002-09-01T00:00:00 + + + + + +2002-12-01T00:00:00 + + + + + +2003-03-01T00:00:00 + + + + + +2003-06-01T00:00:00 + + + + + + + + + + + + + +2003-09-01T00:00:00 + + + + + +2003-12-01T00:00:00 + + + + + +2004-03-01T00:00:00 + + + + + +2004-06-01T00:00:00 + + + + + + + +xml + +This directive is essentially just like the element directive. It causes the column in question to be generated as an element rather than an attribute. The differences between the xml and element directives will be seen only if you have special characters that require encoding—for example, the = sign is reserved in XML. If you need to represent an =, then you need to encode it (for =, it would be encoded as &eq). With the element directive, the content of the element is automatically encoded. With xml, the content is passed straight into the resulting XML without encoding. If you use the xml directive, no other item at this level (the number) can have a directive other than hide. + +hide + +Hide is another simple one that does exactly what it says it does—hides the results of that column. + +Why in the world would you want to do that? Well, sometimes we include columns for reasons other than output. For example, in a normal query, we can perform an ORDER BY based on columns that do not appear in the SELECT list. For UNION queries, however, we can't do that—we have to specify a column in the SELECT list because it's the one thing that unites all the queries that we are performing the UNION on. + +Let's use a little example of tracking some product sales. We'll say that we want a list of all of our products as well as the SalesOrderIDs of the orders they shipped on and the date that they shipped. We only want the ProductID, but we want the ProductID to be sorted such that any given product is near similar products—that means we need to sort based on the ProductSubcategoryID, but we do not want the ProductSubcategoryID to be included in the end results. + +We can start out by building the query without the directive—that way we can see that our sort is working: + +SELECT 1 as Tag, + +NULL as Parent, + +p.ProductID as [Product!1!ProductID], + +p.ProductSubcategoryID as [Product!1!ProductSubcategoryID], + +NULL as [Order!2!OrderID], + +NULL as [Order!2!OrderDate] + +FROM Production.Product p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +UNION ALL + +SELECT 2, + +1, + +p.ProductID, + +p.ProductSubcategoryID, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +ORDER BY [Product!1!ProductSubcategoryID],[Product!1!ProductID], + +[Order!2!OrderID] + +FOR XML EXPLICIT + +Be sure to check out the way we dealt with the OrderDate on this one. Even though I needed to fetch that information out of the SalesOrderHeader table, it was easy (since we're using EXPLICIT anyway) to combine that information with the SalesOrderID from the SalesOrderDetail table. As it happens, I could have also just grabbed the SalesOrderID from the SalesOrderHeader table, too, but sometimes you need to mix data from multiple tables in one element, and this query is yet another demonstration of how we can do just that. + +We can see from the results that we are indeed getting the sort we expected: + + + + + + + + + + + + + + + + + +< + +Product ProductID="766" ProductSubcategoryID="2"> + + + + + +Now we'll add our hide directive and get rid of the category information: + +SELECT 1 as Tag, + +NULL as Parent, + +p.ProductID as [Product!1!ProductID], + +p.ProductSubcategoryID as [Product!1!ProductSubcategoryID!hide], + +NULL as [Order!2!OrderID], + +NULL as [Order!2!OrderDate] + +FROM Production.Product p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +UNION ALL + +SELECT 2, + +1, + +p.ProductID, + +p.ProductSubcategoryID, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +ORDER BY [Product!1!ProductSubcategoryID!hide],[Product!1!ProductID], + +[Order!2!OrderID] + +FOR XML EXPLICIT + +And we get the same results; only this time, our Category information is indeed hidden: + + + + + + + + + + + + + + + + + + + + + + + + + +id, idref, and idrefs + +None of these three has any affect whatsoever unless you also make use of the XMLDATA option (it goes after the EXPLICIT in the FOR clause) or validate against some other schema that has the appropriate declarations. This makes perfect sense when you think about what they do—they add things to the schema to enforce behavior, but, without a schema, what do you modify? + +You see, XML has the concept of an id. An id in XML works much the same as a primary key does in relational data—it designates a unique identifier for that element name in your XML document. For any element name, there can be no more than one attribute specified in the id. What attribute is to serve as the id is defined in the schema for the XML. Once you have one element with a given value for your id attribute, no other element with the same element name is allowed to have the same attribute. + +Unlike primary keys in SQL, you cannot have multiple attributes make up your id in XML (there is no concept of a composite key). + +Since XML has a concept that is similar to a primary key, it probably comes as no surprise that XML also has a concept that is similar to a foreign key—that's where idref and idrefs come in. Both are used to create a reference from an attribute in one element to an id attribute in another element. + +What does this do for us? Well, if we didn't have these, there would only be one way to create a relationship between two elements—nest them. By giving a certain element an id and then making reference to it from an attribute declared as being an idref or idrefs attribute, we gain the ability to link the two elements, regardless of their position in the document. + +This should bring on the question, "OK—so why are there two of them?" The answer is implied in their names: idref provides for a single value that must match an existing element's id value. idrefs provides a multivalued, whitespace-separated list—again, the values must each match an existing element's id value. The result is that you use idref if you are trying to establish a one-to-many relationship (there will only be one of each id value but potentially many elements with that value in an attribute of idref). Use idrefs when you are trying to establish a many-to-many relationship (each element with an idrefs can refer to many ids, and those values can be referred to by many ids). + +To illustrate this one, we'll go with a slight modification of our last query. We'll start with the idref directive: + +SELECT 1 as Tag, + +NULL as Parent, + +p.ProductID as [Product!1!ProductID!ID], + +p.ProductSubcategoryID as [Product!1!ProductSubCategoryID!hide], + +NULL as [Order!2!OrderID], + +NULL as [Order!2!ProductID!idref], + +NULL as [Order!2!OrderDate] + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +UNION ALL + +SELECT 2, + +1, + +p.ProductID, + +p.ProductSubcategoryID, + +sod.SalesOrderID, + +sod.ProductID, + +soh.OrderDate + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-27' + +ORDER BY [Product!1!ProductSubCategoryID!hide],[Product!1!ProductID!ID], + +[Order!2!OrderID] + +FOR XML EXPLICIT, XMLDATA + +When we look at the results, there are really just two pieces that we are interested in—the schema and our product element: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +In the schema, you can see some fairly specific type information. Our Product is declared as a type of element, and you can also see that ProductID has been declared as being the id for this element type. Likewise, we have an Order element with the ProductID declared as an idref. + +The next piece that we're interested in is a Product element: + + + + + + + +In this case, notice that SQL Server has referenced our inline schema in the Product element. This declares that the Product element and everything within it must comply with our schema—thus ensuring that our id and idrefs will be enforced. + +When we try to use the idrefs directive, we have to get a little trickier. SQL Server requires that the query that we use to build our idrefs list be separate from the query that builds the elements with the ids. This means we must add another query to our UNION to supply the idrefs (the list of possible ids has to be known before we can build the idrefs list—but the actual ids will come after the id list). The query to generate the idrefs must immediately precede the query that generates the ids. This makes the query look pretty convoluted: + +SELECT 1 as Tag, + +NULL as Parent, + +p.ProductID as [Product!1!ProductID], + +NULL as [Product!1!OrderList!idrefs], + +NULL as [Order!2!OrderID!id], + +NULL as [Order!2!OrderDate] + +FROM Production.Product p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31' + +UNION ALL + +SELECT 1, + +NULL, + +p.ProductID, + +soh.SalesOrderID, + +NULL, + +NULL + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31' + +UNION ALL + +SELECT 2, + +1, + +p.ProductID, + +soh.SalesOrderID, + +soh.SalesOrderID, + +soh.OrderDate + +FROM Production.Product AS p + +JOIN Sales.SalesOrderDetail AS sod + +ON p.ProductID = sod.ProductID + +JOIN Sales.SalesOrderHeader AS soh + +ON sod.SalesOrderID = soh.SalesOrderID + +WHERE soh.OrderDate BETWEEN '2003-03-27' AND '2003-03-31' + +ORDER BY [Product!1!ProductID], [Order!2!OrderID!id], + +[Product!1!OrderList!idrefs] + +FOR XML EXPLICIT, XMLDATA + +Note that I've expanded the date range a bit to make sure that there are multiple product IDs for a given range so you see the proper many-to-many relationship. + +The schema winds up looking an awful lot like the one we got for idref: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +But the elements couldn't be much more different: + + + + + + + + + +Using id, idref, and idrefs is very complex. Still, they allow you to make your output strongly typed. For most situations, this level of control and the hassles that go with it simply aren't necessary but, when they are, these three can be lifesavers. + +xmltext + +xmltext expects the content of the column to be XML and attempts to insert it as an integral part of the XML document you are creating. + +While, on the surface, that may sound simple enough (Okay, so they're inserting some text in the middle—big deal!), the rules of where, when, and how it inserts the data are a little strange: + + * As long as the XML you're trying to insert is well formed, the root element will be stripped out—but the attributes of that element will be retained and applied depending on the following few rules. + * If you did not specify an attribute name when using the xmltext directive, then the retained attributes from the stripped element will be added to the element that contains the xmltext directive. The names of the retained attributes will be used in the combined element. If any attribute names from the retained attribute data conflict with other attribute information in the combined element, then the conflicting attribute is left out from the retained data. + * Any elements nested inside the stripped element will become nested elements of the combined element. + * If an attribute name is provided with the xmldata directive, then the retained data is placed in an element of the supplied name. The new element becomes a child of the element that made the directive. + * If any of the resulting XML is not well formed, there is no defined behavior. Basically, the behavior will depend on how the end result looks, but I would figure that you're going to get an error (I haven't seen an instance where you can refer to data that is not well formed and escape without an error). + +cdata + +The term cdata is a holdover from DTDs and SGML. (SGML is an old markup language, used in the graphics industry that is the ancestor of both HTML and XML. DTDs are type definition documents that outline rules that your SGML [and later, HTML and XML] documents had to live up to.) Basically, cdata stands for character data. XML acknowledges a cdata section as something of a no man's land—it completely and in all ways ignores whatever is included inside a properly marked cdata section. Since there is no validation on the data in a cdata section, no encoding of the data is necessary. You would use cdata anytime you need your data completely untouched (you can't have encoding altering the data) or, frankly, when you want to move the data but have no idea what the data is (so you can't know if it's going to cause you problems or not). + +For this one, we'll just take a simple example—the AdventureWorks2008 Production.Document table. This table has a field that has an nvarchar(max) data type. The contents are basically unknown. A query to generate the notes on employees into XML might look something like this: + +SELECT 1 as Tag, + +NULL as Parent, + +DocumentNode as [Document!1!DocumentNode], + +DocumentSummary as [Document!1!!cdata] + +FROM Production.Document Document + +WHERE DocumentSummary IS NOT NULL + +ORDER BY [Document!1!DocumentNode] + +FOR XML EXPLICIT + +The output is pretty straightforward: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Basically, this was a pretty easy one. + +PATH + +Now let's switch gears just a little bit and get down to a more "real" XML approach to getting data. + +While EXPLICIT has not been deprecated as yet, make no mistake—PATH is really meant to be a better way of doing what EXPLICIT originally was the only way of doing. PATH makes a lot of sense in a lot of ways, and it is how I recommend that you do complex XML output in most cases. + +This is a more complex recommendation than it might seem. The Microsoft party line on this is that PATH is easier. Well, PATH is easier is many ways, but, as we're going to see, it has its own set of "except for this, and except for that, and except for this other thing" that can twist your brain into knots trying to understand exactly what to do. In short, in some cases, EXPLICIT is actually easier if you don't know XPath. The thing is, if you're dealing with XML, then XPath should be on your learn list anyway, so, if you're going to know it, you should find the XPath-based approach more usable. + +Note, however, that if you need backward compatibility to SQL Server 2000, then you're going to need to stick with EXPLICIT. + +In its most straightforward sense, the PATH option isn't that bad at all. So, let's start by getting our feet wet by focusing in on just the basics of using PATH. From there, we'll get a bit more complex and show off some of what PATH has to offer. + +PATH 101 + +With PATH, you have a model that molds an existing standard to get at your data—XPath. XPath has an accepted standard, and provides a way of pointing at specific points in your XML schema. For PATH, we're just utilizing a lot of the same rules and ideas in order to say how data should be treated in a native XML sort of way. + +How PATH treats the data you refer to depends on a number of rules, including whether the column is named or unnamed (like EXPLICIT, the alias is the name if you use an alias). If the column does have a name, then a number of additional rules are applied as appropriate. + +Let's look at some of the possibilities. + +Unnamed Columns + +Data from a column that is not named will be treated as raw text within the row's element. To demonstrate this, let's take a modified version of the example we used for XML RAW. What we're doing here is listing the two customers we're interested in and the number of orders they have placed: + +SELECT sc.CustomerID, + +COUNT(soh.SalesOrderID) + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +Check the output from this: + +294847 + +294854 + +What it created is a row element for each row in the query—much as you had with RAW—but notice the difference in how it treated our column data. + +Since the CustomerID column was named, it was placed in its own element (we'll explore this more in the next section)—notice, however, the number 7 in the results. This is just loose embedded text for the row element—it isn't even associated directly with the CustomerID since it is outside the CustomerID element. + +Remember that the exact counts (7s in this case) that come back may vary on your system depending on how much you have been playing with the data. The key thing is to see how the counts are not associated with the CustomerID but are instead just raw text associated with the row. + +My personal slant on this is that the number of situations where loose text at the level of the top element is a valid way of doing things is pretty limited. The rules do say you can do it, but I believe it makes for data that is not very clear. Still, this is how it works—use it as it seems to fit the needs of your particular system. + +Named Columns + +This is where things get considerably more complex rather quickly. In its most simple form, named columns are just as easy as unnamed were—indeed, we saw one of them in our previous example. If a column is a simple named column using PATH, then it is merely added as an additional element to the row: + +294847 + +Our CustomerID column was a simple named column. + +We can, however, add special characters into our column name to indicate that we want special behaviors for this column. Let's look at a few of the most important. + +@ + +No, that's not a typo—the @ symbol is really the heading to this section. If we add an @ sign to our column name, then SQL Server will treat that column as an attribute of the previous column. Let's move the CustomerID to be an attribute of the top element for the row: + +SELECT sc.CustomerID AS '@CustomerID', + +COUNT(soh.SalesOrderID) + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +Yields: + +7 + +4 + +Notice that our order count remained a text element of the row—only the column that we identified as an attribute moved in. We could take this to the next step by naming our count and prefixing it to make it an attribute also: + +SELECT sc.CustomerID AS '@CustomerID', + +COUNT(soh.SalesOrderID) AS '@OrderCount' + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +With this, we no longer have our loose text for the element: + + + + + +Also notice that SQL Server was smart enough to realize that everything was contained in attributes—with no lower-level elements or simple text, it chose to make it a self-closing tag (see the / at the end of the element). + +So, why did I indicate that this stuff was tricky? Well, there are a lot of different "it only works if..." kind of rules here. To demonstrate this, let's make a simple modification to our original query. This one seems like it should work, but SQL Server will throw a hissy fit if you try to run it: + +SELECT sc.CustomerID, + +COUNT(soh.SalesOrderID) AS '@OrderCount' + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +What I've done here is to go back to CustomerID as its own element. What, at first glance, you would expect to happen is to get a CustomerID element with OrderCount as an attribute, but it doesn't quite work that way: + +Msg 6852, Level 16, State 1, Line 1 + +Attribute-centric column '@OrderCount' must not come after a non-attribute-centric + +sibling in XML hierarchy in FOR XML PATH. + +The short rendition of the answer to "What's wrong?" is that it doesn't really know what it's supposed to be an attribute of—is it an attribute of the row, or an attribute of the CustomerID? + +/ + +Yes, a forward slash. Much like @, this special character indicates special things you want done. Essentially, you use it to define something of a path—a hierarchy that relates an element to those things that belong to it. It can exist anywhere in the column name except the first character. To demonstrate this, we're going to utilize our last (failed) example and build into what we were looking for when we got the error. + +First, we need to alter the OrderID to have information on what element it belongs to: + +SELECT sc.CustomerID, + +COUNT(soh.SalesOrderID) AS 'CustomerID/OrderCount' + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +By adding the / and then placing CustomerID before the slash, we are telling SQL Server that OrderCount is below CustomerID in a hierarchy. Now, there are many ways XML hierarchy can be structured, so let's see what SQL Server does with this: + +294847 + +294854 + +Now, if you recall, we wanted to make OrderCount an attribute of CustomerID, so, while we have OrderCount below CustomerID in the hierarchy, it's still not quite in the place we wanted it. To do that, we can combine / and @, but we need to fully define all the hierarchy. Now, since I suspect this is a bit confusing, let's take it in two steps—first, the way we might be tempted to do it, but that will yield a similar error to the earlier example: + +SELECT sc.CustomerID, + +COUNT(soh.SalesOrderID) AS 'CustomerID/@OrderCount' + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +Error time: + +Msg 6852, Level 16, State 1, Line 1 + +Attribute-centric column 'CustomerID/@OrderCount' must not come after a non- + +attribute-centric sibling in XML hierarchy in FOR XML PATH. + +To fix this, we need to understand a bit about how things are constructed when building the XML tags. The key is that the tags are essentially built in the order you list them. So, if you want to add attributes to an element, you need to keep in mind that they are part of the element tag—that means you need to define any attributes before you define any other content of that element (subelements or raw text). + +In our case, we are putting the CustomerID as raw text, but the OrderCount as an attribute (okay, backward from what would be likely in real life, but hang with me here). This means we are telling SQL Server things backward. By the time it sees the OrderCount information, it is already done with attributes for CustomerID and can't go back. + +So, to fix things, we simply need to tell it about the attributes before we tell it about any more elements or raw text: + +SELECT COUNT(soh.SalesOrderID) AS 'CustomerID/@OrderCount', + +sc.CustomerID + +FROM Person.Person pp + +JOIN Sales.Customer sc + +ON pp.BusinessEntityID = sc.PersonID + +JOIN Sales.SalesOrderHeader soh + +ON sc.CustomerID = soh.CustomerID + +WHERE sc.CustomerID = 29484 OR sc.CustomerID = 29485 + +GROUP BY sc.CustomerID + +FOR XML PATH; + +This probably seems counterintuitive, but, again, think of the order things are being written in. The attributes are written first and then, and only then, can we write the lower-level information for the CustomerID element. Run it, and you'll see we get what we were after: + +29484 + +29485 + +The OrderCount has now been moved into the attribute position, just as we desired, and the actual CustomerID is still raw text embedded in the element. + +Follow the logic of the ordering of what you ask for a bit, because it works for most everything. So, if we wanted CustomerID to also be an attribute rather than raw text, but wanted it to be after OrderCount, we could do that—we just need to make sure that it comes after the OrderCount definition. + +But Wait, There's More... + +As I said earlier, XPath has its own complexity and is a book's worth to itself, but I don't want to leave you with just what I said in the preceding sections and say that's all there is. + +@ and / will give you a great deal of flexibility in building the XML output just the way you want it, and probably meet the need well for most simple applications. If, however, you need something more, then there is still more out there waiting for you. For example, you can: + + * "Wildcard" data such that it's all run together as text data without being treated as separate columns + * Embed native XML data from XML data type columns + * Use XPath node tests—these are special XPath directives that change the behavior of your data + * Use the data() directive to allow multiple values to be run together as one data point in the XML + * Utilize namespaces + +OPENXML + +We've spent pages and pages dealing with how to turn our relational data into XML. It seems reasonably intuitive then that SQL Server must also allow you to open a string of XML and represent it in the tabular format that is expected in SQL. + +OPENXML is a rowset function that opens your string much as other rowset functions (such as OPENQUERY and OPENROWSET) work. This means that you can join to an XML document, or even use it as the source of input data by using an INSERT..SELECT or a SELECT INTO. The major difference is that it requires you to use a couple of system stored procedures to prepare your document and clear the memory after you're done using it. + +To set up your document, you use sp_xml_preparedocument. This moves the string into memory and pre-parses it for optimal query performance. The XML document will stay in memory until you explicitly say to remove it or you terminate the connection that sp_xml_preparedocument was called on. + +Let me digress a moment and say that I'm not at all a fan of letting a system clean up for you. If you instantiate something, then you should proactively clean it up when you're done (if only I could teach my youngest child this when she pulls out her toys!). + +Much like Visual Basic, C#, and most other languages are supposed to clean up your objects when they go out of scope for you, SQL Server is supposed to clean up your prepared documents. Please do not take the lazy approach of relying on this—clean up after yourself! By explicitly deallocating it (using sp_xml_removedocument), you are making certain the clean up happens, clearing it from memory slightly sooner, and also making it very clear in your code that you're done with it. + +The syntax is pretty simple: + +sp_xml_preparedocument @hdoc = OUTPUT, + +[, @xmltext = ] + +[, @xpath_namespaces = ] + +Note that, if you are going to provide a namespace URL, you need to wrap it in the < and > symbols at both ends (for example, , + + + +[, ]) + +[WITH (|
)] + +We have pretty much already discussed the handle—this is going to be an integer value that you received as an output parameter for your sp_xml_preparedocument call. + +When you make your call to OPENXML, you must supply the XPath to a node that will serve as a starting point for all your queries. The schema declaration can refer to all parts of the XML document by navigating relative to the base node you set here. + +Next up are the mapping flags. These assist us in deciding whether we want to favor elements or attributes in our OPENXML results. The options are: + +Byte Value | Description +---|--- +0 | Same as 1 except that you can't combine it with 2 or 8 (2 + 0 is still 2). This is the default. +1 | Unless combined with 2 (described next), only attributes will be used. If there is no attribute with the name specified, then a NULL is returned. This can also be added to either 2 or 8 (or both) to combine behavior, but this option takes precedence over option 2. If XPath finds both an attribute and an element with the same name, the attribute wins. +2 | Unless combined with 1 (described previously), only elements will be used. If there is no element with the name specified, then a NULL is returned. This can also be added to either 1 or 8 (or both) to combine behavior. If combined with 1, then the attribute will be mapped if it exists. If no attribute exists, then the element will be used. If no element exists, then a NULL is returned. +8 | Can be combined with 1 or 2 (described previously). Consumed data should not be copied to the overflow property @mp:xmltext (you would have to use the metaproperty schema item to retrieve this). If you're not going to use the metaproperties—and most of the time you won't be—I recommend this option. It cuts a small (okay, very small) amount of overhead out of the operation. + +Finally comes the schema or table. If you're defining a schema and are not familiar with XPath, this part can be a bit tricky. Fortunately, this particular XPath use isn't very complex and should become second nature fairly quickly (it works a lot like directories do in Windows, only with a lot more power). + +The schema can vary somewhat in the way you declare it. The definition is declared as: + +WITH ( + + [{|}] + +[, [{|}] + + * The column name is just that—the name of the attribute or element you are retrieving. This will also serve as the name you refer to when you build your SELECT list, perform JOINs, and the like. + * The data type is any valid SQL Server data type. Since XML can have data types that are not equivalents of those in SQL Server, an automatic coercion will take place if necessary, but this is usually predictable. + * The column XPath is the XPath pattern (relative to the node you established as the starting point for your OPENXML function) that gets you to the node you want for your column—whether an element or attribute gets used is dependent on the flags parameter as described above. If this is left off, then SQL Server assumes you want the current node as defined as the starting point for your OPENXML statement. + * Metaproperties are a set of special variables that you can refer to in your OPENXML queries. They describe various aspects of whatever part of the XML DOM you're interested in. To use them, just enclose them in single quotes and put them in the place of the column XPath. Available metaproperties include: + * @mp:id—Don't confuse this with the XML id that we looked at with EXPLICIT. While this property serves a similar function, it is a unique identifier (within the scope of the document) of the DOM node. The difference is that this value is system generated—as such, you can be sure it is there. It is guaranteed to refer to the same XML node as long as the document remains in memory. If the id is zero, it is the root node (its @mp:parentid property, as referred to next, will be NULL). + * @mp:parentid—This is the same as the preceding, only for the parent. + * @mp:localname—Provides the non-fully qualified name of the node. It is used with a prefix and namespace URI (Uniform Resource Identifier—you'll usually see it starting with URN) to name element or attribute nodes. + * @mp:parentlocalname—This is the same as the preceding, only for the parent. + * @mp:namespaceuri—Provides the namespace URI of the current element. If the value of this attribute is NULL, no namespace is present. + * @mp:parentnamespacerui—This is the same as the preceding, only for the parent. + * @mp:prefix—Stores the namespace prefix of the current element name. + * @mp:parentprefix—This is the same as the preceding, only for the parent. + * @mp:prev—Stores the mp:id of the previous sibling relative to a node. Using this, you can tell something about the ordering of the elements at the current level of the hierarchy. For example, if the value of @mp:prev is NULL, then you are at the first node for this level of the tree. + * @mp:xmltext—This metaproperty is used for processing purposes, and contains the actual XML for the current element. + +Of course, you can always save yourself a ton of work by bypassing all these parameters. You get to do this if you have a table that directly relates (names and data types) to the XPath starting point that you've specified in your XML. If you do have such a table, you can just name it and SQL Server will make the translation for you! + +Okay, that's a lot to handle, but we're not quite finished yet. You see, when you're all done with your XML, you need to call sp_xml_removedocument to clean up the memory where your XML document was stored. Thankfully, the syntax is incredibly easy: + +sp_xml_removedocument [hdoc = ] + +Again, I can't stress enough how important it is to get in the habit of always cleaning up after yourself. I know that, in saying that, I probably sound like your mother. Well, like your mother, SQL Server will clean up after you some, but, like your mother, you can't count on SQL Server to clean up after you every time. SQL Server will clean things up when you terminate the connection, but what if you are using connection pooling? Some connections may never go away if your system is under load. It's an easy sproc to implement, so do it—every time! + +Okay, I'm sure you've been waiting for me to get to how you really make use of this—so now it's time for the all-important example. + +Imagine that you are merging with another company and need to import some of their data into your system. For this example, we'll say that we're working on importing a few shipping providers that they have and our company doesn't. A sample of what our script might look like to import these from an XML document might be: + +USE AdventureWorks2008; + +DECLARE @idoc int ; + +DECLARE @xmldoc nvarchar(4000); + +\-- define the XML document + +SET @xmldoc = ' + + + + + + + + + +'; + +PRINT @xmldoc; + +\--Load and parse the XML document in memory + +EXEC sp_xml_preparedocument @idoc OUTPUT, @xmldoc; + +\--List out what our shippers table looks like before the insert + +SELECT * FROM Purchasing.ShipMethod; + +\--See our XML data in a tabular format + +SELECT * FROM OPENXML (@idoc, '/ROOT/Shipper', 0) WITH ( + +CompanyName nvarchar(40), + +Base decimal(5,2), + +Rate decimal(5,2)) ; + +\--Perform and insert based on that data + +INSERT INTO Purchasing.ShipMethod + +(Name, ShipBase, ShipRate) + +SELECT * FROM OPENXML (@idoc, '/ROOT/Shipper', 0) WITH ( + +CompanyName nvarchar(40), + +Base decimal(5,2), + +Rate decimal(5,2)); + +\--Now look at the Shippers table after our insert + +SELECT * FROM Purchasing.ShipMethod; + +\--Now clear the XML document from memory + +EXEC sp_xml_removedocument @idoc; + +The final result set from this looks just like what we wanted. (Note that I've snipped off the final two columns for brevity.) + +ShipMethodID Name ShipBase ShipRate + +\------------ ----------------------------------------- ------------- --------- + +1 XRQ - TRUCK GROUND 3.95 0.99 + +2 ZY - EXPRESS 9.95 1.99 + +3 OVERSEAS - DELUXE 29.95 2.99 + +4 OVERNIGHT J-FAST 21.95 1.29 + +5 CARGO TRANSPORT 5 8.99 1.49 + +6 Billy Bob's Pretty Good Shipping 4.50 1.05 + +7 Fred's Freight 3.95 1.29 + +It isn't pretty, but it works—XML turned into relational data. + +A Quick Heads Up Regarding XML Indexes + +We're going to defer discussion of XML indexes until we discuss some of the other indexing constructs in SQL Server, but I wanted to take a moment and make sure that you realized that indexes can be built over XML data. We will discuss them more fully in Chapter 7, but, for now, I want to make sure that you are taking XML indexes into consideration in your design efforts and performance expectations. + +A Brief Word on Hierarchical Data + +XML is naturally hierarchical. The concept of a root and then branching levels of elements and attributes pretty much says everything; one is higher in lineage than another. While XML has been index capable since SQL Server 2005, there is nothing inherent in the XML data type that allows for the handling of XML in a truly hierarchical fashion. + +Beginning with SQL Server 2008, we have a new data type that is explicitly created for the purpose of dealing with hierarchical data—the HierarchyID data type. I want to make sure that you're aware of this new data type as a tool for keeping track of hierarchical data in a relational format. This has significant implications in terms of when you might want to store data in XML versus a more traditional data format. + +We will defer full discussion of HierarchyID and other hierarchy design issues until Chapter 7 but keep the correlation in mind. You may well find that you want to store information on how deep your XML data is within the tree hierarchy to facilitate fast response to hierarchy questions. + +Summary + +The size of the XML portion of SQL Server has grown considerably since its original introduction as a "Web release" prior to SQL Server 2000, and it continues to grow. XML is one of the most important technologies to hit the industry in the last 20 or more years. It provides a flexible, very transportable way of describing data, and SQL Server now has more and more ways of meeting your XML needs. + +In this chapter, we've taken a look at how to get relational data into XML format, and how to get XML data into a relational structure. We've also seen how SQL Server can supply Web service data directly using XML-based methods. +5 + +Daring to Design + +And so I come to another one of those things where I have to ponder how much to assume you already know. "To normalize, or not to normalize—THAT is the question!" Okay, the real question is one of whether or not you already understand the most basic tenets of relational database design yet. Since you come to this book with a degree of experience already, I'm going to take an approach that assumes you've heard of it, know it's important, and even grasp the basics of it. I'm going to assume you need the information filled in for you rather than that you are starting from scratch. + +With the exception of perhaps three or four chapters, this book has an Online Transaction Processing, or OLTP, flare to the examples. Don't get me wrong; I will point out, from time to time, some of the differences between OLTP and its more analysis-oriented cousin Online Analytical Processing (OLAP). My point is that you will, in most of the examples, be seeing a table design that is optimized for the most common kind of database—OLTP. Thus, the table examples will typically have a database layout that is, for the most part, normalized to what is called the third normal form. + +What is "normal form"? We'll start off by taking a very short look at that and then will move quickly onto more advanced concepts. For the moment though, just say that it means your data has been broken out into a logical, nonrepetitive format that can easily be reassembled into the whole. In addition to normalization (which is the process of putting your database into normal form), we'll also be examining the characteristics of OLTP and OLAP databases. And, as if we didn't have enough to do between those two topics, we'll also be looking at many examples of how the constraints we've already seen are implemented in the overall solution. + +Normalization 201 + +If you've read Beginning SQL Server 2008 Programming, then you can probably safely skip this section and move on to the more advanced concepts. + +I want to start off by saying that there are six normal forms (plus or minus one or two depending on which academician you listen to). We'll leave several of those to the academicians though. Those in the real world usually deal with only three normal forms. Indeed, a fully normalized database is one that is generally considered to be one that is normalized to the third normal form. + +The concept of normalization has to be one of most over-referenced yet misunderstood concepts in programming. Everyone thinks they understand it, and many do in at least its academic form. Unfortunately, it also tends to be one of those things that many database designers wear like a cross—it is somehow their symbol that they are "real" database architects. What it really is, however, is a symbol that they know what the normal forms are—and that's all. Normalization is really just one piece of a larger database design picture. Sometimes you need to normalize your data—then again, sometimes you need to deliberately de-normalize your data. Even within the normalization process, there are often many ways to achieve what is technically a normalized database. + +My point is that normalization is a theory, and that's all it is. Once you choose whether or not to implement a normalized strategy, what you have is a database—hopefully the best one you could possibly design. Don't get stuck on what the books (including this one) say you're supposed to do—do what's right for the situation that you're in. As the author of this book, all I can do is relate concepts to you—I can't implement them for you, and neither can any other author (at least not with the written word). You need to pick and choose between these concepts in order to achieve the best fit and the best solution. + +By this point in your database development background, I would expect that you already understand how to create a primary key and some of the reasons for using one in our tables—if we want to be able to act on just one row, then we need to be able to uniquely identify that row. The concepts of normalization are highly dependent on issues surrounding the definition of the primary key and what columns are dependent on it. One phrase you might hear frequently in normalization is: + +The key, the whole key, and nothing but the key. + +The somewhat fun addition to this is: + +The key, the whole key, and nothing but the key, so help me Codd! + +This is a super-brief summarization of what normalization is about out to the third normal form (for those who don't know, Codd is considered the father of relational design). When you can say that all your columns are dependent only on the whole key and nothing more or less, then you are at third normal form. + +Now let's review the various normal forms and what each does for you. + +Where to Begin + +The concepts of relational database design are founded on the notion of entities and relations. If you're familiar with object-oriented programming, then you can liken most top-level entities to objects in an object model. Much as a parent object might contain other objects that further describe it, tables may have a child or other table that further describe the rows in the original table. + +An entity will generally tie to one "parent" table. That table will usually have one and only one row per instance of entity you're describing (for example, a table that is the top table for tracking orders in a system will have only one row per individual order). The one entity may, however, require multiple tables to provide additional descriptive information (for example, a details or line item table to carry a list of all the things that were purchased on that particular order). + +A relation is a representation of how two entities relate to each other logically. For example, a customer is a different entity from an order, but they are related. You cannot have so much as one order without at least one customer. Furthermore, your order relates to only one customer. + +As you start the process of "normalizing" these entities and relations into tables, some things about your data are assumed even before you get to the first of the normal forms: + + * The table should describe one and only one entity. (No trying to shortcut and combine things!) + * All rows must be unique, and there must be a primary key. + * The column and row order must not matter. + +As you gain experience, this will become less of a "process" and more of the natural starting point for your tables. You will find that creating a normalized set of tables will be the way things flow from your mind to start with rather than anything special that you have to do. + +Getting to Third Normal Form + +As I indicated earlier, there are, from a practical point of view, three normal forms: + + * The First Normal Form (1NF) is all about eliminating repeating groups of data and guaranteeing atomicity (the data is self-contained and independent). At a high level, it works by creating a primary key (which you already have), then moving any repeating data groups into new tables, creating new keys for those tables, and so on. In addition, you break out any columns that combine data into separate rows for each piece of data. + * Second Normal Form (2NF) further reduces the incidence of repeated data (not necessarily groups). Second normal form has two rules to it: + * The table must meet the rules for first normal form. (Normalization is a building block kind of process—you can't stack the third block on if you don't have the first two there already.) + * Each column must depend on the whole key. + * Third Normal Form (3NF) deals with the issue of having all the columns in your table not just be dependent on something—but the right thing. Third normal form has just three rules to it: + * The table must be in 2NF (I told you this was a building block thing). + * No column can have any dependency on any other non-key column. + * You cannot have derived data (that is, data that can be inferred from other data in your tables). + +Other Normal Forms + +There are a few other forms out there that are considered, at least by academics, to be part of the normalization model. These include: + + * Boyce-Codd (considered to really just be a variation on third normal form)—This one tries to address situations where you have multiple overlapping candidate keys. This can only happen if: + +a. All the candidate keys are composite keys (that is, it takes more than one column to make up the key). + +b. There is more than one candidate key. + +c. The candidate keys each have at least one column that is in common with another candidate key. + +This is typically a situation where any number of solutions works, and almost never gets thought of outside the academic community (and I think I'll stop thinking about it right now....). + + * Fourth Normal Form—This one tries to deal with issues surrounding multi-valued dependence. This is the situation where, for an individual row, no column depends on a column other than the primary key and depends on the whole primary key (meeting third normal form). However, there can be rather odd situations where one column in the primary key can depend separately on other columns in the primary key. These are rare and don't usually cause any real problem. Thus, they are largely ignored in the database world, and we will not address them any further here. + * Fifth Normal Form—Deals with non-loss and loss decompositions. Essentially, there are certain situations where you can decompose a relationship such that you cannot logically recompose it into its original form. Again, these are rare, largely academic, and, again, we won't deal with them any further here. + +This is, of course, just a really quick look at these—and that's deliberate on my part. The main reason you need to know these in the real world is either to impress your friends (or prove to them you're a "know it all") and to not sound like an idiot when some database guru comes to town and starts talking about them. However you choose to use this knowledge, I do recommend against using it to get dates. + +Relationships + +Well, I've always heard from women that men immediately leave the room if you even mention the word "relationship." With that in mind, I hope that I didn't just lose about half my readers. + +I am, of course, kidding—but not by as much as you might think. Experts say the key to successful relationships is that you know the role of both parties and that everyone understands the boundaries and rules of the relationship that they are in. I can be talking about database relationships with that statement every bit as much as people relationships. + +There are three different kinds of major relationships: + + * One-to-one—This is exactly what it says it is. A one-to-one relationship is one where the fact that you have a record in one table means that you have exactly one matching record in another table. + * One-to-many—This is one form of your run-of-the-mill, average, everyday foreign key kind of relationship. Usually, this is found in some form of header/detail relationship, and generally implements some idea of a parent to child hierarchy. For example, for every one customer, you might have several orders. + * Many-to-many—In this type of relationship, both sides of the relationship may have several records that match. An example of this would be the relationship of products to orders—an order may contain several products, and, likewise, a product will appear on many orders. SQL Server has no way of physically establishing a direct many-to-many relationship, so you cheat by having an intermediate table to organize the relationship. + +Each of these has some variations depending on whether one side of the relationship is nullable or not. For example, instead of a one-to-one relationship, you might have a zero- or one-to-one relationship. + +Diagramming + +Entity-relationship diagrams (ERDs) are an important tool in good database design. Small databases can usually be easily created from a few scripts and implemented directly without drawing things out at all. The larger your database gets, however, the faster it becomes very problematic to just do things "in your head." ERDs solve a ton of problems because they allow you to quickly visualize and understand both the entities and their relationships. + +For this book, I've decided to do things somewhat in reverse of how I've done things before. SQL Server includes a very basic diagramming tool that you can use as a starting point for building rudimentary ERDs. Unfortunately, it employs a proprietary diagramming methodology that does not look remotely like any standard I'm aware of out there. In addition, it does not allow for the use of logical modeling—something I consider a rather important concept. Therefore, I'm going to start off talking about the more standard diagramming methodologies first—later in the chapter we'll look at SQL Server's built-in tools and how to use them. + +There are two reasonably common diagramming paradigms—IE and IDEF1X. You'll find both of these in widespread use, but I'm going to limit things here to a once over of the basics of IE (also called Information Engineering). For the record, IDEF1X is a perfectly good diagramming paradigm, and was first put forth by the U.S. Air Force. IE (again, Information Engineering—not Internet Explorer) is, however, the method I use personally, and I do so for just one reason—it is far more intuitive for the inexperienced reviewer of your diagrams. I also find it to be the far more common of the two. + +I can't say enough about the importance of having the right tools. While the built-in tools at least give you "something," they are a long way away from "what you need." + +ER tools are anything but cheap—running from somewhere over $1,000 to just under $5,000 (that's per seat!). They are also something of a language unto themselves. Don't plan on just sitting down and going to work with any of the major ER tools—you had better figure on some spin-up time to get it to do what you expect. + +Don't let the high price of these tools keep you from building a logical model. While Visio continues to fall somewhat short in terms of answering the world's database design problems, it does do okay in a pinch for light logical modeling and can do some degree of synchronization and physical modeling. That said, if you're serious about database design, and going to be doing a lot of it, you really need to find the budget for a real ER tool. + +Expense aside, there is no comparison between the productivity possible in the third-party tools out there and the built-in tools. Depending on the ER tool you select, they give you the capability to do things like: + + * Create logical models, and then switch back and forth between the logical and physical model. + * Work on the diagram offline—then propagate all your changes to the physical database at one time (when you're ready, as opposed to when you need to log off). + * Reverse engineer your database from any one of a number of mainstream RDBMS systems (even some ISAM databases), and then forward engineer them to a completely different RDBMS. + * Create your physical model on numerous different systems. + +This really just scratches the surface. + +A Couple of Relationship Types + +Before you get going too far in more diagramming concepts, I want to explore two types of relationships: identifying and non-identifying. + +Identifying Relationships + +For some of you, I'm sure the term identifying relationship brings back memories of some boyfriend or girlfriend you've had in the past who got just a little over possessive—this is not that kind of relationship. Instead, you're dealing with the relationships that are defined by foreign keys. + +An identifying relationship is one where the column or columns (remember, there can be more than one) being referenced (in the parent table) are used as all or part of the referencing (child) table's primary key. Since a primary key serves as the identity for the rows in a table, and all or part of the primary key for the child table is dependent on the parent table—the child table can be said to, at least in part, be "identified" by the parent table. + +Non-Identifying Relationships + +Non-identifying relationships are those that are created when you establish a foreign key that does not serve as part of the referencing (child) table's primary key. This is extremely common in situations where you are referencing a domain table—where essentially the sole purpose of the referenced table is to limit the referencing field to a set list of possible choices. + +The Entity Box + +One of the many big differences you'll see in both IE and IDEF1X versus SQL Server's own brand of diagramming comes in the entity box. The entity box, depending on whether you're dealing with logical or physical models, equates roughly to a table. By looking over the entity box, you should be able to easily identify the entity's name, primary key, and any attributes (effectively columns) that entity has. In addition, the diagram may expose other information such as the attribute's data type or whether it has a foreign key defined for it. As an example, consider the entity box in Figure 5.1. + +Figure 5.1 + +The name of our entity is kept on the top outside the box. Then, in the top area of the overall box, but in a separate box of its own, you have the primary key (you'll look at an example with more than one column in the primary key shortly), and last, but not least, come the attributes of the entity. + +Take a look at a slightly different entity (Figure 5.2). + +Figure 5.2 + +Several new things appear: + + * The data types (I've turned on the appropriate option). + * Foreign keys (if any—again I've turned on the option to make this show). + * You have multiple columns in the primary key (everything above the line is part of the primary key). + * This time, the entity is rounded on the corners. This tells you that this table is identified (remember identifying relationships?) by at least one other table. + +Depending on the ER tool, the data types can be defined right within the ER diagram. Also, as you draw the lines that form your relationships (you'll look at those shortly), you are able to define foreign keys, which can also be shown. For most available ER tools, you can even tell the tool to automatically define the referenced field(s) in the foreign key relationship as being part (or possibly all) of the primary key in the referencing table. + +The Relationship Line + +There are two kinds, and they match 100 percent with our relationship types: + +A solid line indicates an identifying relationship: + +______ + +A broken or dashed line indicates a non-identifying relationship: + +\------------------------------ + +Again, an identifying relationship is one where the column that is referencing another table serves as all or part of the primary key of the referencing table. In a non-identifying relationship, the foreign key column has nothing to do with the primary key in the referencing table. + +Terminators + +Ahh, this is where things become slightly more interesting. The terminators we're talking about here are, of course, not the kind you'd see Arnold Schwarzenegger play in a movie—they are the end caps that we put on our relationship lines. + +The terminators on our lines will communicate as much or more about the nature of our database as the entities themselves will. They are the thing that will tell you the most information about the true nature of the relationship, including the cardinality of the relationship. + +Cardinality is, in its most basic form, the number of records on both sides of the relationship. When you say it is a one-to-many relationship, then you are indicating cardinality. Cardinality can, however, be much more specific than the zero, one, or many naming convention that you use more generically. Cardinality can address specifics, and is often augmented in a diagram with two numbers and a colon, such as: + + * 1:M + * 1:6 (which, while meeting a one-to-many criteria, is more specific and says there is a maximum of 6 records on that side of the relationship). + +Walk through a couple of the parts of a terminator and examine what they mean. + +Just as a reminder, the terminators that follow are the ones from the IE diagramming methodology. As I have indicated, there is another diagramming standard that is in widespread use (though I see it much less than IE) called IDEF1X. While its entity boxes are much like IE's, its terminators on the relationship lines are entirely different. + +In the top half of the terminator shown in Figure 5.3, it is indicating the first half of our relationship. In this case, we have a zero. For the bottom half, we are indicating the second half of our relationship—in this case, a many. In this example, then, we have a zero, one, or many side of a relationship. + +Figure 5.3 + +In Figure 5.4, you're not allowing nulls at this end of the relationship—this is a one or many end to a relationship. + +Figure 5.4 + +In Figure 5.5, you're back to allowing a zero at this end of the relationship, but you are now allowing a maximum of one. This is a zero or one side of a relationship. + +Figure 5.5 + +And last, but not least, you have Figure 5.6. This one is pretty restrictive—it's simply a "one" (no more, no less) side of a relationship. + +Figure 5.6 + +Since it's probably pretty confusing to look at these just by themselves, take a look at a couple of example tables and relationships (Figure 5.7). + +Figure 5.7 + +Figure 5.7 is a diagram that shows two tables that support the notion of just one logical entity—an order. You have an Orders table to keep track of information that is global to the order (this has just a CustomerNo, but it may have contained things like a shipping address, a date of the order, a due date, and so on). You also have an OrderDetails table to track the individual line items' place on this order. The diagram depicts not only your Orders and OrderDetails tables but also the one (the Orders side) to zero, one, or many (the OrderDetails side) relationship between the two tables. The relationship is an identifying relationship (solid, rather than dashed line), and the relationship is called OrderHasDetails. + +In Figure 5.8, you add in a Products table. + +Figure 5.8 + +This new relationship is very similar to the relationship that you already looked at. It is again a one (Products this time) to zero, one, or many (OrderDetails again) relationship, but this one is non-identifying (as represented by the broken line). The IE indicates that, for this table, PartNo is an Inversion Entry, or an index that is not associated with anything other than a foreign key. The Inversion Entry has been added as it usually makes sense to have an index on a field that is a foreign key (since it is a frequent target of lookups). + +By looking at all three together, you can see that there is a many-to-many relationship between Orders and Products by virtue of their relationship through the OrderDetails table. + +Note that an Inversion Entry does not have to be associated with anything at all—it just happened to be associated with a foreign key in this particular case. An Inversion Entry is essentially any index that is not unique or associated with a primary key. + +As I've indicated before, you are still really only scratching the surface of the different information that your ER diagrams can convey. Still, as you look later in the chapter at the SQL Server diagramming tools, you will be able to see that the more accepted methodologies out there have an awful lot more information to convey than the included tools do. In addition, just the nature of how tables are displayed makes information such as keys more visible and easier to read. + +Logical versus Physical Design + +In your database work, you may have already heard about the concepts of logical versus physical models. In this section, we'll be exploring the differences between the two. + +The physical model is one that's probably pretty easy to grasp. It is essentially what you have been working with up to this point in the book. You can think of anything that you can perform a CREATE statement on as being part of the physical model. Indeed—if you run any statements in SQL Server on it at all then it must be part of the physical model. + +That being said, a logical model is a means to a number of different things—the physical model in particular. This means that, as you work on the logical model, you are working your way toward being able to generate DDL (Data Definition Language—or things like CREATE, ALTER, and DROP statements). Think of the logical model as being like the planning stages for an artist. The artist figures out what to paint, gets out the paints and brushes, and picks out an appropriately sized canvas, but he hasn't painted anything yet. The physical model is the actual painting. The painting is, of course, what everyone sees and notices, but the painting couldn't exist without the decision of what to paint and the gathering of the paints and other supplies needed. Likewise, the best physical models are generally put together as a progression from a solid logical model. + +Purpose of a Logical Model + +The first thing to understand about logical models is that they have somewhat different goals than physical models do. A logical model does several things for you: + + * Allows you to begin to build abstracts of complex, data-related business issues as well as provide a high-level effort at identifying your entities + * Allows you to use these abstracts to effectively communicate business rules and content as relates to data + * Represents the purest form of the data (before you start introducing the realities of what will really work) + * Serves as a major piece of documentation in the data requirements portion of your project + +Because logical models aren't strictly rooted in the exact syntax to create the database, they give you a flexibility that you can't obtain from a physical model. You can attach dialog and rules to the logical model regardless of whether your particular RDBMS will support those rules or not. In short, it allows you to squeeze in all the facts before you start paring down your design to a specific implementation. + +What's nice about this is that logical models allow you to capture all of your data rules in one place regardless of where that rule will be actually implemented. You will frequently run into situations where you cannot sensibly implement your rules in the database. The rules in question may be data related, but due to some constraint or requirement, you need to implement them using more procedural code in your client or in some form of middle tier. With logical models, you go ahead and include the data-related rules anyway. + +Regardless of its source, you include all data-related information in a logical design to create one or more abstracts of the data in your system. These abstracts can then be used as a representation to your customer about what you really are intending to store and what rules you believe you have captured. Using such a representation early (and often) can save valuable time and money in your projects by opening extra doors of communication. Even a customer who is not very data savvy can often look at the highest level diagrams and say things like "Where are the purchase requisitions?" Usually, you have some handy dandy explanation of why you called them something else and you can point to them on the diagram—other times, however, you find yourself uttering that most fearsome of words—"Oops!" I don't know about you, but I'd rather utter that word in the first weeks of a project rather than the first weeks of deployment. Logical modeling, when properly shared with the customer, can help avoid those deployment-time Oops statements. + +I can't do enough to stress the importance of sharing your logical design (there had better be one!) with your customer both early and often. With a little education of the customer in how to read your logical model (this should also include good documentation on cause and purpose of the entities and relationships of the model), you can save a fortune in both time and money. + +I haven't met a developer with any real experience who hasn't, at least once (and probably far more often than that), learned the hard way about the cost of late changes to your system. Changing code is very expensive, but that typically doesn't even begin to touch what happens when you need to change your database late in a project. If you haven't done a good job of abstracting your database, then every change you make to your database is going to cascade through tons of code. In other words, one little change in your database can potentially cost several hundred or even thousands (depending on the size of the system) of changes in the code that accesses the database. + +In short, communication is everything, and logical modeling should be a huge part of your tool set for communicating with your customer. + +Parts of a Logical Model + +A logical model contains three major parts: + + * Structure + * Constraints + * Rules + +The combination of these three should completely describe the requirements of the data in your system, but they may not translate entirely to the physical model. Some of the issues identified in the logical model may need to be implemented in some procedural form (such as in a middle-tier component). Other times, the entire logical model can be implemented through the various features of your RDBMS. + +This is a really important point, and I want to stress it again—just because it's in your logical model doesn't mean that it will be in your physical database. A logical model should take into account all of your data requirements—even those that are not possible to implement in your RDBMS (for example, data that you might be retrieving from a third-party source—perhaps in an XML document or some other storage medium). Having everything in your logical model allows you to plan the physical design in such a way that you can be sure that you have addressed all data issues—not just those that will physically reside in the database. + +Structure + +Structure is that part of the logical design that deals with the concept of actually storing the data. When you deal with the structure of the database, you're talking about entities—most of which will translate to tables that will store your data—and the particular columns you are going to need to maintain the atomicity of your data. + +Constraints + +Constraints, from a logical model standpoint, are a bit broader than the way that you've used the word constraint up until now. Prior to now, when you used the word constraint, you were talking about a specific set of features to limit data to certain values. From a logical standpoint, a constraint is anything that defines the "what" question for our data—that is, what data is valid. A logical model includes constraints, which is to say that it includes things like: + + * Data types (notice that this is really a separate thought from the notion that a column needs to exist or what the name of that column should be). + * Constraints in the form you're used to up until now—that is, CHECK constraints, foreign keys, or even primary keys and UNIQUE constraints (alternate keys). Each of these provides a logical definition of what data can exist in our database. This area would also include things like domain tables (which you would reference using foreign keys)—which restrict the values in a column to a particular "domain" list. + +Rules + +If constraints were the "what" in our data, then rules are the "when and how much" in our data. + +When we define logical rules, we're defining things like "Do we require a value on this one?" (which equates to "Do we allow nulls?") and "How many of these do we allow?" (which defines the cardinality of our data—do we accept one or many?). + +It's worth noting yet again that any of these parts may not be implemented in the physical part of your database—we may decide that the restrictions that we want to place on things will be handled entirely at the client—regardless of where the requirement is implemented, it should still be part of our comprehensive logical data model. It is only when we achieve this complete modeling of our data that we can really know that we have addressed all the issues (regardless of where we addressed them). + +Dealing with File-Based Information Via Classic BLOBs + +BLOBs. You probably haven't seen enough of them to hate them yet. Whether that's a "yet" or not largely depends on whether or not you need to support backward compatibility. + +Back in SQL Server 2005, Microsoft added some new data types (varchar(max), nvarchar(max), and varbinary(max)) that greatly simplify dealing with Binary large objects—or BLOBs. SQL Server 2008 adds yet another option to the mix in the form of supporting a special file-level storage option called filestreams (these are a lot more complex, and require a very cohesive design effort with your client-side coders as well as special network considerations). Other than a quick glance at them, we'll largely defer the discussion of filestreams to the next chapter (advanced data structures) and our more advanced performance design chapter (Chapter 21). + +When used with a compatible data access model (ADO.NET 2.0 or higher), you can access BLOB data through the more standard methods (that is, without using filestreams) as though it were the same as its smaller base data type (varchar, nvarchar, or varbinary). For those of you still needing to deal with backward compatibility issues, you'll have to use the older (and even slower) "chunking" method to access your data. Regardless of which access method you're using, BLOBs are slow—very slow and big. Using the new access methods can really help BLOB handling performance though, so let me encourage you to migrate as soon as possible to at least SQL Server 2005 as your bottom level of support. + +The oldest version of SQL Server you're supporting is the critical factor—not the data access method—when using the newer BLOB data types. SQL Server will automatically translate the newer data types to appear like the old ones when dealing with the older connectivity methods. Note, however, that use of filestreams does require very specific client-side code. + +BLOBs are nice in the sense that they let you break the 8K barrier on row size (BLOBs can be up to about 2GB in size). The first problem is that they can be clumsy to use under the old data types and access methods. Perhaps the larger problem, however, is that they are painfully slow (I know, I'm repeating myself, but I suspect I'm also making a point here). In the race between the BLOB and the tortoise (the sequel to the tortoise and the hare), the BLOB won only after the tortoise stopped for a nap. + +Okay, okay, so I've beaten the slow thing into the ground. Indeed, there have been substantial performance improvements in BLOB handling over the years, and the difference is not what it used to be, but at the risk of mentioning it one too many times, BLOBs are still relatively slow. + +All right, so now you've heard me say BLOBs are slow and you still need to store large blocks of text or binary information. Normally, you'd do that using a BLOB—and, with the recent performance improvements in BLOB handling, that's probably best—but you do have the option of doing it another way. You can go around the problem by storing things as files instead. + +Okay, so by now some of you have to be asking the question of "isn't a database going to be a faster way of accessing data than the file system?" My answer is quite simply—"Usually not." + +There are two ways to do this without going to filestreams. We'll start with the method that has traditionally been implemented, and then we'll talk about another potential (it requires very application-specific design on your part) way to do it in the .NET era. + +I'm going to warn you right up front that, in order to pull the typical way of doing this off, you need to be planning for it in your client—this isn't a database server–only kind of thing to do. Indeed, you'll be removing most of the work from the database server and putting it into your middle tier and file system. You can start by looking at what you need to do on the server's file system side. The only thing that you need is to make sure that you have at least one directory to store the information in. Depending on the nature of your application, you may also need to have logic in a middle-tier object that will allow it to create additional directories as needed. + +All Windows operating systems have limits on the number of files they can store in one directory. With the 64-bit operating systems out, the maximum number of files per directory has increased such that the maximum isn't so much the issue, as raw performance. (Windows still tends to get very slow in file access as the number of files in a particular directory rises.) As such, you still need to think about how many files you're going to be storing. If it will be many (say, over 500), then you'll want to create a mechanism in the object that stores your BLOB so that it can create new directories either on an as-needed basis, or based on some other logical criteria. + +Your business component will be in charge of copying the BLOB information to the file you're going to store it in. If it is already is some defined file format, you're on easy street—just run your language's equivalent to a copy command (with a twist we'll go over shortly), and you're in business. If it is streamed data, then you'll need to put the logic in your component to store the information in a logical format for later retrieval. + +One big issue with this implementation is that of security. Since you're storing the information in a file that's outside of SQL Server's realm, it is also outside SQL Server's protection security-wise. Instead, you have to rely on your network security. + +There are several "Wow, that's scary!" things that should come to mind for you here. First, if someone's going to read data out of the directory that you're storing all this in, doesn't that mean they can see other files that are stored in there? Yes, it does (if you wanted to get really tricky, you could get around this by changing the Windows security for each file, but it would be very tedious indeed—in the case of a Web application, you would need to do something like implementing a DLL on your Web server). Second, since you'd have to give people rights to copy the file into the directory, wouldn't there be a risk of someone altering the file directly rather than using the database (potentially causing your database to be out of sync with the file)? Absolutely. + +The answer to these and the many other questions that you could probably come up with lies in your data access layer. (I'm assuming an n-tier approach here.) You can, for example, have the access component run under a different security context than the end user. This means that you can create a situation where the users can access their data—but only when they are using the data access component to do it (they don't have any rights to the directory themselves—indeed, they probably don't even know where the files are stored). + +So then, where does SQL Server come into play in all this? It keeps track of where you stored the information in question. Theoretically, the reason why you were trying to store this information in the databases in the first place is because it relates to some other information in the row you were going to store it as part of. But instead of saving the actual data in the row in the form of a BLOB, you will now store a path to the file that you saved. The process for storage will look something like this: + +1. Determine the name you're going to store it as. + +2. Copy the file to the location that you're going to store it at. + +3. Save the full name and path in a varchar along with the rest of the data for that row. + +4. To retrieve the data, run your query much as you would have if you were going to retrieve the data direction from the table, only this time, retrieve the path to where the actual BLOB data is stored. + +5. Retrieve the data from the file system. + +In general, this approach will run somewhat faster than if you were using BLOBs. There are, however, some exceptions to the rule when using this approach: + + * The BLOBs you are saving are consistently small (less than 8K) in size. + * The data is text or some format that MS Search has a filter for, and you want to be able to perform full-text searches against it. + +If the size of your BLOBs is consistently less than 8K, then the data may be able to fit entirely on one data page. This significantly minimizes the overhead in dealing with your BLOB. While the file system approach may still be faster, the benefits will be sharply reduced such that it doesn't make as much sense. If you're in this scenario, and speed is everything, then all I can suggest is to experiment. + +If you want to perform full-text searches, you're probably going to be better off going ahead and storing the large blocks of text as a TEXT data type (which is a BLOB) in SQL Server. If the text is stored in a binary format that has an MS Search filter available (or you could write your own if you're desperate enough), then you can store the file in an image data type and MS Search will automatically use the filter to build the full-text index. Don't get me wrong; it's still very possible to do full-text searches against the text in the file, but you're going to have to do substantially more coding to keep your relationships intact if you want non-BLOB data from the same functional row. In addition, you're most likely going to wind up having to program your middle tier to make use of index server. + +If push comes to shove, and you need to make a full-text search against file system–based information, you could take a look at accessing the index server via a query directly. SQL Server can issue remote queries such that you can potentially access any OLE DB (a Microsoft originated data access API—we'll see a bit more about it in Chapter 25) data source. The MS Search service has an OLE DB provider and can be used at the target as a linked server or in an OPENQUERY. The bad news, however, is that performing an index server query against an index server that is not on the same physical box as your SQL Server really doesn't work. (Feel free to e-mail me if you've found a workaround to this.) The only workaround is to have an index server on the system local to SQL Server, but have it catalog files stored on another system. The problem with this is the network chatter during the cataloging process and the fact that it doesn't let you offload the cataloging work (which hurts scalability). + +Okay, so that was way #1 (you may recall I said there were two). The second leverages the .NET assembly architecture that was added back in SQL Server 2005. We haven't really gotten to a discussion of .NET integration yet, so we'll keep this fairly high level. + +This approach actually leverages many of the same concepts that were used in the middle-tier file access approach. The only real change is in what server or component takes charge of the file access. + +With the advent of Common Language Runtime (CLR) integration, we have the ability to create user-defined functions far more complex than those previously possible. As part of that, we have the ability to define table-valued functions that can retrieve data from nearly any base source. Indeed, in Chapter 10 we will take a look at how we can enumerate files in a directory and return them as a table-valued function, but we could just as easily return a varbinary(max) column that contains the file. Under this model, all file access would be performed under whatever network security context we establish for that assembly to run under, but it would only be performed as part of the table-valued function. + +It is important to note that the file system–based method mentioned earlier can be considered as something of a predecessor to the filestream feature introduced with this release. Filestreams implement a somewhat advanced version of this approach—one that includes coordinated backups among other things. That said, filestreams also add substantial complexity over even this approach—thus why I have deferred detailed discussion of them to the advanced data structures and performance chapters. + +Subcategories + +Subcategories are a logical construct that provides you another type of relationship (sometimes called a "Supertype" or "Subtype" relationship) to work with. On the physical side of the model, a subcategory is implemented using a mix of the types of relationships that I've already talked about (you'll see the specifics of that before you're done). + +A subcategory deals with the situation where you have a number of what may first seem like different entities but which share some, although not all, things in common. + +I think the best way to get across the concept of a subcategory is to show you one. To do this, we'll take the example of a document in a company. + +A document has a number of attributes that are common to any kind of document. For example: + + * Title + * Author + * Date created + * Date last modified + * Storage location + +I'm sure there are more. Note that I'm not saying that every document has the same title, rather that every document has a title. Every document has an author (possibly more than one actually, but, for this example, we'll assume a limit of one). Every document was created on some date. You get the picture—you're dealing with the attributes of the concept of a document, not any particular instance of a document. + +But there are lots of different kinds of documents. From things like legal forms (say your mortgage documents) to office memos, to report cards—there are lots of document types. Still, each of these can still be considered to be a document—or a subcategory of a document. Consider a few examples. + +For our first example, we'll look at a lease. A lease has all the characteristics that we expect to find in our documents category, but it also has information that is particular to a lease. A lease has things like: + + * Lessor + * Lessee + * Term (how long the lease is for) + * Rate (how much per month or week) + * Security deposit + * Start date + * Expiration date + * Option (which usually offers an extension at a set price for a set additional term) + +The fact that a lease has all of these attributes does not preclude the fact that it is still a document. + +We can come up with a few more examples, and I'll stay with my legal document trend—start with a divorce document. It has attributes such as: + + * Petitioner (the person suing for a divorce) + * Respondent (the plaintiff's spouse) + * Separation date + * Date the petitioner files for the divorce + * Date the divorce was considered "final" + * Alimony (if any) + * Child support (if any) + +We could also have a bill of sale—our bill of sale might include attributes such as: + + * Date of sale + * Amount of the sale + * Seller + * Purchaser + * Warranty period (if any) + +Again, the fact that divorces and bills of sale both have their own attributes does not change the fact that they are documents. + +In each case—leases, divorces, and bills of sale—we have what is really a subcategory of the category of "documents." A document really has little or no meaning without also belonging to a subcategory. Likewise, any instance of a subcategory has little meaning without the parent information that is found only in the supercategory—documents. + +Types of Subcategories + +Subcategories fall into two separate classifications of their own—exclusive and non-exclusive. + +When you refer to a subcategory as simply a "subcategory," then you are usually referring to a subcategory arrangement where you have a record in a table that represents the supercategory (a document in our previous example), and a matching record in at least one of the subcategories. + +This kind of subcategory is represented with a symbol that appears rather odd as compared to those you've seen thus far (Figure 5.9). + +Figure 5.9 + +Even though there are three subcategories depicted both here and in the document example, don't misconstrue this as being any kind of official limit to the number of subcategories—there isn't one. You could have a single subcategory or 10 of them—it doesn't really make any difference. + +Far more common is the situation where you have an exclusive subcategory. An exclusive subcategory works exactly as a category did with only one exception—for every record in the supercategory, there is only one matching record in any of the subcategories. Each subcategory is deemed to be mutually exclusive, so a record to match the supercategory exists as exactly one row in exactly one of the subcategory tables. + +The diagramming for an exclusive sub-type looks even a little odder yet (Figure 5.10). + +Figure 5.10 + +Keeping Track of What's What—Implementing Subcategories + +The thing that's really cool about subcategories is that they allow you to store all of a similar construct in one place. Before learning this concept, you would have taken one of two approaches to implement our document model: + + * Add all of the attributes into one column and just leave the columns null for the information that doesn't fit the specific type of document you're interested in for a given record. + * Have separate tables for each type of document. The columns that are essentially the same between document types would be repeated for each table (each table stores its own copy of the document information as it applies to the records in that particular table). + +Using the notion of a subcategory, you can now store all documents, regardless of type, such that they all begin in one place. Any query that you have that is looking for information about all the documents in your system can now run against just one table instead of having to do something like using the UNION operator on three (maybe more, maybe less) different tables. It probably goes without saying, then, that implementing this kind of situation using a subcategory can provide a serious performance enhancement over the other options. + +There is a catch though (you knew there would be, right?)—you need to provide some mechanism to point to the rest of the information for that document. Your query of all documents may provide the base information on the specific document that you're looking for, but when you want the rest of the information for that document (the things that are unique to that document type), then how does your application know which of the subcategory tables to search for the matching record in? To do this, just add a field to your supercategory that indicates what the subcategory is for that record. In our example, you would probably implement another column in our documents table called "DocumentType." From that type, you would know which of your other tables to look through for the matching record with more information. Furthermore, you might implement this using a domain table—a table to limit the values in your DocumentType column to just those types that you have subcategories for—and a foreign key to that table. + +Keep in mind that while what I'm talking about here is the physical storage and retrieval of the data, there is no reason why you couldn't abstract this using either a sproc or a series of views (or both). For example, you could have a stored procedure call that would pull together the information from the Documents table and then join to the appropriate subcategory. + +Oh—for those of you who are thinking, "Wait, didn't that other text that I read about n-tier architecture say to never use sprocs?" Well, that's a garbage recommendation in my not so humble opinion (you'll look more at sprocs in Chapter 10). It's foolish not to use the performance tools available—just remember to access them only through your data access layer—don't allow middle-tier or client components to even know your sprocs exist. Follow this advice, and you'll get better performance, improved overall encapsulation, shorter dev times, and, even with all that, still live within the real theory of a separate data access layer that is so fundamental to n-tier design. + +In addition to establishing a pointer to the type of document, you also need to determine whether you're dealing with a plain subcategory or an exclusive subcategory. In our document example, you have what should be designed as an exclusive subcategory. You may have lots of documents, but you do not have documents that are both a lease and a divorce (a non-exclusive subcategory would allow any mix of our subcategories). Even if you had a lease with a purchase option, the bill of sale would be a separate document created at the time the lease option was exercised. + +Figure 5.11 shows an implementation of our logical model. + +Okay, so you have an entity called documents. These documents are of a specific type, and that type is limited to a domain—the boundaries of that domain are set by DocumentType. In addition, each of the types is represented by its own entity—or subcategory. The symbol in the middle of it all (the half-circle with an "X" through it), tells you that the three subcategories are exclusive in nature (you have one, and only one, for each instance of a document). + +Figure 5.11 + +This is an excellent place to step back and reflect on what your logical model can do for you. As I discussed earlier in the chapter, our logical model, among other things, provides you with a way to communicate the business rules and requirements of our data. In this case, with a little explanation, someone (a customer perhaps?) can look at this and recognize the concept that you are saying that Leases, Divorces, and Sales are all variations on a theme—that they are really the same thing. This gives the viewer the chance to say, "Wait—no, those aren't really the same thing." Or perhaps something like, "Oh, I see—you know, you also have will and power-of-attorney documents—they are pretty much the same, aren't they?" These are little pieces of information that can save you a bundle of time and money later. + +Getting Physical—The Physical Implementation of Subcategories + +On the physical side of things, there's nothing quite as neat and clean as it looks in the logical model. Indeed, all you do for the physical side is implement a series of one-to-zero or -one relationships. You do, however, draw them out as being part of a single, multi-table relationship (Figure 5.12). + +Figure 5.12 + +The only real trick in the game occurs if you have an exclusive subcategory (which is actually the case much more often than not). In this case, you also need to put some logic into the subcategory tables (in the form of triggers) to ensure that, if any row is to be inserted, there is not already another matching row in one of the other subcategories. For example, you would need to place an insert trigger in Leases that queried the Divorces and Sales tables for records with the same DocumentID. If one was found, then the trigger should reject inserted record with an appropriate error message and a ROLLBACK. + +Adding to Extensibility with Subcategories + +Subcategories are one of those concepts that can make a huge difference in the success of your database design. If used when appropriate, you can cut significant time off your queries and significantly simplify pulling together aggregate information for related but different pieces of information. Yet these aren't the only benefits to subcategories. + +Subcategories can provide a pathway to making your database more extensible. If you need to add another subcategory, the only queries you need to deal with are those that are specific to your new subcategory. Any of your queries that worked only with the parent table will still work fine—what's more, they'll pick up the information on your new subcategory without any changes! + +In short, you're picking up two major scalability benefits: + + * The information for your supercategory (documents in the example) can be scanned from just one table rather than using a UNION operator. This means fewer joins and faster relative query performance—especially as your tables grow larger or you have more and more subcategories. + * Adding new subcategories often does not take as much development time as it would have if you where developing the framework for those categories from scratch. + +Now, just as with most things, you do need to keep in mind one downside—subcategories can create a bottleneck at the parent table. Every query that you run against all the tables and data involved in the overall set of categories is probably going to need to access the parent table. Think about the locking implications there. (If you're new to locking considerations, they are discussed in full in Chapter 12.) If you are not careful about your index and query strategies, this can lead to some very bad blocking and/or deadlocking problems. That said, with intelligent planning and query writing, this is usually not a problem. Also, if the sheer size of the parent table becomes a problem, SQL Server now gives us the option of using partitioned tables to scale to larger sizes. + +Database Reuse + +This is almost never thought of, but you can create databases that facilitate reusability. Why do I say that it's almost never thought of? Well, just trust me on this—developers think of things like reusable components. Things such as objects to validate credit cards, distribute mail, and stream binary information in and out are all things that you would immediately think about placing in a repository and using over and over again. For whatever reason, however, databases just don't seem to get thought of in that way. + +Perhaps one reason for this is that databases, by definition, store data. Data is normally thought of as being unique to one company or industry and, most of all, as being private. I'm guessing that you then automatically think of the storage container for that data as also being personal—who knows? + +Contrary to popular belief, however, databases can be built to be reusable. Surprisingly, to do this you apply a lot of the same concepts that make code components reusable—most of all compartmentalization and the use of common interfaces. + +Just remember to make sure you have a really good fit before you try to reuse an existing database structure. Much like most things in programming that I've seen reuse of, it's very possible to have your reuse become a situation where you are trying to use the wrong tool for the job, and things can actually become even more expensive than they would have been if you had written things from scratch to begin with. + +Candidates for Reusable Databases + +The databases that have the best chance at being reusable are those that can be broken up into separate subject areas (much as components are usually broken up into functional groupings). Each subject area is kept as generic as is feasible. An example would be something like an accounting database. You could have separate subject areas that match up with the functional areas in accounting: + + * Purchasing + * Accounts receivable (which in turn may be broken up into invoicing and cash receipts) + * Inventory + * Accounts payable + * General ledger + * Cash management + +The list could go on. You can also take the approach down to a more granular level and create many, many databases, down to the level of things like persons, commercial entities (ever noticed how similar customers are to vendors?), orders—there are lots of things that have base constructs that are used repeatedly. You can roll these up into their own "mini-database," and then plug them into a larger logical model (tied together using sprocs, views, or other components of your data access layer). + +How to Break Things Up + +This is where the logical versus physical modeling really starts to show its stuff. When you're dealing with databases that you're trying to make reusable, you often have one logical database (that contains all the different subject areas) that contains many physical databases. Sometimes you'll choose to implement your logical design by referencing each of the physical implementations directly. Other times you may choose an approach that does a better job of hiding the way that you've implemented the database—you can create what amounts to a "virtual" database in that it holds nothing but views that reference the data from the appropriate physical database. + +Let me digress long enough to point out that this process is essentially just like encapsulation in object-oriented programming. By using the views, you are hiding the actual implementation of your database from the users of the view. This means that you can remove one subject area in your database and replace it with an entirely different design—the only trick in doing this is to map the new design to your views—from that point on, the client application and users are oblivious to the change in implementation. + +Breaking things up into separate physical databases and/or virtualizing the database places certain restrictions on you, and many of these restrictions contribute to the idea of being able to separate one subject area from the whole, and reuse it in another environment. + +Some of the things to do include: + + * Minimize or eliminate direct references to other functional areas. If you've implemented the view approach, connect each physically separate piece of the database to the logical whole only through the views. + * Don't use foreign key constraints—where necessary, use triggers instead. Triggers can span databases; foreign key constraints can't. + +The High Price of Reusability + +All this reuse comes at a price. Many of the adjustments that you make to your design in order to facilitate reuse have negative performance impacts. Some of these include: + + * Foreign key constraints are faster than triggers overall, but triggers are the only way to enforce referential integrity that crosses database boundaries. + * Using views means two levels of optimization run on all your queries (one to get at the underlying query and mesh that into your original query, another to sort out the best way to provide the end result)—that's more overhead, and it slows things down. + * If not using the virtual database approach (one database that has views that map to all the other databases), maintaining user rights across many databases can be problematic. + +In short, don't look for things to run as fast unless you're dealing with splitting the data across more servers than you can with the single database model. + +Reusing your database can make lots of sense in terms of reduced development time and cost, but you need to balance those benefits against the fact that you may suffer to some degree in the performance category. + +De-Normalization + +I'm going to keep this relatively short, since this tends to get into fairly advanced concepts, but remember not to get carried away with the normalization of your data. + +As I stated early in this chapter, normalization is one of those things that database designers sometimes wear like a cross. It's somehow turned into a religion for them, and they begin normalizing data for the sake of normalization rather than for the good things it does to their database. Here are a couple of things to think about in this regard: + + * If declaring a computed column or storing some derived data is going to allow you to run a report more effectively, then by all means put it in. Just remember to take into account the benefit versus the risk. (For example, what if your "summary" data gets out of sync with the data it can be derived from? How will you determine that it happened, and how will you fix it if it does happen?) + * Sometimes, by including just one (or more) de-normalized column in a table, you can eliminate or significantly cut down the number of joins necessary to retrieve information. Watch for these scenarios—they actually come up reasonably frequently. I've dealt with situations where adding one column to one commonly used base table cut a nine-table join down to just three, and cut the query time by about 90 percent in the process. + * If you are keeping historical data—data that will largely go unchanged and is just used for reporting—then the integrity issue becomes a much smaller consideration. Once the data is written to a read-only area and verified, you can be reasonably certain that you won't have the kind of "out of sync" problems that is one of the major things that data normalization addresses. At that point, it may be much nicer (and faster) to just "flatten" (de-normalize) the data out into a few tables, and speed things up. + * The fewer tables that have to be joined, the happier your users who do their own reports are going to be. The user base out there continues to get more and more savvy with the tools they are using. Increasingly, users are coming to their DBA and asking for direct access to the database to be able to do their own custom reporting. For these users, a highly normalized database can look like a maze and become virtually useless. De-normalizing your data can make life much easier for these users. + +All that said, if in doubt, normalize things. There is a reason why that is the way relational systems are typically designed. When you err on the side of normalizing, you are erring on the side of better data integrity, and on the side of better performance in a transactional environment. + +Partitioning for Scalability + +Beginning with SQL Server 2000, SQL Server picked up the marvelous ability to create one logical table from multiple physical tables—partitioned views. That is, the data from one logical table is partitioned such that it is stored in a separate well-defined set of physical tables. But the notion of partitioning your data has been around a lot longer than partitioned views have been. Indeed, keeping your main accounting system on one server and your order entry and inventory systems on another is a form of partitioning—you are making sure that the load of handling the two activities is spread across multiple servers. SQL Server 2005 took an additional step by adding what are called partitioned tables. + +Partitioned tables are a bit different from partitioned views in a way that is implied in their name—they truly remain a table throughout. Whereas a partitioned view could not support some of the functionality found in tables (constraints, defaults, identity columns, and so on), a partitioned table supports all these. + +There is, of course, a catch—partitioned tables are limited to just one server (it is a means of separating a table across multiple filegroups and, therefore, drive volumes). Note that the limitation to one server doesn't mean you're limited to one physical storage device—there is nothing stopping you from linking multiple storage devices (including multiple SANs) to the one SQL Server. + +Partitioned tables do not allow unique indexes on columns that are not part of the partitioning key—this can be critical when the column you want to partition on is not the one you want to use as a primarykey or you have other columns that need unique constraint enforcement. + +Partitioned views are still an option when the load is such that you need to span multiple servers. For purposes of this chapter, you're going to stick with the basic notions of partitioning that apply to both the view and table models. + +Regardless of which partitioning method you're using, the concepts are pretty much the same. You utilize one or more columns in the logical table as a divider to physically separate your data. This allows you to use multiple I/O pathways and even multiple servers to process your query for you. The question of just how to partition your data should be a very big one. The tendency is going to be to take the hyper-simplistic approach and just divide things up equally based on the possible values in a partitioning column. This approach may work fine, but it is also a little shortsighted for two big reasons: + + * Data rarely falls into nice, evenly distributed piles. Often, predicting the distribution requires a lot of research and sampling up front. + * It fails to take into account the way the data will actually be used once stored. + +The way that you partition your data does a lot more than determine the volume of data that each partition will receive—much more importantly, it makes a positively huge difference in how well your overall system is going to perform. Keep in mind: + + * Tables rarely live in a bubble. Most of the time you are going to be joining data from any given table with other data in the system—is how the "other" data is partitioned compatible (from a performance perspective)? + * Network bandwidth tends to be a huge bottleneck in overall system performance—how are you taking that into account when designing your partitions? This is not that big of a deal if dealing with a partitioned table scheme (which will be local to just one server) but can be huge for a portioned view model. + +So, with all this in mind, here are a couple of rules for you: + + * If using partitioned views to spread data across servers, keep data that will be used together stored together. That is, if certain tables are going to be used together frequently in queries, then try to partition those tables such that data that is likely to be returned as part of a query will most likely reside on the same server. Obviously, you won't be able to make that happen 100 percent of the time, but, with careful thought and recognition of how your data gets used, you should find that you can arrange things so that most queries will happen local to just one server. For example, for a given order, all the related order detail rows will be on the same server. + * When you design your application, you should ideally make it partition aware—that is, you should code the routines that execute the queries such that they know which server most likely has their data. The data may be broken out across multiple machines—wouldn't it be nice if the database server your application made the request to was the right one from the start, and there was no need for the request to be forwarded to another server? + +If you've gotten as far as deciding that you need to go with a partitioned system, then you must really have one heck of a load you're planning on dealing with. How you partition your data is going to have a huge impact on how well your system is going to deal with that load. Remember to take the time to fully plan out your partitioning scheme. After you think you've decided what you're going to do—Test! Test! Test! + +The SQL Server Diagramming Tools + +You can open up SQL Server's built-in tools by navigating to the Diagrams node of the database you want to build a diagram for (expand your server first, then the database). Some of what you are going to see you'll find familiar—some of the dialogs are the same as you saw in Chapter 4 when you were creating tables. The SQL Server diagramming tools don't give you all that many options, so you'll find that you'll get to know them fairly quickly. + +You can start by creating your first diagram. You can create your new diagram by right-clicking the Diagrams node underneath the AdventureWorks database and choosing the New Database Diagram option. + +You may (if it's the first time you've tried to create a diagram) see a dialog come up warning you that some of the objects needed to support diagramming aren't in the database and asking if you want to create them—choose yes. + +SQL Server starts you out with an Add Table dialog (see Figure 5.13) that lists the available tables you can add to your diagram. + +Figure 5.13 + +Select the following tables (remember to hold down the control key to select more than one table): + + * Address + * Customer + * CustomerAddress + * SalesOrderHeader + * SalesOrderDetail + +Then click Add. After a brief pause while SQL Server draws all the tables you selected, click the Close button. SQL Server has added our tables to the diagram, as shown in Figure 5.14. + +I've rearranged my layout slightly from what SQL Server came up with by default to make more of it fit into this book. Depending on your screen resolution, it may be difficult to see the entire diagram at once due to the zoom. To pull more of the tables into view, change the zoom setting in the toolbar. + +SQL Server enumerates through each table you have said you want to add and analyzes what other objects are associated with those tables. The various other items you see beyond the table itself are some of the many other objects that tie into tables—primary keys, foreign keys. + +So, having gotten a start, I'll use this diagram as a launching point for explaining how the diagramming tool works and building a few tables here and there. + +Figure 5.14 + +Tables + +Each table has its own window you can move around. The primary key is shown with the little symbol of a key in the column to the left of the name like the one next to the CustomerID. This is just the default view for the table; you can select from several others that allow you to edit the very make-up of the table. To check out your options for views of a table, right-click the table that you're interested in. The default is column names only, but you should also take an interest in the choice of Custom; this or "standard" is what you would use when you want to edit the table from right within the diagram (very nice!). + +Adding Tables + +You can add a new table to the diagram in one of two ways: + + * If you have a table that already exists in the database (but not in the diagram), but now you want to add it to your diagram, you simply click the Add Table button on the diagramming window's toolbar, or right-click anywhere in the diagram and choose Add Table. You'll be presented with a list of all the tables in the database; just choose the one that you want to add, and it will appear along with any relationships it has to other tables in the diagram. + * If you want to add a completely new table, click the New Table button on the diagramming window's toolbar or right-click in the diagram and choose New Table. You'll be asked for a name for the new table, and the table will be added to the diagram in Column Properties view. Simply edit the properties to have the column names, data types, and so on that you want, and you have a new table in the database. + +Let me take a moment to point out a couple of gotchas in this process. + +First, don't forget to add a primary key to your table. SQL Server does not automatically do this, nor does it even prompt you. This is a somewhat less than intuitive process. To add a primary key, you must select the columns that you want to have in the key. Then right-click and choose Set Primary Key. + +Next, be aware that your new table is not actually added to the database until you choose to save—this is also true of any edits that you make along the way. + +Go ahead and quickly add a table to see how this works and set you up for some later examples. + +First, right-click anywhere in the diagramming pane, and choose New Table. You'll be prompted for a table name—call this one CustomerNotes. Now add just three columns as shown in Figure 5.15. + +Figure 5.15 + +Notice the asterisk in the title bar for the table—that means there are unsaved changes to this table (specifically, the table has yet to be saved at all). Go ahead and save the diagram, and that will also create the table in the physical database. You now have a table with three NOT NULL columns. There is not, as yet, any primary key for this table. (We'll deal with that in our section on adding constraints.) + +Dropping Tables from Either the Database or Diagram + +Dropping tables is a bit confusing since there is a vague distinction between deleting them from the diagram versus deleting them from the database. You can drop a table from the diagram either of two ways: + + * Select the table and press your Delete key. + * Select the table and choose the Remove from Diagram button on the toolbar. + +To entirely drop the table from the database, you have three choices: + + * Select the table, and choose Edit⇒Delete Tables from Database + * Select the table, and click the Delete Tables from Database icon on the toolbar + * Right-click the table header, and choose Delete Tables from Database + +Note that, while deleting a table from the diagram does not become permanent until you save the diagram, deleting it from the database happens immediately after you confirm the deletion. + +Dealing with Constraints + +If you're using the diagram tools at all, you'll want to do more than create just the basic table—you'll want to be able to establish constraints as well. The diagramming tools make these relatively easy. + +Primary Keys + +This really couldn't be much simpler. To create a primary key, just select the column(s) you want to participate in the key (again, hold down the control key if you need to select multiple columns), right-click and select Set Primary Key, as shown in Figure 5.16. + +Figure 5.16 + +I'm adding a primary key to the CustomerNotes table we created in the previous section. As you choose the Set Primary Key option, you'll see it add a key icon to each of the fields that participate in your column. To change the primary key, just select a new set of columns and again choose Set Primary Key. To remove it, just choose Remove Primary Key from the same menu. (It does not show in my figure, because no primary key had been set yet.) + +Foreign Keys + +Foreign keys are nearly as easy as primary keys were—they use a simple drag-and-drop model. + +In our CustomerNotes example, you'll notice I used CustomerID—this is intended to be the same CustomerID that is used elsewhere in the AdventureWorks database, so it makes sense that you would want a foreign key to the base table for CustomerID's (Customer). To do this, simply click the CustomerID column in the Customer table, and drag it onto the CustomerNotes table. Management Studio will then give you the dialog in Figure 5.17 to confirm the foreign key you're after. + +Figure 5.17 + +From here, you can change what the columns are in both the referenced and referencing tables, and even add additional columns if you need to. Click OK, and you move on to the dialog in Figure 5.18, which allows you to set the other properties that go with a foreign key definition, including such things as cascading actions and whether this foreign key should be propagated to any replicated databases you have out there. + +Figure 5.18 + +To edit the foreign key after you've created it, select it (by clicking it), and you will see properties in the pane on the right-hand side of the screen. + +Note that the properties pane is a dockable window, so it's possible you have moved it away from the default right-hand side. + +To delete a foreign key, simply right-click the relationship and choose Delete Relationships from Database. + +CHECK Constraints + +To work on the CHECK constraints for your table, simply right-click the table and choose Check Constraints. This brings up a dialog that allows you to either create a new constraint or to select from those already defined for the table. After you create a new one or select one of the existing ones, Management Studio brings up a dialog that is not all that different from that used for foreign keys. + +As when you created tables, you can see the asterisk next to the CK_CustomerNotes name—this lets you know that there are unsaved changes. The primary thing you want to focus on in this dialog is the Expression field; this is where you would enter in the conditions of your constraint. + +Do not confuse the Identity box in this dialog with an IDENTITY column—this section of the dialog is only there for providing the name and, optionally, a description of the constraint. + +To edit an existing constraint, just change the properties as desired. To remove it, just select the constraint you're interested in and click the Delete button. + +Regarding Date Columns + +Normally I wouldn't spend much time on specific data types, but with SQL Server 2008 the new data types require some special attention. Of particular issue is how the new Date and Time data types alter things. We'll hold off on the performance and space ramifications for our designing for performance chapter (Chapter 21), but the new Date data type in particular deserves a brief moment of discussion. + +Previous versions of SQL Server supported only the notions of date and time as one combined data type. The datetime data type takes up a whopping 8 bytes, and the combination often creates hassles in development—among these are: + + * Wasted space when there is no need to track a specific time (or when time of day is all you need). + * Hassles in comparing dates when there is time also attached. (You want to see if it's on the same day, but they don't compare equally due to different times of day; you can get around this, but it's a hassle and muddles your code). + * Occasional compatibility hassles when interacting with client data types that expect just the date or just the time. + +The new date and time data types address these issues by making date and time data discrete and adding flexibility to each type (you can even set precision). Dates are now easily compared to other dates, and times are not only easily compared to other times, but also precision settable to either save space or capture time down to the nanosecond (we were limited to roughly 3 milliseconds previously). + +In addition, we have new data types that are meant to deal with the increasing need to standardize time. Allowances have been made to keep track of time offsets versus Coordinated Universal Time, or UTC, which is an abbreviation for the French, Temps Universel Coordonné. This means you can accept times submitted from all around the world and easily reconcile them for more genuine time comparisons. + +We will touch on these new data types more as we continue through the book, but given the legacy of the datetime data type, it is important to recognize these new data types and plan for how they will affect your applications moving forward. + +Summary + +Database design is a huge concept, and one that has many excellent books dedicated to it as their sole subject. It is essentially impossible to get across every database design notion in just a chapter or two. + +In this chapter, you have, however, gotten off to a solid start. You've gotten a bit of review of normalization. You have, however, also seen that normalization is not always the right answer—strategic de-normalization of our data can simplify the database for users and speed reporting performance. Finally, you've looked at some non-normalization-related concepts in database design, plus how to make use of the diagramming tools to design our database. + +In the next chapter, you will be taking a very close look at how SQL Server stores information and how to make the best use of indexes. +6 + +Core Storage and Index Structure + +Indexes. They may well be the second most important part (to tables) of your database planning and system maintenance. Why is it then that they are, all too often, an afterthought in many designs? + +Think about it for a minute. Most database systems are based on the notion of fast and efficient data retrieval and maintenance. Indexes provide your database system with additional ways to look up data and take shortcuts to that data's physical location. The right index can cut huge percentages of time off your query executions. So, if efficient data retrieval and maintenance are why we build databases, and indexes are critical to the efficient access and maintenance of the data in databases, why is it that so many software architects move straight from determining a table layout to stored procedures or client code? Silly. + +Now, don't get me wrong: thinking about stored procedures, client code, and other non-table elements is important, and most developers aren't going to leave a database with zero indexes. Indeed, at least a few indexes will show up in your database without you having to specify them. (Creating a primary key or unique constraint creates an implied index required to enforce those constraints.) It is, however, amazing just how often indexes are applied based on only a few minutes worth of guesses or purely to address a specific performance bug that showed up in QA (or worse, as a patch to a released product). In still other scenarios, developers will take an "index everything" approach, failing to realize the additional storage required or how too many poorly planned indexes can actually increase the time it takes for your query to run. + +In this chapter, we will be focusing on the core index structures in SQL Server from both a developer's and an administrator's point of view. We will also look at how data is stored in SQL Server so that we may better understand how SQL Server makes optimization choices, and, from that, what indexes make sense in what situations. + +SQL Server Storage + +Storage is an area that has undergone some minor changes in SQL Server 2008. (Well, technically they showed up in a service pack for SQL Server 2005.) These changes, primarily centered around the compression of fixed-length storage types, are discussed in the next chapter. + +Data in SQL Server can be thought of as existing in something of a hierarchy of structures. The hierarchy is pretty simple. Some of the objects within the hierarchy are things that you will deal with directly and will therefore know easily. A few others exist under the cover, and while they can be directly addressed in some cases, they usually are not. Take a look at them one by one. + +The Database + +Okay—this one is easy. I can just hear people out there saying, "Duh! I knew that." Yes, you probably did, but I point it out as a unique entity here because it is the highest level of the definition of storage (for a given server). This is the highest level that a lock can be established at, although you cannot explicitly create a database level lock. + +A lock is something of both a hold and a place marker that is used by the system. We will be looking into locking extensively in Chapter 11, but we will see the lockability of objects within SQL Server discussed in passing as we look at storage. + +The File + +By default, your database has two files associated with it: + + * The first is the primary physical database file—that's where your data is ultimately stored. This file should be named with an *.mdf extension (this is a recommendation, not a requirement—but I think you'll find doing it in other ways will become confusing over time). "Secondary" files can be added (and should use an *.ndf extension), and do not need to be on the same physical drive as the primary (which means you can use them to distribute I/O load—we will explore these further in Chapter 21). + * The second is something of an offshoot to the database file—the log. We'll dive into the log quite a bit when we deal with transactions and locks in Chapter 11, but you should be aware that it resides in its own file (which should end with an *.ldf extension), and that your database will not operate without it. The log is the serial recording of what's happened to your database since the last time that data was "committed" to the database. The database isn't really your complete set of data. The log isn't your complete set of data. Instead, if you start with the database and "apply" (add in all the activities from the last point the two synched up) the log, you have your complete set of data. + +There is no restriction about where these files are located relative to each other. It is possible (actually, it's even quite desirable) to place each file on a separate physical device. This not only allows for the activity in one file not to interfere with that in the other file, but it also creates a situation where losing the file with the database does not cause you to lose your work—you can restore a backup and then reapply the log (that was safe on the other drive). Likewise, if you lose the drive with the log, you'll still have a valid database up through the time of the last checkpoint (checkpoints are fully covered in Chapter 11). + +The Extent + +An extent is the basic unit of storage used to allocate space for tables and indexes within a given file. It is made up of eight contiguous 64KB data pages. + +The concept of allocating space based on extents, rather than actual space used, can be somewhat difficult to understand for people used to operating system storage principles. The important points about an extent include: + + * Once an extent is full, the next record will take up not just the size of the record but the size of a whole new extent. Many people who are new to SQL Server get tripped up in their space estimations in part due to the allocation of an extent at a time rather than a record at a time. + * By pre-allocating this space, SQL Server saves the time of allocating new space with each record. + +It may seem like a waste that a whole extent is taken up just because one too many rows were added to fit on the currently allocated extent(s), but the amount of space wasted this way is typically not that much as a percentage of the entire database. Still, it can add up—particularly in a highly fragmented environment—so it's definitely something you should keep in mind. + +The good news in taking up all this space is that SQL Server skips some of the allocation time overhead. Instead of worrying about allocation issues every time it writes a row, SQL Server deals with additional space allocation only when a new extent is needed. + +Don't confuse the space that an extent is taking up with the space that a database takes up. Whatever space is allocated to the database is what you'll see disappear from your disk drive's available space number. An extent is merely how things are, in turn, allocated within the total space reserved by an individual database file. + +The Page + +Much like an extent is a unit of allocation within the database, a page is the unit of allocation within a specific extent. There are eight pages to every extent. + +A page is the last level you reach before you are at the actual data row. Whereas the number of pages per extent is fixed, the number of rows per page is not—that depends entirely on the size of the row, which can vary. You can think of a page as being something of a container for both table and index row data. A row is not allowed to be split between pages. + +Figure 6.1 illustrates how data gets put into a page. Notice how, for every row you insert, you have to place the row offset down at the end of the page to indicate where in the page that particular row's data begins. + +Figure 6.1 + +There are a number of different page types. For purposes of this book, the types we care about are: + + * Data + * Index + * Binary Large Object (BLOB) (for Image, most Text and Ntext data, and varchar(max)/nvarchar(max) data that is larger than about 8k) + * Global and Shared Global Allocation Map (GAM, or SGAM) + * Page Free Space (PFS) + * Index Allocation Map (IAM) + * Bulk Changed Map + * Differential Changed Map + +Data Pages + +Data pages are pretty self-explanatory—they are the actual data in your table, with the exception of any BLOB data that is not stored "in row" (more on this in the BLOB pages section). In the case of a row that has a column that contains BLOB data, the regular data is stored in a data page, and the BLOB data may be stored in page (if small enough to fit). If the BLOB data can't fit on the page, then a 16-byte pointer is used to show where to find the BLOB page that contains the start of the BLOB. + +Index Pages + +Index pages are also pretty straightforward: They hold both the non-leaf and leaf level pages (we'll examine what these are later in the chapter) of a non-clustered index, as well as the non-leaf level pages of a clustered index. These index types will become much clearer as we continue through this chapter. + +BLOB Pages + +BLOB pages are for storing Binary Large Objects. For SQL Server, these amount to data stored in varbinary(max), varchar(max), or nvarchar(max) columns. BLOB pages are special as far as data storage pages go, in that they don't have any rows as such. Since a BLOB can be as large as 2GB, they have to be able to go on more than one page—for this portion of things it doesn't matter what the version is. SQL Server will allocate as many pages as it needs in order to store the entire BLOB, but there is no guarantee that these pages will be contiguous—the pages could be located anywhere within the database file(s). + +As mentioned before, the connection between the non-BLOB data for a row and any BLOB-related to that row comes in the form of a pointer. The nature of that pointer and how SQL Server navigates to the BLOB data was changed for version 7.0 of SQL Server. In version 6.5 and before, the BLOB pages were put together in a chain—similar to a linked list. In order to find any page that was part of the BLOB, you needed to start at the beginning and navigate through the BLOB page by page. If you were trying to perform some form of text or binary search, this kind of arrangement would be deadly, given that you were forced into a serial scan of the data. Beginning with version 7.0, however, the pages were changed to be organized into a B-Tree structure (which I will discuss fully a little later in the chapter). B-Trees provide more of a branching structure, and, therefore, a more direct path for larger BLOBs. This has made quite a difference in how quickly text operations can be performed. + +Even with the significant improvements made across several versions over the years, BLOBs are very slow performance-wise, so we will talk about alternative storage methods when we look at advanced design issues later on. + +Global Allocation Map, Shared Global Allocation Map, and Page Free Space Pages + +Global Allocation Map (GAM), Shared Global Allocation Map (SGAM), and Page Free Space (PFS) page types are involved with figuring out which extents and pages are in use, and which are not. Essentially, these pages store records that indicate where there is space available. Understanding these page types is not really necessary to do high-quality development or systems administration, and is beyond the scope of this book. If, however, you're just dying to know about them (or you're having problems with insomnia), then you can find more information on them in the Books Online—just look up GAM in the index. + +Bulk Changed Map + +Hmmm. How to address this one, since we haven't addressed bulk operations yet.... + +SQL Server has the concept of "bulk operations." Bulk operations are very high-speed changes to the database (usually a mass import of data or a truncation of a table). Part of this speed comes from the idea that they don't "log" every single thing they do. The log is a critical part of the backup and recovery system, and bulk operations mean that unlogged activity (well, it logs that it did an operation, but not the specifics, and so the log cannot reconstruct what you did) has occurred in your database. + +The Bulk Changed Map—or BCM—is a set of pages that track what extents have been altered via bulk operations. It cares nothing about the specifics of the changes—merely that you messed with that particular extent. Since it knows you altered that extent, it provides more options when you back up your database. More specifically, when backing up the log, you can supplement the log backup with backing up of the physical data in the extents that were affected by bulk operations. + +Differential Changed Map + +This is nearly the same thing as the Bulk Changed Map, but, instead of tracking only those extents changed by bulk operations, it instead tracks any extents that were changed since the last full backup of your database. + +When you request a differential backup, the Differential Changed Map—or DCM—supplies information about what extents need to be backed up. You wind up with a much smaller and faster running (albeit only partial) backup as only those extents that have changed since the prior backups are included. + +Page Splits + +When a page becomes full, it splits. This means more than just a new page being allocated—it also means that approximately half the data from the existing page is moved to the new page. + +The exception to this process is when a clustered index is in use. If there is a clustered index, and the next inserted row would be physically located as the last record in the table, then a new page is created, and the new row is added to the new page without relocating any of the existing data. We will see much more on page splits as we investigate indexes. + +Rows + +You will hear much about "Row Level Locking," so it shouldn't be a surprise to hear this term. Rows typically can be up to 8K. + +In addition to the limit of 8,060 characters, there is also a maximum of 1,024 columns. In practice, you'll find it very unusual to run into a situation where you run out of columns before you run into the 8,060-character limit. 1,024 gives you an average column width of 8 bytes. For most uses, you'll easily exceed that. The exception to this tends to be in measurement and statistical information—where you have a large number of different things that you are storing numeric samples of. Still, even those applications will find it a rare day when they bump into the 1,024 column count limit. + +I did, as you may have noted, use the term typically when I mentioned the 8KB limit. This limit is based on a row being limited to a single page, and the page having an 8KB size, but it can be exceeded in a few circumstances—specifically, with varchar(max) or varbinary(max) as well as traditional BLOB data types like image and text. If a row contains too much data in one of these types to fit within the single page, then these special data types know how to make your data span multiple pages (up to 2GB in a single row). In this case, the original row is used to keep track of where the actual data for that column is stored (all other columns are still stored in the original row). + +Full-Text Catalogs + +Prior to SQL Server 2008, these were a separate storage mechanism outside of your normal database. While you could associate a full-text catalog as being the default for a given database, and even back up your full-text catalogs together with your database (in 2005—prior to that even the backups were decoupled), they were stored completely separately. With SQL Server 2008, the Full-Text Catalog no longer has relevance as a storage unit—instead, it is merely a logical grouping of full-text indexes. I mention them here solely for historical reference. (We discuss full text in Chapter 18.) + +Coordinated backups between full-text index files and the core database did not exist prior to SQL Server 2005. Keep this in mind if you have backward compatibility concerns with prior versions. + +File Streams + +File streams are a special storage method meant to address the performance issues with the storage of very large BLOBs. Instead of storing the file's stream in a set of BLOB pages, the file is stored in an NT File System (NTFS) directory that is created explicitly for use by the particular SQL Server database you're storing data in. Unlike client-controlled systems that store binary data in the file system and a pointer in the database, SQL Server coordinates file versioning for you—allowing the file stream to participate in transactions and backups. + +File streams are something of a niche area, but a rather important one. We will explore their structure more fully in the next chapter, and further still in our chapter on designing for performance (Chapter 21). + +Understanding Indexes + +Webster's dictionary defines an index as: + +A list (as of bibliographical information or citations to a body of literature) arranged usually in alphabetical order of some specified datum (as author, subject, or keyword). + +I'll take a simpler approach in the context of databases and say it's a way of potentially getting to data a heck of a lot quicker. Still, the Webster's definition isn't too bad—even for our specific purposes. + +Perhaps the key thing to point out in the Webster's definition is the word usually that's in there. The definition of "alphabetical order" changes depending on a number of rules. For example, in SQL Server, we have a number of different collation options available to us. Among these options are: + + * Binary: Sorts by the numeric representation of the character (for example, in ASCII, a space is represented by the number 32, the letter D is 68, but the letter d is 100). Because everything is numeric, this is the fastest option; unfortunately, it's also not at all the way in which people think, and can also really wreak havoc with comparisons in your WHERE clause. + * Dictionary Order: This sorts things just as you would expect to see in a dictionary, with a twist; you can set a number of different additional options to determine sensitivity to case, accent, and character set. Keep in mind that every language can add its own notion of what constitutes dictionary order, so, if you choose a collation that's oriented around a non-English language, you may see sort order altered somewhat. + +It's fairly easy to understand that, if we tell SQL Server to pay attention to case, then A is not going to be equal to a. Likewise, if we tell it to be case insensitive, then A will be equal to a. Things get a bit more confusing when we add accent sensitivity—that is, SQL Server pays attention to diacritical marks, and therefore a is different from á, which is different from à. Where many people get even more confused is in how collation order affects not only the equality of data but also the sort order (and, therefore, the way it is stored in indexes). + +By way of example, let's look at the equality of a couple of collation options in the following table, and what they do to our sort order and equality information: + +Collation Order | Comparison Values | Index Storage Order +---|---|--- +Dictionary order, case insensitive, accent insensitive (the default) | A = a = à = á = â = Ä = ä = Å = å | a, A, à, â, á, Ä, ä, Å, å +Dictionary order, case insensitive, accent insensitive, uppercase preference | A = a = à = á = â = Ä = ä = Å = å | A, a, à, â, á, Ä, ä, Å, å +Dictionary order, case sensitive | A ≠ a, Ä ≠ ä, Å ≠ å, a ≠ à ≠ á ≠ â ≠ ä ≠ å, A ≠ Ä ≠ Å | A, a, à, á, â, Ä, ä, Å, å + +The point here is that what happens in your indexes depends on the collation information you have established for your data. Collation can be set at the database and column level, so you have a fairly fine granularity in your level of control. If you're going to assume that your server is case insensitive, then you need to be sure that the documentation for your system deals with this or you had better plan on a lot of tech support calls—particularly if you're selling outside of the United States. Imagine you're an independent software vendor (ISV) and you sell your product to a customer who installs it on an existing server (which is going to seem like an entirely reasonable thing to the customer), but that existing server happens to be an older server that's set up as case sensitive. You're going to get a support call from one very unhappy customer. + +Once the collation order has been set, changing it is very non-trivial (but possible), so be certain of the collation order you want before you set it. + +To "B," or Not to "B": B-Trees + +The concept of a Balanced Tree, or B-Tree, is certainly not one that was created with SQL Server. Indeed, B-Trees are used in a very large number of indexing systems both in and out of the database world. + +A B-Tree simply attempts to provide a consistent and relatively low-cost method of finding your way to a particular piece of information. The Balanced in the name is pretty much self-descriptive—a B-Tree is, with the odd exception, self-balancing, meaning that every time the tree branches, approximately half the data is on one side, and half on the other side. The Tree in the name is also probably pretty obvious at this point. (Hint: tree, branch—see a trend here?) It's there because, when you draw the structure, then turn it upside down, it has the general form of a tree. + +A B-Tree starts at the root node (another stab at the tree analogy there, but not the last). This root node can, if there is a small amount of data, point directly to the actual location of the data. In such a case, you would end up with a structure that looked something like Figure 6.2. + +Figure 6.2 + +So, we start at the root and look through the records until we find the last page that starts with a value less than what we're looking for. We then obtain a pointer to that node, and look through it until we find the row that we want. + +In most situations though, there is too much data to reference from the root node, so the root node points at intermediate nodes—or what are called non-leaf level nodes. Non-leaf level nodes are nodes that are somewhere in between the root and the node that tells you where the data is physically stored. Non-leaf level nodes can then point to other non-leaf level nodes, or to leaf level nodes (last tree analogy reference—I promise). Leaf level nodes are the nodes where you obtain the real reference to the actual physical data. Much like the leaf is the end of the line for navigating the tree, the node we get to at the leaf level is the end of the line for our index—from here, we can go straight to the actual data node that has our data on it. + +As you can see in Figure 6.3, we start with the root node just as before, then move to the node that starts with the highest value that is equal to or less than what we're looking for and is also in the next level down. We then repeat the process—look for the node that has the highest starting value at or below the value for which we're looking. We keep doing this, level by level down the tree, until we get to the leaf level—from there we know the physical location of the data and can quickly navigate to it. + +Figure 6.3 + +Page Splits—A Deeper Look + +All of this works quite nicely on the read side of the equation; it's the insert that gets a little tricky. Recall that the B in B-Tree stands for balanced. You may also recall that I mentioned that a B-Tree is balanced because about half the data is on either side every time you run into a branch in the tree. B-Trees are sometimes referred to as self-balancing because the way new data is added to the tree generally prevents them from becoming lopsided. + +When data is added to the tree, a node will eventually become full, and will need to split. Because, in SQL Server, a node equates to a page—this is called a page split, illustrated in Figure 6.4. + +When a page split occurs, data is automatically moved around to keep things balanced. The first half of the data is left on the old page, and the rest of the data is added to a new page—thus you have about a 50–50 split, and your tree remains balanced. + +Figure 6.4 + +If you think about this splitting process a bit, you'll realize that it adds a substantial amount of overhead at the time of the split. Instead of inserting just one page, you are: + + * Creating a new page + * Migrating rows from the existing page to the new page + * Adding your new row to one of the pages + * Adding another entry in the parent node + +But the overhead doesn't stop there. Since you're in a tree arrangement, you have the possibility for something of a cascading action. When you create the new page (because of the split), you need to make another entry in the parent node. This entry in the parent node also has the potential to cause a page split at that level, and the process starts all over again. Indeed, this possibility extends all the way up to and can even affect the root node. + +If the root node splits, then you actually end up creating two additional pages. Because there can be only one root node, the page that was formerly the root node is split into two pages, and becomes a new intermediate level of the tree. An entirely new root node is then created, and will have two entries (one to the old root node, one to the split page). + +Needless to say, page splits can have a very negative impact on system performance and are characterized by behavior where your process on the server seems to just pause for a few seconds (while the pages are being split and rewritten). + +We will talk about page-split prevention before we're done with this chapter. + +While page splits at the leaf level are a common fact of life, page splits at intermediate nodes happen far less frequently. As your table grows, every layer of the index will experience page splits, but, because the intermediate nodes have only one entry for several entries on the next lower node, the number of page splits gets less and less frequent as you move further up the tree. Still, for a split to occur above the leaf level, there must have already been a split at the next lowest level—this means that page splits up the tree are cumulative (and expensive performance-wise) in nature. + +SQL Server has a number of different types of indexes (which we will discuss shortly), but they all make use of this B-Tree approach in some way or another. Indeed, they are all very similar in structure thanks to the flexible nature of a B-Tree. Still, we shall see that there are indeed some significant differences, and these can have an impact on the performance of our system. + +For a SQL Server index, the nodes of the tree come in the form of pages, but you can actually apply this concept of a root node, the non-leaf level, the leaf level, and the tree structure to more than just SQL Server or even just databases. + +How Data Is Accessed in SQL Server + +In the broadest sense, there are only two ways in which SQL Server retrieves the data you request: + + * Using a table scan + * Using an index + +Which method SQL Server will use to run your particular query will depend on what indexes are available, what columns you are asking about, what kind of joins you are doing, and the size of your tables. + +Use of Table Scans + +A table scan is a pretty straightforward process. When a table scan is performed, SQL Server starts at the physical beginning of the table looking through every row in the table. As it finds rows that match the criteria of your query, it includes them in the result set. + +You may hear lots of bad things about table scans, and in general, they will be true. However, table scans can actually be the fastest method of access in some instances. Typically, this is the case when retrieving data from rather small tables. The exact size where this becomes the case will vary widely according to the width of your table and what the specific nature of the query is. + +See if you can spot why the use of EXISTS in the WHERE clause of your queries has so much to offer performance-wise where it fits the problem. When you use the EXISTS operator, SQL Server stops as soon as it finds one record that matches the criteria. If you had a million record table, and it found a matching record on the third record, then use of the EXISTS option would have saved you the reading of 999,997 records! NOT EXISTS works in much the same way. + +Use of Indexes + +When SQL Server decides to use an index, the process actually works somewhat similarly to a table scan, but with a few shortcuts. + +During the query optimization process, the Optimizer takes a look at all the available indexes and chooses the best one (this is primarily based on the information you specify in your joins and WHERE clause, combined with statistical information SQL Server keeps on index make-up). Once that index is chosen, SQL Server navigates the tree structure to the point of data that matches your criteria and again extracts only the records it needs. The difference is that, since the data is sorted, the query engine knows when it has reached the end of the current range it is looking for. It can then end the query, or move on to the next range of data as necessary. + +If you ponder the query topics you've worked with and studied thus far, you may notice some striking resemblances to how the EXISTS option works. The EXISTS keyword allows a query to quit running the instant that it finds a match. The performance gains of using an index are similar or even better since the process of searching for data can work in a similar fashion—that is, the server is able to know when there is nothing left that's relevant, and can stop things right there. Even better, however, is that by using an index, you don't have to limit yourself to Boolean situations (does the piece of data I was after exist—yes or no?). You can apply this same notion to both the beginning and end of a range—you are able to gather ranges of data with essentially the same benefits that using an index gives to finding data. What's more, you can do a very fast lookup (called a SEEK) of your data rather than hunting through the entire table. + +Don't get the impression from my comparing what indexes do for you to the EXISTS operator that indexes replace the EXISTS operator altogether (or vice versa). The two are not mutually exclusive; they can be used together, and often are. I mention them here together only because they have the similarity of being able to tell when their work is done, and quit before getting to the physical end of the table. + +Index Types and Index Navigation + +Although there are nominally two types of indexes in SQL Server (clustered and non-clustered), there are actually, internally speaking, three different types: + + * Clustered indexes + * Non-clustered indexes—which comprise: + * Non-clustered indexes on a heap + * Non-clustered indexes on a clustered index + +The way the physical data is stored varies between clustered and non-clustered indexes. The way SQL Server traverses the B-Tree to get to the end data varies between all three index types. + +All SQL Server indexes have leaf level and non-leaf level pages. As I mentioned when I discussed B-Trees, the leaf level is the level that holds the "key" to identifying the record, and the non-leaf level pages are guides to the leaf level. + +The indexes are built over either a clustered table (if the table has a clustered index) or what is called a heap (what's used for a table without a clustered index). + + * A clustered table is any table that has a clustered index on it. Clustered indexes are discussed in detail shortly, but what they mean to the table is that the data is physically stored in a designated order. Individual rows are uniquely identified through the use of the cluster key—the columns that define the clustered index. + +This should bring to mind the question, "What if the clustered index is not unique?" That is, how can a clustered index be used to uniquely identify a row if the index is not a unique index? The answer lies under the covers—SQL Server forces any clustered indexes to be unique—even if you don't define it that way. Fortunately, it does this in a way that doesn't change how you use the index. You can still insert duplicate rows if you wish, but SQL Server will add a suffix to the key internally to ensure that the row has a unique identifier. + + * A heap is any table that does not have a clustered index on it. In this case, a unique identifier, or row ID (RID) is created based on a combination of the extent, pages, and row offset (places from the top of the page) for that row. A RID is necessary only if there is no cluster key available (no clustered index). + +Clustered Indexes + +A clustered index is unique for any given table—you can have only one per table. You don't have to have a clustered index, but you'll find it to be one of the most commonly chosen types as the first index, for a variety of reasons that will become apparent as you look at your index types. + +What makes a clustered index special is that the leaf level of a clustered index is the actual data—that is, the data is resorted to be stored in the physical order defined in the index or related key command. This means that once you get to the leaf level of the index, you're done—you're at the data. Any new record is inserted according to its correct physical order in the clustered index. How new pages are created changes depending on where the record needs to be inserted. + +In the case of a new record that needs to be inserted into the middle of the index structure, a normal page split occurs. The last half of the records from the old page are moved to the new page and the new record is inserted into the new or old page as appropriate. + +In the case of a new record that is logically at the end of the index structure, a new page is created, but only the new record is added to the new page, as shown in Figure 6.5. + +Navigating the Tree + +As I've indicated previously, even the indexes in SQL Server are stored in a B-Tree. Theoretically, a B-Tree always has half of the remaining information in each possible direction as the tree branches. Take a look at a visualization of what a B-Tree looks like for a clustered index (Figure 6.6). + +Figure 6.5 + +Figure 6.6 + +As you can see, it looks essentially identical to the more generic B-Trees we discussed earlier in the chapter. In this case, we're doing a range search (something clustered indexes are particularly good at) for numbers 158–400. All we have to do is the following: + +Navigate to the first record, and include all remaining records on that page—we know we need the rest of that page because the information from one node up lets us know that we'll also need data from a few other pages. Because this is an ordered list, we can be sure it's continuous—that means if the next page has records that should be included, then the rest of this page must be included. We can just start spewing out data from those pages without having to do the verification side of things. + +We start by navigating to the root node. SQL Server is able to locate the root node based on an entry that is kept a system table. You can look at the logical content of that table by querying sys.indexes. + +Every index in your database has an entry in sys.indexes. This system view is part of your database (as opposed to being in the master database) and shows the stored location information for all the indexes in your database as well as which columns they are based on. + +In older versions of SQL Server, you could query against the underlying table (technically you still can, but I highly recommend against such direct queries at this point) which is called sysindexes. + +By looking through the page that serves as the root node, we can figure out what the next page we need to examine is (the second page on the second level as we have it drawn here). We then continue the process. With each step we take down the tree, we are getting to smaller and smaller subsets of data. + +Eventually, we will get to the leaf level of the index. In the case of our clustered index, getting to the leaf level of the index means that we are also at our desired row(s) and our desired data. + +I can't stress enough the importance of the distinction that, with a clustered index, when you've fully navigated the index, you've fully navigated to your data. How much of a performance difference this can make will really show its head as you look at non-clustered indexes—particularly when the non-clustered index is built over a clustered index. + +Non-Clustered Indexes on a Heap + +Non-clustered indexes on a heap work very similarly to clustered indexes in most ways. They do, however, have a few notable differences: + +The leaf level is not the data—instead, it is the level at which you are able to obtain a pointer to that data. This pointer comes in the form of the RID, which, as described earlier in the chapter, is made up of the extent, page, and row offset for the particular row being pointed to by the index. Even though the leaf level is not the actual data (instead, it has the RID), you have only one more step than with a clustered index—because the RID has the full information on the location of the row, you can go directly to the data. + +Don't, however, misunderstand this "one more step" to mean that there's only a small amount of overhead difference, and that non-clustered indexes on a heap will run close to as fast as a clustered index. With a clustered index, the data is physically in the order of the index. That means, for a range of data, when you find the row that has the beginning of your data on it, there's a good chance that the other rows are on that page with it (that is, you're already physically almost to the next record since they are stored together). With a heap, the data is not linked together in any way other than through the index. From a physical standpoint, there is absolutely no sorting of any kind. This means that, from a physical read standpoint, your system may have to retrieve records from all over the file. Indeed, it's quite possible (possibly even probable) that you will wind up fetching data from the same page several separate times. SQL Server has no way of knowing it will have to come back to that physical location because there was no link between the data. With the clustered index, it knows that's the physical sort, and can therefore grab it all in just one visit to the page. + +Just to be fair to the non-clustered index on a heap here versus the clustered index, the odds are extremely high that any page that was already read once will still be in the memory cache, and, thus, will be retrieved extremely quickly. Still, it does add some additional logical operations to retrieve the data. + +Figure 6.7 shows the same search you did with the clustered index, only with a non-clustered index on a heap this time. + +Figure 6.7 + +Through most of the index navigation, things work exactly as they did before. You start out at the same root node, and you traverse the tree dealing with more and more focused pages until you get to the leaf level of your index. This is where you run into the difference. With a clustered index, you could have stopped right here, but, with a non-clustered index, you have more work to do. If the non-clustered index is on a heap, then you have just one more level to go. You take the Row ID from the leaf-level page, and navigate to it—it is not until that point that you are at your actual data. + +Non-Clustered Indexes on a Clustered Table + +With non-clustered indexes on a clustered table, the similarities continue—but so do the differences. Just as with non-clustered indexes on a heap, the non-leaf level of the index looks pretty much as it did for a clustered index. The difference does not come until you get to the leaf level. + +At the leaf level, you have a rather sharp difference from what you've seen with the other two index structures—you have yet another index to look over. With clustered indexes, when you got to the leaf level, you found the actual data. With non-clustered indexes on a heap, you didn't find the actual data, but did find an identifier that let you go right to the data (you were just one step away). With non-clustered indexes on a clustered table, you find the cluster key. That is, you find enough information to go and make use of the clustered index. + +You end up with something that looks like Figure 6.8. + +What you end up with is two entirely different kinds of lookups. + +In the example from your diagram, you start off with a ranged search—you do one single lookup in your index and are able to look through the non-clustered index to find a continuous range of data that meets your criterion (LIKE 'T%'). This kind of lookup, where you can go right to a particular spot in the index, is called a seek. + +The second kind of lookup then starts—the lookup using the clustered index. This second lookup is very fast; the problem lies in the fact that it must happen multiple times. You see, SQL Server retrieved a list from the first index lookup (a list of all the names that start with "T"), but that list doesn't logically match up with the cluster key in any continuous fashion—each record needs to be looked up individually as shown in Figure 6.9. + +Figure 6.8 + +Figure 6.9 + +Needless to say, this multiple lookup situation introduces more overhead than if you had just been able to use the clustered index from the beginning. The first index search—the one through your non-clustered index—is going to require very few logical reads. + +For example, if I have a table with 1,000 bytes per row, and I did a lookup similar to the one in our drawing (say, something that would return 5 or 6 rows); it would take only something to the order of 8–10 logical reads to get the information from the non-clustered index. However, that gets me only as far as being ready to look up the rows in the clustered index. Those lookups would cost approximately 3–4 logical reads each, or 15–24 additional reads. That probably doesn't seem like that big a deal at first, but look at it this way: + +Logical reads went from 3 minimum to 24 maximum—that's an 800 percent increase in the amount of work that had to be done. + +Now expand this thought out to something where the range of values from the non-clustered index wasn't just five or six rows, but five or six thousand, or five or six hundred thousand rows—that's going to be a huge impact. + +Don't let the extra overhead versus a clustered index scare you. The point isn't meant to scare you away from using indexes, but rather to point out that a non-clustered index is not going to be as efficient as a clustered index from a read perspective (it can, in some instances, actually be a better choice at insertion time). An index of any kind is usually (there are exceptions) the fastest way to do a lookup. I'll explain what index to use and why later in the chapter. + +Creating, Altering, and Dropping Indexes + +These work much as they do on other objects such as tables. Take a look at each, starting with CREATE. + +Indexes can be created in two ways: + + * Through an explicit CREATE INDEX command + * As an implied object when a constraint is created + +Each of these has its own quirks about what it can and can't do, so take a look at each of them individually. + +The CREATE INDEX Statement + +The CREATE INDEX statement does exactly what it sounds like; it creates an index on the specified table or view based on the stated columns. + +The syntax to create an index is somewhat drawn out, and introduces several items that I haven't really talked about up to this point: + +CREATE [UNIQUE] [CLUSTERED|NONCLUSTERED] + +INDEX ON
( [ASC|DESC] [,...n]) + +INCLUDE ( [,...n]) + +[WITH + +[PAD_INDEX = { ON | OFF }] + +[[,] FILLFACTOR = ] + +[[,] IGNORE_DUP_KEY = { ON | OFF }] + +[[,] DROP_EXISTING = { ON | OFF }] + +[[,] STATISTICS_NORECOMPUTE = { ON | OFF }] + +[[,] SORT_IN_TEMPDB = { ON | OFF }] + +[[,] ONLINE = { ON | OFF } + +[[,] ALLOW_ROW_LOCKS = { ON | OFF } + +[[,] ALLOW_PAGE_LOCKS = { ON | OFF } + +[[,] MAXDOP = + +] + +[ON { | | DEFAULT }] + +There is legacy syntax available for many of these options, and so you may see that syntax put into use to support prior versions of SQL Server. That syntax is, however, considered deprecated and will be removed at some point. I highly recommend that you stay with the newer syntax where possible. + +There is a similar but sufficiently different syntax for creating XML and spatial indexes. These will be handled separately in the next chapter. + +Loosely speaking, this statement follows the same CREATE syntax that you've seen plenty of already (and will see even more of). The primary hitch in things is that you have a few intervening parameters that you haven't seen elsewhere. + +Just as you'll see with views in Chapter 8, you do have to add an extra clause onto your CREATE statement to deal with the fact that an index isn't really a standalone kind of object. It has to go together with a table or view, and you need to state the table that your column(s) are "ON." + +After the ON
() clause, everything is optional. You can mix and match these options. Many of them are seldom used, but some (such as FILLFACTOR) can have a significant impact on system performance and behavior, so take a look at them one by one. + +ASC/DESC + +These two allow you to choose between an ascending and a descending sort order for your index. The default is ASC, which is, as you might guess, ascending order. + +A question that might come to mind is why ascending versus descending matters: You see, SQL Server can just look at an index backwards if it needs the reverse sort order. Life is not, however, always quite so simple. Looking at the index in reverse order works just fine if you're dealing with only one column, or if your sort is always the same for all columns, but what if you needed to mix sort orders within an index? That is, what if you need one column to be sorted ascending, but the other descending? Since the indexed columns are stored together, reversing the way you look at the index for one column would also reverse the order for the additional columns. If you explicitly state that one column is ascending, and the other is descending, then you invert the second column right within the physical data. There is suddenly no reason to change the way that you access your data. + +As a quick example, imagine a reporting scenario where you want to order your employee list by the hire date, beginning with the most recent (a descending order), but you also want to order by their last name (an ascending order). In previous versions, SQL Server would have to do two operations: one for the first column and one for the second. By having control over the physical sort order of your data, you gain flexibility in the way you combine columns. + +Generally speaking, you'll want to leave this one alone (again, remember backward compatibility). Some likely exceptions are: + + * You need to mix ascending and descending order across multiple columns. + * Backward compatibility is not an issue. + +INCLUDE + +This was first added with SQL Server 2005. Its purpose is to provide better support for what are called covered queries. A query is considered to be "covered" when all of the data the query needs is covered in the index that is being used. If all the data needed is already in the index, then there is no need to go to the actual data page; as soon as it has gotten to the leaf level of the index, it has all it needs and can stop there (saving a bunch of I/O operations). + +When you INCLUDE columns as opposed to placing them in the ON list, SQL Server adds them only at the leaf level of the index. Because each row at the leaf level of an index corresponds to a data row, what you're doing is essentially including more of just the raw data in the leaf level of your index. If you think about this, you can probably guess that INCLUDE really applies only to non-clustered indexes. (Clustered indexes already are the data at the leaf level, so there would be no point.) + +Why does this matter? Well, as we'll discuss further as the book goes on, SQL Server stops working as soon as it has what it actually needs. So, if while traversing the index, it can find all the data that it needs without continuing on to the actual data row, then it won't bother going to the data row (what would be the point?). By including a particular column in the index, you may "cover" a query that utilizes that particular index at the leaf level and save the I/O associated with using that index pointer to go to the data page. + +Careful not to abuse this one! When you INCLUDE columns, you are enlarging the size of the leaf level of your index pages. That means fewer rows will fit per page, and, therefore, more I/O may be required to see the same number of rows. The result may be that your effort to speed up one query may slow down others. To quote an old film from the eighties, "Balance Danielson—balance!" Think about the effects on all parts of your system, not just the particular query you're working on that moment. + +WITH + +WITH is an easy one—it just tells SQL Server that you will indeed be supplying one or more of the options that follow. + +PAD_INDEX + +In the syntax list, this one comes first—but that will seem odd when you understand what PAD_INDEX does. In short, it determines just how full the non-leaf level pages of your index are going to be (as a percentage), when the index is first created. You don't state a percentage on PAD_INDEX because it will use whatever percentage is specified in the FILLFACTOR option that follows. Setting PAD_INDEX = ON would be meaningless without a FILLFACTOR (which is why it seems odd that it comes first). + +FILLFACTOR + +When SQL Server first creates an index, the pages are, by default, filled as full as they can be, minus two records. You can set the FILLFACTOR to be any value between 1 and 100. This number will be how full your pages are as a percentage after index construction is completed. Keep in mind, however, that as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes (something you should do—setting up a maintenance schedule for this is covered in Chapter 23). + +You use a FILLFACTOR when you need to adjust the page densities. Think about things this way: + + * If it's an OLTP system, you want the FILLFACTOR to be low. + * If it's an OLAP or other very stable (in terms of changes—very few additions and deletions) system, you want the FILLFACTOR to be as high as possible. + * If you have something that has a medium transaction rate and a lot of report type queries against it, then you probably want something in the middle (not too low, not too high). + +If you don't provide a value, then SQL Server will fill your pages to two rows short of full, with a minimum of one row per page. (For example, if your row is 8,000 characters wide, you can fit only one row per page, so leaving things two rows short wouldn't work). + +IGNORE_DUP_KEY + +The IGNORE_DUP_KEY option is a way of doing little more than circumventing the system. In short, it causes a UNIQUE constraint to have a slightly different action from that which it would otherwise have. + +Normally, a unique constraint, or unique index, does not allow duplicates of any kind. If a transaction tried to create a duplicate based on a column that is defined as unique, then that transaction would be rolled back and rejected. Once you set the IGNORE_DUP_KEY option, however, you'll get mixed behavior. You will still receive an error message, but the error will be only of a warning level. The record is still not inserted. + +This last line—the record is still not inserted—is a critical concept from an IGNORE_DUP_KEY standpoint. A rollback isn't issued for the transaction (the error is a warning error rather than a critical error), but the duplicate row will have been rejected. + +Why would you do this? Well, it's a way of storing unique values, but not disturbing a transaction that tries to insert a duplicate. For whatever process is inserting the would-be duplicate, it may not matter at all that it's a duplicate row (no logical error from it). Instead, that process may have an attitude that's more along the lines of, "Well, as long as I know there's one row like that in there, I'm happy. I don't care whether it's the specific row that I tried to insert or not." + +DROP_EXISTING + +If you specify the DROP_EXISTING option, any existing index with the name in question will be dropped prior to construction of the new index. This option is much more efficient than simply dropping and re-creating an existing index when you use it with a clustered index. If you rebuild an exact match of the existing index, SQL Server knows that it need not touch the non-clustered indexes, while an explicit drop and create would involve rebuilding all of the non-clustered indexes twice in order to accommodate the different row locations. If you change the structure of the index using DROP_EXISTING, the NCIs are rebuilt only once instead of twice. Furthermore, you cannot simply drop and re-create an index created by a constraint, for example, to implement a certain fill factor. DROP_EXISTING is a workaround to this. + +STATISTICS_NORECOMPUTE + +By default, SQL Server attempts to automate the process of updating the statistics on your tables and indexes. By selecting the STATISTICS_NORECOMPUTE option, you are saying that you will take responsibility for the updating of the statistics. To turn this option off, you need to run the UPDATE STATISTICS command, but not use the NORECOMPUTE option. + +I strongly recommend against using this option. Why? Well, the statistics on your index are what the Query Optimizer uses to figure out just how helpful your index is going to be for a given query. The statistics on an index are changing constantly as the data in your table goes up and down in volume and as the specific values in a column change. When you combine these two facts, you should be able to see that not updating your statistics means that the Query Optimizer is going to be running your queries based on out-of-date information. Leaving the automatic statistics feature on means that the statistics will be updated regularly. (Just how often depends on the nature and frequency of your updates to the table.) Conversely, turning automatic statistics off means that you will either be out of date or you will need to set up a schedule to manually run the UPDATE STATISTICS command. + +SORT_IN_TEMPDB + +This option makes sense only when your tempdb is stored on a physically separate drive from the database that is to contain the new index. This is largely an administrative function, so I'm not going to linger on this topic for more than a brief overview of what it is and why it makes sense only when tempdb is on a separate physical device. + +When SQL Server builds an index, it has to perform multiple reads to take care of the various index construction steps: + +1. Read through all the data, constructing a leaf row corresponding to each row of actual data. Just like the actual data and final index, these go into pages for interim storage. These intermediate pages are not the final index pages but rather a holding place to temporarily store things every time the sort buffers fill up. + +2. A separate run is made through these intermediate pages to merge them into the final leaf pages of the index. + +3. Non-leaf pages are built as the leaf pages are being populated. + +If the SORT_IN_TEMPDB option is not used, then the intermediate pages are written out to the same physical files that the database is stored in. This means that the reads of the actual data have to compete with the writes of the build process. The two cause the disk heads to move to different places from those the other (read versus write) needs. The result is that the disk heads are constantly moving back and forth; this takes time. + +If, on the other hand, SORT_IN_TEMPDB is used, then the intermediate pages will be written to tempdb rather than the database's own file. If they are on separate physical drives, this means that there is no competition between the read and write operations of the index build. Keep in mind, however, that this works only if tempdb is on a separate physical drive from your database file; otherwise, the change is only in name, and the competition for I/O is still a factor. + +If you're going to use SORT_IN_TEMPDB, make sure that there is enough space in tempdb for large operations. + +ONLINE + +If you set this to ON, it forces the table to remain available for general access and does not create any locks that block users from the index and/or table. By default, full index operations will grab the locks (eventually a table lock) it needs to have full and efficient access to the table. The side effect, however, is that your users are blocked out. (Yeah, it's a paradox; you're likely building an index to make the database more usable, but you essentially make the table unusable while you do it.) + +Now, you're probably thinking something like: "Oh, that sounds like a good idea. I'll do that every time so my users are unaffected." Poor thinking. Keep in mind that any index construction like that is probably a very highly I/O-intensive operation, so it is affecting your users one way or the other. Now, add that there is a lot of additional overhead required in the index build for it to make sure that it doesn't step on the toes of any of your users. If you let SQL Server have free reign over the table while it's building the index, then the index will be built much faster, and the overall time that the build is affecting your system will be much smaller. + +ONLINE index operations are supported only in the Enterprise Edition of SQL Server. You can execute the index command with the ONLINE directive in other editions, but it will be ignored, so don't be surprised if you use ONLINE and find your users still being blocked out by the index operation if you're using a lesser edition of SQL Server. + +ALLOW ROW/PAGE LOCKS + +This is a longer term directive than ONLINE is, and is a very, very advanced topic. For purposes of this book and given how much we've introduced so far on locking, I want to stick with a pretty simple explanation. + +Through much of the book thus far I have repeatedly used the term lock. As explained early on, this is something of a placeholder to avoid conflicts in data integrity. The ALLOW settings you're looking at here are setting directives regarding whether this index will allow those styles of locks or not. This falls under the heading of extreme performance tweak. + +MAXDOP + +This is overriding the system setting for the maximum degree of parallelism for purposes of building this index. Parallelism is not something I talk about in this book, so I'll give you a mini-dose of it here. + +In short, the degree of parallelism is how many processes are put to use for one database operation (in this case, the construction of an index). There is a system setting called the max degree of parallelism that allows you to set a limit on how many processes can run in parallel per logical operation. The MAXDOP option in the index creation options allows you to set the degree of parallelism to be either higher or lower than the base system setting as you deem appropriate. + +ON + +SQL Server gives you the option of storing your indexes separately from the data by using the ON option. This can be nice from a couple of perspectives: + + * The space that is required for the indexes can be spread across other drives. + * The I/O for index operations does not burden the physical data retrieval. + +There's more to this, but this is highly advanced stuff. It is very data- and use-dependent, and so we'll consider it out of the scope of this book. + +Implied Indexes Created with Constraints + +I guess I call this one "index by accident." It's not that the index shouldn't be there. It has to be there if you want the constraint that created the index. It's just that I've seen an awful lot of situations where the only indexes on the system were those created in this fashion. Usually, this implies that the administrators and/or designers of the system are virtually oblivious to the concept of indexes. + +However, you'll also find another bizarre twist on this one—the situation where the administrator or designer knows how to create indexes but doesn't really know how to tell what indexes are already on the system and what they are doing. This kind of situation is typified by duplicate indexes. As long as they have different names, SQL Server will be more than happy to create them for you. + +Implied indexes are created when one of two constraints is added to a table: + + * A PRIMARY KEY + * A UNIQUE constraint (a.k.a. an alternate key) + +You've seen plenty of the CREATE syntax up to this point, so I won't belabor it; however, it should be noted that all the options except for {CLUSTERED|NONCLUSTERED} and FILLFACTOR are not allowed when creating an index as an implied index to a constraint. + +ALTER INDEX + +The command ALTER INDEX is somewhat deceptive. Up until now, ALTER commands have always been about changing the definition of your object. You ALTER tables to add or disable constraints and columns for example. ALTER INDEX is different. It is all about maintenance and zero about structure. If you need to change the make-up of your index, you still need either to DROP and CREATE it or to CREATE and use the index with the DROP_EXISTING=ON option. + +As you saw earlier in the chapter, SQL Server gives you an option for controlling just how full your leaf level pages are, and, if you choose, another option to deal with non-leaf level pages. Unfortunately, these are proactive options. They are applied once, and then you need to reapply them as necessary by rebuilding your indexes and reapplying the options. + +In the upcoming section on maintenance, you'll learn more on the wheres and whys of utilizing this command, but for now take it on faith that you'll use maintenance commands like ALTER INDEX as part of your regular maintenance routine. + +The ALTER INDEX syntax looks like this: + +ALTER INDEX { | ALL } + +ON
+ +{ REBUILD + +[ [ WITH ( + +[ PAD_INDEX = { ON | OFF } ] + +| [[,] FILLFACTOR = + +| [[,] SORT_IN_TEMPDB = { ON | OFF } ] + +| [[,] IGNORE_DUP_KEY = { ON | OFF } ] + +| [[,] STATISTICS_NORECOMPUTE = { ON | OFF } ] + +| [[,] ONLINE = { ON | OFF } ] + +| [[,] ALLOW_ROW_LOCKS = { ON | OFF } ] + +| [[,] ALLOW_PAGE_LOCKS = { ON | OFF } ] + +| [[,] MAXDOP = + +) ] + +| [ PARTITION = + +[ WITH ( + +[ ,...n ] ) ] ] ] + +| DISABLE + +| REORGANIZE + +[ PARTITION = ] + +[ WITH ( LOB_COMPACTION = { ON | OFF } ) ] + +| SET ([ ALLOW_ROW_LOCKS= { ON | OFF } ] + +| [[,] ALLOW_PAGE_LOCKS = { ON | OFF } ] + +| [[,] IGNORE_DUP_KEY = { ON | OFF } ] + +| [[,] STATISTICS_NORECOMPUTE = { ON | OFF } ] + +) + +} [ ; ] + +Several of the options are common to the CREATE INDEX command, so I will skip redefining those particular ones here. Beyond that, a fair amount of the ALTER-specific options are fairly detailed and relate to dealing with things like fragmentation (you'll get to fragmentation and maintenance shortly) or are more DBA oriented and usually used on an ad hoc basis to deal with very specific problems. The core elements here should, however, be part of your regular maintenance planning. + +You'll start by looking at a couple of top parameters and then look at the options that are part of your larger maintenance planning needs + +Index Name + +You can name a specific index if you want to maintain one specific index, or use ALL to indicate that you want to perform this maintenance on every index associated with the named table. + +Table or View Name + +Pretty much just what it sounds like—the name of the specific object (table or view) that you want to perform the maintenance on. Note that it needs to be one specific table. (You can feed it a list and say, "do all of these please!"). + +REBUILD + +This is the "industrial-strength" approach to fixing an index. If you run ALTER INDEX with this option, the old index is completely thrown away and a new one reconstructed from scratch. The result is a truly optimized index, where every page in both the leaf and non-leaf levels of the index has been reconstructed as you have defined it (either with the defaults, or using switches to change things like the fill factor). If the index in question is a clustered index, then the physical data is also reorganized. + +By default, the pages will be reconstituted to be full minus two records. Just as with the CREATE TABLE syntax, you can set the FILLFACTOR to be any value between 0 and 100. This number will be the percent full that your pages are once the database reorganization is complete. Remember though that, as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes. + +Careful on this one. As soon as you kick off a REBUILD, the index you are working on is essentially gone until the rebuild is complete. Any queries that relied on that index may become exceptionally slow (potentially by orders of magnitude). This is the sort of thing you want to test on an offline system first to have an idea how long it's going to take, and then schedule to run in off hours (preferably with someone monitoring it to be sure it's back online when peak hours come along). + +This one can have major side effects while it runs, and thus it falls squarely in the domain of the database administrator in my not so humble opinion. + +DISABLE + +This one does what it says, only in somewhat drastic fashion. It would be nice if all this command did was take your index offline until you decided further what you want to do, but instead it essentially marks the index as unusable. Once an index has been disabled, it must be rebuilt (not reorganized, but rebuilt) before it will be active again. + +This is one you're very, very rarely going to do yourself (you would more likely just drop the index)—it is far more likely to happen during a SQL Server upgrade or some other oddball situation. + +Yet another BE CAREFUL!!! warning on this one. If you disable the clustered index for your table, it has the effect of disabling the table. The data will remain, but will be inaccessible by all indexes (since they all depend on the clustered index) until you rebuild the clustered index. + +REORGANIZE + +BINGO!!! from the developer perspective. With REORGANIZE you hit much more of a happy medium in life. When you reorganize your index, you get a slightly less complete optimization than you get with a full rebuild, but one that occurs online. (Users can still utilize the index.) + +This should, if you're paying attention, bring about the question "What exactly do you mean by 'slightly less complete'?" Well, REORGANIZE works only on the leaf level of your index; non-leaf levels of the index go untouched. This means that you're not quite getting a full optimization, but, for the lion's share of indexes, that is not where your real cost of fragmentation is (though it can happen and your mileage may vary). + +Given its much lower impact on users, this is usually the tool you'll want to use as part of your regular maintenance plan. We'll look into this a bit more later when talking fragmentation. + +DROP INDEX + +This one returns to most of the simplicity of prior DROP statements. The only real trick to it is that, since an index is not a standalone object (it is essentially contained within the definition of a table), you must name not only the index but also the table that is belongs to. The syntax looks like this: + +DROP INDEX
. + +As you can see, there's not really much to it. You can use full four-part naming (I guess it turns into five part if you include the index) if you need to. + +Choosing Wisely: Deciding What Index Goes Where and When + +By now, you're probably thinking to yourself, "Gee, I'm always going to create clustered indexes!" There are plenty of good reasons to think that way. Just keep in mind that there are also some reasons not to. + +Choosing which indexes to include and which not to include can be a tough process, and, in case that wasn't enough, you have to make some decisions about what type you want them to be. The latter decision is made simultaneously easier and harder by the fact that you can only have one clustered index. It means that you have to choose wisely to get the most out of it. + +Selectivity + +Indexes, particularly non-clustered indexes, are primarily beneficial when there is a reasonably high level of selectivity within the index. By selectivity, I'm referring to the percentage of values in the column that are unique. The higher the percentage of unique values within a column, the higher the selectivity is said to be, and the greater the benefit of indexing. + +If you think back to the sections on non-clustered indexes—particularly the section on non-clustered indexes over a clustered index—you will recall that the lookup in the non-clustered index is really only the beginning. You still need to make another loop through the clustered index to find the real data. Even with the non-clustered index on a heap, you still end up with multiple physically separate reads to perform. + +If one lookup in your non-clustered index is going to generate multiple additional lookups in a clustered index, then you are probably better off with the table scan. The exponential effect that's possible here is actually quite amazing. Consider that the looping process created by the non-clustered index is not worth it if you don't have somewhere in the area of 90–95 percent uniqueness in the indexed column. + +Clustered indexes are substantially less affected by this because, once you're at the start of your range of data—unique or not—you're there. There are no additional index pages to read. Still, more than likely, your clustered index has other things that it could be put to greater use on. + +One other exception to the rule of selectivity has to do with foreign keys. If your table has a column that is a foreign key, then, in all likelihood, you're going to benefit from having an index on that column. Why foreign keys and not other columns? Well, foreign keys are frequently the target of joins with the table they reference. Indexes, regardless of selectivity, can be very instrumental in join performance because they allow what is called a merge join. A merge join obtains a row from each table and compares them to see if they match the join criteria (what you're joining on). Since there are indexes on the related columns in both tables, the seek for both rows is very fast. + +The point here is that selectivity is not everything, but it is a big issue to consider. If the column in question is not in a foreign key situation, then it is almost certainly second only to the, "How often will this be used?" question in terms of issues you need to consider. + +Watching Costs: When Less Is More + +Remember that, while indexes speed up performance when reading data, they are actually very costly when modifying data. Indexes are not maintained by magic. Every time that you make a modification to your data, any indexes related to that data also need to be updated. + +When you insert a new row, a new entry must be made into every index on your table. Remember, too, that when you update a row, this is handled as a delete and insert; again, your indexes have to be updated. But wait! There's more! (Feeling like a late night infomercial here.) When you delete records, again, you must update all the indexes, not just the data. For every index that you create, you are creating one more block of entries that has to be updated. + +Notice, by the way, that I said entries plural—not just one. Remember that a B-Tree has multiple levels to it. Every time that you make a modification to the leaf level, there is a chance that a page split will occur, and that one or more non-leaf level pages must also be modified to have the reference to the proper leaf page. + +Sometimes—quite often actually—not creating that extra index is the thing to do. Sometimes, the best thing to do is choose your indexes based on the transactions that are critical to your system and use the table in question. Does the code for the transaction have a WHERE clause in it? What column(s) does it use? Is there a sorting required? + +Choosing That Clustered Index + +Remember that you can have only one, so you need to choose it wisely. + +By default, your primary key is created with a clustered index. This is often the best place to have it, but not always (indeed, it can seriously hurt you in some situations), and if you leave things this way, you won't be able to use a clustered index anywhere else. The point here is don't just accept the default. Think about it when you are defining your primary key: Do you really want it to be a clustered index? + +If you decide that you indeed want to change things—that is, you don't want to declare things as being clustered—just add the NONCLUSTERED keyword when you create your table. For example: + +CREATE TABLE MyTableKeyExample + +( + +Column1 intIDENTITY + +PRIMARY KEY NONCLUSTERED, + +Column2 int + +) + +Once the index is created, the only way to change it is to drop and rebuild it, so you want to get it set correctly up front. + +Keep in mind that, if you change which column(s) your clustered index is on, SQL Server will need to do a complete resorting of your entire table. (Remember, for a clustered index, the table sort order and the index order are the same.) Now, consider a table you have that is 5,000 characters wide and has a million rows in it. That is an awful lot of data that has to be reordered. Several questions should come to mind from this: + + * How long will it take? It could be a long time, and there really isn't a good way to estimate that time. + * Do I have enough space? Figure that in order to do a resort on a clustered index you will, on average, need an additional 1.2 times (the working space plus the new index) the amount of space your table is already taking up. This can turn out to be a very significant amount of space if you're dealing with a large table. Make sure you have the room to do it in. All this activity will, by the way, happen in the database itself, so this will also be affected by how you have your maximum size and growth options set for your database. + * Should I use the SORT_IN_TEMPDB option? If tempdb is on a separate physical array from your main database and it has enough room, then the answer is probably yes. + +The Pros + +Clustered indexes are best for queries when the column(s) in question will frequently be the subject of a ranged query. This kind of query is typified by use of the BETWEEN statement or the < or > symbols. Queries that use a GROUP BY and make use of the MAX, MIN, and COUNT aggregators are also great examples of queries that use ranges and love clustered indexes. Clustering works well here, because the search can go straight to a particular point in the physical data, keep reading until it gets to the end of the range, and then stop. It is extremely efficient. + +Clusters can also be excellent when you want your data sorted (using ORDER BY) based on the cluster key. + +The Cons + +There are two situations where you don't want to create that clustered index. The first is fairly obvious—when there's a better place to use it. I know I'm sounding repetitive here, but don't use a clustered index on a column just because it seems like the thing to do. (Primary keys are the common culprit here.) Be sure that you don't have another column that it's better suited to first. + +Perhaps the much bigger no-no use for clustered indexes, however, is when you are going to be doing a lot of inserts in a non-sequential order. Remember that concept of page splits? Well, here's where it can come back and haunt you big time. + +Imagine this scenario: You are creating an accounting system. You would like to make use of the concept of a transaction number for your primary key in your transaction files, but you would also like those transaction numbers to be somewhat indicative of what kind of transaction it is. (It really helps troubleshooting by your accountants.) So, you come up with something of a scheme: You'll place a prefix on all the transactions indicating what sub-system they come out of. They will look something like this: + +ARXXXXXX Accounts Receivable Transactions + +GLXXXXXX General Ledger Transactions + +APXXXXXX Accounts Payable Transactions + +where XXXXXX will be a sequential numeric value. + +This seems like a great idea, so you implement it, leaving the default of the clustered index going on the primary key. + +At first look, everything about this setup looks fine. You're going to have unique values, and the accountants will love the fact that they can infer where something came from based on the transaction number. The clustered index seems to make sense since they will often be querying for ranges of transaction IDs. + +Ah, if only it were that simple. Think about your inserts for a bit. With a clustered index, you originally had a nice mechanism to avoid much of the overhead of page splits. When a new record was inserted that was to go after the last record in the table, then, even if there was a page split, only that record would go to the new page, SQL Server wouldn't try to move around any of the old data. Now you've messed things up though. + +New records inserted from the General Ledger will wind up going on the end of the file just fine. (GL is last alphabetically, and the numbers will be sequential.) The AR and AP transactions have a major problem though; they are going to be doing non-sequential inserts. When AP000025 gets inserted and there isn't room on the page, SQL Server is going to see AR000001 in the table, and know that it's not a sequential insert. Half the records from the old page will be copied to a new page before AP000025 is inserted. + +The overhead of this can be staggering. Remember that you're dealing with a clustered index, and that the clustered index is the data. The data is in index order. This means that, when you move the index to a new page, you are also moving the data. Now imagine that you're running this accounting system in a typical OLTP environment (you don't get much more OLTP-like than an accounting system) with a bunch of data-entry people keying in vendor invoices or customer orders as fast as they can. You're going to have page splits occurring constantly, and every time you do, you're going to see a brief hesitation for users of that table while the system moves data around. + +Fortunately, there are a couple of ways to avoid this scenario: + + * Choose a cluster key that is going to be sequential in its inserting. You can either create an identity column for this or you may have another column that logically is sequential to any transaction entered regardless of system. + * Choose not to use a clustered index on this table. This is often the best option in a situation like that in this example, since an insert into a non-clustered index on a heap is usually faster than one on a cluster key. + +Even though I've told you to lean toward sequential cluster keys to avoid page splits, you also have to realize that there's a cost there. Among the downsides of sequential cluster keys are concurrency (two or more people trying to get to the same object at the same time). It's all about balancing what you want, what you're doing, and what it's going to cost you elsewhere. + +This is perhaps one of the best examples of why I have gone into so much depth as to how things work. You need to think through how things are actually going to get done before you have a good feel for what the right index to use (or not to use) is. + +Column Order Matters + +Just because an index has two columns, it doesn't mean that the index is useful for any query that refers to either column. + +An index is considered for use only if the first column listed in the index is used in the query. The bright side is that there doesn't have to be an exact one-for-one match to every column—just the first. Naturally, the more columns that match (in order), the better, but only the first creates a definite do-not-use situation. + +Think about things this way. Imagine that you are using a phone book. Everything is indexed by last name and then first name. Does this sorting do you any good if all you know is that the person you want to call is named Fred? On the other hand, if all you know is that his last name is Blake, the index will still serve to narrow the field for you. + +One of the more common mistakes that I see in index construction is to think that one index that includes all the columns is going to be helpful for all situations. Indeed, what you're really doing is storing all the data a second time. The index will totally be ignored if the first column of the index isn't mentioned in the JOIN, ORDER BY, or WHERE clauses of the query. + +Dropping Indexes + +If you're constantly re-analyzing the situation and adding indexes, don't forget to drop indexes, too. Remember the overhead on inserts. It doesn't make much sense to look at the indexes that you need and not also think about which indexes you do not need. Always ask yourself: "Can I get rid of any of these?" + +The syntax to drop an index is pretty much the same as that for dropping a table. The only hitch is that you need to qualify the index name with the table or view it is attached to: + +DROP INDEX
. + +And it's gone. + +Use the Database Engine Tuning Advisor + +It would be my hope that you'll learn enough about indexes not to need the Database Engine Tuning Advisor, but it still can be quite handy. It works by taking a workload file, which you generate using the SQL Server Profiler (discussed in Chapter 22), and looking over that information for what indexes will work best on your system. + +The Database Engine Tuning Advisor is found as part of the Tools menu of the SQL Server Management Studio. It can also be reached as a separate program item in the Start Menu of Windows (under Microsoft SQL Server 2008 ⇒ Performance Tools). As with most tuning tools, I don't recommend using this tool as the sole way you decide what indexes to build, but it can be quite handy in terms of making some suggestions that you may not have thought of. + +Maintaining Your Indexes + +As developers, we often tend to forget about our product after it goes out the door. For many kinds of software, that's something you can get away with just fine. You ship it and then you move on to the next product or next release. However, with database-driven projects, it's virtually impossible to get away with. You need to take responsibility for the product well beyond the delivery date. + +Please don't take me to mean that you have to go serve a stint in the tech support department. I'm actually talking about something even more important: maintenance planning. + +There are really two issues to be dealt with in terms of the maintenance of indexes: + + * Page splits + * Fragmentation + +Both are related to page density and, while the symptoms are substantially different, the troubleshooting tool is the same, as is the cure. + +Fragmentation + +We've already talked about page splits quite a bit, but we haven't really touched on fragmentation. I'm not talking about the fragmentation that you may have heard of with your O/S files and the defrag tool you use, because that won't help with database fragmentation. + +Fragmentation happens when your database grows, pages split, and then data is eventually deleted. While the B-Tree mechanism is really not that bad at keeping things balanced from a growth point of view, it doesn't really have a whole lot to offer as you delete data. Eventually, you may get down to a situation where you have one record on this page, a few records on that page—a situation where many of your data pages are holding only a small fraction of the amount of data that they could hold. + +The first problem with this is probably the first you would think about—wasted space. Remember that SQL Server allocates an extent of space at a time. If only one page has one record on it, then that extent is still allocated. In the case of the empty pages in the extent, SQL Server will see those pages as available for reuse in the same table or index, but if, for example, that table or index is decreasing in size, the free pages in the extent will remain unused. + +The second problem is the one that is more likely to cause you grief: Records that are spread all over the place cause additional overhead in data retrieval. Instead of just loading up one page and grabbing the 10 rows it requires, SQL Server may have to load 10 separate pages in order to get that same information. It isn't just reading the row that causes effort, SQL Server has to read that page in first. More pages equals more work on reads. + +That being said, database fragmentation does have its good side. OLTP systems positively love fragmentation. Any guesses as to why? Page splits. Pages that don't have much data in them can have data inserted with little or no fear of page splits. + +So, high fragmentation equates to poor read performance, but it also equates to excellent insert performance. As you might expect, this means that OLAP systems really don't like fragmentation, but OLTP systems do. + +Identifying Fragmentation + +SQL Server has always had commands to help you identify just how full the pages and extents in your database are. In SQL Server 2005, Microsoft greatly expanded the options and, in particular, the usability of management tools for indexes, and those increased options continue to become more mainstream as we move into the SQL Server 2008 era and slowly become less concerned about compatibility with SQL Server 2000 and earlier. We can use the information provided by these commands and tools to make some decisions about what we want to do to maintain our database. + +sy.sdm_db_index_physical_stats + +The sys.dm_db_index_physical_stats function is one of several metadata functions that were added back in SQL Server 2005. (There is a discussion of these in Appendix [CHECK].) The idea behind these and similar metadata functions is to allow developers and administrators alike more flexible access to data on the condition of our server, the database, and the tables and indexes within. Whereas before we were stuck with different functions within the Database Consistency Checker (DBCC), which gave free-form output that was difficult to use programmatically (you were pretty much stuck parsing the results to find what you need), we now have both scalar and table-valued functions, as appropriate, that return usable data to us that we can build conditions around, grab values to use in variables, and otherwise manipulate as discrete pieces of data. When talking indexes, the metadata function we're most likely interested in is sys.dm_db_index_physical_stats. It is a table-valued function that requires several parameters, and the syntax looks like this: + +sys.dm_db_index_physical_stats ( + +{ | NULL | 0 | DEFAULT } + +, { | NULL | 0 | DEFAULT } + +, { | NULL | 0 | -1 | DEFAULT } + +, { | NULL | 0 | DEFAULT } + +, { LIMITED | SAMPLED | DETAILED | NULL | DEFAULT } + +) + +Again, this is a table-valued function, so you need to use it in conjunction with a SELECT statement. Let's look at the input parameters individually. + +Parameter | Description +---|--- +Database ID | SQL Server's internal identifier for the database you want containing the tables and indexes you want physical statistics for. Use the DB_ID() function to easily retrieve the database id for your database. The default for this parameter is NULL (technically the same as 0 in use), which means supply information for all databases. +Object ID | The internal identifier for the particular object you want physical statistical information on. Use the OBJECT_ID() function to easily retrieve the object id for the table or view you're interested in. The default is NULL (again, functionally the equivalent of 0) and implies that you want data for all objects in the database(s) you've indicated. +Index ID | The internal identifier for the particular index you're interested in physical statistics for. Fetching a particular index identifier is more of a challenge, as there is no system function to retrieve it. (You would need to query it from sys.indexes using the name and the object id it belongs to.) As with the other parameters so far, this one defaults to -1. Unlike previous parameters, this is not functionally equivalent to 0 (which is only valid if the table is built on a heap, and then indicates you want data on the heap itself). +Partition number | For the vast majority of tables, there is only going to be one partition (and its number will be 1). The default is NULL, which returns all partitions and is functionally equivalent to 0. +Mode | Determines the level of scanning performed to establish the statistics returned. Scan modes include LIMITED, SAMPLED, and DETAILED in order of increasing accuracy but increasing overhead (and slower response). + +What is returned is a very wide table with an array of different physical statistics on your index or table. We won't address every one of them here, but let's take a look at some of the highlights: + +Column | Description +---|--- +index_type_desc | Indicates the nature of the index this row relates to. If the result is HEAP or CLUSTERED INDEX, then it relates to the physical data for the table. Other possible results include NONCLUSTERED INDEX, PRIMARY XML INDEX, XML INDEX, and SPATIAL INDEX. +index_depth | Number of levels to the index. If it's a heap or a set of LOB pages, then this will always be 1; otherwise, it will represent how many levels there are in the index. (For example, back in Figure 8.7, there are three levels to the non-clustered index.) +index_level | This one is somewhat counterintuitive in that it counts from the bottom of the index up. The leaf level of the index will be zero (also zero for a heap or LOB), and the number will go up as one navigates backwards up the tree. This value is only supplied when the mode is DETAILED +avg_fragmentation_in_percent | This indicates the degree of fragmentation in the index tree based on pages or extents that are out of order (the pointer to the logical next page is not the same as the physical next page). You're usually looking for a low number here, though how low depends on the specifics of your row makeup and the purpose of the index. +avg_record_size_in_bytes | Just what it says it is. The average size of a record in the index. This can be a highly useful number when doing space planning (If I add another 100,000 rows, how much space will it take up?). +record_count | Another somewhat tricky one. This value will generally match what you would get from a SELECT COUNT(*). The exception is when dealing with a heap that has a forwarding record. Forwarding records occur when a record is written onto a page, and then is later updated such that a given column no longer fits on the page (so they store a pointer to where the data is instead). + +Let's take a look at a quick example of using this system function. Imagine for a moment that we want to see the fragmentation on the clustered index for the Sales.SalesOrderDetail table. We could get key pieces of information with the following query: + +SELECT index_type_desc AS Type, + +index_id, + +avg_fragmentation_in_percent, + +forwarded_record_count + +FROM sys.dm_db_index_physical_stats( + +DB_ID(), + +OBJECT_ID('Sales.SalesOrderDetail'), + +DEFAULT, + +DEFAULT, + +'DETAILED' ) + +WHERE index_id = 1 + +AND index_level = 0; + +Which yields a fairly straightforward result set: + +Type index_id avg_fragmentation_in_percent forwarded_record_count + +\---------------- --------- ---------------------------- ----------------- + +CLUSTERED INDEX 1 0.0810372771474878 NULL + +(1 row(s) affected) + +Note that I've used the index_id = 1 in my WHERE clause to force it to be the clustered index. (I would choose zero had this been on a heap.) I've chosen index_level = 0 to force it to give me information on just the leaf level of the index. + +By placing an additional WHERE condition on the fragmentation percentage, I could use the information provided here to build a list of indexes that I thought required maintenance (more on that in Chapter 23). + +Backward Compatibility + +So we've now seen the metadata way of getting information, but what about when we're working with older releases (prior to SQL Server 2005)? The "old standby" command is actually an option for the DBCC. This is the command you're likely to find utilized in some fashion in virtually every installation today and for years to come. This is the pre-2005 way of doing things, and any pre-2005 database installation that had any maintenance going at all utilized it. What's more, there continue to be tons and tons of articles and "how-tos" on the Web that show you how to use this tool. + +Before I get too far into extolling the praises of DBCC SHOWCONTIG, let me remind you that this is the "old," and, dare I say, "inflexible" way of doing things. The system views give us many more possibilities in terms of being able to more specifically query data and manage indexes on a more global level. We explore much more of that functionality in Appendix [CHECK] at the end of this book. With that said, DBCC has done the job for years, and it is the thing to use if you are monitoring indexes in a server environment that contains pre–SQL Server 2005 installations, and is what you will likely find in much of the existing management code out there. + +The syntax is pretty simple: + +DBCC SHOWCONTIG + +[({
|
||} + +[, |])] + +[WITH { [ ALL_INDEXES ] + +| [, FAST ] + +| [, TABLERESULTS ] + +| [, ALL_LEVELS } ] + +| [, NO_INFOMSGS ] + +Some of this is self-describing (such as the table name), but I want address the items beyond the names: + +table id/view id/index id | The is the internal object id for the table, view, or index. In prior versions of SQL Server, DBCC SHOWCONTIG operated solely off this identifier, so you had to look it up using the OBJECT_ID() function prior to making your DBCC call. +---|--- +ALL_INDEXES | This is one of those "what it sounds like" things. If you specify this option, you can skip providing a specific index, as all indexes will be analyzed and data returned. +FAST | This is about getting a return as fast as possible, and it therefore skips analyzing the actual pages of the index and will output only minimal information. +TABLERESULTS | A very cool feature—this one returns the results as a table rather than text. This means it's much easier to parse the results and take automated actions. +ALL_LEVELS | This really only has one relevance in SQL Server 2005, as what it used to do it now ignores. The relevance? Backward compatibility. Basically, you can include this option and the command will still run, but it won't be any different. +NO_INFOMSGS | This just trims out informational-only messages. Basically, if you have any significant errors in your table (error level 11 or higher), then messages will still come through, but error level 10 and lower will be excluded. + +As an example, to again get the information from the PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID index in the Sales.SalesOrderDetail table, we could run: + +USE AdventureWorks2008; + +GO + +DBCC SHOWCONTIG ('Sales.SalesOrderDetail', + +PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID); + +Notice the single quotation marks around the table name. These are only required because I'm using two-part naming; if I had only specified the name of the table (SalesOrderDetail), then the quotation marks would not have been required. The problem here is that, depending on how your user is set up for use of different schemas or the existence of other tables with the same name in a different schema, leaving out the schema name may generate an error or perform the operation on a different table than you expected. + +The output is not really all that self-describing: + +DBCC SHOWCONTIG scanning 'SalesOrderDetail' table... + +Table: 'SalesOrderDetail' (898102240); index ID: 1, database ID: 7 + +TABLE level scan performed. + +\- Pages Scanned................................: 1234 + +\- Extents Scanned..............................: 155 + +\- Extent Switches..............................: 154 + +\- Avg. Pages per Extent........................: 8.0 + +\- Scan Density [Best Count:Actual Count].......: 100.00% [155:155] + +\- Logical Scan Fragmentation ..................: 0.08% + +\- Extent Scan Fragmentation ...................: 3.23% + +\- Avg. Bytes Free per Page.....................: 28.5 + +\- Avg. Page Density (full).....................: 99.65% + +DBCC execution completed. If DBCC printed error messages, contact your + +system administrator. + +Some of this is probably pretty straightforward, but the following table will walk you through what everything means: + +Stat | What It Means +---|--- +Pages Scanned | The number of pages in the table (for a clustered index) or index. +Extents Scanned | The number of extents in the table or index. This will be a minimum of the number of pages divided by 8 and then rounded up. The more extents for the same number of pages, the higher the fragmentation. +Extent Switches | The number of times DBCC moved from one extent to another as it traversed the pages of the table or index. This is another one for fragmentation—the more switches it has to make to see the same amount of pages, the more fragmented you are. +Avg. Pages per Extent | The average number of pages per extent. A fully populated extent would have eight. +Scan Density [Best Count: Actual Count] | The best count is the ideal number of extent changes if everything is perfectly linked. Actual count is the actual number of extent changes. Scan density is the percentage found by dividing the best count by the actual count. +Logical Scan Fragmentation | The percentage of pages that are out-of-order as checked by scanning the leaf pages of an index. Only relevant to scans related to a clustered table. An out-of-order page is one for which the next page indicated in the index allocation map (IAM) is different from that pointed to by the next page pointer in the leaf page. +Extent Scan Fragmentation | This one is telling you if an extent is not physically located next to the extent that it is logically located next to. This just means that the leaf pages of your index are not physically in order (though they still can be logically), and just what percentage of the extents this problem pertains to. +Avg. Bytes free per page | Average number of free bytes on the pages scanned. This number can get artificially high if you have large row sizes. For example, if your row size was 4,040 bytes, then every page could only hold one row, and you would always have an average number of free bytes of about 4,020 bytes. That would seem like a lot, but, given your row size, it can't be any less than that. +Avg. Page density (full) | Average page density (as a percentage). This value takes into account row size and is, therefore, a more accurate indication of how full your pages are. The higher the percentage, the better. + +Now, the question is how do we use this information once we have it? The answer is, of course, that it depends. + +Using the output from our SHOWCONTIG, we have a decent idea of whether our database is full, fragmented, or somewhere in between (the latter is, most likely, what we want to see). If we're running an OLAP system, then seeing our pages full would be great; fragmentation would bring on depression. For an OLTP system, we would want much the opposite (although only to a point). + +So, how do we take care of the problem? To answer that, we need to look into the concept of index rebuilding and fillfactors. + +DBREINDEX—That Other Way of Maintaining Your Indexes + +Earlier in the chapter, we looked at the ALTER INDEX command. This should be your first line command for performing index reorganization and managing your fragmentation levels. While I highly recommend the use of ALTER INDEX moving forward, DBREINDEX is the way things have been done in the past, and, much like DBCC SHOWCONTIG, there is far, far too much code and use out there already for me to just skip it. + +DBREINDEX is another DBCC command, and the syntax looks like this: + +DBCC DBREINDEX (<'database.owner.table_name'>[, + +[, ]]) [WITH NO_INFOMSGS] + +Executing this command completely rebuilds the requested index. If you supply a table name with no index name, then it rebuilds all the indexes for the requested table. There is no single command to rebuild all the indexes in a database. + +Rebuilding your indexes restructures all the information in those indexes, and reestablishes a base percentage that your pages are full. If the index in question is a clustered index, then the physical data is also reorganized. + +As with ALTER INDEX, the pages will, by default, be reconstituted to be full minus two records. Just as with the CREATE TABLE syntax, you can set the FILLFACTOR to be any value between 0 and 100. This number will be the percent full that your pages are once the database reorganization is complete. Remember though that, as your pages split, your data will still be distributed 50–50 between the two pages. You cannot control the fill percentage on an ongoing basis other than regularly rebuilding the indexes. + +There is something of an exception on the number matching the percent full that occurs if you use zero as your percentage. It will go to full minus two rows (it's a little deceiving—don't you think?). + +We use a FILLFACTOR when we need to adjust the page densities. As we've already discussed, lower page densities (and therefore lower FILLFACTORs) are ideal for OLTP systems where there are a lot of insertions; this helps prevent page splits. Higher page densities are desirable with OLAP systems (fewer pages to read, but no real risk of page splitting due to few to no inserts). + +If we wanted to rebuild the index that serves as the primary key for the Order Details table we were looking at earlier with a fill factor of 65, we would issue a DBCC command as follows: + +DBCC DBREINDEX ('Sales.SalesOrderDetail', + +PK_SalesOrderDetail_SalesOrderID_SalesOrderDetailID, 65) + +We can then re-run the DBCC SHOWCONTIG to see the effect: + +DBCC SHOWCONTIG scanning 'SalesOrderDetail' table... + +Table: 'SalesOrderDetail' (898102240); index ID: 1, database ID: 7 + +TABLE level scan performed. + +\- Pages Scanned................................: 1883 + +\- Extents Scanned..............................: 236 + +\- Extent Switches..............................: 235 + +\- Avg. Pages per Extent........................: 8.0 + +\- Scan Density [Best Count:Actual Count].......: 100.00% [236:236] + +\- Logical Scan Fragmentation ..................: 0.05% + +\- Extent Scan Fragmentation ...................: 1.27% + +\- Avg. Bytes Free per Page.....................: 2809.1 + +\- Avg. Page Density (full).....................: 65.29% + +DBCC execution completed. If DBCC printed error messages, contact your + +system administrator. + +The big one to notice here is the change in Avg. Page Density. The number didn't quite reach 65 percent because SQL Server has to deal with page and row sizing, but it gets as close as it can. + +Several things to note about DBREINDEX and FILLFACTOR: + + * If a FILLFACTOR isn't provided, then the DBREINDEX will use whatever setting was used to build the index previously. If one has never been specified, then the fill factor will make the page full less two records (which is too full for most situations). + * If a FILLFACTOR is provided, then that value becomes the default FILLFACTOR for that index. + * While DBREINDEX can be done live, I strongly recommend against it. It locks resources and can cause a host of problems. At the very least, look at doing it at non-peak hours. Better still, if you're going to do it online, use ALTER INDEX instead and just do a REORGANIZE rather than a rebuild. + * I've said it before, but it bears repeating: DBREINDEX is now considered deprecated, and you should avoid it in situations where you do not need that backward compatibility. (Use ALTER INDEX instead.) + +Summary + +Indexes are sort of a cornerstone topic in SQL Server or any other database environment, and are not something to be taken lightly. They can drive your performance successes, but they can also drive your performance failures. + +Top-level things to think about with indexes: + + * Clustered indexes are usually faster than non-clustered indexes (one could come very close to saying always, but there are exceptions). + * Only place non-clustered indexes on columns where you are going to get a high level of selectivity (that is, 95 percent or more of the rows are unique). + * All Data Manipulation Language (DML: INSERT, UPDATE, DELETE, SELECT) statements can benefit from indexes, but inserts, deletes, and updates (remember, they use a delete and insert approach) are slowed by indexes. The lookup part of a query is helped by the index, but anything that modifies data will have extra work to do (to maintain the index in addition to the actual data). + * Indexes take up space. + * Indexes are used only if the first column in the index is relevant to your query. + * Indexes can hurt as much as they help—know why you're building the index, and don't build indexes you don't need. + * Indexes can provide structured data performance to your unstructured XML data, but keep in mind that, like other indexes, there is overhead involved. + +When you're thinking about indexes, ask yourself these questions: + +Question | Response +---|--- +Are there a lot of inserts or modifications to this table? | If yes, keep indexes to a minimum. This kind of table usually has modifications done through single record lookups of the primary key—usually, this is the only index you want on the table. If the inserts are non-sequential, think about not having a clustered index. +Is this a reporting table? That is, relatively few inserts, but reports that run many different ways? | More indexes are fine. Target the clustered index to frequently used information that is likely to be extracted in ranges. OLAP installations will often have many times the number of indexes seen in an OLTP environment. +Is there a high level of selectivity on the data? | If yes, and it is frequently the target of a WHERE clause, then add that index. +Have I dropped the indexes I no longer need? | If not, why not? +Do I have a maintenance strategy established? | If not, why not? +7 + +More Advanced Index Structures + +Alright, so we've walked through the basics of design. Heck, we've even walked through the advanced stages of traditional indexing. There are, however, some even more advanced things to think about in indexing and other storage. Among these are some of the atypical index and storage structures including: + + * XML indexes + * Spatial data and their associated indexes + * User-defined data types + * Filestreams + * Table compression + * Hierarchical data + +In this chapter, we'll take a look at each of these. Some of it will build on things you already know (like the XML data type and methods we've already talked about extensively), and some will likely be totally new. (Indeed, the remaining items are new with SQL Server 2008.) + +The choice to group these particular items into one chapter may seem a bit crazy (even to me), but the thing they have in common is pretty simple. They are all somewhat out of the mainstream and require a bit of extra thinking to see how they work. + +XML Indexes + +XML indexes first appeared in SQL Server 2005, and I have to admit that I continue to be mildly amazed that Microsoft pulled it off. I've known some of that team for a very long time now, and I have a lot of confidence in them, but the indexing of something as unstructured as XML has been a problem that many have tried to address, but few have done with any real success. Kudos to the SQL Server team for pulling this one off. Enough gushing though. I want to get down to the business of what XML indexes are all about. + +Perhaps the most amazing thing about XML indexes is that they are really not all that different from indexes of more typical relational data. Indeed, the XML CREATE syntax supports all the same options you saw in the previous chapter for the CREATE INDEX statement with the exception of IGNORE_DUP_KEY and ONLINE. Why is this such a big deal? Well, while an index would seem to be a basic structure that could support anything, the nature of what's being indexed can have a significant impact in how well traditional indexes support the underlying data. Unlike the relational data that you may be more accustomed to, XML tends to be very unstructured. It utilizes tags to identify data, and can be far more variable in nature than typical relational data. The unstructured nature of XML requires the notion of "navigating" or "path" information to find a data "node" in an XML document. Now indexes, on the other hand, try to provide very specific structure and order to data. This poses something of a conflict. + +You can create indexes on columns in SQL Server that are of type XML. The requirements of doing this are: + + * The table containing the XML you want to index must have a clustered index on it, and that clustered index must be on the table's primary key; furthermore, the primary key can not include more than 15 columns. + * A "primary" XML index must exist on the XML data column before you can create "secondary" indexes (more on this in a moment). + * XML indexes can be created only on columns of XML type (and an XML index is the only kind of index you can create on columns of that type). + * The XML column must be part of a base table. You cannot create the index on a view, table variable, or table user-defined data type. + +Creating one or more XML indexes on a table also implies an important restriction on your table: You cannot modify the primary key or (as a result) the clustered index while any XML indexes exist on the table. If you need to modify the primary key, you must first drop all the XML indexes. (You can rebuild them after the modification to the primary key is complete.) + +The Primary XML Index + +The first index you create on an XML index must be declared as a "primary" index. When you create a primary index, SQL Server "shreds" the XML (converting it to tabular form) and creates a new clustered index that combines the clustered index of the base table with data from whatever XML node you specify. In addition to the cluster key information, the primary XML index will also store: + + * The tag name of the node being indexed (its element or attribute name) + * The value of the node + * The type of the node (element, attribute, or text) + * An internal node identifier (order information) + * The path from the node to the document root + +All this is the result of shredding the XML out into an internal table. This internal table is how the XML data is persisted in a form that allows the traditional index model to work. You can get a look at what internal tables are being stored in your system by querying sys.internal_tables (which also shows other types of internal tables) or sys.xml_indexes. For example, we can check out the XML indexes in the AdventureWorks2008 database: + +SELECT * FROM sys.xml_indexes; + +This yields us several primary XML indexes and a few secondary XML indexes. (We'll look at secondary XML indexes shortly.) + +object_id name index_id type + +\----------- ------------------------------------------------- ----------- ---- + +162099618 PXML_ProductModel_CatalogDescription 256000 3... + +162099618 PXML_ProductModel_Instructions 256001 3... + +270624007 PXML_Store_Demographics 256000 3... + +1509580416 PXML_Person_AddContact 256000 3... + +1509580416 PXML_Person_Demographics 256001 3... + +1509580416 XMLPATH_Person_Demographics 256002 3... + +1509580416 XMLPROPERTY_Person_Demographics 256003 3... + +1509580416 XMLVALUE_Person_Demographics 256004 3... + +(8 row(s) affected) + +The result here has been truncated on the right side to allow it to fit in the book, but if you run the query for yourself, you'll see a wealth of additional information about the nature of each XML index listed. + +We'll defer discussion of the shredding process for a bit, and move, for the moment, to secondary XML indexes and how they differ from primary indexes. + +Secondary XML Indexes + +Much like non-clustered indexes point to the cluster key of the clustered index, secondary XML indexes point at the various columns that are part of the internal table of the primary XML index. Secondary XML indexes are merely separate, and far more specialized than the primary XML index they depend on or any other index for that matter. You can have up to 248 secondary XML indexes against a given column. + +Secondary XML indexes are special in the sense that they come in three different sub-types: + + * PATH: This secondary index type focuses on providing fast access based on a path-based search criteria. This index is based on the reverse path of the internal table, plus the value. + * VALUE: As the name suggests, this index type provides an index oriented around searching for a specific node value. This one can be considered to be the inverse of the PATH secondary index type, indexing first on the value, and then on the reverse path. + * PROPERTY: Similar to VALUE, but oriented around multivalued scenarios. + +It follows then that the key thing to understand with secondary XML indexes is that your index choice is not targeted just around what data you're indexing, but also specific types of queries you'll be issuing against that data. + +Let's take a look at each of the three types. + +PATH XML Indexes + +The first of the secondary XML index types is targeted toward queries that are searching based on a specific path. If most of your queries will include a specific path in your WHERE clause, then the PATH style secondary index is for you. While the primary XML index will greatly aid in the search for a specific path (likely via .exist()), it carries with it the overhead of the identifying information for the blob (the node information we discussed earlier). As a secondary index, the PATH-based index focuses solely on the path information, and is, therefore, more compact (and therefore more efficient when simply searching). + +The key of using a PATH index's efficiency is making sure that a particular path is specified. Your XPath designation of the path can also include a value (if you so choose), but including a path is what will cause this kind of index to be used. + +For example, let's look at the Person.Person table in the AdventureWorks2008 database. We can issue a relatively straightforward XPath-oriented query against the table's Demographics column: + +WITH XMLNAMESPACES + +('http://schemas.microsoft.com/sqlserver/2004/07/adventure- + +works/IndividualSurvey' AS "IS") + +SELECT Demographics.query(' + +/IS:IndividualSurvey/IS:TotalPurchaseYTD + +') AS Result + +FROM Person.Person + +WHERE Demographics.exist ('/IS:IndividualSurvey/IS:TotalPurchaseYTD') = 1; + +The search for a specific path creates a situation that is optimal for the PATH secondary index type. To see that SQL Server is indeed using it, check out the query plan, as shown in Figure 7.1. + +Figure 7.1 + +Indeed, we can see that the XMLPATH_Person_Demographics index is in use. + +VALUE XML Indexes + +This one is all about ordering by—wait for it....—value. You knew it was coming, right? + +The columns used for this index are based on the Primary node value and the path. The type of the value is not important. The important thing to remember when considering this is that you may not know the entire path. In particular, you may know only the element and/or attribute that actually contains the value. + +Since the index is primarily focused on the value, it finds a match there first, and then concerns itself with whether or not the path matches. The path is actually stored in reverse order, which allows you to find a match to the leaf portion of a path regardless of what is the parent to the partial path you supply. + +PROPERTY XML Indexes + +PROPERTY indexes are meant to combine values from two different kinds of columns—whatever the primary key is, and, of course, the XML column. PROPERTY indexes are first oriented around the primary key of the row, and then on the path (again, stored in reverse) and value of individual XML nodes. As you might surmise from the first value being the primary key for the row, this index is useful only for situations where the primary key is known. After that, it acts somewhat like the PATH secondary index type. + +Creating XML Indexes + +So, now that we have all the different types of XML indexes figured out, we're probably set to see how to create them. Much of the syntax isn't that different from creating standard indexes, but there are a few twists. The overall syntax looks like this: + +CREATE [ PRIMARY ] XML INDEX + +ON
( ) + +[ USING XML INDEX + +[ FOR { PATH | VALUE | PROPERTY } ] ] + +[ WITH ( PAD_INDEX = { ON | OFF } + +| FILLFACTOR = + +| SORT_IN_TEMPDB = { ON | OFF } + +| IGNORE_DUP_KEY = OFF + +| STATISTICS_NORECOMPUTE = { ON | OFF } + +| DROP_EXISTING = { ON | OFF } + +| ONLINE = OFF + +| ALLOW_ROW_LOCKS = { ON | OFF } + +| ALLOW_PAGE_LOCKS = { ON | OFF } + +| MAXDOP = + +[ ,...n ] + +) ][ ; ] + +Notice that both the IGNORE_DUP_KEY and ONLINE options have only one setting. I honestly can't tell you why Microsoft decided to keep them in there at all (I suspect just to keep it more in line with the basic CREATE INDEX statement, but it still seems odd), but they are there for now. (Perhaps they will have additional options later.) As you can see, most of the other options are the same, so let's focus on the main syntax items. + +First, XML indexes must be explicitly called out in the CREATE INDEX line via the XML keyword. The PRIMARY keyword is only necessary for primary XML indexes. The XML index is otherwise assumed to be a secondary index. + +Moving on, notice that we do not have the option of supplying multiple columns. Instead, we just name which index of type xml we plan on indexing. + +The USING clause is mutually exclusive with the PRIMARY keyword and applies only in the case of (and in such case is required) for secondary indexes. Use this clause along with the FOR keyword to indicate the type of secondary index you want to create (PATH, VALUE, or PROPERTY). + +So, were we to put this to use, we might create a primary XML index on the Production.ProductModel table: + +CREATE PRIMARY XML INDEX PXProductModelInstructions + +ON Production.ProductModel (Instructions) + +WITH (PAD_INDEX = OFF, + +SORT_IN_TEMPDB = OFF, + +DROP_EXISTING = OFF, + +ALLOW_ROW_LOCKS = ON, + +ALLOW_PAGE_LOCKS = ON + +); + +Note that, if you want to actually run the previous script, you would need to drop the existing XML index that came with the AdventureWorks2008 sample. + +Or to create a secondary index utilizing the primary we just created, we would do something like: + +CREATE XML INDEX SXProductModelInstructionsPATH + +ON Production.ProductModel (Instructions) + +USING XML INDEX PXProductModelInstructions + +FOR PATH + +WITH (PAD_INDEX = OFF, + +SORT_IN_TEMPDB = OFF, + +DROP_EXISTING = OFF, + +ALLOW_ROW_LOCKS = ON, + +ALLOW_PAGE_LOCKS = ON + +); + +Note that either of the preceding CREATE XML INDEX statements will fail in AdventureWorks2008 because the sample already has a default primary XML index. The second of the two examples will run if you change the USING clause to reference the existing primary XML index (PXML_ProductModel_Instructions). + +Again, the syntax differences versus standard indexes are relatively subtle in nature. + +User-Defined Data Types + +Ah, the awesome potential of user-defined data types or UDTs. + +This is a little bit of the classic "What came first, the chicken or the egg?" thing. You see, part of what has made UDTs interesting since SQL Server 2005 is the addition of .NET objects as a possible source for a UDT. For various reasons, however, I'd prefer to hold up on the addition of .NET until we're talking the procedural side of things in Chapter 10. + +With this in mind, I've decided to compromise a bit. We'll start our discussion of UDTs here, and finish the .NET side of it in Chapter 10 (call it a little bit of both worlds....). + +So, all, with the organizational stuff disposed of, let's address the issue of what exactly a user-defined data type is. If you're a true SQL Server "Pro," then the fundamentals of a UDT may well be old news. After all, UDTs have been part of SQL Server since long before anyone had even thought of .NET. Then again, until the .NET era, they had only minimal value. Even with the advent of .NET, the flexibility of .NET data types requires an exorbitant amount of complexity and requires you to turn on some things in the server configuration that may violate some security policies. (Many DBAs see a great deal of risk in turning on SQLCLR and .NET for SQL Server.) You may well have just ignored UDTs altogether, and given the changes in UDTs for SQL Server 2008, it's worth a look at UDTs as they are today. + +Classic UDTs + +The classic UDT is founded on existing data types. Indeed, it can be considered to be nothing more than an alias for the base types already found in SQL Server. Historically, it has been used primarily to aid consistency in a frequently used attribute, or in conjunction with rules and defaults (which can be bound directly to a UDT and apply anywhere the classic UDT is used). + +Let's start with a fairly basic example—an account number. The AdventureWorks2008 database makes use of a user-defined data type called AccountNumber that is created from the base type of nvarchar—in this case, an nvarchar(15). Using the AccountNumber UDT rather than directly using the nvarchar(15) base type ensures consistency across all instances where you want to make use of the account number concept. + +The syntax for creating classic UDTs (UDTs that source from built-in data types and are not tabular in nature) is pretty simple: + +CREATE TYPE [.] + +FROM + +[ ( precision[, scale]) ] + +[ NULL | NOT NULL ] + +So, the AccountNumber data type used in AdventureWorks2008 would look like this: + +CREATE TYPE dbo.AccountNumber + +FROM nvarchar(15) NULL; + +As you can see, there isn't a whole lot to creating what amounts to a simple alias to an existing type. You may see these relatively simple types extended via the use of rules and defaults, but I recommend against this as Microsoft has said for the last four releases that rules and defaults are considered deprecated and will be removed from the product at some point. + +It's probably worth noting that I've been told by members on the team that Microsoft is getting more serious about "truly" following up on removing deprecated features. When SQL Server 11 (currently code named Kilimanjaro) eventually ships, expect to see some long deprecated features finally disappear from the product. + +.NET UDTs + +I'm going to defer most of our discussion of .NET-based UDTs until Chapter 10 when we will fully explore .NET-based development, but at least a cursory look at .NET-based UDTs has to be included here if for no other reason than context. + +As you might expect, .NET UDTs make use of a .NET assembly to implement a custom data type. These have been around since .NET first appeared in SQL Server as part of SQL Server 2005, and can implement some very complex custom types. How complex? Well, complex enough that the new geospatial data types that will be discussed a bit later in this chapter were implemented using .NET. Indeed, they are essentially the same as any .NET data type you might develop and deploy yourself, save that they are flagged as a system type and do not require explicitly enabling .NET in order for geospatial data types to be used. + +Just to make sure we have a copy of our syntax examples in the same place, here's the syntax for .NET UDTs: + +CREATE TYPE [.] + +EXTERNAL NAME [.] + +So, as a preview of the example we'll use in Chapter 10 (don't actually execute this code—we'll get to it in due time!), we could add a .NET assembly called ComplexNumber with code such as: + +CREATE TYPE ComplexNumber + +EXTERNAL NAME [ComplexNumber].[Microsoft.Samples.SqlServer.ComplexNumber]; + +Again, we will more fully explore .NET-based UDTs in Chapter 10, including creating our own type. + +Tabular UDTs + +These are new in SQL Server 2008, and they are the start of something big that I suspect will evolve over the next few releases. + +What are they? Well, largely what they sound like: a user-defined data type that accepts tabular data. You create them with a syntax that mostly matches the syntax used for table-valued variables or in the CREATE TABLE command. After creation, you can then utilize them in scripts or, perhaps more importantly, as a table-valued parameter in a stored procedure. + +Note that I did not mention using them as a type you can use within a table. Unlike other user-defined data types, user-defined data types based on the notion of a table are not usable within other table type objects (that is, a table object or table variable). + +Unlike other user-defined data types, tabular UDTs cannot be embedded within other tabular objects such as a table variable or table object. + +As of this writing, Microsoft has not yet made a commitment about how far they are going to take tabular UDTs. Right now, it would appear that we are on a path that is taking us closer and closer to a more fully functioning tabular UDT similar to that found in competitive products—such as Oracle—where you have long been able to embed a table within a table. + +For now, we're going to focus on how exactly we create tabular UDTs. In Chapter 10, we'll examine the most likely use for tabular UDTs: table-valued parameters for stored procedures. + +Creating a Table User-Defined Data Type + +Creating a table user-defined data type works as something of a combination of the classic CREATE TYPE and table variable syntax. The tabular CREATE TYPE syntax looks like this: + +CREATE TYPE [.] + +AS TABLE + +( + +{ + +[ COLLATE ] + +[ NULL | NOT NULL ] + +[ DEFAULT ] + +[ IDENTITY [ ( , ) ] + +[ ROWGUIDCOL ] [ [...n ] ] + +[
]} + +| } + +)[;] + +As an example, we're going to create a user-defined table type that will represent addresses. Later in the book (in Chapter 10), we'll see how we can pass an instance of this data type into a stored procedure or function for further processing. + +Long ago, it seemed one address was generally enough for most people. The majority of systems out there stored a single address for most business entities they worked with. Today, however, one doesn't seem to be enough. Between dealing with companies that have multiple locations, and even individuals deciding to receive bills at one location, but ship to a different location, many business entities we work with have multiple addresses. The AdventureWorks2008 database represents this by separating addresses out into their own table (Person.Address). We've decided that we want to represent this notion of an address in a consistent way across our systems, so we create our custom type: + +USE AdventureWorks2008; + +GO + +CREATE TYPE Person.Address + +AS TABLE( + +AddressID int NULL, + +AddressLine1 nvarchar(60) NOT NULL, + +AddressLine2 nvarchar(60) NULL, + +City nvarchar(30) NOT NULL, + +StateProvinceID int NOT NULL, + +PostalCode nvarchar(15) NOT NULL, + +SpatialLocation geography NULL + +); + +There are a host of items to notice about this script: + + * I used the exact name of an existing object in the database (there is a table called Person.Address). The type can be considered to be much like the difference between a class and an object—that is, a type is a definition, and a table is an actual instance of something (though, the table is not an instance of the type definition the way an object is an instance of a class). + * The syntax for creating the actual definition is very similar to the CREATE TABLE syntax. + * The layout maps very closely to the Person.Address table in order to support moving data between the two relatively easily. + +Note that I created my user-defined type with the same name as a table just to prove the point. I would not recommend duplicating names in practice, as it is likely to lead to far more confusion than it is worth. + +With my type now created, I can reference it as a valid data type for variable declarations, or function or sproc parameters (more on the latter two in Chapter 10). + +Let's further our example just a bit by utilizing our new type: + +DECLARE @Address Person.Address; + +INSERT INTO @Address + +(AddressID, + +AddressLine1, + +City, + +StateProvinceID, + +PostalCode + +) + +VALUES + +( + +1, + +'My first address', + +'MyTown', + +1, + +'21212' + +), + +( + +1, + +'My second address', + +'OtherTown', + +5, + +'43434' + +), + +( + +1, + +'My third address', + +'MyTown', + +1, + +'21214' + +); + +SELECT * + +FROM @Address; + +Notice that, with a simple declaration of our Person.Address user-defined type, we gained access to all the columns for that tabular type. We're able to insert rows, and select them back out: + +(3 row(s) affected) + +AddressID AddressLine1 + +\----------- -------------------- + +1 My first address... + +1 My second address... + +1 My third address... + +(3 row(s) affected) + +Again, we'll take a further look at uses for this in Chapter 10 as part of our table-valued parameter discussion. + +Dropping a User-Defined Type + +It's fairly intuitive, but just to make sure we've addressed the point, all varieties of UDTs are dropped using the very common DROP syntax: + +DROP TYPE [.][;] + +Hierarchical Data + +This area is somewhat more revolutionary than most of the other items we discuss during this chapter. While spatial data has been around in other products for some time now (and the lack of spatial support had been something SQL Server was often derided for), the addition of the new HierarchyID data type and its embedded functions brings a new realm of functionality to the database that was somewhat unexpected. + +So what is HierarchyID? Simply put, it is a special data type that is optimized to the needs of representing a single node in a hierarchical structure (usually a tree). The real horsepower here is in the idea that it is able to analyze the concepts of hierarchical ancestry (parent/child relationships) as well as understand the notion of depth and siblings (for example, all departmental managers versus operational staff or executives). + +A given instance of HierarchyID data does not represent a tree. Instead it is merely information about the properties of a single node of a tree, including that node's ancestry. Only by making use of a collection of related nodes can one represent a true hierarchy tree. + +The need for hierarchical representation of data is not new. Indeed, the version of AdventureWorks that shipped back in SQL Server 2005 included a fairly typical modeling of one of the more common hierarchical problems: employee reporting structures. (Indeed, we created a similar mapping to this when we created the Employee2 table to show off CTEs back in Chapter 3.) The typical solution was what is called a unary relationship—that is, a table that has a foreign key to itself. "Kits" are another common hierarchical problem. (For example, a part that is nothing more than a collection of other parts, with some of those, perhaps, being kits with other parts.) XML is naturally hierarchical, and has also been a frequent solution to storing hierarchies even for non-XML applications. + +Let's take a look at how it works, and then we'll explore some of the functionality that comes with the data type and its associated methods. + +Understanding Depth Versus Fanout + +Before getting too much into the structure of the HierarchyID type and the methods and index type that support it, it is important to understand the concepts of depth (or level) versus the idea of what is called fan out (for the moment, think about this as being horizontal). + +The depth—or levels deep—of a hierarchy node is based on the number of direct and indirect ancestor nodes. Note that this yields us a zero-based set—that is, the root node of a hierarchy has a level of zero, its direct descendants have a level of 1, and so on. So, for example, the node labeled E in Figure 7.1 has a level of 2. The root node labeled A in Figure 7.1 has a level of zero. + +The HierarchyID type gives us a special method call to tell us what level (not surprisingly called GetLevel) a given node is within a hierarchy. Levels are used primarily for comparisons with siblings, and will become important later in the chapter when we discuss breadth-first indexes on HierarchyID columns. + +The fanout of a hierarchy refers to the idea of how many children a given parent node has. In a tree representation such as the one in Figure 7.1, you can think of the fanout as governing the width of the hierarchy. In Figure 7.1, the E node has a fanout of 3, the B node has a fanout of 4, and the A node has a fanout of 11. In our next section, we'll take a look at how we store all this information. + +HierarchyID Type Structure + +The HierarchyID data type is stored internally as a variable-length binary representation of a node. Indeed, if we retrieve an instance of HierarchyID type data, it will come back in a hexadecimal representation. So, for example, if we execute: + +SELECT e.OrganizationNode + +FROM HumanResources.Employee e; + +it give us back some numbers in hex: + +OrganizationNode + +\--------------------------- + +0x + +0x58 + +0x68 + +... + +... + +0x85EBA6 + +0x85EBAA + +0x85EBAE + +(290 row(s) affected) + +You can use the ToString method (we'll explore the various method calls in the next section) to render it a bit more human readable: + +SELECT e.OrganizationNode.ToString() AS OrganizationNode + +FROM HumanResources.Employee e; + +This gets us back something that, at first blush, probably doesn't seem all that much more readable: + +OrganizationNode + +/ + +/1/ + +/2/ + +/3/ + +/4/ + +/5/ + +/6/ + +/1/1/ + +/2/1/ + +/2/2/ + +... + +... + +/4/3/1/9/ + +/4/3/1/10/ + +/4/3/1/11/ + +(290 row(s) affected) + +There are several items of note in this string representation: + + * Each forward slash (/) separates a representation of a node in the current node's lineage + * The numbers are largely arbitrary. You can assign them yourself, or SQL Server can find a place to insert them for you. If you have SQL Server generate the number for you, then the number will be the next available whole number unless you explicitly state that you want the new value to be between existing nodes. In which case you need to supply the points you want the value to lay between. (More on this when we look at the GetDescendant method in the next section.) Only by explicitly managing the values for a given level can you provide any form of ranking within a given level. + * The numeric order does not matter within a specific node, only the position in the series matters. Each set of numbers matters only within that particular level. (Notice that the first 1 in /1/1/ is a different item than the second 1 is. The number sequencing is maintained separately within each level of the hierarchy.) + * The solitary forward slash (/) represents what is being seen as the root node. (The equivalent in hex was 0x0.) This is, however, also arbitrary, as nothing prevents you from having multiple root nodes. + +There is nothing inherent in the HierarchyID data type that ensures that you only have a single root node. Indeed, there is no guarantee of uniqueness at all unless you explicitly enforce that constraint (via primary key or unique constraint). + +As indicated earlier, the inner workings of the HierarchyID type represent the node in a variable-length bit field (thus all the hex output). Unlike other variable-length data types, you do not explicitly define the length. Instead, SQL Server adjusts the length as required to address the depth and fanout found in the various nodes. Microsoft hasn't really said much about the specifics of how each bit is manipulated, but based on what Microsoft has said publically, you can figure that most installations are going to average 5–6 bytes per node. + +Working with HierarchyID Values—HierarchyID Methods + +So, all these concepts and theories are great, but anyone who has read much from my books knows that I'm more of a fan of showing specific examples. With that in mind, we're going to use this next section to cover the various methods that are supported by the type. These work much like the methods we used with the XML data. We'll address each method based on the function they help us perform (inserts, positions, grafting, and the like), but just to get them all into one place for reference, here is the quick list of methods and what they are used for: + + * GetAncestor(n): This fetches the node value for the parent node that is n number of nodes up the tree. So, for example, GetAncestor(1) would fetch the immediate parent. + * GetDescendant(, ): This varies in behavior depending on the specific values provided for the child arguments, but the name is a bit misleading. Contrary to what you might expect, GetDescendant() is not used to fetch a specific child node, which would be hard since there may be multiple children at any level of the hierarchy. Instead, it is used to calculate a value to use in inserting a new node into the hierarchy. + * GetLevel(): Returns the level of the current node where the root node is considered level 0 and each child level below the root adds one to the reported level. + * GetReparentedValue(, ): This is another deceptively named one. Despite using the Get moniker in the name, GetReparentedValue() actually performs a task—that is, pruning a given node or set of child nodes from a given parent (old root) and grafting them to a new parent (new root). Do not let the use of the term "root" confuse you here. This does not need to be the primary root for the entire hierarchy, but rather the common parent that all the grafted nodes share. + * GetRoot(): Supplies the constant value of the root node of a hierarchy (which is always 0x0). Unlike most of the other methods discussed here, GetRoot() is a static method and thus only callable against the base type and not an individual node instance. We'll explore the specifics of this a little later when we discuss fetching the root. + * IsDescendantOf(): Provides a true/false indication as to whether or not the current node is a descendent of the provided node. Note that a parent is considered a child of itself (so if you perform a IsDescendantOf() on a given node referencing itself, the result will be true). + * Parse(): Loosely speaking, this can be considered the opposite of the ToString() method. This receives a string-based representation of a node and converts it to the internal binary representation. Like GetRoot(), this is a static method, and can be called only against the base type (for example, HierarchyID::GetRoot()). + * Read(): This is a CLR or client language only function (it is not callable from within T-SQL), and is used to receive a stream of a HierarchyID instance in its native binary representation. In general, the database developer would utilize this only while doing extremely complex CLR programming or manipulating the hierarchy in a client language. + * ToString(): This does what it says—that is, it converts the binary representation into a more human-readable string. + * Write(): This is the functional opposite of Read(). Like Read() it is CLR/.NET only, and cannot be called from T-SQL. It is used to take a client-side binary representation of a HierarchyID instance and write it directly back to SQL Server without the need for a string conversion. + +So, with the introductions done, let's take a look at things from a more functional standpoint, and discuss the many things we might want to do with an instance of the HierarchyID data type. + +Methods Related to Retrieving a Given Level of Hierarchy Data + +There are a few examples of the HierarchyID data type in the AdventureWorks2008 database (one each in the Address, Document, Employee, and ProductDocument tables). We'll focus on the Employee table here as it is the easiest to get the concept of, but each of the other HierarchyID usages provides a further example of a potential hierarchy. + +Let's start out with retrieving a simple user-readable selection of the Employee table. For this, we use the ToString() method we saw earlier in the section. ToString() takes no arguments and is used relative to an instance of data (usually a row or variable) of type HierarchyID. So, to formalize the syntax, it would look like this: + +.ToString() + +To keep things manageable, we're going to limit the results using the OrganizationLevel column: + +SELECT e.BusinessEntityID, + +p.LastName + ', ' + p.FirstName AS Name, + +e.OrganizationNode.ToString() AS Hierarchy + +FROM HumanResources.Employee e + +JOIN Person.Person p + +ON e.BusinessEntityID = p.BusinessEntityID + +WHERE e.OrganizationLevel BETWEEN 1 AND 2; + +This gives us list of a set of parents and their respective children: + +BusinessEntityID Name Hierarchy + +\---------------- ------------------------------ ---------- + +2 Duffy, Terri /1/ + +16 Bradley, David /2/ + +25 Hamilton, James /3/ + +234 Norman, Laura /4/ + +263 Trenary, Jean /5/ + +273 Welcker, Brian /6/ + +3 Tamburello, Roberto /1/1/ + +17 Brown, Kevin /2/1/ + +18 Wood, John /2/2/ + +19 Dempsey, Mary /2/3/ + +20 Benshoof, Wanida /2/4/ + +21 Eminhizer, Terry /2/5/ + +22 Harnpadoungsataya, Sariya /2/6/ + +23 Gibson, Mary /2/7/ + +24 Williams, Jill /2/8/ + +26 Krebs, Peter /3/1/ + +211 Abolrous, Hazem /3/2/ + +222 Wright, A. Scott /3/3/ + +227 Altman, Gary /3/4/ + +235 Barreto de Mattos, Paula /4/1/ + +241 Liu, David /4/2/ + +249 Kahn, Wendy /4/3/ + +262 Barber, David /4/4/ + +264 Conroy, Stephanie /5/1/ + +267 Berg, Karen /5/2/ + +268 Meyyappan, Ramesh /5/3/ + +269 Bacon, Dan /5/4/ + +270 Ajenstat, François /5/5/ + +271 Wilson, Dan /5/6/ + +272 Bueno, Janaina /5/7/ + +274 Jiang, Stephen /6/1/ + +285 Abbas, Syed /6/2/ + +287 Alberts, Amy /6/3/ + +(33 row(s) affected) + +Notice that nothing about the numbers used in the HierarchyID column has anything to do with the other columns. BusinessEntityID is the primary key for the table, but it is not utilized in the hierarchy representation at all. Taking a look at Roberto Tamburello, we can see that he reports to Terri Duffy. The number "1" is reused at each level of the hierarchy, and implies no relationship to how it might be used in other levels of the hierarchy. The number sequences we see here happen to be sequential at each level, but that is an arbitrary fact of this particular data set. There is no requirement that it be this way. (Decimals can and will occur, as can negative numbers.) + +Next, take note of the OrganizationLevel column that we used in the previous query. If you look at the definition of this column in the database, you'll see that this is a computed column. Indeed, it utilizes the next method we want to look at: GetLevel(). + +GetLevel() takes no arguments. (It is assumed to be operating on the instance of hierarchy data you used the method with, and passes back just how deep that node is in the hierarchy with the root node considered to be zero, the first level of children of the root being level 1, their children being 2, and so on.) So, the syntax would look like this: + +.GetLevel() + +So, if we wanted to do a comparison of the OrganizationLevel we used in our previous query to what we would see using GetLevel() directly, we could rewrite it as: + +SELECT e.OrganizationNode.ToString() AS Hierarchy, + +OrganizationLevel, + +e.OrganizationNode.GetLevel() AS ComputedLevel + +FROM HumanResources.Employee e + +WHERE e.OrganizationLevel BETWEEN 1 AND 2; + +Which would, as expected, yield identical values for OrganizationLevel and our use of GetLevel(): + +Hierarchy OrganizationLevel ComputedLevel + +\---------- ----------------- ------------- + +/1/ 1 1 + +/2/ 1 1 + +/3/ 1 1 + +... + +... + +/6/1/ 2 2 + +/6/2/ 2 2 + +/6/3/ 2 2 + +(33 row(s) affected) + +We can use this in a wide variety of ways, but the most notable would be: + + * Returning all rows of data related to a certain level in a hierarchy. For example, all CxO level employees might be found by looking for level 1 or 2, or a regional manager might be at level 3. It just depends on how you set up your hierarchy. + * Indexing for horizontal comparisons. + +Methods Related to Retrieving Parent or Child Hierarchy Data + +Looking at the information for a specific level or node of a hierarchy is all well and good, but it doesn't really show off the horsepower of the HierarchyID data type. For that, you need to expand more fully out to the parent/child relationships that are the cornerstone of what hierarchical data is all about. The real centerpiece of this functionality are the GetAncestor() and IsDescendantOf() methods. + +Let's start with the syntax for GetAncestor(), which takes a single argument. It looks like this: + +.GetAncestor(n): + +The method is assumed to be operating against the instance of hierarchical data it was called as a method of, and uses the single argument to indicate how many levels up the tree you want to go. + +The value returned by GetAncestor() is of type HierarchyID, which means you can further extend the GetAncestor() call with other HierarchyID methods. + +Let's see what we get if we fetch a few different ancestor levels for the employee named Roberto Tamburello that we saw in one of our first hierarchy example queries. You may recall his hierarchy node looked like this: + +/1/1/ + +So let's run a few instances of the GetAncestor() method to see what gets returned: + +SELECT e.BusinessEntityID, + +p.LastName + ', ' + p.FirstName AS Name, + +e.OrganizationNode.ToString() AS Hierarchy, + +e.OrganizationNode.GetAncestor(0).ToString() AS Self, + +e.OrganizationNode.GetAncestor(1).ToString() AS OneUp, + +e.OrganizationNode.GetAncestor(2).ToString() AS TwoUp, + +e.OrganizationNode.GetAncestor(3).ToString() AS TooFar + +FROM HumanResources.Employee e + +JOIN Person.Person p + +ON e.BusinessEntityID = p.BusinessEntityID + +WHERE e.BusinessEntityID = 3 + +If you look at this closely, you'll see that I'm fetching the same node several times, but, with each separate column, I'm stepping further up the hierarchy until I've stepped beyond the level that I happen to know that this particular piece of data lays at. Run this, and we get back a single row: + +BusinessEntityID Name Hierarchy Self OneUp TwoUp TooFar + +\---------------- -------------------- --------- ----- ----- ----- ------ + +3 Tamburello, Roberto /1/1/ /1/1/ /1/ / NULL + +(1 row(s) affected) + +Several things are of note in this result: + + * Although it provides little value, zero was a valid argument (it returns the calling node). + * Each increase in the argument to GetAncestor() moved us further up the hierarchy tree. + * Using a value that goes beyond the root of the hierarchy list returns a NULL. + +This is great for going up the hierarchy tree, but what if we want to return the children, or simply know if a specific child has a given parent anywhere in its ancestry? For that, the right answer depends on whether we know how far down the chain we want to go (all reports or only direct reports). If it is all reports, we have IsDecendantOf(). This one takes a single node as an argument and returns a Boolean result that is, as you might expect, a simple true/false as to whether the node you pass into the method has the node are calling the method from as a child (directly or indirectly). The syntax looks like this: + +.IsDescendantOf(n): + +For this, let's look at how it can be used in either direction. For example, let's say we want to return all superiors to Mr. Tamburello. This translates to us wanting to return any row with a node that considers Mr. Tamburello's node to be a descendant. For example: + +DECLARE @ChildNode HierarchyID + +SELECT @ChildNode = OrganizationNode + +FROM HumanResources.Employee e + +WHERE e.BusinessEntityID = 3 + +SELECT e.BusinessEntityID, + +p.LastName + ', ' + p.FirstName AS Name, + +e.OrganizationNode.ToString() AS Hierarchy + +FROM HumanResources.Employee e + +JOIN Person.Person p + +ON e.BusinessEntityID = p.BusinessEntityID + +WHERE @ChildNode.IsDescendantOf(e.OrganizationNode) = 1; + +First, note that we were able to move a node into a variable of type HierarchyID, and we were still able to make a method call from that variable. Why use a query like this one instead of using GetAncestor()? If you think about this for a moment, I suspect you'll see that it has to do with how open ended the question was. GetAncestor() really expects you to know how many ancestors you have. You could figure that out using GetLevel() or rig up some test for NULL values, but that is far more complicated than simply returning all rows where IsDescendant()is true. + +BusinessEntityID Name Hierarchy + +\---------------- ------------------------------ ---------- + +1 Sánchez, Ken / + +2 Duffy, Terri /1/ + +3 Tamburello, Roberto /1/1/ + +(3 row(s) affected) + +Much as a node can consider itself its own ancestor (with a level input of zero), a node is also considered its own descendant. + +That showed us how to check what ancestors are above us, but what about the children below us? For that, we can ask an even more open-ended question. For example, listing all people that report directly or indirectly to Mr. Tamburello requires a simple reversal of the WHERE condition in our previous query: + +DECLARE @ChildNode HierarchyID + +SELECT @ChildNode = OrganizationNode + +FROM HumanResources.Employee e + +WHERE e.BusinessEntityID = 3 + +SELECT e.BusinessEntityID, + +p.LastName + ', ' + p.FirstName AS Name, + +e.OrganizationNode.ToString() AS Hierarchy + +FROM HumanResources.Employee e + +JOIN Person.Person p + +ON e.BusinessEntityID = p.BusinessEntityID + +WHERE e.OrganizationNode.IsDescendantOf(@ChildNode) = 1; + +And just that quick we have all of Mr. Tamburello's reports: + +BusinessEntityID Name Hierarchy + +\---------------- ------------------------------ ---------- + +3 Tamburello, Roberto /1/1/ + +4 Walters, Rob /1/1/1/ + +5 Erickson, Gail /1/1/2/ + +6 Goldberg, Jossef /1/1/3/ + +7 Miller, Dylan /1/1/4/ + +8 Margheim, Diane /1/1/4/1/ + +9 Matthew, Gigi /1/1/4/2/ + +10 Raheem, Michael /1/1/4/3/ + +11 Cracium, Ovidiu /1/1/5/ + +12 D'Hers, Thierry /1/1/5/1/ + +13 Galvin, Janice /1/1/5/2/ + +14 Sullivan, Michael /1/1/6/ + +15 Salavaria, Sharon /1/1/7/ + +(13 row(s) affected) + +To get his direct reports, we use pretty much the same query, but return to the GetAncestor() method: + +DECLARE @ChildNode HierarchyID; + +SELECT @ChildNode = OrganizationNode + +FROM HumanResources.Employee e + +WHERE e.BusinessEntityID = 3; + +SELECT e.BusinessEntityID, + +LEFT((p.LastName + ', ' + p.FirstName), 30) AS Name, + +LEFT(e.OrganizationNode.ToString(), 10) AS Hierarchy + +FROM HumanResources.Employee e + +JOIN Person.Person p + +ON e.BusinessEntityID = p.BusinessEntityID + +WHERE e.OrganizationNode.GetAncestor(1) = @ChildNode; + +Which limits us to just the specific level below us (or, as GetAncestor() looks at it, the level that we are currently 1 above). + +Inserting New Hierarchical Data + +At its most basic level, inserting new hierarchical data isn't unlike inserting any other data in SQL Server. The real trick to inserting new hierarchy nodes lies in understanding what the representation should look like for the new row. + +Remember that SQL Server has no preconceived notions about your hierarchy. Indeed, SQL Server doesn't necessarily even look at it as a tree or that a given node is unique. So, while SQL Server can't build your hierarchy for you, it can help you generate values based on information you provide. The functionality for this is provided by the GetDescendant() method. + +GetDescendant() would probably have been more accurately named if they had called it something like "GenerateHierarchyNodeRepresentation()" or something like that. Its purpose is to generate a valid representation of a hierarchy node that falls between two optionally set parameters. The syntax looks like this: + +.GetDescendant({ | NULL}, { | NULL}) + +The low and high child nodes specify a range that the generated value must fall between (it is non-inclusive). The generated value may contain decimals or even be a negative value as long as it falls within the specified range. While both arguments are required, you can explicitly specify NULL as the value for either, effectively putting no bound on that side of the generation. + + * If parent is NULL, returns NULL. + * If parent is not NULL, and both low and high children are NULL, returns a child of parent. + * If parent and the low child are not NULL, and the high child is NULL, returns a child of parent greater than the low child. + * If parent and the high child are not NULL and the low child is NULL, returns a child of parent less than the high child. + * If the parent, the low child, and the high child are not NULL, returns a child of parent greater than the low child and less than the high child. + * If the low child is not NULL and not a child of parent, an exception is raised. + * If high child is not NULL and not a child of parent, an exception is raised. + * If the low child is equal to or greater than the high child, an exception is raised. + +For this particular method call, we'll generate a bit more custom example using the following script: + +CREATE TABLE NodeTest + +( + +NodeID int NOT NULL IDENTITY PRIMARY KEY, + +Node hierarchyid NOT NULL, + +NodeLevel AS Node.GetLevel(), + +Name varchar(50) NOT NULL + +); + +INSERT NodeTest + +VALUES + +('/', 'Manager'); + +DECLARE @Manager hierarchyid; + +SELECT @Manager = Node + +FROM NodeTest + +WHERE NodeID = 1; + +INSERT NodeTest + +VALUES + +(@Manager.GetDescendant(NULL, NULL), 'ReportAAA'), + +(@Manager.GetDescendant(NULL, NULL), 'ReportBBB'), + +(@Manager.GetDescendant(NULL, '/1000/'), 'ReportCCC'), + +(@Manager.GetDescendant(NULL, '/1000/'), 'ReportDDD'), + +(@Manager.GetDescendant('/1000/', NULL), 'ReportEEE'), + +('/547/', 'ReportFFF'), + +(@Manager.GetDescendant('/3/', '/547/'), 'ReportGGG'), + +(@Manager.GetDescendant('/1/', '/2/'), 'ReportHHH'), + +(@Manager.GetDescendant('/-10/', '/-1/'), 'ReportIII'), + +('/547/345/', 'SecondLevelAA'), + +('/547/346/', 'SecondLevelBB'), + +('/547/345/1/', 'ThirdLevelAA'), + +('/785/294/386/925/','RandomEntry'); + +SELECT NodeID, + +Node.ToString(), + +Name + +FROM NodeTest; + +With this script, we've stuck a wide variety of data in, but the output may surprise you in several places: + +NodeID Name + +\----------- -------------------- ------------------------------ + +1 / Manager + +2 /1/ ReportAAA + +3 /1/ ReportBBB + +4 /999/ ReportCCC + +5 /999/ ReportDDD + +6 /1001/ ReportEEE + +7 /547/ ReportFFF + +8 /4/ ReportGGG + +9 /1.1/ ReportHHH + +10 /-9/ ReportIII + +11 /547/345/ SecondLevelAA + +12 /547/346/ SecondLevelBB + +13 /547/345/1/ ThirdLevelAA + +14 /785/294/386/925/ RandomEntry + +(14 row(s) affected) + +Note that we were able to insert data randomly. For example, we have a fourth-level node called RandomEntry that is just that—random. It has no parent. SQL Server does nothing to enforce a tree representation or the validity of your hierarchy; it only provides the tools for making nodes work together in a way you are most likely to use to create hierarchy trees. + +Next, note that we inserted decimal-based values. Our ninth entry was inserting between 1 and 2, so there was no way to squeeze it in there without going to decimals (and so that's exactly what SQL Server did). + +Continuing on, we have negative values. Again, we provided SQL Server no real choice, as our low and high children were both negative. + +Finally, we inserted duplicate rows. HierarchyID columns are not any more inherently unique than any other data type. If you want to avoid duplicate node values, you'll need to utilize a unique or primary key constraint. Note also that given a specific high and low child, GetDescendant() will generate the same value over and over again without regard to whether or not there is a duplicate (and regardless of whether there is a unique or primary key constraint). You need to plan for the values you're going to insert. For the vast majority of hierarchies, horizontal position is not important, so you can usually use whatever the max node is for the level you're inserting into. + +Moving Sub-Trees Between Parents + +The HierarchyID data type also provides for the concept of a coordinated prune and graft of a node and its children to a new parent using the GetReparentedValue() method. Like GetDescendant(), the name of GetReparentedValue() seems to imply that its main function is getting data back. While returning data is indeed technically what it does, GetReparentedValue() is largely about moving data around. It requires two arguments: the old "root" and the new "root." So the basic syntax looks like this: + +.GetReparentedValue(, ) + +Note that "root" in this case doesn't mean the top-level root of the entire hierarchy. Instead, it is just the root of the particular sub-tree you're wanting to move. + +GetReparentedValue() does not necessarily make the represented move. It is merely a way to show a "what if?" scenario. When used with an UPDATE statement, it performs the actual move. + +Let's go back to the NodeTest table we created in the previous example. We want to see what things would look like if we took the children of node /547/ and moved them to node /1001/. We can do this by combining our GetReparentedValue() method with the IsDescendantOf() method: + +SELECT NodeID, + +Node.GetReparentedValue('/547/', '/1001/').ToString() AS New, + +Node.ToString() AS Old, + +Name + +FROM NodeTest + +WHERE Node.IsDescendantOf('/547/') = 1; + +This code shows what things would look like if we pruned the /547/ sub-tree (included the /547/ node itself) and grafted all related nodes to the /1001/ node. Let's take a look at the results: + +NodeID New Old Name + +\----------- -------------------- -------------------- --------------------- + +7 /1001/ /547/ ReportFFF + +11 /1001/345/ /547/345/ SecondLevelAA + +12 /1001/346/ /547/346/ SecondLevelBB + +13 /1001/345/1/ /547/345/1/ ThirdLevelAA + +(4 row(s) affected) + +At first blush, this looks perfect, but there is one potential problem: the actual /547/ node. In our original data, we already have a /1001/ node. If we are ok with duplicates (and thus the nodes appearing to have two parents), then there is no problem here. Most of the time, however, a node is going to have one and only one parent. To change things so that we only move the children of /547/, we just need to exclude it from the result set using the WHERE clause: + +SELECT NodeID, + +Node.GetReparentedValue('/547/', '/1001/').ToString() AS New, + +Node.ToString() AS Old, + +Name + +FROM NodeTest + +WHERE Node.IsDescendantOf('/547/') = 1 + +AND Node.ToString() != '/547/'; + +And we've quickly cleaned our errant node out of the results: + +NodeID New Old Name + +\----------- -------------------- -------------------- --------------------- + +11 /1001/345/ /547/345/ SecondLevelAA + +12 /1001/346/ /547/346/ SecondLevelBB + +13 /1001/345/1/ /547/345/1/ ThirdLevelAA + +(3 row(s) affected) + +With that all figured out, we're ready to actually move our data around using an UPDATE statement: + +UPDATE NodeTest + +SET Node = Node.GetReparentedValue('/547/', '/1001/') + +WHERE Node.IsDescendantOf('/547/') = 1 + +AND Node.ToString() != '/547/'; + +Execute this, and then reselect all the data from our NodeTest table: + +NodeID Name + +\----------- -------------------- ------------------------------ + +1 / Manager + +2 /1/ ReportAAA + +3 /1/ ReportBBB + +4 /999/ ReportCCC + +5 /999/ ReportDDD + +6 /1001/ ReportEEE + +7 /547/ ReportFFF + +8 /4/ ReportGGG + +9 /1.1/ ReportHHH + +10 /-9/ ReportIII + +11 /1001/345/ SecondLevelAA + +12 /1001/346/ SecondLevelBB + +13 /1001/345/1/ ThirdLevelAA + +14 /785/294/386/925/ RandomEntry + +(14 row(s) affected) + +As planned, all of our nodes that were previously descendants of /547/ have been moved to /1001/. /547/ has been left in its original state as planned. + +Getting the Root of a Hierarchy + +Well, it deserves mentioning I guess, but it's probably going to be a bit anti-climactic. The last method we're going to cover here (I'm limiting myself to those that are T-SQL addressable) is for retrieving the root of a hierarchy. The odd thing about this method is that it is returning a constant. Since it is a static member of the HierarchyID type, you reference it using the HierarchyID type rather than a specific instance. You can, if you so choose, skip this, as the value will always be the same ("/" if you do a ToString() on it). The syntax is straightforward, and does not vary by specific implementation: + +HierarchyID::GetRoot() + +As I said, there is no real magic to this one. You can always select it to see: + +SELECT HierarchyID::GetRoot().ToString(); + +which will yield you the now familiar simple forward slash: + +\-------------- + +/ + +(1 row(s) affected) + +Indexing Hierarchy Data + +There are two likely ways for you to want your hierarchical data indexed: + + * Vertically (also referred to as "Depth First"): This is what is inherent to the base indexing of a HierarchyID column. It starts at the highest node it can find (the root node assuming you have one), and drills downward into the tree. As shown in Figure 7.2, when it reaches a bottom node it indexes everything at that level, and then returns to the lowest node part of the same general branch that hasn't been indexed yet, and then again starts downward. Creating the index uses the standard index syntax we covered in the previous chapter. If you index with a HierarchyID column as your first column, then you're sure to be getting a depth-first traversal index. + +Figure 7.2 + + * Horizontally (usually referred to as a "Breadth-First" index): Creating a breadth-first index requires a little extra effort, but, before we worry about that, let's focus on what exactly it does. + +A breadth-first index stores siblings close together (as shown in Figure 7.3). This is created for comparisons that are oriented around things like the GetAncestor() method. To create an index with this treatment order, you need to create a computed column based on the GetLevel() method just as the AdventureWorks2008 database has for the OrganizationLevel column of the Employee table (and as I created in our NodeTest). You can then index the Level column followed by the HierarchyID column to have a breadth-first index. + +Figure 7.3 + +Other than considering the difference in depth versus breadth on first traversal, HierarchyID indexes work much like any other index in SQL Server. + +Performance Considerations + +In general, the HierarchyID approach is going to give you the best overall performance and functionality for hierarchical data. However, as is the case with so many things in software development, there are other approaches and exceptions to the best performance rule. We discussed some of the alternatives at the beginning of our hierarchy discussion, but let's quickly explore some of the performance ramifications of each choice by utilizing a table approach. + +I'm told that people love it when I build the "Best performance by the numbers" and "If this, then that" tables I occasionally have in my books. While I do put these things forward based on experience or other research, keep in mind that they are "best guess" suggestions as to approach. In short, they are what works for the listed situations "most of the time." Your mileage may vary, and you really should, as I say all too often, test, test, test! + +Don't confuse the number of squares indicating parent/child relationships as being a good fit as being an endorsement as parent/child continuing to be the likely best solution. Treat each case individually, and realize that sub-tree and ancestry queries are generally very common in hierarchies, and such queries are where the HierarchyID data type excels. + +Spatial Data + +The addition of spatial data handling has been one of the most touted features of SQL Server 2008. Perhaps the most interesting thing to me is that such a feature can be touted to an audience who mostly has no basis for understanding what the feature is even about. + +What am I trying to get at here? Well, the new geospatial data types that are part of SQL Server have been a relatively highly requested feature for perhaps the last ten years or so. (It is one of the things often focused on by the Oracle crowd, since Oracle has handled geospatial data for some time now.) While very powerful, it is addressing an area that many database developers don't even realize they may need, let alone actually understand. + +The geospatial data types require a grasp of a style of data that is much different than other forms of data we deal with. For example, when dealing with the new HierarchyID type that we looked at in the previous section, we were working with a style of data most developers already have some concept of. (We've dealt with hierarchies such as org charts for years.) So the new thing was simply the way we went about manipulating data and we already understood the nature of that. With geospatial data, however, many developers will be asking themselves many questions regarding what geospatial data is all about. For example: + + * Is this just defining a specific location (for example an address)? + * Is it defining the boundaries of a property? + * Is it mapping a road? + * How many of my customers live within 5 miles of this point? + * How many bridges are there in Madison County? + +The reality is a bit larger than any one of these questions. Indeed, it encompasses all of the concepts just listed and more. How would we have designed for these kinds of questions in the past? For some of them we could have taken a relatively simple (and low power) approach, such as including a simple address. We might even have passed the address to an external application that kept geospatial data and utilized feedback from that application to ask bigger questions. Today's end users, however, expect more. It is, for example, nearly impossible to find a retail or restaurant chain website that does not include a "find a store near you" feature. They use geospatial functionality to supply that. + +With some of these needs in mind, let's explore the two types of geospatial data (planar or geodetic) and the functionality supporting each. + +Spatial Concepts + +To figure out the peculiarities of the specific type of geospatial data you need to work with, we are going to first get a bit of grounding in the more commonly accepted methods of representing spatial data. As you might imagine, there are standards surrounding how spatial data should be represented. Unfortunately, there isn't just one standard (indeed, SQL Server supports several "models"). + +To begin understanding geospatial data, we must first grasp that there are two different major models of representing geospatial data: planar (flat earth) and geodetic (round earth). Both have the same basic goal: to represent space via a set of data points (points, lines, curves). Planar representations are generally more simplistic and, therefore, easier to grasp and manage. Planar data is often used for relatively "local" data—that is, data that does not need to cover a particularly large area and does not need to have precision adjustments for the curvature of the earth's surface. Geodetic representations offer a more "real world" depiction, and are generally used when you need to represent a larger area that is more likely to be affected by the curvature of the earth. + +Planar (Flat Earth) Data + +Planar data is known by several names, such as geoplaner, geometric, or flat earth. You can think of this as mapping reasonably well to the Euclidian geometry that you likely studied in high school. With planar data, everything is represented on, as you might guess, a plane or series of planes. The space being presented is assumed to be flat. This is, for smaller areas, a very practical method of looking at spatial data, as it is easy to visualize and most functionality does not require particularly complex math (for example, distance is the same as a straight line). Planar data can be represented using the sort of x, y, z data points you might have used in graphics in geometry class by mixing a collection of point data into lines and polygonal shapes; one can use basic geometry to represent complex shapes, and still handle things like overlapping objects. + +No matter how well we draw our planar mapping though, we are often representing something that is not truly flat by using points on a flat surface. This can introduce some problems. There are a number of approaches to minimizing the effects of a flat representation of a round earth. Figures 7.4 through 7.5 are examples of some common projections of the earth. Planar representations of the earth make use of the concept of a "projection"—that is, the round earth gets projected onto a flat surface. + +Figure 7.4 + +Figure 7.5 + +As it turns out, these projects are generally "good enough" for many applications of spatial data. Indeed, most local maps for government tracking of properties, roads, and other needs are done using planar models such as latitude and longitude. + +Be careful with your assumptions regarding latitude and longitude. While these may seem like well understood and agreed on concepts, there are actually multiple mappings of latitude and longitude used in the world today. For example, the longitude used in the Global Positioning System (GPS) is a noticeable distance (more than 100 meters or more depending on what part of the earth you're standing on when you measure it) different than most other representations of longitude (which are generally based on the Royal Observatory's definition of zero longitude). + +Planar data is supported in SQL Server by the GEOMETRY data type (which will serve as the core type for most of our upcoming examples). + +There are multiple accepted models of the earth. Make certain when supplying or receiving spatial data that the models being used are compatible or that you know how to adjust for differences between the two. + +Geodetic (Round Earth) Data + +Geodetic data, as shown in Figure 7.5, represents the more realistic (far more complex) model based on a round earth. Geodetic representation of data is supported by the GEOGRAPHY data type. + +Under the planar data model, it is assumed that the surface of the earth is flat. This works just fine for areas measured in relatively small distances (say, as much as several miles), but begins to fall apart as the distances grow larger. For example, when measuring the distance between Portland, Oregon and Beijing, China, the straight line used in a planar model would improperly represent the distance by many miles less than it actually is. Why? Well, under the flat model, the distance is a straight line rather than the more appropriate arc (which would follow the curvature of the earth's surface). Indeed, the issue can get even more complex, as the earth is not a perfect sphere (it bulges in places) with the circumference varying by literally hundreds of miles depending on which direction you're measuring. Geodetic data models the curve of the earth, and is supported in SQL Server via the GEOGRAPHY data type. + +It is important to note that SQL Server can only represent geographic data that resides within a single hemisphere. A hemisphere can be considered as any half of a sphere—regardless of what plane you cut the sphere along. + +Representing Spatial Data + +There are several key notions that are common to representing both planar and geodetic data and work together to allow you to represent a given type of data in different ways. The Open Geospatial Consortium (OGC)—an organization specializing in geometric data standards—defines several formats that you can utilize to represent spatial data. SQL Server 2008 implements three of these: + + * Well Known Text (WKT): This is very plain-text looking, and simply sequentially names a series of objects (such as a point or a line) followed by coordinate information for each object. + * Well Known Binary (WKB): Implementing the same general notion as WKT, this representation encodes the same kind of information in a binary stream rather than plain text. + * Geography Markup Language (GML): An XML schema designed to represent geometric data. GML leverages the self-defining nature of XML data to allow additional (non-coordinate) information to be encoded along with the coordinate data. Examples of the kind of extended information that might be included with GML data would be things like a description of what is found at the location or, perhaps, sensor information (say, an ozone measurement at a specific point in Los Angeles, CA versus a similar measurement taken in Lisbon, Portugal). + +We will utilize WKT for the examples in this book, but this is largely a readability decision, and does not imply that WKT is a better choice in general use (the right choice will vary by situation). + +Regardless of which data representation is being utilized, the general objects required will be the same. Each format recognizes a set of three base objects that can be used individually or as a collection to represent spatial data. The objects are: + + * Point: This is a specific point in space. It has no length, no width, and no height. It is the equivalent of the spot you mark with a thumb tack on a map to represent a place you are or have been. A point requires a simple X, Y notation. + * Line: In each of the formats SQL Server recognizes, a line is represented using a LINESTRING object. Note the relevance of the term STRING that is embedded. This recognizes that a line is represented as a series of two or more points. The use of multiple points in the line definition allows for the idea that the line may not be straight. Since each segment of the line string is the shortest path between the two points, increasing the number of points representing the same conceptual line will increase the accuracy of that line's representation. + * A line is considered "simple" if it does not cross over itself, and is considered to be a one-dimensional object even if it is curved or forms a ring (a line that has the same ending and starting point). + +Note that a ring does not mean that a polygon is round—only that it creates some form of enclosed space. + + * Polygon: Although it is defined by one or more rings (again, a line with the same ending and starting points), defining what is individually a linestring that forms a ring as a polygon instead changes the treatment of the would-be linestring. Unlike the base ring definition, which is one dimensional and has no area, a polygon does have area. In addition, the ring that defines the outer boundary of the polygon can contain additional polygons that can define areas in the outer polygon that are hollow. The space defined by these inner hollow polygons is not considered to be part of the area of the parent polygon. + * Collection: This is a collection of the other three objects (point, line, polygon). + +Regardless of which spatial data type you're using within SQL Server—GEOMETRY or GEOGRAPHY—all three of these base objects (or a collection of them) are available and can be used in any mix within a given table. For example, a table of world landmarks might store a complex polygon to represent Yellowstone Park, a line to represent the equator, and a simple point to indicate the highest point on the earth. Each of these (or a collection of them) could be represented within the same column in the same table. + +In addition to these base concepts, the OGC defines a set of methods that should be supported to work with our spatial data. We'll explore some of these that are supported by SQL Server as we go through the examples, but it's worth noting that many of the methods exist for both types of spatial data (using the same name or just a slight name change) and have the same general functionality between the types. The OGC functions all start with a prefix of ST followed by a verb that indicates what the function does. They are implemented as a method for each instance of spatial data. Key examples are discussed in the following table. + +Note that, for each ST method call, the spatial reference id—or SRID—must match in order to perform a valid comparison. The SRID indicates what recognized (by the European Petroleum Survey Group) spatial model this particular spatial instance is referencing. If the SRIDs of two instances do not match, then any comparison will return NULL. + +Method | Use +---|--- +.STArea() | Calculates the area of a spatial instance that is a polygon and accounts for hollow spaces created by contained polygons. +.STContains() | Returns a bit indicating whether the supplied instance is entirely contained within the calling instance. +.STDistance() | Provides a numeric value indicating the distance between the supplied instance and the calling instance. +.STEquals() | Returns a bit indicating whether the supplied and calling instance are qualitatively equal. Note that this does not require them to be defined in exactly the same way, but, rather, to wind up with the same result (for example, defining a square with 8 line segments, and another with 4, but with the same resulting side lengths and position would return a 1). +.STIntersects() | Returns a bit indicating whether the supplied instance crosses the calling instance at any point +.STOverlaps() | Returns a bit indicating whether the supplied instance overlaps the calling instance (for example, a line starting within a polygon, and then ending outside of it). +.STTouches() | Returns a bit indicating whether the supplied instance touches the calling instance in any way. +.STWithin() | Returns a bit indicating whether the supplied instance lies entirely within the calling instance; if any portion of the supplied instance falls outside of the calling instance, then STWithin will return a zero. + +The OGC function list is actually much, much longer and does vary somewhat between the GEOMETRY and GEOGRAPHY types, but these provide a taste of what's available and, among other things, includes those supported for indexes against spatial data. You can find a more complete list in the Books Online by looking under each spatial type (GEOMETRY and GEOGRAPHY). + +Implementing Planar Data Representations—The GEOMETRY Data Type + +As previously mentioned, the data type that implements the concept of planar, or flat earth, data is called GEOMETRY. Using the GEOMETRY data type not only provides a means to contain the types of geometric object definitions (point, linestring, polygon) we discussed earlier, but also a series of methods that can be utilized against that data. Like the HierarchyID data type we discussed earlier in the chapter (and the GEOGRAPHY type we'll discuss next), GEOMETRY is implemented via a CLR user-defined function (then flagged as system so it doesn't require the security considerations that true CLR UDTs require). Like other .NET classes, you can make use of a number of properties and static members of the class. + +The GEOMETRY type can accept any of the geometric types we just discussed. Let's check this out with a quick example that not only instantiates a geometric data type, but loads it with data. + +Note that SQL Server will attempt to represent some spatial data in Management Studio. The representation will, however, become visible only when you are in the Results to Grid mode in the Query Editor Window. + +We'll start by examining the way to get our WKT data into our data type: + +DECLARE @MyGeometry GEOMETRY; + +SET @MyGeometry = Geometry::STGeomFromText('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, + +-3 3)', 0) + +SET @MyGeometry = Geometry::Parse('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)') + +SET @MyGeometry = 'LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)' + +SELECT @MyGeometry; + +SET @MyGeometry = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))' + +SELECT @MyGeometry; + +In this code, we've declared an instance of geometric data in a variable called @MyGeometry. We then assign linestring data to our variable in three different ways. These are all functionally the same, with the final assignment using the Parse function implicitly. + +We then select out our newly assigned line. When this is executed, Management Studio shows us not only a binary representation, but also the visual representation shown in Figure 7.6. + +Figure 7.6 + +Note that, in order to see the spatial data tab, you must be using the "Results to Grid" mode in the Query Editor Window. + +We then go on to repeat the assignment and selection, but this time for a polygon instead of a linestring. This winds up yielding us slightly different results (shown in Figure 7.7). + +Figure 7.7 + +Notice the slightly different representation of two objects based on the same series of points. Why are they different? Well, recall that a linestring is always considered one dimensional. Although they can curve and even cross over themselves, they are still considered to lack area (which requires two dimensions). SQL Server represents the linestring—even though it forms a ring—as hollow to represent the lack of area. For the polygon, however, SQL Server fills in the square to represent the enclosure of two-dimensional space. SQL Server is aware that the two, though based on the same series of points, have a fundamental difference distinguishing them. This difference will become more apparent later on, as various methods of the GEOGRAPHY data type are only relevant to specific object types (for example, the method that calculates area only makes sense on polygons, not on lines). + +Our polygons are, of course, not limited to squares or even rectangles. Indeed, they can be virtually any shape as long as they eventually are enclosed into a ring by ending at the same point they started at. (A linestring simply crossing itself is not enough to form a ring, and, therefore, a polygon. It must start and end at the same point.) In addition, we can use polygons embedded inside other polygons to represent hollow space. Let's check all these concepts out. + +First, we need a few different instances of the GEOMETRY data type to compare against each other. We'll also go ahead and establish a simple square again, but this time we'll call the STArea() method of the GEOMETRY type to get the area of our square: + +DECLARE @First GEOMETRY, + +@Second GEOMETRY, + +@Merged GEOMETRY; + +SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'; + +SELECT 'First polygon area: ', @First.STArea(); + +SELECT @First; + +The STArea() method is an example of a method that is part of the OGC list of spatial data methods. Execute this code, and we get a representation of our square (the same as we showed in Figure 7.7), but we also get a calculated area of 36. + +Moving on, let's expand our script to add another polygon, but this time we'll add something that has a slightly more complex linestring: + +DECLARE @First GEOMETRY, + +@Second GEOMETRY, + +@Merged GEOMETRY; + +SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))' + +SELECT 'First polygon area:', @First.STArea(); + +SELECT @First; + +SET @Second = + +'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1 .4))'; + +SELECT @Second; + +SET @Second= @Second.MakeValid(); + +SELECT 'Second polygon area: ', @Second.STArea(); + +As the more complex linestring in @Second would imply, we are shown a more complex shape: an octagon (shown in Figure 7.8). + +Figure 7.8 + +Note also, though, that we had to perform an additional action on our polygon to make it valid—that done, we are able to call the area calculation and receive our result (3.28000022888182). + +Continuing the example, we can build a polygon that utilizes both linestrings, with the second becoming a hollow area in the first: + +DECLARE @First GEOMETRY, + +@Second GEOMETRY, + +@Merged GEOMETRY; + +SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))' + +SELECT 'First polygon area: ', @First.STArea(); + +SELECT @First; + +SET @Second = + +'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, - + +1 .4))'; + +SELECT @Second; + +SET @Second= @Second.MakeValid(); + +SELECT 'Second polygon area: ', @Second.STArea(); + +SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3), + +(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 .4, -1 .4))'; + +SELECT @Merged; + +SET @Merged = @Merged.MakeValid(); + +SELECT 'Merged polygon area: ', @Merged.STArea(); + +This time SQL Server shows both polygons—inverting the color fill to show the hollow space (shown in Figure 7.9). + +Figure 7.9 + +The calculated area for the merged polygon has properly taken into account the hollow area (that is, it subtracts it from the larger polygon) and gives us the correct area of 32.7200009155276. + +Let's make one last addition to this script, this time adding yet another polygon into the mix to see how SQL Server handles overlapping areas. We'll add another octagon to the merged polygon: + +DECLARE @First GEOMETRY, + +@Second GEOMETRY, + +@Merged GEOMETRY; + +SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))' + +SELECT 'First polygon area: ', @First.STArea(); + +SELECT @First; + +SET @Second = + +'POLYGON((-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1 + +.4))'; + +SELECT @Second; + +SET @Second= @Second.MakeValid(); + +SELECT 'Second polygon area: ', @Second.STArea(); + +SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3), + +(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, - + +1 .4))' + +SET @Merged = @Merged.MakeValid(); + +SELECT 'Merged polygon area: ', @Merged.STArea(); + +SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3), + +(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, -1 .4), + +(-2.5 .4, -1.9 1, -1.1 1, -.5 .4, -.5 -.4, -1.1 -1, -2.5 -1, -1.9 -1, -2.5 – + +.4, -2.5 .4))' + +SELECT @Merged; + +SET @Merged = @Merged.MakeValid(); + +SELECT 'Second Merged polygon area: ', @Merged.STArea(); + +Pay attention to both the third figure (shown in Figure 7.10) and the area of 30.4900010681158. Note that both polygons are shown (including their overlap area), and that the area result subtracted the hollow area only once—that is, the area that overlaps between the two inner polygons was only removed once. + +Figure 7.10 + +Last, but not least, let's take a quick visit of the ToString() method. For this, we'll use the same merged GEOMETRY, activate the MakeValid() method, and then output the slightly modified result: + +DECLARE @Merged GEOMETRY; + +SET @Merged = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3), + +(-1 .4, -.4 1, .4 1, 1 .4, 1 -.4, .4 -1, -1 -1, -.4 -1, -1 -.4, + +-1 .4))' + +SET @Merged = @Merged.MakeValid(); + +SELECT 'Merged polygon area: ', @Merged.STArea(); + +SELECT @Merged; + +SELECT @Merged.ToString() + +Notice the changes to the output: + +POLYGON ((-3 -3, 3 -3, 3 3, -3 3, -3 -3), (-0.39999961853027344 -1, -1 – + +0.39999961853027344, -1 0.39999961853027344, -0.39999961853027344 1, + +0.39999961853027344 1, 1 0.39999961853027344, 1 -0.39999961853027344, + +0.39999961853027344 -1, -0.39999961853027344 -1)) + +The changes away from our relatively round numbers is a byproduct of the MakeValid() command, but, other than that change, we got back almost exactly the layout we put in. + +Implementing Geodetic Representations—The GEOGRAPHY Type + +The type that implements the concept of geodetic, or round earth, data is called GEOGRAPHY. The GEOGRAPHY data type works, in most ways, just like the GEOMETRY type did. (Indeed, they share many of the same functions.) Like the last two data types we've discussed, GEOMETRY is implemented via a CLR user-defined function. + +The GEOGRAPHY type can also accept any of the geometric types we discussed earlier in the section, but it also applies the notion of a hemisphere. + +While the geometric data type would apply a default SRID to spatial instances (the default is zero), the GEOGRAPHY data type does not generally have a default value (some individual geography methods do assume a SRID of 4326), and must be supplied each time you redefine a geographic instance. + +Let's start by utilizing a near duplicate of our first geometry example, only using the GEOGRAPHY type this time: + +DECLARE @First GEOGRAPHY; + +SET @First = GEOGRAPHY::STGeomFromText('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 + +3)', 4326) + +SET @First = GEOGRAPHY::Parse('LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)') + +SET @First = 'LINESTRING(-3 3, 3 3, 3 -3, -3 -3, -3 3)' + +SELECT @First; + +This all works fine, with only the STGeomFromText() function working differently than its geometric counterpart (and, even then, the only difference is that it requires a second parameter instead of using a default). + +Things get a bit more interesting when we get to a polygon though, as we must fit within a given hemisphere. A hemisphere is, just as in the dictionary definition, half of a sphere. The starting and stopping points of each hemisphere vary depending on what SRID you're referencing, but, regardless of which you've chosen, all polygons, lines, and points referenced for a given spatial instance must fit within that hemisphere. + +I would imagine this to provoke the question of "Why?" I know it did for me. The issue has to do with eliminating ambiguity on what is considered "inside" versus "outside" a polygon. There are functions that look to see if something is contained within a spatial instance, but how do you know if something is inside an object if you don't know which side of the defining ring is considered inside versus outside? + +There is, of course, more than one way to address the inside versus outside problem with spatial data in general, but the SQL Server team had to pick one, and they went with an approach that requires you to stay within a single hemisphere. If you need to map an object that crosses a hemisphere boundary, consider mapping it as two adjacent objects (sharing the hemisphere border), and utilizing them as a pair. + +To check this out, we'll continue to run through what is largely the same example as we used for geometry, but mapped to the curve aware data type (GEOGRAPHY): + +DECLARE @First GEOGRAPHY; + +SET @First = 'POLYGON((-3 3, 3 3, 3 -3, -3 -3, -3 3))'; + +SELECT @First; + +But when you try and execute this, you run into trouble that you didn't have under the GEOGRAPHY data type: + +Msg 6522, Level 16, State 1, Line 3 + +A .NET Framework error occurred during execution of user-defined routine or + +aggregate "geography": + +Microsoft.SqlServer.Types.GLArgumentException: 24205: The specified input does + +not represent a valid geography instance because it exceeds a single + +hemisphere. Each geography instance must fit inside a single hemisphere. A + +common reason for this error is that a polygon has the wrong ring orientation. + +Microsoft.SqlServer.Types.GLArgumentException: + +at Microsoft.SqlServer.Types.GLNativeMethods.ThrowExceptionForHr(GL_HResult + +errorCode) + +at Microsoft.SqlServer.Types.GLNativeMethods.GeodeticIsValid(GeoData g) + +at Microsoft.SqlServer.Types.SqlGeography.IsValidExpensive() + +at Microsoft.SqlServer.Types.SqlGeography.ConstructGeographyFromUserInput(GeoData + +g, Int32 srid) + +at Microsoft.SqlServer.Types.SqlGeography.GeographyFromText(OpenGisType + +type, SqlChars taggedText, Int32 srid) + +at Microsoft.SqlServer.Types.SqlGeography.Parse(SqlString s) + +. + +(1 row(s) affected) + +The extra stack of error lines is a result of the .NET implementation that is behind all of the new data types that are covered in this chapter. The key item, however, is the GLArgumentException line; we are in more than one hemisphere. + +When I first started learning about the hemisphere issue, my assumption was that it must have to do with negative and positive numbers—not so. Instead, the issue is more of a simplistic test of whether the "inside" of our polygon fits inside a hemisphere. We've defined a box that seems fairly straightforward and small here, so it's easy to see why one might be confused at how it is in more than one hemisphere. The problem is, however, also fairly simple. Our inside and outside are backwards. That is, what you likely perceive as being "outside" the square is considered to be inside as we've defined the box to SQL Server. + +To address this issue, we have to think of the polygon in terms of the ring that draws it—that is, as a series of connected lines that eventually ends where it started. The "inside" is always deemed to be the side that is on the left of the line as you draw it. In general, this means that, when you draw an object, you'll want to lay out the lines that enclose it in a counterclockwise direction. In our example, we were going clockwise, so we created a situation where the "outside" was the area that was bounded by our line, and the inside was unbounded. We can fix our error by simply reversing the order we draw the polygon in: + +DECLARE @First GEOGRAPHY; + +SET @First = 'POLYGON((-3 3, -3 -3, 3 -3, 3 3, -3 3))'; + +SELECT @First; + +Now if we execute it, things return and look pretty much as they did when we were working with the GEOMETRY type. + +The set of methods implemented in the GEOMETRY and GEOGRAPHY types has significant overlap between them, but is not identical. (All the ones we've seen in this chapter, except for MakeValid() are implemented in both types). Spatial data is its own area of study, so I recommend exploring information well outside the SQL Server–specific community to understand what is expected in each implementation. + +Filestreams + +This is something of a "high-octane" feature that is new in SQL Server 2008. Indeed, it is relatively fringe in nature, and even requires you to take special steps to enable it. (It is not enabled in the default installation.) Still, while I consider this feature to still be solidly in its infancy, it has started a path to something that is potentially very special. So that should bring about the question: "OK then, what exactly do filestreams do?" Glad you asked. + +There has long been a series of problems in the database realm regarding what to do with storage of unstructured data files (for example, images, documents, spreadsheets, movies, and so on). The files are often an integral part of a large piece of data we are storing in a database (let's say something like photos of a crash and a scanned image of a claim form on an insurance claim). + +With this is mind, we would like to: + + * Store all that data together and in a space-efficient manner + * Read and write the data with maximum performance + * Utilize transactions + * Secure the data effectively and under one model + * Have consistent state on the data when backing up and restoring + +The methods of addressing these problems have varied depending on which of these issues were considered the priority for a particular installation. The balancing act has gone something like this: + + * Performance is key: The data was generally kept in individual files at the file system level. + * Consistent state is key: The data was generally stored as binary large objects (blobs) in the database. Often the blobs were kept on a separate drive array through the use of filegroups. + +The specifics vary by installation, but, while SQL Server's performance in blob handling has improved substantially over the years, it was still slow enough that the most common installation was to store files at the file system level and just store the path to the file in SQL Server. This has several risks, including: + + * Files can get moved without the database knowing, breaking the link between data with no history that might allow recovery. + * Updates to the files are made without the database being directly aware of the change, making auditing ineffective at best. + * There was no means of co-enrolling data changes in the same transaction. This means you can overwrite a file, but have the associated database changes rolled back (or vice versa), destroying the proper state of your data. + * The lack of coordinated transactions created a time latency between changes in the file system and backup/recovery work in the database. + +Other installations did go the SQL Server blob route, fixing the preceding issues, but creating other problems: + + * Storage was inefficient, with space loss due to SQL Server's page storage model overhead as relates to blob data. + * Performance suffered. In general, this performance hit occurred in a manner that affected all the data being accessed, not just the blob. + * Accessing blob data from the database required special handling versus other data in the database—adding complexity. What's more, the access model was generally seen as more complex than the relatively simple stream handling of files from the file system. + +Filestreams in SQL Server address virtually all of these problems by coordinating storage between the database and the file system into one cohesive solution, with both systems doing what they do best (SQL Server coordinating the transactions and storing the structured data, and the file system storing the unstructured data). + +Under a filestream model, SQL Server integrates with NTFS (the file system used in Windows). For tables and columns that are configured to do so, data for columns defined as type varbinary(max) are redirected to the file system. Access from within SQL Server is relatively transparent, and standard T-SQL statements will work against the data. For client languages, however, they can utilize a special SqlFileStream object that is derived from the Stream class in .NET, making much of the functionality very familiar to client developers that are already used to the Stream object for file handling and other stream access. Through this integration of the best parts of SQL Server and NTFS, several key problems are solved: + + * Security is coordinated between SQL Server and NTFS: The directory used to store the SQL Server filestream data can only be accessed within a SQL Server granted context. This means that those who do not have appropriate access to the varbinary(max) column in SQL Server cannot gain access to the underlying file in NTFS. + * Transactions are fully supported: Stream updates are fully enrolled in any active transaction (indeed, clients using a filestream are required to enroll in a transaction context in order to gain access to the data at all), and will honor commits and rollbacks as appropriate. This means updates to an existing file will be rolled back as appropriate, restoring the file to its original state if the transaction did not complete. + * Backups are also coordinated: This means that backups of the database include the NTFS handled files in a state consistent with the rest of the backup data. + * Access to the file information is handled through virtually identical means as it would have been had the file been stored within NTFS directly: Only minimal coordination overhead is incurred, so performance differences versus direct NTFS storage is negligible. + +The ramifications of this bode very well for the future of unstructured data in otherwise structured environments. Let's take a quick look at what's involved from a development perspective. + +Enabling Filestreaming + +By default, filestream access is turned off when you install SQL Server. There is an option to set this up during the installation process, and I recommend using that option if you remember it. If, however, you forget (or just didn't, at the time, think you needed it), you can enable filestreaming for the server by using the SQL Server Configuration Manager. Go to the SQL Server Services node, and right-click the SQL Server services for your instance (the default instance is labeled MSSQLSERVER). This should bring up the dialog shown in Figure 7.11. (Notice that I've changed to the FILESTREAM tab). + +Installation of the AdventureWorks2008 database requires that filestream be turned on for the server you install it on, so, if you've made it this far in the book, you're certain to have filestream access turned on for the server you've been working examples with. That said, you may want to play around with a system that doesn't have filestream turned on so you can understand what's involved in turning it on after the fact. + +Figure 7.11 + +In this dialog, you can define the level of access you want the filestream exposed to. Be sure and note that what you are setting up here is for the server, and your database(s) will need additional configuration to be able to store stream data. + +Enabling a Database for Filestreams + +To enable filestreaming for a database, you just need to create a filegroup using the CONTAINS FILESTREAM option. This will set the path that you want to place under SQL Server access control and enable tables to be configured for filestream access. Let's try this out by creating a database we'll use for examples in this section: + +CREATE DATABASE FileStreamDB + +ON + +PRIMARY ( NAME = FSDBPrimary, FILENAME = 'C:\FSDB\DB\fsdb.mdf'), + +FILEGROUP FSDBStream CONTAINS FILESTREAM + +( NAME = FSDBStream, FILENAME = 'C:\FSDB\STREAM') + +LOG ON ( NAME = FSDBLog, FILENAME = 'C:\FSDB\fsdb.ldf') + +GO + +Note that, unlike the data and log file paths, which must exist when you run the CREATE DATABASE statement, the filegroup you're using for the filestream must not yet exist. SQL Server creates the directory as part of the database creation, coordinating with NTFS regarding permissions and ownership of the directory. + +Run this (changing the file paths to something that works on your particular system) and you should get a confirmation that your database has been created. + +You can use the ALTER DATABASE command to add a filestream filegroup if you need to enable an existing database for filestream access. + +Creating a Filestream-Enabled Table + +There are no special settings required to enable a table for filestream. Instead, you just need to make sure that your table has a unique constrained column of type rowguidcol (a special data type that uses the uniqueidentifer type but also defines it as a row identifier for SQL Server) defined. After that, filestream access is defined on a per-column basis based on options for any varbinary(max) columns in the table. + +Again, let's try this out by creating a table we'll use later to store an object on our SQL Server: + +CREATE TABLE FSTable + +( + +FileKey int NOT NULL IDENTITY PRIMARY KEY, + +rowguid uniqueidentifier rowguidcol NOT NULL UNIQUE, + +filedata varbinary(max) FILESTREAM + +); + +Again, this should get you a simple confirmation that the command ran successfully, but, with this created, we should be ready to manipulate stream data. + +Using T-SQL with Filestreams + +Filestream data is relatively transparent to T-SQL access. We can, for example, run a simple INSERT statement just as we would any other row that had binary data: + +DECLARE @Ident int + +INSERT FSTable + +VALUES + +(NEWID(), 0x0A); + +SET @Ident = @@IDENTITY; + +SELECT FileKey, filedata + +FROM FSTable + +WHERE FileKey = @Ident; + +UPDATE FSTable + +SET filedata = 0x49276D206C6561726E696E672066696C6573747265616D73 + +WHERE FileKey = @Ident; + +SELECT FileKey, filedata + +FROM FSTable + +WHERE FileKey = @Ident; + +DELETE FSTable + +WHERE FileKey = @Ident; + +SELECT FileKey, filedata + +FROM FSTable + +WHERE FileKey = @Ident; + +This explores all the main statements of SQL: + +(1 row(s) affected) + +FileKey filedata + +\----------- ------------------------------------------------------------------------ + +1 0x0A + +(1 row(s) affected) + +(1 row(s) affected) + +FileKey filedata + +\----------- ------------------------------------------------------------------------ + +1 0x49276D206C6561726E696E672066696C6573747265616D73 + +(1 row(s) affected) + +(1 row(s) affected) + +FileKey filedata + +\----------- ------------------------------------------------------------------------ + +(0 row(s) affected) + +As you can see, there really isn't a lot to it from a T-SQL perspective. Indeed, all the major statements work pretty much as they would with non-filestream data. There is a small amount of additional information using the PathName() property that is added to the varbinary(max) data type when filestream is enabled, for example: + +DECLARE @Ident int; + +INSERT FSTable + +VALUES + +(NEWID(), 0x0A); + +SELECT @Ident = @@IDENTITY; + +SET @Ident = @@IDENTITY; + +SELECT rowguid, fs.filedata.PathName() AS Path + +FROM FSTable fs + +WHERE FileKey = @Ident; + +Run this, and you should see a single row back. (It is, unfortunately, too wide to fit gracefully in this book.) First, notice the rowguid column. Now compare it with the final portion of the Path column, and you should see a match. + +As you can see, the column we identify as the rowguidcol is critically important in terms of setting a unique path for our stored filestreams. + +Using Filestreams with .NET + +I'm going to defer much of our discussion of .NET with filestreams until we discuss connectivity in Chapter 25 (which is a web-only release, so don't skip right to the back of the book!). However, I think it important to understand some key points early, as they have design ramifications that you may not otherwise think of before you get in the middle of some .NET code. + +Any work with a filestream requires a transaction context. Even if you're just reading data, you need the transaction context from the SQL Server side to govern issues of concurrency and consistency of your data. Unfortunately, you cannot make use of the T-SQL keyword BEGIN TRANSACTION (there are some rules for multiple active result set—or MARS—enabled connections that BEGIN TRANSACTION does not live up to), so you must use your client's data access API's method of enlisting transactions prior to accessing data via a filestream. + +Other than that, the primary difference between handling a SQL Server related filestream and the more generic Stream object in .NET is mostly one of what you instantiate. (For SQL Server filestreams, a SqlFileStream object takes care of most of the differences transparently.) + +Again, we will look at an example filestream connection in Chapter 25. + +Again, as an important reminder, Chapter 25 is a web release chapter, and one I hope to occasionally update during the life of this book to keep it somewhat in line with the ever changing world of connectivity. + +Table Compression + +It is important to note that, as of this writing, the data compression features in SQL Server 2008 are limited to the Enterprise edition. + +This one is, again, new with SQL Server 2008, but some early indications of what was to come first appeared in a SQL Server 2005 service pack. From a programming standpoint, there is actually relatively little to be done here. (It's largely about table settings.) But it's worth a visit in this "advanced" data structures chapter for three simple reasons: + + * Planning: The compression feature fundamentally alters the page/row storage format of data on disk, and can significantly reduce the footprint of your data. This is done on a table-by-table basis (again, it is a table-level setting), and therefore requires an adjustment to how you plan for the required storage volume and growth in your database. + * Performance: There is a performance trade-off when you deal with table compression that can work for or against you. It depends on the particular scenario. There is extra overhead to managing the compression, but the compression may also sharply reduce I/O requirements, and thus gain back any performance lost to the compression overhead. + * Structure Knowledge: I went so far as to tell you about the traditional page/row storage methods, so anything that fundamentally alters those default storage methods probably deserves something of a look. + +Enabling Compression + +In the previous chapter, we took a look at the CREATE INDEX syntax. This, along with CREATE TABLE, is where the DATA_COMPRESSION option is available. The CREATE INDEX version is highlighted in the following code (it works the same in the CREATE TABLE statement). + +CREATE [UNIQUE] [CLUSTERED|NONCLUSTERED] + +INDEX ON
( [ASC|DESC] [,...n]) + +INCLUDE ( [,...n]) + +[WITH + +[PAD_INDEX = { ON | OFF }] + +[[,] FILLFACTOR = ] + +[[,] IGNORE_DUP_KEY = { ON | OFF }] + +[[,] DROP_EXISTING = { ON | OFF }] + +[[,] STATISTICS_NORECOMPUTE = { ON | OFF }] + +[[,] SORT_IN_TEMPDB = { ON | OFF }] + +[[,] ONLINE = { ON | OFF } + +[[,] ALLOW_ROW_LOCKS = { ON | OFF } + +[[,] ALLOW_PAGE_LOCKS = { ON | OFF } + +[[,] DATA_COMPRESSION = { NONE | ROW | PAGE} + +[ ON PARTITIONS ( { | } + +[[,] MAXDOP = + +] + +[ON { | | DEFAULT }] + +As mentioned before, you can turn on data compression as part of the CREATE TABLE statement by adding an identical line to that used in the CREATE INDEX statement. + +Summary + +Virtually everything seen in this chapter is new with SQL Server 2008 (XML indexes being the notable exception). Most of it is highly specialized, but each does what it does very well with data structures that have been optimized for that specific task. + +If you're dealing with XML data, consider your index carefully, but experiment with indexes and realize that they can greatly speed XML queries. For hierarchical data, consider the new HierarchyID data type. Not only does it include hierarchy-specific methods, but, for many developers, the notion that a given node knows its entire lineage is going to be much easier to grasp than the recursive calls that are generally required for the parent child approach to hierarchies. + +Spatial data is finally here, but brings SQL Server developers into a realm that they have likely not been in before. There is support for both flat and round earth models, and the ability to recognize proximity, irregular shapes, intersections, and similar spatial-specific concepts is a huge boon for many that didn't realize they had a special need—let alone conceive of a way to address that need. + +Filestreams address a long-standing need in SQL Server. Most of the functionality supported by filestreams has been supported in some other fashion for a long time, but filestreams integrate that functionality in a manner that allows for more coordinated backup processes and, perhaps more important, transaction-based handling of large binary files. While filestream access is largely a client application–only process, it requires substantial design and security consideration by the database architect. + +Data compression is finally here at the database level. While the compression is largely transparent to the application, compression can affect performance in both good and bad ways, and needs to be carefully considered prior to activating the compression feature. + +In our next chapter, we'll explore an old mainstay of SQL Server—views. +8 + +Views + +Since we're assuming, in this book, that you already know something about SQL Server, I am going to minimize the discussion of the basics and focus primarily on the more meaty uses of views. That said, we'll touch ever so briefly on view basics before moving on. + +Views have a tendency to be used either too much, or not enough—rarely just right. When we're done with this chapter, you should be able to use views to: + + * Be more comfortable with view basics + * Add additional indexing to your database to speed query performance—even when you're not using the view the index is based on + * Understand and utilize the notion of partitioned views and federated servers + +A view is, at its core, really nothing more than a stored query. You can create a simple query that selects from only one table and leaves some columns out, or you can create a complex query that joins several tables and makes them appear as one. + +Reviewing View Syntax + +The most basic syntax for a view looks something like this: + +CREATE VIEW + +AS + + + +[WITH CHECK OPTION] + +So, an extremely simple view on the Person.Person table in the AdventureWorks2008 database might look something like: + +USE AdventureWorks2008; + +GO + +CREATE VIEW Person.PersonView + +AS + +SELECT FirstName, MiddleName, LastName + +FROM Person.Person; + +So, when you run: + +SELECT * FROM Person.PersonView; + +You get back exactly the same thing as: + +SELECT FirstName, MiddleName, LastName + +FROM Person.Person; + +You are essentially saying to SQL Server: "Give me all of the rows and columns you get when you run the statement SELECT FirstName, MiddleName, LastName FROM Person.Person." + +We've created something of a pass-through situation—that is, our view hasn't really changed anything, but rather just "passed through" a filtered version of the data it was accessing. Think about the uses for this a bit, and you should be able to see how this concept can be utilized to do things like simplify the data for inexperienced users (show them only the columns they care about to keep from confusing them) or to proactively hide sensitive data (such as profit or salary numbers) by granting the user rights to a view that doesn't include that data, but not giving them rights to the underlying table. + +Be aware that, by default, there is nothing special done for a view. The view runs just as if it were a query run from the command line—there is no pre-optimization of any kind. This means that you are adding one more layer of overhead between the request for data and the data being delivered. That means that a view is never going to run as fast as if you had just run the underlying SELECT statement directly. That said, views exist for a reason—be it security or simplification for the user—balance your need against the overhead as would seem to fit your particular situation. + +How much overhead? Well, it depends both on how complex the view is and on the calling code. It can range from milliseconds to much longer impacts (though usually the former) depending on the specifics. + +Let's take this one step further. + +You've already seen how to create a simple view—you just use any SELECT statement. How do you filter the results of your queries? With a WHERE clause. Views are no different. + +More Complex Views + +Perhaps one of the most common uses of views is to flatten data—that is, the removal of complexity that we outlined at the beginning of the chapter. Imagine that we are providing a view for management to make it easier to check on sales information. No offense to managers who are reading this book, but managers who write their own complex queries are still a rather rare breed—even in the information age. + +For an example, our manager would like to be able to do simple queries that will tell him or her what orders have been placed for what items and how many sold on each order and related pricing information. So, we create a view that he or she can perform very simple queries on: + +USE AdventureWorks2008; + +GO + +CREATE VIEW CustomerOrders_vw + +AS + +SELECT o.SalesOrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.OrderQty, + +od.UnitPrice, + +od.LineTotal + +FROM Sales.SalesOrderHeader AS o + +JOIN Sales.SalesOrderDetail AS od + +ON o.SalesOrderID = od.SalesOrderID + +JOIN Production.Product AS p + +ON od.ProductID = p.ProductID; + +Now do a SELECT: + +SELECT * + +FROM CustomerOrders_vw; + +You wind up with a bunch of rows—over 100,000—but you also wind up with information that is far simpler for the average manager to comprehend and sort out. What's more, with not that much training, the manager (or whoever the user might be) can get right to the heart of what he or she is looking for: + +SELECT ProductID, OrderQty, LineTotal + +FROM CustomerOrders_vw + +WHERE OrderDate = '5/15/2003'; + +The user didn't need to know how to do a four-table join—that was hidden in the view. Instead, he or she needs only limited skill (and limited imagination for that matter) in order to get the job done. + +ProductID OrderQty LineTotal + +\----------- -------- --------------------------------------- + +791 1 2443.350000 + +781 1 2071.419600 + +794 1 2181.562500 + +798 1 1000.437500 + +783 1 2049.098200 + +801 1 1000.437500 + +784 1 2049.098200 + +779 1 2071.419600 + +797 1 1000.437500 + +(9 row(s) affected) + +However, we could make our query even more targeted. Let's say that we want our view to return only yesterday's sales. We'll make only slight changes to our query: + +USE AdventureWorks2008; + +GO + +CREATE VIEW YesterdaysCustomerOrders_vw + +AS + +SELECT o.SalesOrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.OrderQty, + +od.UnitPrice, + +od.LineTotal + +FROM Sales.SalesOrderHeader AS o + +JOIN Sales.SalesOrderDetail AS od + +ON o.SalesOrderID = od.SalesOrderID + +JOIN Production.Product AS p + +ON od.ProductID = p.ProductID + +WHERE CONVERT(varchar(12),o.OrderDate,101) = + +CONVERT(varchar(12),DATEADD(day,-1,GETDATE()),101) + +All the dates in the AdventureWorks database are old enough that this view wouldn't return any data, so let's add a row to test it. Execute the following script all at one time: + +USE AdventureWorks2008; + +DECLARE @Ident int; + +INSERT INTO Sales.SalesOrderHeader + +( + +CustomerID, + +OrderDate, + +DueDate, + +BillToAddressID, + +ShipToAddressID, + +ShipMethodID + +) + +VALUES + +( + +1, -- CustomerID + +DATEADD(day,-1,GETDATE()), -- OrderDate (Yesterday) + +GETDATE(), -- Due Date (today) + +1, -- BillToAddressID + +1, -- ShipToAddressID + +1 -- ShipMethodID + +); + +SELECT @Ident = @@IDENTITY; + +INSERT INTO Sales.SalesOrderDetail + +(SalesOrderID, + +OrderQty, + +ProductID, + +SpecialOfferID, + +UnitPrice, + +UnitPriceDiscount) + +VALUES + +(@Ident, 4, 765, 1, 50, 0); + +SELECT 'The OrderID of the INSERTed row is ' + CONVERT(varchar(8),@Ident); + +Most of what's going on in this script shouldn't be a big mystery for non-beginners, but I'll be explaining all of what is going on here in Chapter 9. For now, just trust me that we'll need to run all of this in order for us to have a value in AdventureWorks2008 that will come up for our view. You should see a result from the Management Studio that looks something like this: + +(1 row(s) affected) + +(1 row(s) affected) + +\------------------------------------------- + +The OrderID of the INSERTed row is 75124 + +(1 row(s) affected) + +Be aware that some of the messages shown in the preceding code will appear only on the Messages tab if you are using the Management Studio's Results In Grid mode. Also remember that your particular OrderID may be different from mine depending what experimenting you've already been doing in the AdventureWorks2008 database. + +The SalesOrderID might vary, but the rest should hold pretty true. + +Now let's run a query against our view and see what we get: + +SELECT SalesOrderID, OrderDate FROM YesterdaysCustomerOrders_vw + +You can see that the 75124 does indeed show up: + +SalesOrderID OrderDate + +\------------ ----------------------- + +75124 2008-12-31 01:00:00.000 + +(1 row(s) affected) + +Don't get stuck on the notion that your SalesOrderID numbers are going to be the same as mine—these are set by the system (since SalesOrderID is an identity column) and are dependent on just how many rows have already been inserted into the table. As such, your numbers will vary. + +Using a View to Change Data—Before INSTEAD OF Triggers + +As we've said before, a view works mostly like a table does from an in-use perspective (obviously, creating them works quite a bit differently). Now we're going to come across some differences, however. + +It's surprising to many, but you can run INSERT, UPDATE, and DELETE statements against a view successfully. There are several things, however, that you need to keep in mind when changing data through a view: + + * If the view contains a join, you won't, in most cases, be able to INSERT or DELETE data unless you make use of an INSTEAD OF trigger. An UPDATE can, in some cases (as long as you are only updating columns that are sourced from a single table), work without INSTEAD OF triggers, but it requires some planning, or you'll bump into problems very quickly. + * If your view references only a single table, then you can INSERT data using a view without the use of an INSTEAD OF trigger provided all the required fields in the table are exposed in the view or have defaults. Even for single-table views, if there is a column not represented in the view that does not have a default value, then you must use an INSTEAD OF trigger if you want to allow an INSERT. + * You can, to a limited extent, restrict what is and isn't inserted or updated in a view. + +Now, I've already mentioned INSTEAD OF triggers several times. INSTEAD OF triggers are a special, fairly complex kind of trigger we will look at extensively in Chapter 12. The problem here is that we haven't discussed triggers to any significant extent yet. As is often the case in SQL Server items, we have something of the old chicken versus egg thing going ("Which came first?"). I need to discuss INSTEAD OF triggers because of their relevance to views, but we're also not ready to talk about INSTEAD OF triggers unless we understand both of the objects (tables and views) that they can be created against. + +The way we are going to handle things for this chapter is to address views the way they used to be—before there was such a thing as INSTEAD OF triggers. While we won't deal with the specifics of INSTEAD OF triggers in this chapter, we'll make sure we understand when they must be used. We'll then come back and address these issues more fully when we look at INSTEAD OF triggers in Chapter 12. + +Having said that, I will provide this bit of context—an INSTEAD OF trigger is a special kind of trigger that essentially runs "instead" of whatever statement caused the trigger to fire. The result is that it can see what your statement would have done, and then make decisions right in the trigger about how to resolve any conflicts or other issues that might have come up. It's very powerful but also fairly complex stuff, which is why we defer it for now. + +Dealing with Changes in Views with Joined Data + +If the view has more than one table, then using a view to modify data is, in many cases, out—sort of anyway—unless you use an INSTEAD OF trigger. Since it creates some ambiguities in the key arrangements, Microsoft locks you out by default when there are multiple tables. To resolve this, you can use an INSTEAD OF trigger to examine the altered data and explicitly tell SQL Server what you want to do with it. + +Required Fields Must Appear in the View or Have the Default Value + +By default, if you are using a view to insert data (there must be a single table SELECT in the underlying query or at least you must limit the insert to affecting just one table and have all required columns represented), then you must be able to supply some value for all required fields (fields that don't allow NULLs). Note that by "supply some value" I don't mean that it has to be in the SELECT list—a default covers the bill rather nicely. Just be aware that any columns that do not have defaults and do not accept NULL values will need to appear in the view in order to perform INSERTs through the view. The only way to get around this is—you guessed it—with an INSTEAD OF trigger. + +Limit What's Inserted into Views—WITH CHECK OPTION + +The WITH CHECK OPTION is one of those lesser-known to almost completely unknown features in SQL Server. The rules are simple—in order to update or insert data using the view, the resulting row must qualify to appear in the view results. Restated, the inserted or updated row must meet any WHERE criterion that's used in the SELECT statement that underlies your view. + +Editing Views with T-SQL + +The main thing to remember when you edit views with T-SQL is that you are completely replacing the existing view. The only differences between using the ALTER VIEW statement and the CREATE VIEW statement are: + + * ALTER VIEW expects to find an existing view, whereas CREATE doesn't. + * ALTER VIEW retains any permissions that have been established for the view. + * ALTER VIEW retains any dependency information. + +The second of these is the biggie. If you perform a DROP and then use a CREATE, you have almost the same effect as using an ALTER VIEW statement. The problem is that you will need to entirely reestablish your permissions on who can and can't use the view. + +Dropping Views + +It doesn't get much easier than this: + +DROP VIEW , [,[ ...n]] + +And it (or they) is gone. + +Auditing: Displaying Existing Code + +What do you do when you have a view, but you're not sure what it does? The first option should be easy at this point—just go into the Management Studio as if you're going to edit the view. Go to the Views sub-node, select the view you want to edit, right-click, and either choose Design or Script View As and then choose the specific type of script you want. Either way, you'll see the code behind the view complete with color-coding. + +Note that the Design feature brings up a special view builder utility. While the view builder is fabulous for those with little SQL experience (it works much like a similar tool in Access), I find it to be overly invasive about the way I want my view formatted, and inevitably leaves me with a view that is much more wordy (and therefore harder to read) than I would like; therefore, I usually stick to using the scripting tool and my own SQL writing skills. + +Unfortunately, we don't always have the option of having the Management Studio around to hold our hand through this stuff (we may be using a lighter-weight tool of some sort, or we may need to build the actual requests into our own application). The bright side is that we have a few ways of getting at the actual view definition: + + * sp_helptext + * The OBJECT_DEFINITION() system function + * The sys.comments system view + +Let's look at the first of these by running sp_helptext against one of the supplied views in the AdventureWorks2008 database—vStateProvinceCountryRegion: + +EXEC sp_helptext 'Person.vStateProvinceCountryRegion'; + +Note the quotes. This is because this stored proc expects only one argument, and the period is a delimiter of sorts—if you pass Person.vStateProvinceCountryRegion in without the quotes, it sees the period and isn't sure what to do with it and therefore errors out. If the view was in our default schema, we could supply just the view name (no schema) and would not need to wrap it in quotes. + +SQL Server obliges us with the code for the view: + +Text + +\------------------------------------------------------------------------------ + +CREATE VIEW [Person].[vStateProvinceCountryRegion] + +WITH SCHEMABINDING + +AS + +SELECT + +sp.[StateProvinceID] + +,sp.[StateProvinceCode] + +,sp.[IsOnlyStateProvinceFlag] + +,sp.[Name] AS [StateProvinceName] + +,sp.[TerritoryID] + +,cr.[CountryRegionCode] + +,cr.[Name] AS [CountryRegionName] + +FROM [Person].[StateProvince] sp + +INNER JOIN [Person].[CountryRegion] cr + +ON sp.[CountryRegionCode] = cr.[CountryRegionCode]; + +Now, sp_helptext is great, but I would classify it as somewhat antiquated at this point. Why? Well, since sp_helptext is a stored procedure, you can't easily include the result set as part of a more complex data operation. Fortunately, Microsoft has given us OBJECT_DEFINITON() to deal with that issue. + +OBJECT_DEFINITION() should be your preferred choice for a couple of reasons: + + * When new releases come out, it will automatically be updated for changes to the system tables (so you don't have to worry about such things) + * The value returned can easily be used within a broader query (for example, as one column, with the source code for many objects being returned) + +The syntax looks like this: + +OBJECT_DEFINITION() + +The negative in this is that we rarely know what our object's id is without doing special lookup. Fortunately, SQL Server provides us a simple way to look up an object's id by using the OBJECT_ID() function. For example, if we wanted to use OBJECT_DEFINITION() to get the code for the same view we looked at earlier, we could write: + +SELECT OBJECT_DEFINITION (OBJECT_ID(N'Person.vStateProvinceCountryRegion')); + +Object IDs are SQL Server's internal way of keeping track of things. They are integer values rather than the names that you're used to for your objects. In general, they are outside the scope of this book, but it is good to realize they are there, as you will find them used by scripts you may copy from other people or just bump into them later in your SQL endeavors. + +Try it, and you'll see the result is nearly identical to when we used sp_helptext (it just doesn't name the column for us unless we provide an alias in our query definition). + +We can take this a bit further and easily return the code for every view in our database: + +SELECT '------------------------', OBJECT_DEFINITION(so.object_id) + +FROM sys.objects so + +WHERE so.type = 'V'; + +I've omitted the results here in the book lest thousands of trees die needlessly—it's that lengthy. That said, running the previous query should give you all of the views in the AdventureWorks2008 database. + +We couldn't have done that with sp_helptext without utilizing a cursor—making it easy to see the usefulness of system functions such as OBJECT_DEFINITION versus the system stored procedure objects we had in earlier versions of SQL Server. + +Now let's try it the last of our ways—using sys.comments. + +You may see sys.comments (a system view) used interchangeably with the older, but not far less desirable syscomments (a system table). syscomments is one of many system tables that gave us most of our system information in SQL Server versions of old. Microsoft has been trying to move us away from direct calls to system tables for years, and they have finally given us the set of tools that allows us to comply with their wish. + +Even when system tables were the only directly queryable way to get system information, their use was somewhat risky, as Microsoft has always warned that system tables can change at any time (even service packs, though I've never seen that actually happen). Now that Microsoft has given us the views in the sys schema and a wide variety of table valued functions for metadata (see Appendix [CHECK] for more on those), it is downright silly to go directly against the system tables. I highly recommend that you migrate old code that may be accessing the system tables directly to utilize the equivalent view (usually easily found by just adding a period after the "sys" in the old system table name). + +sys.comments provides an actual view to your underlying source code, and thus provides something you can join directly to if you so choose. Like OBJECT_DEFINITION(), any use of sys.comments is going to require you to know your object's id. You can either join to the sys.objects system view, much as I did in the previous example, or utilize the OBJECT_ID() function as I did in the example before that. Note, however, that, when using sys.objects, you need to treat the object name and schema name separately (which means that you also need to involve the sys.schemas system view). For example: + +SELECT sc.text + +FROM sys.syscomments sc + +JOIN sys.objects so + +ON sc.id = so.object_id + +JOIN sys.schemas ss + +ON so.schema_id = ss.schema_id + +WHERE so.name = 'vStateProvinceCountryRegion' + +AND ss.name = 'Person'; + +Again, you get the same block of code we saw in the previous two methods. + +Protecting Code: Encrypting Views + +If you're building any kind of commercial software product, odds are that you're interested in protecting your source code. All you have to do to encrypt your view (and most other forms of server stored code) is use the WITH ENCRYPTION option. This one has a couple of tricks to it if you're used to the WITH CHECK OPTION clause: + + * WITH ENCRYPTION goes after the name of the view, but before the AS keyword. + * WITH ENCRYPTION does not use the OPTION keyword. + +In addition, remember that if you use an ALTER VIEW statement, you are entirely replacing the existing view except for access rights. This means that the encryption is also replaced. If you want the altered view to be encrypted, then you must use the WITH ENCRYPTION clause in the ALTER VIEW statement. + +Let's do an ALTER VIEW on the CustomerOrders_vw view that we created earlier in the chapter. If you haven't yet created the CustomerOrders_vw view, then just change the ALTER to CREATE (don't forget to run this against AdventureWorks): + +ALTER VIEW CustomerOrders_vw + +WITH ENCRYPTION + +AS + +SELECT o.SalesOrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.OrderQty, + +od.UnitPrice, + +od.LineTotal + +FROM Sales.SalesOrderHeader AS o + +JOIN Sales.SalesOrderDetail AS od + +ON o.SalesOrderID = od.SalesOrderID + +JOIN Production.Product AS p + +ON od.ProductID = p.ProductID; + +Now do an sp_helptext on our CustomerOrders_vw: + +EXEC sp_helptext CustomerOrders_vw; + +SQL Server promptly tells us that it can't do what we're asking: + +The text for object 'CustomerOrders_vw' is encrypted. + +The heck you say, and promptly go to the sys.comments view: + +SELECT sc.text + +FROM syscomments sc + +JOIN sys.objects so + +ON sc.id = so.object_id + +JOIN sys.schemas ss + +ON so.schema_id = ss.schema_id + +WHERE so.name = 'CustomerOrders_vw' + +AND ss.name = 'dbo'; + +But that doesn't get you very far either—SQL Server recognizes that the table was encrypted and will give you a NULL result. + +In short—your code is safe and sound. Even if you pull it up in other viewers (such as Management Studio, which actually won't even give you the Design option on an encrypted table), you'll find it useless. + +Make sure you store your source code somewhere before using the WITH ENCRYPTION option. Once it's been encrypted, there is no way to get it back. If you haven't stored your code away somewhere and you need to change it, then you may find yourself rewriting it from scratch. + +About Schema Binding + +Schema binding essentially takes the things that your view is dependent upon (tables or other views), and "binds" them to that view. The significance of this is that no one can make alterations to those objects (CREATE, ALTER) unless they drop the schema-bound view first. + +Why would you want to do this? Well, there are a few reasons why this can come in handy: + + * It prevents your view from becoming "orphaned" by alterations in underlying objects. Imagine, for a moment, that someone performs a DROP or makes some other change (even deleting a column could cause your view grief) but doesn't pay attention to your view. Oops. If the view is schema bound, then this is prevented from happening. + * To allow indexed views. If you want an index on your view, you must create it using the SCHEMABINDING option. (We'll look at indexed views just a few paragraphs from now.) + * If you are going to create a schema-bound user-defined function (and there are instances where your UDF must be schema bound) that references your view, then your view must also be schema bound. + +Keep these in mind as you are building your views. + +Making Your View Look Like a Table with VIEW_METADATA + +This option has the effect of making your view look very much like an actual table to DB-LIB, ODBC, and OLE-DB clients. Without this option, the metadata passed back to the client API is that of the base table(s) that your view relies on. + +Providing this metadata information is required to allow for any client-side cursors (cursors your client application manages) to be updatable. Note that, if you want to support such cursors, you're also going to need to use an INSTEAD OF trigger. + +Indexed (Materialized) Views + +In SQL Server 2000, this one was supported only in the Enterprise Edition (okay, the Developer and Evaluation Editions also supported it, but you aren't allowed to use test and development editions in production systems). It is, however, supported in all editions since SQL Server 2005. + +When a view is referred to, the logic in the query that makes up the view is essentially incorporated into the calling query. Unfortunately, this means that the calling query just gets that much more complex. The extra overhead of figuring out the impact of the view (and what data it represents) on the fly can actually get very high. What's more, you're often adding additional joins into your query in the form of the tables that are joined in the view. Indexed views give you a way of taking care of some of this impact before the query is ever run. + +An indexed view is essentially a view that has had a set of unique values "materialized" into the form of a clustered index. The advantage of this is that it provides a very quick lookup in terms of pulling the information behind a view together. After the first index (which must be a clustered index against a unique set of values), SQL Server can also build additional indexes on the view using the cluster key from the first index as a reference point. That said, nothing comes for free—there are some restrictions about when you can and can't build indexes on views (I hope you're ready for this one—it's an awfully long list!): + + * The view must use the SCHEMABINDING option. + * If it references any user-defined functions (more on these later in the book), then these must also be schema bound. + * The view must not reference any other views—just tables and UDFs. + * All tables and UDFs referenced in the view must utilize a two-part (not even three-part and four-part names are allowed) naming convention (for example, dbo.Customers, BillyBob.SomeUDF) and must also have the same owner as the view. + * The view must be in the same database as all objects referenced by the view. + * The ANSI_NULLS and QUOTED_IDENTIFIER options must have been turned on (using the SET command) at the time the view and all underlying tables were created. + * Any functions referenced by the view must be deterministic. + +To create an example indexed view, let's start by reviewing the CustomerOrders_vw object that we created earlier in the chapter. I'm showing this using the ALTER statement we used in the section on encryption, but, really, it could just as easily be the original version we created very early in the chapter as long as the WITH SCHEMABINDING is properly added. + +ALTER VIEW CustomerOrders_vw + +WITH SCHEMABINDING + +AS + +SELECT o.SalesOrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.OrderQty, + +od.UnitPrice, + +od.LineTotal + +FROM Sales.SalesOrderHeader AS o + +JOIN Sales.SalesOrderDetail AS od + +ON o.SalesOrderID = od.SalesOrderID + +JOIN Production.Product AS p + +ON od.ProductID = p.ProductID; + +Some important things to notice here are: + + * We had to make our view use the SCHEMABINDING option. + * In order to utilize the SCHEMABINDING option, we must have two-part naming for the objects (in this case, all tables) that we reference (in this case, we did anyway, but not all views you come across will already be configured that way). + +This is really just the beginning—we don't have an indexed view as yet. Instead, what we have is a view that can be indexed. When we create the index, the first index created on the view must be both clustered and unique. + +CREATE UNIQUE CLUSTERED INDEX ivCustomerOrders + +ON CustomerOrders_vw(SalesOrderID, ProductID, Name); + +Once this command has executed, we have a clustered view. We also, however, have a small problem that will become clear in just a moment. + +Let's test our view by running a simple SELECT against it: + +SELECT * FROM CustomerOrders_vw; + +If you execute this, you'll see that the graphical showplan as shown in Figure 8.1 (Display Estimated Execution Plan is the tooltip for this, and you'll find it toward the center of the toolbar; you can also find it in the menus at Query ⇒ Display Estimated Execution Plan) shows us using our new index. + +Figure 8.1 + +The index supporting an indexed view may be utilized by SQL Server even if you do not explicitly use the view. For example, if you are performing joins that are similar to those the index is supporting for the view, SQL Server may recognize this and utilize the index. + +Partitioned Views + +These have been in use just since SQL Server 2000, but Microsoft has, since 2005, considered partitioned tables to be the preferred partitioning method. I bring them up here because they were one of the leading scalability options put forth by Microsoft for many years, and you need to know how they work in case you run into them in legacy code. In addition, there are some partitioning problems that are difficult to unsolvable utilizing partitioned tables, so it's good to know and understand another option. + +A partitioned view is a view that unifies multiple identical (in terms of structure—not actual data) tables and makes them appear to be a single table. At first, this seems like an easy thing to do with simple UNION clauses, but the concept actually becomes somewhat tricky when you go to handle insert and update scenarios. + +With partitioned views, we define a constraint on one of the tables in our view. We then define a similar, but mutually exclusive, constraint on a second (and possibly many more) table. When you build the view that unifies these mutually exclusive tables, SQL Server is able to sort out the exclusive nature of the tables in a logical manner. By doing this, SQL Server can determine exactly which table is to get the new data (by determining which table can accept the data—if you created them as mutually exclusive as you should have, then the data will be able to get into only one table and there is no conflict). The only catch is that the so called "partitioning column" must participate in the primary key. Let's see how this works by building our own little mini-sample. + +Imagine for a moment that you are running a very large Internet site, and you are taking in thousands of orders daily. Your Orders table is getting to be huge, and you are having issues where your purge job (to delete older records) is causing blocking issues while the DELETE statement is running. + +By utilizing partitioned views (or as we'll learn later, partitioned tables), we can essentially silo our data such that we can spread the data out physically (by using different filegroups, or even different servers for each table) and have SQL Server sort out where everything is supposed to go. + +Here's what a two-month set of data might look: + +CREATE TABLE OrderPartitionJan08 + +(OrderID int NOT NULL, + +OrderDate date NOT NULL + +CONSTRAINT CKIsJanOrder + +CHECK (OrderDate >= '2008/01/01' + +AND OrderDate < '2008/02/01'), + +CustomerID int NOT NULL, + +CONSTRAINT PKOrderIDOrderDateJan + +PRIMARY KEY (OrderID, OrderDate) + +); + +CREATE TABLE OrderPartitionFeb08 + +(OrderID int NOT NULL, + +OrderDate date NOT NULL + +CONSTRAINT CKIsFebOrder + +CHECK (OrderDate >= '2008/02/01' + +AND OrderDate < '2008/03/01'), + +CustomerID int NOT NULL, + +CONSTRAINT PKOrderIDOrderDateFeb + +PRIMARY KEY (OrderID, OrderDate) + +); + +GO + +CREATE VIEW Orders + +AS + +SELECT * + +FROM OrderPartitionJan08 + +UNION ALL + +SELECT * + +FROM OrderPartitionFeb08; + +Once we have created these tables along with the view that unites them into a partitioned view, we're ready to insert a few rows of data: + +INSERT INTO Orders + +VALUES + +(1, '2008-01-15', 1), + +(2, '2008-02-15', 1); + +Orders is a view, and therefore has no data of its own—so where does the data go? Under the covers, SQL Server analyzes the data being inserted and figures out that, based on the constraints in our table, the data can, in each case, go to one and only one table. Let's check that out with a few queries: + +SELECT * FROM Orders; + +SELECT * FROM OrderPartitionJan08; + +SELECT * FROM OrderPartitionFeb08; + +This gets us, in order, both rows we inserted, then the row from January, then the one from February. + +OrderID OrderDate CustomerID + +\----------- ---------- ----------- + +1 2008-01-15 1 + +2 2008-02-15 1 + +(2 row(s) affected) + +OrderID OrderDate CustomerID + +\----------- ---------- ----------- + +1 2008-01-15 1 + +(1 row(s) affected) + +OrderID OrderDate CustomerID + +\----------- ---------- ----------- + +2 2008-02-15 1 + +(1 row(s) affected) + +As you can see, our data has been split up into separate tables based on the partitioning column. We can easily create additional tables to partition our data into (for example, an OrderPartitionMar08 table) and then alter our view to union in the additional table. Likewise, we can easily remove a block of data by excluding it from the view and then dropping the table. + +You can also spread the tables that support the partitioned view over multiple servers utilizing linked servers. This distributes the query load for those tables out to the various servers that house them, and is usually referred to as a "distributed partitioned view." The servers that support a given distributed partitioned view are said to be "federated." + +Summary + +Views tend to be either the most overused or most underused tools in most of the databases I've seen. Some people like to use them to abstract seemingly everything (often forgetting that they are adding another layer to the process when they do this). Others just seem to forget that views are even an option. Personally, like most things, I think you should use a view when it's the right tool to use—not before, not after. + +Common uses for views include: + + * Filtering rows + * Protecting sensitive data + * Reducing database complexity + * Abstracting multiple physical databases into one logical database + * Creating indexes that effectively pre-join data between tables + +Things to remember with views include: + + * Stay away from building views based on views—instead, adapt the appropriate query information from the first view into your new view. + * Remember that a view using the WITH CHECK OPTION provides some flexibility that can't be duplicated with a normal CHECK constraint. + * Encrypt views when you don't want others to be able to see your source code—either for commercial products or general security reasons. + * Using an ALTER VIEW completely replaces the existing view other than permissions. This means you must include the WITH ENCRYPTION and WITH CHECK OPTION clauses in the ALTER statement if you want encryption and restrictions to be in effect in the altered view. + * Use the OBJECT_DEFINITION() system function to display the supporting code for a view—avoid using system tables. + * Minimize the use of views for production queries—they can add additional overhead and hurt performance. + * Indexing a view puts additional load on any data modification process that affects the data participating in the indexed view. + * Distributed partitioned views can be utilized to distribute data and query load across multiple servers, but, for single server partitioning, partitioned tables is typically a better choice. + +In our next chapter, we'll take a look at batches and scripting. Batches and scripting will lead us right into stored procedures and user defined functions—the closest thing that SQL Server has to its own programs. +9 + +Scripts and Batches + +Geez. I've been writing too long. For some reason, when I see the phrase "Scripts and Batches" it reminds me of the old song "Love and Marriage" (Frank Sinatra for the curious). While scripts and batches do go together like a horse and carriage, they are hardly as lyrical—but I digress.... + +We have, of course, already written many SQL scripts in this book. My assumption, given that this is a "Professional" level book, is that you already have most of the script basics down. After all, every CREATE statement that you write, every ALTER, every SELECT is all (if you're running a single statement) or part (multiple statements) of a script. It is, however, rather difficult to get excited about a script with one line in it. Could you imagine Hamlet's "To be, or not to be...?" if it had never had the following lines. We wouldn't have any context for what he was talking about. + +SQL scripts are much the same way. Things get quite a bit more interesting when we string several commands together into a longer script—a full play or at least an act to finish our Shakespeare analogy. Now imagine that we add a richer set of language elements from .NET to the equation. Now we're ready to write an epic! + +Scripts generally have a unified goal. That is, all the commands that are in a script are usually building up to one overall purpose. Examples include scripts to build a database (these might be used for a system installation), scripts for system maintenance, such as backups, Database Consistency Checker utilities (DBCCs), and scripts for anything where several commands are usually run together. + +We will be reviewing the notion of scripts during this chapter, and adding in the notion of batches, which control how SQL Server groups your commands together. In addition, we will take a look at SQLCMD, the command-line utility, and how it relates to scripts. + +SQLCMD was introduced as the new command-line scripting tool in SQL Server 2005. For backward compatibility only, SQL Server continues to support osql.exe (the previous tool that did command-line work). You may also see references to isql.exe, which served this same function in earlier releases. (Do not confuse this with isqlw.exe.) Isql.exe is no longer supported, but, since the options are pretty much the same, migration to osql or SQLCMD is generally not that difficult. + +Script Basics + +A script technically isn't a script until you store it in a file where it can be pulled up and reused. SQL scripts are stored as text files. SQL Server Management Studio provides many tools to help you with your script writing, but, technically, you can do the writing in any text editor. Keep in mind, however, that to actually test your script, it's going to have to be something that can connect to a SQL Server. With SQL Server 2008, the Management Studio gains the additional advantage of supporting IntelliSense. + +I continue to occasionally make use of a highly robust text editor for its ability to handle real expressions and other text-editing features that Management Studio, and even Visual Studio, will likely never have. That said, the Management Studio has, as it has added more features, become my preferred tool for editing SQL scripts for SQL Server. + +Scripts are usually treated as a unit. That is, you are normally executing the entire script or nothing at all. They can make use of both system functions and local variables. As an example, let's look at a simple script that could be used to INSERT order records into a typical order header and order detail table scenario: + +USE SomeDatabase + +DECLARE @Ident int + +INSERT INTO Orders + +(CustomerID,OrderDate) + +VALUES + +(25, DATEADD(day,-1,GETDATE())) -- this always sets the OrderDate to yesterday + +SELECT @Ident = @@IDENTITY + +INSERT INTO Details + +(OrderID, ProductID, UnitPrice, Quantity) + +VALUES + +(@Ident, 1, 50, 25) + +SELECT 'The OrderID of the INSERTed row is ' + CONVERT(varchar(8),@Ident) + +We have six distinct commands working here, covering a range of different things that we might do in a script. We're using both system functions and local variables, the USE statement, INSERT statements, and both assignment and regular versions of the SELECT statement. They are all working in unison to accomplish one task—to insert complete orders into the database. + +Batches + +A batch is a grouping of T-SQL statements into one logical unit, and, while this seems a pretty basic concept (indeed, I cover it at length in my Beginning title), I find it to be one of the more frequently misunderstood concepts in SQL Server, even among experienced administrators and developers. + +All of the statements within a batch are combined into one execution plan, so all statements are parsed together and must pass a validation of the syntax or none of the statements will execute. Note, however, that this does not prevent runtime errors from happening. In the event of a runtime error, any statement that has been executed prior to the runtime error will still be in effect. To summarize, if a statement fails at parse-time, then nothing runs. If a statement fails at runtime, then all statements until the statement that generated the error have already run. + +All the scripts we have run up to this point are made up of one batch each. Even the script we've been analyzing so far in this chapter is just one batch. To separate a script into multiple batches, we make use of the GO statement. The GO statement: + + * Must be on its own line (nothing other than a comment can be on the same line); there is an exception to this discussed shortly, but think of a GO as needing to be on a line to itself + * Causes all statements since the beginning of the script or the last GO statement (whichever is closer) to be compiled into one execution plan and sent to the server independently of any other batches + * Is not a T-SQL command, but, rather, a command recognized by the various SQL Server command utilities (OSQL, ISQL, and the Query Analyzer) + +A Line to Itself + +The GO command should stand alone on its own line. Technically, you can start a new batch on the same line after the GO command, but you'll find this puts a serious damper on readability. T-SQL statements cannot precede the GO statement, or the GO statement will often be misinterpreted and cause either a parsing error or some other unexpected result. For example, if I use a GO statement after a WHERE clause: + +SELECT * FROM Customers WHERE CustomerID = 2 GO + +The parser becomes somewhat confused: + +Msg 102, Level 15, State 1, Line 1 + +Incorrect syntax near 'GO'. + +Each Batch Is Sent to the Server Separately + +Because each batch is processed independently, an error in one batch does not preclude another batch from running. To illustrate, take a look at some code: + +USE AdventureWorks2008; + +DECLARE @MyVarchar varchar(50); --This DECLARE only lasts for this batch! + +SELECT @MyVarchar = 'Honey, I''m home...'; + +PRINT 'Done with first Batch...'; + +GO + +PRINT @MyVarchar; --This generates an error since @MyVarchar + +\--isn't declared in this batch + +PRINT 'Done with second Batch'; + +GO + +PRINT 'Done with third batch'; -- Notice that this still gets executed + +\-- even after the error + +GO + +If there were any dependencies between these batches, then either everything would fail—or, at the very least, everything after the point of error would fail—but it doesn't. Look at the results if you run the preceding script: + +Done with first Batch... + +Msg 137, Level 15, State 2, Line 2 + +Must declare the scalar variable "@MyVarchar". + +Done with third batch + +Again, each batch is completely autonomous in terms of runtime issues. Keep in mind though that you can build in dependencies in the sense that one batch may try to perform work that depends on the first batch being complete. We'll see some of this in the next section when we talk about what can and can't span batches. + +GO Is Not a T-SQL Command + +Thinking that GO is a T-SQL command is a common mistake. GO is a command that is recognized only by the editing tools (Management Studio, SQLCMD). If you use a third-party tool, then it may or may not support the GO command, but most that claim SQL Server support will. + +When the editing tool encounters a GO statement, it sees it as a flag to terminate that batch, package it up, and send it as a single unit to the server, without including the GO. That's right; the server itself has absolutely no idea what GO is supposed to mean. + +If you try to execute a GO command in a pass-through query using ODBC, OLE DB, ADO, ADO.NET, or any other access method, you'll get an error message back from the server. The GO is merely an indicator to the tool that it is time to end the current batch, and time, if appropriate, to start a new one. In the case of the aforementioned access methods, they each have the concept of a "command" object. That command object may include multiple statements, but each execution of the command object is implied to represent exactly one batch. + +Keep this notion in mind if you are building scripts you want to be compatible with other RDBMSs. Your non-SQL Server target system will likely fail if you pass it the GO keyword. + +Errors in Batches + +Errors in batches fall into two categories: + + * Syntax errors + * Runtime errors + +If the query parser finds a syntax error, the processing of that batch is canceled immediately. Since syntax checking happens before the batch is compiled or executed, a failure during the syntax check means none of the batch will be executed, regardless of the position of the syntax error within the batch. + +Runtime errors work quite a bit differently. Any statement that has already executed before the runtime error was encountered is already done, so anything that statement did will remain intact unless it is part of an uncommitted transaction. (Transactions are covered in Chapter 11, but the relevance here is that they imply an all or nothing situation.) What happens beyond the point of the runtime error depends on the nature of the error. Generally speaking, runtime errors will terminate execution of the batch from the point where the error occurred to the end of the batch. Some runtime errors, such as a referential-integrity violation will prevent only the offending statement from executing; all other statements in the batch will still be executed. This latter scenario is why error checking is so important. We will cover error checking in full in our chapter on stored procedures (see Chapter 10). + +When to Use Batches + +Batches have several purposes, but they all have one thing in common: They are used when something has to happen either before or separately from everything else in your script. + +Statements That Require Their Own Batch + +There are several commands that absolutely must be part of their own batch. These include: + + * CREATE DEFAULT + * CREATE FUNCTION + * CREATE PROCEDURE + * CREATE RULE + * CREATE SCHEMA + * CREATE TRIGGER + * CREATE VIEW + +If you want to combine any of these statements with other statements in a single script, then you will need to break them up into their own batch by using a GO statement. + +Note that, if you DROP an object, you may want to place the DROP in its own batch or at least with a batch of other DROP statements. Why? Well, if you're going to create an object later with the same name, the CREATE will fail during the parsing of your batch unless the DROP has already happened. That means you need to run the DROP in a separate and prior batch so it will be complete when the batch with the CREATE statement executes. + +Using Batches to Establish Precedence + +Perhaps the most likely scenario for using batches is when precedence is required—that is, you need one task to be completely done before the next task starts. Most of the time, SQL Server deals with this kind of situation just fine. The first statement in the script is the first executed, and the second statement in the script can rely on the server being in the proper state when the second statement runs. There are times, however, when SQL Server can't resolve this kind of issue. + +Let's take the example of creating a database together with some tables: + +CREATE DATABASE Test; + +CREATE TABLE TestTable + +( + +col1 int, + +col2 int + +); + +Execute this and, at first, it appears that everything has gone well: + +Command(s) completed successfully. + +However, things are not as they seem. Check out the INFORMATION_SCHEMA in the Test database, and you'll notice something is missing: + +SELECT TABLE_CATALOG + +FROM INFORMATION_SCHEMA.TABLES + +WHERE TABLE_NAME = 'TestTable' + +TABLE_CATALOG + +\----------------------------------------------------------------------------- + +master + +(1 row(s) affected) + +Hey! Why was the table created in the wrong database? The answer lies in what database was current when we ran the CREATE TABLE statement. In our case, it happened to be the master database, so that's where our table was created. + +Note that you may have been somewhere other than the master database when you ran this, so you may get a different result. That's kind of the point though. You could be in pretty much any database. That's why making use of the USE statement is so important. + +When you think about it, this seems like an easy thing to fix. Just make use of the USE statement, but before we test our new theory, we have to get rid of the old (okay, not that old) database: + +USE MASTER; + +DROP DATABASE Test; + +We can then run our newly modified script: + +CREATE DATABASE Test; + +USE Test; + +CREATE TABLE TestTable + +( + +col1 int, + +col2 int + +); + +Unfortunately, this has its own problems: + +Msg 911, Level 16, State 1, Line 3 + +Database 'Test' does not exist. Make sure that the name is entered correctly. + +The parser tries to validate our code and finds that we are referencing a database with a USE command that doesn't exist. Ahh, now we see the need for our batches. We need the CREATE DATABASE statement to be completed before we try to use the new database: + +CREATE DATABASE Test; + +GO + +USE Test; + +CREATE TABLE TestTable + +( + +col1 int, + +col2 int + +); + +Now things work a lot better. Our immediate results look the same: + +Command(s) completed successfully. + +But when we run our INFORMATION_SCHEMA query, things are confirmed: + +TABLE_CATALOG + +\------------------------------------------------------------------------------ + +Test + +(1 row(s) affected) + +Let's move on to another example that shows an even more explicit need for precedence. + +When you use an ALTER TABLE statement that significantly changes the type of a column or adds columns, you cannot make use of those changes until the batch that makes the changes has completed. + +If we add a column to our TestTable table in our Test database and then try to reference that column without ending the first batch: + +USE Test; + +ALTER TABLE TestTable + +ADD col3 int; + +INSERT INTO TestTable + +(col1, col2, col3) + +VALUES + +(1,1,1); + +We get an error message. SQL Server cannot resolve the new column name and therefore complains: + +Msg 207, Level 16, State 1, Line 6 + +Invalid column name 'col3'. + +Add one simple GO statement after the ADD col3 int though, and everything is working fine: + +(1 row(s) affected) + +SQLCMD + +SQLCMD is a utility that allows you to run scripts from a command prompt in a Windows command box. This can be very nice for executing conversion or maintenance scripts, as well as a quick-and-dirty way to capture a text file. + +SQLCMD replaces the older OSQL. OSQL is still included with SQL Server for backward compatibility only. An even older command-line utility—ISQL—is no longer supported. + +The syntax for running SQLCMD from the command line includes a large number of different switches, and looks like this: + +sqlcmd + +[ + +{ { -U [ -P ] } | –E } + +] + +[-S [ \ ] ] [ -H ] [ -d ] + +[ -l ] [ -w ] [ -a ] + +[ -e ] [ -I ] + +[ -c ] [ -L [ c ] ] [ -q "" ] [ -Q "" ] + +[ -m ] [ -V ] [ -W ] [ -u ] [ -r [ 0 | 1 ] ] + +[ -i ] [ -o ] + +[ -f | i: [ <, o: ] + +[ -k [ 1 | 2 ] ] + +[ -y ] [-Y ] + +[ -p [ 1 ] ] [ -R ] [ -b ] [ -v ] [ -A ] [ -X [ 1 ] ] [ -x ] + +[ -? ] + +] + +The single biggest thing to keep in mind with these flags is that many of them (but, oddly enough, not all of them) are case sensitive. For example, both -Q and -q will execute queries, but the first will exit SQLCMD when the query is complete, and the second won't. + +So, let's try a quick query direct from the command line. Again, remember that this is meant to be run from the Windows command prompt (don't use the Management Console): + +SQLCMD -Usa -Pmypass -Q "SELECT * FROM AdventureWorks2008.HumanResources.Employee" + +The –P is the flag that indicates the password. If your server is configured with something other than a blank password (and it should be!), then you'll need to provide that password immediately following the –P with no space in between. + +If you run this from a command prompt, you should get something like 290 rows back. Now, let's create a quick text file to see how it works when including a file. At the command prompt, type the following: + +C:\>copy con testsql.sql + +This should take you down to a blank line (with no prompt of any kind), where you can enter this: + +SELECT * FROM AdventureWorks2008.HumanResources.Employee + +Then press F6 and Return (this ends the creation of our text file). You should get back a message like: + +1 file(s) copied. + +Now let's retry our earlier query, using a script file this time. The command line at the prompt only has a slight change to it: + +C:\>sqlcmd -Usa -Pmypass -i testsql.sql + +This should get us exactly the same results as when we ran the query using -Q. The major difference is, of course, that we took the command from a file. The file could have had hundreds—if not thousands—of different commands in it. + +There are a wide variety of different parameters for SQLCMD, but the most important are the login, the password, and the one that says what you want to do (straight query or input file). You can mix and match many of these parameters to obtain fairly complex behavior from this seemingly simple command-line tool. + +Dynamic SQL: Generating Your Code on the Fly with the EXEC Command + +Okay, so all this saving stuff away in scripts is all fine and dandy, but what if you don't know what code you need to execute until runtime? + +As a side note, notice that we are done with SQLCMD for now. The following examples should be run utilizing the Management Console. + +SQL Server allows us, with a few gotchas, to build our SQL statement on the fly using string manipulation. The need to do this usually stems from not being able to know the details about something until runtime. The syntax looks like this: + +EXEC ({|''}) + +Or: + +EXECUTE ({|''}) + +As with executing a stored proc, whether you use the EXEC or EXECUTE makes no difference. + +Let's build an example in the AdventureWorks2008 database by creating a dummy table from which to grab our dynamic information: + +USE AdventureWorks2008; + +GO + +\--Create The Table. We'll pull info from here for our dynamic SQL + +CREATE TABLE DynamicSQLExample + +( + +TableID int IDENTITY NOT NULL + +CONSTRAINT PKDynamicSQLExample + +PRIMARY KEY, + +SchemaName varchar(128) NOT NULL, + +TableName varchar(128) NOT NULL + +); + +GO + +/* Populate the table. In this case, We're grabbing every user + +** table object in this database */ + +INSERT INTO DynamicSQLExample + +SELECT s.name AS SchemaName, t.name AS TableName + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id; + +This should get us a response something like: + +(78 row(s) affected) + +To quote the old advertising disclaimer: "actual results may vary." It's going to depend on which examples you've already followed along with in the book, which ones you haven't, and for which ones you took the initiative and did a DROP on once you were done with them. In any case, don't sweat it too much. + +Okay, so what we now have is a list of all the tables in our current database. Now let's say that we wanted to select some data from one of the tables, but we wanted to identify the table only at runtime by using its ID. For example, I'll pull out all the data for the table with an ID of 15: + +DECLARE @SchemaName varchar(128) + +DECLARE @TableName varchar(128) + +\-- Now, grab the table name that goes with our ID + +SELECT @SchemaName = SchemaName, @TableName = TableName + +FROM DynamicSQLExample + +WHERE TableID = 15 + +\-- Finally, pass that value into the EXEC statement + +EXEC ('SELECT * FROM ' + @SchemaName + '.' + @TableName) + +If your table names went into the DynamicSQLExample table the way mine did, then a TableID of 15 should equate to the ProductProductPhoto table. If so, you should wind up with something like this: + +ProductID ProductPhotoID Primary ModifiedDate + +\----------- -------------- ------- ----------------------- + +1 1 1 1998-05-02 00:00:00.000 + +2 1 1 1998-05-02 00:00:00.000 + +3 1 1 1998-05-02 00:00:00.000 + +... + +... + +997 102 1 2003-06-01 00:00:00.000 + +998 102 1 2003-06-01 00:00:00.000 + +999 102 1 2003-06-01 00:00:00.000 + +(504 row(s) affected) + +The Gotchas of EXEC + +Like most things that are of interest, using EXEC is not without its little trials and tribulations. Among the gotchas of EXEC are: + + * It runs under a separate scope than the code that calls it—that is, the calling code can't reference variables inside the EXEC statement, and the EXEC can't reference variables in the calling code after they are resolved into the string for the EXEC statement. + * By default, it runs under the same security context as the current user—not that of the calling object. Use the EXECUTE AS option to override this. + * It runs under the same connection and transaction context as the calling object (we'll discuss this further with transactions in Chapter 11). + * Concatenation that requires a function call must be performed on the EXEC string prior to actually calling the EXEC statement. You can't do the concatenation of function in the same statement as the EXEC call. + * EXEC cannot be used inside a user-defined function. + +Each of these can be a little difficult to grasp, so let's look at each individually. + +The Scope of EXEC + +Determining variable scope with the EXEC statement is something less than intuitive. The actual statement line that calls the EXEC statement has the same scope as the rest of the batch or procedure that the EXEC statement is running in, but the code that is performed as a result of the EXEC statement is considered to be in its own batch. As is so often the case, this is best shown with an example: + +USE AdventureWorks2008; + +/* First, we'll declare to variables. One for stuff we're putting into + +** the EXEC, and one that we think will get something back out (it won't) + +*/ + +DECLARE @InVar varchar(50); + +DECLARE @OutVar varchar(50); + +\-- Set up our string to feed into the EXEC command + +SET @InVar = 'SELECT @OutVar = FirstName FROM Person.Person + +WHERE ContactID = 1'; + +\-- Now run it + +EXEC (@Invar); + +\-- Now, just to show there's no difference, run the select without using a in variable + +EXEC ('SELECT @OutVar = FirstName FROM Person.Person WHERE BusinessEntityID = 1'); + +\-- @OutVar will still be NULL because we haven't been able to put anything in it + +SELECT @OutVar; + +Now, look at the output from this: + +Msg 137, Level 15, State 1, Line 1 + +Must declare the scalar variable '@OutVar'. + +Msg 137, Level 15, State 1, Line 1 + +Must declare the scalar variable '@OutVar'. + +\-------------------------------------------------- + +NULL + +(1 row(s) affected) + +SQL Server wastes no time in telling us that we are scoundrels and clearly don't know what we're doing. Why do we get a "Must Declare" error message when we have already declared @OutVar? Because we've declared it in the outer scope—not within the EXEC itself. + +Let's look at what happens if we run things a little differently: + +USE AdventureWorks2008; + +\-- This time, we only need one variable. It does need to be longer though. + +DECLARE @InVar varchar(200); + +/* Set up our string to feed into the EXEC command. This time we're going + +** to feed it several statements at a time. They will all execute as one + +** batch. + +*/ + +SET @InVar = 'DECLARE @OutVar varchar(50) + +SELECT @OutVar = FirstName FROM Person.Person + +WHERE BusinessEntityID = 1 + +SELECT "The Value Is " + @OutVar'; + +\-- Now run it + +EXEC (@Invar); + +This time we get back results closer to what we expect: + +\--------------------------------------------------------------- + +The Value Is Ken + +Notice the way that I'm using two quotation marks right next to each other to indicate that I really want a quotation mark rather than to terminate my string. + +So, what we've seen here is that we have two different scopes operating, and nary the two shall meet. There is, unfortunately, no way to pass information between the inside and outside scopes without using an external mechanism such as a temporary table. If you decide to use a temp table to communicate between scopes, just remember that any temporary table created within the scope of your EXEC statement will live only for the life of that EXEC statement. + +This behavior of a temp table only lasting the life of the scope it is created in will show up again when we are dealing with triggers and sprocs. + +A Small Exception to the Rule + +There is one thing that happens inside the scope of the EXEC that can be seen after the EXEC is done—system functions. So, things like @@ROWCOUNT can still be used. Again, let's look at a quick example: + +USE AdventureWorks2008; + +EXEC('SELECT * FROM Sales.Customer'); + +SELECT 'The Rowcount is ' + CAST(@@ROWCOUNT as varchar); + +This yields us (after the result set): + +The Rowcount is 19820 + +Security Contexts and EXEC + +When you give someone the right to run a stored procedure, you imply that he or she also gains the right to perform the actions called for within the sproc. For example, let's say we had a stored procedure that lists all the employees hired within the last year. Someone who has rights to execute the sproc can do so (and get results back) even if he or she does not have rights to the HumanResources.Employee table directly. This is really handy as it allows you to grant access to information for a very specific need without granting more general access to the underlying object. + +Developers usually assume that this same implied right is valid for an EXEC statement also—not necessarily. Indeed, by default, any reference made inside an EXEC statement will be run under the security context of the current user. So, let's say I have the right to run a procedure called spNewEmployee, but I do not have rights to the Employee table. If spNewEmployee gets the values by running a simple SELECT statement, then everything is fine. If, however, spNewEmployee uses an EXEC statement to execute that SELECT statement, the EXEC statement will fail because I don't have the rights to perform a SELECT on the Employee table. + +Fortunately, we now have some (albeit limited) options to get around this by utilizing the EXECUTE AS option that was added beginning in SQL Server 2005. We'll discuss the specifics of how to do so as we work with security in Chapter 19, when we will discuss how to run under a specific user context. + +The security context of an EXEC statement run within a stored procedure, user-defined function, or trigger can be overridden using the EXECUTE AS clause within the sproc, function, or trigger. EXECUTE AS will be discussed more fully when we discuss security in Chapter 19. + +Use of Functions in Concatenation and EXEC + +This one is actually more of a nuisance than anything else, since there is a reasonably easy workaround. Simply put, you can't run a function against your EXEC string in the argument for an EXEC. For example: + +USE AdventureWorks2008; + +\-- This won't work + +DECLARE @NumberOfLetters int; + +SET @NumberOfLetters = 3; + +EXEC('SELECT LEFT(LastName,' + CAST(@NumberOfLetters AS varchar) + ') AS FilingName + +FROM Person.Person'); + +GO + +\-- But this does + +DECLARE @NumberOfLetters AS int; + +SET @NumberOfLetters = 3; + +DECLARE @str AS varchar(255); + +SET @str = 'SELECT LEFT(LastName,' + CAST(@NumberOfLetters AS varchar) + ') AS + +FilingName FROM Person.Person'; + +EXEC(@str); + +The first instance gets us an error message because the CAST function needs to be fully resolved prior to the EXEC line: + +Msg 102, Level 15, State 1, Line 6 + +Incorrect syntax near 'CAST'. + +But the second line works just fine because it is already a complete string: + +FilingName + +\---------- + +Abb + +Abe + +Abe + +... + +Zuk + +Zwi + +Zwi + +(19972 row(s) affected) + +EXEC and UDFs + +In short, you can't get there from here. You are not allowed to use EXEC to run dynamic SQL within a UDF—period. (Using EXEC to run a sproc is, however, legal in a few cases.) + +Control-of-Flow Statements + +Control-of-flow statements are a veritable must for any programming language these days. I can't imagine having to write my code where I couldn't change what commands to run depending on a condition. + +Given that we're assuming at least an intermediate knowledge of both programming and SQL, we're not going to dwell on these a lot, but since "intermediate" means different things to different people, we had best give these the once over. + +T-SQL offers most of the classic choices for control-of-flow situations, including: + + * IF...ELSE + * GOTO + * WHILE + * WAITFOR + * TRY/CATCH + +We also have the CASE statement (a.k.a. SELECT CASE, DO CASE, and SWITCH/BREAK in other languages), but it doesn't have quite the level of control of flow capabilities that you've come to expect from other languages. + +The IF... ELSE Statement + +IF...ELSE statements work much as they do in any language, although I equate them most closely to C in the way they are implemented. The basic syntax is: + +IF + + | BEGIN END + +[ELSE + + | BEGIN END] + +The expression can be pretty much any expression that evaluates to a Boolean. + +This brings us back to one of the most common traps that I see SQL programmers fall into—improper use of NULLs. I can't tell you how often I have debugged stored procedures only to find a statement like: + +IF @myvar = NULL + +This will, of course, never be true on most systems (see the exception shortly) and will wind up bypassing all their NULL values. Instead, it needs to read: + +IF @myvar IS NULL + +The exception to this is dependent on whether you have set the ANSI_NULLS option ON or OFF. The default is that this is ON, in which case you'll see the behavior described previously. You can change this behavior by setting ANSI_NULLS to OFF. I strongly recommend against this since it violates the ANSI standard (it's also just plain wrong). + +Note that only the very next statement after the IF will be considered to be conditional (as per the IF). You can include multiple statements as part of your control-of-flow block using BEGIN...END, but we'll discuss that one a little later in the chapter. + +To show off a simple version of this, let's run an example that's very common to build scripts. Imagine for a moment that we want to CREATE a table if it's not there, but to leave it alone if it already exists. We could make use of the EXISTS operator. (You may recall my complaint that the Books Online calls EXISTS a keyword when I consider it an operator.) + +\-- We'll run a SELECT for our table to start with to prove it's not there + +SELECT 'Found Table ' + s.name + '.' + t.name + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest'; + +\-- Now we're run our conditional CREATE statement + +IF NOT EXISTS ( + +SELECT s.name AS SchemaName, t.name AS TableName + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest' + +) + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +); + +\-- And now look again to prove that it's been created. + +SELECT 'Found Table ' + s.name + '.' + t.name + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest'; + +The meat of this is in the middle. Notice that our CREATE TABLE statement runs only if no matching table already exists: + +\------------------------------------------------------------------------------ + +(0 row(s) affected) + +\------------------------------------------------------------------------------ + +Found Table dbo.OurIFTest + +(1 row(s) affected) + +The ELSE Clause + +Now this thing about being able to run a statement conditionally is just great, but it doesn't really deal with all the scenarios we might want to deal with. Quite often—indeed, most of the time—when we deal with an IF condition, we have specific statements we want to execute not just for the true condition, but also a separate set of statements that we want to run if the condition is false—or the ELSE condition. + +You will run into situations where a Boolean cannot be evaluated—that is, the result is unknown (for example, if you are comparing to a NULL). Any expression that returns a result that would be considered as an unknown result will be treated as FALSE. + +The ELSE statement works pretty much as it does in any other language. The exact syntax may vary slightly, but the nuts and bolts are still the same. The statements in the ELSE clause are executed if the statements in the IF clause are not. + +To expand our earlier example just a bit, let's actually print a warning message out if we do not create our table: + +\-- Now we're run our conditional CREATE statement + +IF NOT EXISTS ( + +SELECT s.name AS SchemaName, t.name AS TableName + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest' + +) + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +); + +ELSE + +PRINT 'WARNING: Skipping CREATE as table already exists'; + +If you have already run the preceding example, then the table will already exist, and running this second example should get you the warning message: + +WARNING: Skipping CREATE as table already exists + +Grouping Code into Blocks + +Sometimes you need to treat a group of statements as though they were all one statement. (If you execute one, then you execute them all, otherwise, you don't execute any of them.) For instance, the IF statement will, by default, consider only the very next statement after the IF to be part of the conditional code. What if you want the condition to require several statements to run? Life would be pretty miserable if you had to create a separate IF statement for each line of code you wanted to run if the condition holds. + +Thankfully, like most any language with an IF statement, SQL Server gives us a way to group code into blocks that are considered to all belong together. The block is started when you issue a BEGIN statement and continues until you issue an END statement. It works like this: + +IF + +BEGIN --First block of code starts here -- executes only if + +\--expression is TRUE + +Statement that executes if expression is TRUE + +Additional statements + +... + +... + +Still going with statements from TRUE expression + +IF \--Only executes if this block is active + +BEGIN + +Statement that executes if both outside and inside + +expressions are TRUE + +Additional statements + +... + +... + +Still statements from both TRUE expressions + +END + +Out of the condition from inner condition, but still + +part of first block + +END --First block of code ends here + +ELSE + +BEGIN + +Statement that executes if expression is FALSE + +Additional statements + +... + +... + +Still going with statements from FALSE expression + +END + +Notice our ability to nest blocks of code. In each case, the inner blocks are considered to be part of the outer block of code. I have never heard of there being a limit to how many levels deep you can nest your BEGIN...END blocks, but I would suggest that you minimize them. There are definitely practical limits to how deep you can keep them readable—even if you are particularly careful about the formatting of your code. + +Just to put this notion into play, let's make yet another modification to table creation. This time, we're going to provide an informational message regardless of whether the table was created or not: + +\-- This time we're adding a check to see if the table DOES already exist + +\-- We'll remove it if it does so that the rest of our example can test the + +\-- IF condition. Just remove this first IF EXISTS block if you want to test + +\-- the ELSE condition below again. + +IF EXISTS ( + +SELECT s.name AS SchemaName, t.name AS TableName + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest' + +) + +DROP TABLE OurIFTest; + +\-- Now we're run our conditional CREATE statement + +IF NOT EXISTS ( + +SELECT s.name AS SchemaName, t.name AS TableName + +FROM sys.schemas s + +JOIN sys.tables t + +ON s.schema_id = t.schema_id + +WHERE s.name = 'dbo' + +AND t.name = 'OurIFTest' + +) + +BEGIN + +PRINT 'Table dbo.OurIFTest not found.' + +PRINT 'CREATING: Table dbo.OurIFTest' + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +); + +END + +ELSE + +PRINT 'WARNING: Skipping CREATE as table already exists'; + +Now, we've mixed all sorts of uses of the IF statement there. We have the most basic IF statement—with no BEGIN...END or ELSE. In our other IF statement, the IF portion uses a BEGIN...END block, but the ELSE does not. + +I did one this way just to illustrate how you can mix them. That said, I recommend you go back to my old axiom of "be consistent." It can be really hard to deal with what statement is being controlled by what IF...ELSE condition if you are mixing the way you group things. In practice, if I'm using BEGIN...END on any statement within a given IF, then I use them for every block of code in that IF statement even if there is only one statement for that particular condition. + +The CASE Statement + +The CASE statement is, in some ways, the equivalent of one of several different statements depending on the language from which you're coming. Statements in procedural programming languages that work in a similar way to CASE include: + + * Switch: C, C++, C#, Delphi + * Select Case: Visual Basic + * Do Case: Xbase + * Evaluate: COBOL + +I'm sure there are others; these are just from the languages that I've worked with in some form or another over the years. The big drawback in using a CASE statement in T-SQL is that it is, in many ways, more of a substitution operator than a control-of-flow statement. + +There is more than one way to write a CASE statement: with an input expression or a Boolean expression. The first option is to use an input expression that will be compared with the value used in each WHEN clause. The SQL Server documentation refers to this as a simple CASE: + +CASE + +WHEN THEN + +[...n] + +[ELSE ] + +END + +Option number two is to provide an expression with each WHEN clause that will evaluate to TRUE/FALSE. The docs refer to this as a searched CASE: + +CASE + +WHEN THEN + +[...n] + +[ELSE ] + +END + +Perhaps what's nicest about CASE is that you can use it "inline" with (that is, as an integral part of) a SELECT statement. This can actually be quite powerful. + +A Simple CASE + +A simple CASE takes an expression that equates to a Boolean result. Let's get right to an example: + +USE AdventureWorks2008; + +GO + +SELECT TOP 10 SalesOrderID, SalesOrderID % 10 AS 'Last Digit', Position = + +CASE SalesOrderID % 10 + +WHEN 1 THEN 'First' + +WHEN 2 THEN 'Second' + +WHEN 3 THEN 'Third' + +WHEN 4 THEN 'Fourth' + +ELSE 'Something Else' + +END + +FROM Sales.SalesOrderHeader; + +For those of you who aren't familiar with it, the % operator is for a modulus. A modulus works in a similar manner to the divide by (/), but it gives you only the remainder. Therefore, 16 % 4 = 0 (4 goes into 16 evenly), but 16 % 5 = 1. (16 divided by 5 has a remainder of 1.) In the example, since we're dividing by 10, using the modulus is giving us the last digit of the number we're evaluating. + +Let's see what we got with this: + +SalesOrderID Last Digit Position + +\------------ ----------- -------------- + +75124 4 Fourth + +43793 3 Third + +51522 2 Second + +57418 8 Something Else + +43767 7 Something Else + +51493 3 Third + +72773 3 Third + +43736 6 Something Else + +51238 8 Something Else + +53237 7 Something Else + +(10 row(s) affected) + +Notice that whenever there is a matching value in the list, the THEN clause is invoked. Since we have an ELSE clause, any value that doesn't match one of the previous values will be assigned whatever we've put in our ELSE. If we had left the ELSE out, then any such value would be given a NULL. + +Let's go with one more example that expands on what we can use as an expression. This time, we'll use another column from our query: + +USE AdventureWorks2008; + +GO + +SELECT TOP 10 SalesOrderID % 10 AS 'OrderLastDigit', + +ProductID % 10 AS 'ProductLastDigit', + +"How Close?" = CASE SalesOrderID % 10 + +WHEN ProductID % 1 THEN 'Exact Match!' + +WHEN ProductID % 1 − 1 THEN 'Within 1' + +WHEN ProductID % 1 + 1 THEN 'Within 1' + +ELSE 'More Than One Apart' + +END + +FROM Sales.SalesOrderDetail + +ORDER BY SalesOrderID DESC; + +Notice that we've used equations at every step of the way on this one, yet it still works.... + +OrderLastDigit ProductLastDigit How Close? + +\-------------- ---------------- ------------------- + +4 5 More Than One Apart + +3 2 More Than One Apart + +3 9 More Than One Apart + +3 8 More Than One Apart + +2 2 More Than One Apart + +2 8 More Than One Apart + +1 7 Within 1 + +1 0 Within 1 + +1 1 Within 1 + +0 2 Exact Match! + +(10 row(s) affected) + +As long as the expression evaluates to a specific value that is of compatible type to the input expression, it can be analyzed, and the proper THEN clause applied. + +A Searched CASE + +This one works pretty much the same as a simple CASE, with only two slight twists: + + * There is no input expression. (Remember, that's the part between the CASE and the first WHEN.) + * The WHEN expression must evaluate to a Boolean value (whereas in the simple CASE examples we've just looked at, we used values such as 1, 3, and ProductID + 1). + +Perhaps what I find the coolest about this kind of CASE is that we can completely change around what is forming the basis of our expression—mixing and matching column expressions, depending on our different possible situations. + +As usual, I find the best way to get across how this works is via an example: + +SELECT TOP 10 SalesOrderID % 10 AS 'OrderLastDigit', + +ProductID % 10 AS 'ProductLastDigit', + +"How Close?" = CASE + +WHEN (SalesOrderID % 10) < 3 THEN 'Ends With Less Than Three' + +WHEN ProductID = 6 THEN 'ProductID is 6' + +WHEN ABS(SalesOrderID % 10 - ProductID) <= 1 THEN 'Within 1' + +ELSE 'More Than One Apart' + +END + +FROM Sales.SalesOrderDetail + +ORDER BY SalesOrderID DESC; + +This is substantially different from our simple CASE examples, but it still works: + +OrderLastDigit ProductLastDigit How Close? + +\-------------- ---------------- ------------------------- + +4 5 More Than One Apart + +3 2 More Than One Apart + +3 9 More Than One Apart + +3 8 More Than One Apart + +2 2 Ends With Less Than Three + +2 8 Ends With Less Than Three + +1 7 Ends With Less Than Three + +1 0 Ends With Less Than Three + +1 1 Ends With Less Than Three + +0 2 Ends With Less Than Three + +(10 row(s) affected) + +These are a few of the things to pay particular attention to in how SQL Server evaluated things: + + * Even when two conditions evaluate to TRUE, only the first condition is used. For example, the second-to-last row meets both the first (the last digit is smaller than 3) and third (the last digit is within 1 of the ProductID) conditions. For many languages, including Visual Basic, this kind of statement always works this way. If you're from the C world, however, you'll need to remember this when you are coding; no "break" statement is required. A CASE statement always terminates after one condition is met. + * You can mix and match what fields you're using in your condition expressions. In this case, we used SalesOrderID, ProductID, and both together. + * You can perform pretty much any expression as long as, in the end, it evaluates to a Boolean result. + +Looping with the WHILE Statement + +The WHILE statement works much as it does in other languages to which you have probably been exposed. Essentially, a condition is tested each time you come to the top of the loop. If the condition is still TRUE, then the loop executes again; if not, you exit. + +The syntax looks like this: + +WHILE + + | + +[BEGIN + + + +[BREAK] + + | + +[CONTINUE] + +END] + +While you can just execute one statement (much as you do with an IF statement), you'll almost never see a WHILE that isn't followed by a BEGIN...END with a full statement block. + +The BREAK statement is a way of exiting the loop without waiting for the bottom of the loop to come and the expression to be re-evaluated. + +I'm sure I won't be the last to tell you this, but using a BREAK is generally thought of as something of bad form in the classical sense. I tend to sit on the fence on this one. I avoid using them if reasonably possible. Most of the time, I can indeed avoid them just by moving a statement or two around, while still coming up with the same results. The advantage of this is usually more readable code. It is simply easier to handle a looping structure (or any structure for that matter) if you have a single point of entry and a single exit. Using a BREAK violates this notion. + +All that being said, sometimes you can actually make things worse by reformatting the code to avoid a BREAK. In addition, I've seen people write much slower code for the sake of not using a BREAK statement—bad idea. + +The CONTINUE statement is something of the opposite of a BREAK statement. In short, it tells the WHILE loop to go back to the beginning. Regardless of where you are in the loop, you immediately go back to the top and re-evaluate the expression (exiting if the expression is no longer TRUE). + +We'll go ahead and do something of a short example here just to get our feet wet. As I mentioned before, WHILE loops tend to be rare in non-cursor situations, so forgive me if this example seems lame. + +What we're going to do is create something of a monitoring process using our WHILE loop and a WAITFOR command. (We'll look at the specifics of WAITFOR in our next section.) We're going to be automatically updating our statistics once per day: + +WHILE 1 = 1 + +BEGIN + +WAITFOR TIME '01:00' + +EXEC sp_updatestats + +RAISERROR('Statistics Updated for Database', 1, 1) WITH LOG + +END + +This would update the statistics for every table in our database every night at 1 AM and write a log entry of that fact to both the SQL Server log and the Windows application log. If you want to check to see if this works, leave this running all night and then check your logs in the morning. + +Note that using an infinite loop like this isn't the way that you would normally want to schedule a task. If you want something to run every day, set up a job using Management Studio. In addition to not keeping a connection open all the time (which the preceding example would do), you also get the capability to make follow up actions dependent on the success or failure of your script. Also, you can e-mail or netsend messages regarding the completion status. + +The WAITFOR Statement + +There are often things that you either don't want to or simply can't have happen right this moment, but you also don't want to have to hang around waiting for the right time to execute something. + +No problem—use the WAITFOR statement and have SQL Server wait for you. The syntax is incredibly simple: + +WAITFOR + +DELAY <'time'> | TIME <'time'> + +The WAITFOR statement does exactly what it says it does. It waits for whatever you specify as the argument to occur. You can specify either an explicit time of day for something to happen, or you can specify an amount of time to wait before doing something. + +The DELAY Parameter + +The DELAY parameter choice specifies an amount of time to wait. You cannot specify a number of days—just time in hours, minutes, and seconds. The maximum allowed delay is 24 hours. So, for example: + +WAITFOR DELAY '01:00' + +would run any code prior to the WAITFOR, then reach the WAITFOR statement, and stop for one hour, after which execution of the code would continue with whatever the next statement was. + +The TIME Parameter + +The TIME parameter choice specifies to wait until a specific time of day. Again, we cannot specify any kind of date—just the time of day using a 24-hour clock. Once more, this gives us a one-day time limit for the maximum amount of delay. For example: + +WAITFOR TIME '01:00' + +would run any code prior to the WAITFOR, then reach the WAITFOR statement, and stop until 1 AM, after which execution of the code would continue with whatever the next statement was after the WAITFOR. + +TRY/CATCH Blocks + +This is yet another one of those areas that I would consider to be critical learning when learning your basics, so, in theory, you should know it well by the time you get to the "Professional" level. That said, TRY/CATCH is still relatively new (it was added in SQL Server 2005), and, if you've grown up supporting an older application, you may not have seen this lovely new addition or may have been avoiding it for backward compatibility reasons. + +In days of yore (meaning anything before SQL Server 2005), our error-handling options were pretty limited. We could check for error conditions, but we had to do so proactively. Indeed, in some cases we could have errors that would cause us to leave our procedure or script without an opportunity to trap it at all. (This can still happen, but is much more limited.) We're going to save a more full discussion of error handling for our stored procedures discussion in Chapter 10, but we'll touch on the fundamentals of the new TRY/CATCH blocks here. + +A TRY/CATCH block in SQL Server works remarkably similarly to those used in any C-derived languages (C, C++, C#, Delphi, and a host of others). The syntax looks like this: + +BEGIN TRY + +{ } + +END TRY + +BEGIN CATCH + +{ } + +END CATCH [ ; ] + +In short, SQL Server will "try" to run anything within the BEGIN...END that goes with your TRY block. If, and only if, you have an error condition that has an error level of 11–19, then SQL Server will exit the TRY block immediately and begin with the first line in your CATCH block. Since there are more possible error levels than just 11–19, take a look at what we have there: + +Error Level | Nature +---|--- +1–10 | Informational only. This would include things like context changes such as settings being adjusted or NULL values found while calculating aggregates. These will not trigger a CATCH block, so if you need to test for this level of error, you'll need to do so manually by checking @@ERROR. +11–19 | Relatively severe errors, but ones that can be handled by your code (foreign key violations, as an example). Some of these can be severe enough that you are unlikely to want to continue processing (such as a memory exceeded error), but at least you can trap them and exit gracefully. +20–25 | Very severe. These are generally system-level errors. Your server-side code will never know this kind of error happened, as the script and connection will be terminated immediately, and the CATCH block will never execute. + +Keep these in mind—if you need to handle errors outside the 11–19 level range, then you'll need to make other plans. The good news is that most errors that we need to trap fall in that 11–19 range. + +Now, to test this out, we'll make some alterations to our CREATE script that we built back when we were looking at IF...ELSE statements. You may recall that part of the reason for our original test to see whether the table already existed was to avoid creating an error condition that might have caused our script to fail. That kind of test is the way things have been done historically (and there really wasn't much in the way of other options). With the advent of TRY/CATCH blocks, we could just try the CREATE and then handle the error if one were given: + +BEGIN TRY + +\-- Try and create our table + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +) + +END TRY + +BEGIN CATCH + +\-- Uh oh, something went wrong, see if it's something + +\-- we know what to do with + +DECLARE @ErrorNo int, + +@Severity tinyint, + +@State smallint, + +@LineNo int, + +@Message nvarchar(4000) + +SELECT + +@ErrorNo = ERROR_NUMBER(), + +@Severity = ERROR_SEVERITY(), + +@State = ERROR_STATE(), + +@LineNo = ERROR_LINE (), + +@Message = ERROR_MESSAGE() + +IF @ErrorNo = 2714 -- Object exists error, we knew this might happen + +PRINT 'WARNING: Skipping CREATE as table already exists' + +ELSE -- hmm, we don't recognize it, so report it and bail + +RAISERROR(@Message, 16, 1 ) + +END CATCH + +Notice I used some special functions to retrieve the error condition, so let's take a look at those. + +Also note that I moved them into variables that were controlled by me so they would not be lost. I must admit this is a holdover habit that I have from the days before TRY/CATCH, when you would lose the error code on the next statement. The functions used here persist within the scope of the particular CATCH block, so you are relatively safe against losing their values. The primary reason to move the values over, at this point, is if you want to utilize the error values after you exit the CATCH block. + +Function | Returns +---|--- +ERROR_NUMBER() | The actual error number. If this is a system error, there will be an entry in the sys.messages that matches that error and contains some of the information you'll get from the other error-related functions. +ERROR_SEVERITY() | This equates to what is sometimes called "error level" in other parts of this book and Books Online. My apologies for the inconsistency. I'm guilty of perpetuating something that Microsoft started doing a version or two ago. Again, the "severity" must be 11–19 before the error will wind up in a catch block. (See the previous table in this chapter for further discussion on this.) +ERROR_STATE() | I use this as something of a place mark. This will always be 1 for system errors. When we discuss error handling in more depth in Chapter 10, you'll see how to raise your own errors. At that point, you can use state to indicate things like at what point in your stored procedure, function, or trigger the error occurred (this helps with situations where a given error can be handled in any one of many places). +ERROR_PROCEDURE() | We did not use this in the preceding example, as it is only relevant to stored procedures, functions, and triggers. This supplies the name of the procedure that caused the error—very handy if your procedures are nested at all, as the procedure that causes the error may not be the one to actually handle that error. +ERROR_LINE() | Just what it says—the line number of the error. +ERROR_MESSAGE() | The text that goes with the message. For system messages, this is the same as what you'll see if you select the message from the sys.messages function. For user-defined errors, it will be the text supplied to the RAISERROR function. + +In our example, I utilized a known error id that SQL Server raises if we attempt to create an object that already exists. You can see all system error messages by selecting them from the sys.messages table function. + +Beginning with SQL Server 2005, the sys.messages output grew so lengthy that it's hard to find what you're looking for by just scanning it. My solution is less than elegant but is rather effective. I just artificially create the error I'm looking for and see what error number it gives me (simple solutions for simple minds like mine!). + +I simply execute the code I want to execute (in this case, the CREATE statement) and handle the error if there is one; there really isn't much more to it than that. + +We will look at error handling in a far more thorough fashion in Chapter 10. In the meantime, you can use TRY/CATCH to give basic error handling to your scripts. + +Summary + +Understanding scripts and batches is the cornerstone to an understanding of programming with SQL Server. The concepts of scripts and batches lay the foundation for a variety of functions from scripting complete database builds to programming stored procedures and triggers. + +Local variables have scope for only one batch. Even if you have declared the variable within the same overall script, you will still get an error message if you don't redeclare it (and start over with assigning values) before referencing it in a new batch. + +You can use batches to create precedence between different parts of your scripts. The first batch starts at the beginning of the script and ends at the end of the script or the first GO statement, whichever comes first. The next batch (if there is another) starts on the line after the first one ends and continues to the end of the script or the next GO statement—again, whichever comes first. The process continues to the end of the script. The first batch from the top of the script is executed first, the second is executed second, and so on. All commands within each batch must pass validation in the query parser, or none of that batch will be executed; however, any other batches will be parsed separately and will still be executed (if they pass the parser). + +In addition, we reviewed the constructs to deal with control of flow and error-handling conditions. We can use this to build complex scripts that are able to adapt to different runtime environments (such as recognizing that it needs to process an upgrade of a database instead of an installation, or even determine what version of your schema it is upgraded from). + +Finally, we also saw how we can create and execute SQL dynamically. This can afford us the opportunity to deal with scenarios that aren't always 100 percent predictable or situations where something we need to construct our statement is actually itself a piece of data. + +In the next couple of chapters, we will take the notions of scripting and batches to the next level, and apply them to stored procedures and triggers—the closest things that SQL Server has to actual programs. We will also see how we can utilize any .NET language to add more complex language functionality to our stored procedures, functions, and triggers. +10 + +Advanced Programmability + +When deciding on where the cutoff should be between my Beginning and Professional titles, this was, perhaps, the most difficult area for me to reconcile. The thing is, how much a supposed "SQL Server jock" knows about things beyond basic DML really varies a lot, so what exactly qualifies someone as ready to do the "Professional" level title? + +In this chapter, I'm going to assume that you already know the basics of stored procedures and user-defined functions (the differences between them, types of SQL-based user-defined functions, parameterization, and basic control of flow statements). After all, if they are "the basics," then they seem more appropriate for a beginning title (and, indeed, I cover them at length in Beginning SQL Server 2008 Programming). So what, then, is this chapter all about? Well, it's about all the things that go beyond the basics. In this section, we'll cover: + + * OUTPUT parameters (often misunderstood by even advanced SQL programmers) + * Error handling (again, I cover this somewhat in the Beginning title, but it's so often misunderstood even amongst advanced SQL programmers that it deserves revisiting) + * Table-valued parameters (new with SQL Server 2008) + * .NET-based stored procedures and user-defined functions + +Even paring out the so called basics, there is a lot to be covered, so let's get to it. + +Most of the concepts provided in this chapter apply relatively equally to both stored procedures and user-defined functions. + +A More Advanced Look At Stored Procedures + +Stored procedures—or "sprocs"—have long been fundamental to truly "programming" in SQL Server. Prior to SQL Server 2005, they could be complex, but even the most complex was still relatively mundane given the limitations of T-SQL. With each release, however, Microsoft has added more to the puzzle. It was a rather big leap in the case of .NET assemblies (again, beginning with SQL Server 2005—we'll cover those a little later in this chapter), and the addition of table-valued parameters in SQL Server 2008 brings a lot of continuity to what we can do inside of a stored procedure. + +Let's start this section off with a review of the general sproc syntax: + +CREATE PROCEDURE|PROC + +[ [.] [VARYING] + +[= ] [OUT[PUT]] [READONLY] + +[, n...] + +[WITH + +RECOMPILE| ENCRYPTION | [EXECUTE AS { CALLER|SELF|OWNER|''}] + +[FOR REPLICATION] + +AS + + | EXTERNAL NAME .. + +Most of this should be second nature at this point, but, before this chapter is done, we will have captured any elements of the syntax that you may not be as comfortable with. + +Let's start by taking a look at output parameters. + +Output Parameters + +Sometimes, you want to pass non-recordset information out to whatever called your sproc. Perhaps one of the most common uses for this is with sprocs that do inserts into tables with identity values. Often the code calling the sproc wants to know what the identity value was when the process is complete. + +To show this off, we'll utilize a stored procedure that is already in the AdventureWorks2008 database—uspLogError. It looks like this: + +\-- uspLogError logs error information in the ErrorLog table about the + +\-- error that caused execution to jump to the CATCH block of a + +\-- TRY...CATCH construct. This should be executed from within the scope + +\-- of a CATCH block otherwise it will return without inserting error + +\-- information. + +CREATE PROCEDURE [dbo].[uspLogError] + +@ErrorLogID [int] = 0 OUTPUT -- contains the ErrorLogID of the row inserted + +AS -- by uspLogError in the ErrorLog table + +BEGIN + +SET NOCOUNT ON; + +\-- Output parameter value of 0 indicates that error + +\-- information was not logged + +SET @ErrorLogID = 0; + +BEGIN TRY + +\-- Return if there is no error information to log + +IF ERROR_NUMBER() IS NULL + +RETURN; + +\-- Return if inside an uncommittable transaction. + +\-- Data insertion/modification is not allowed when + +\-- a transaction is in an uncommittable state. + +IF XACT_STATE() = -1 + +BEGIN + +PRINT 'Cannot log error since the current transaction is in an + +uncommittable state. ' + +\+ 'Rollback the transaction before executing uspLogError in order to + +successfully log error information.'; + +RETURN; + +END + +INSERT [dbo].[ErrorLog] + +( + +[UserName], + +[ErrorNumber], + +[ErrorSeverity], + +[ErrorState], + +[ErrorProcedure], + +[ErrorLine], + +[ErrorMessage] + +) + +VALUES + +( + +CONVERT(sysname, CURRENT_USER), + +ERROR_NUMBER(), + +ERROR_SEVERITY(), + +ERROR_STATE(), + +ERROR_PROCEDURE(), + +ERROR_LINE(), + +ERROR_MESSAGE() + +); + +\-- Pass back the ErrorLogID of the row inserted + +SET @ErrorLogID = @@IDENTITY; + +END TRY + +BEGIN CATCH + +PRINT 'An error occurred in stored procedure uspLogError: '; + +EXECUTE [dbo].[uspPrintError]; + +RETURN −1; + +END CATCH + +END; + +Note the sections that I've highlighted here—these are the core to our output parameter. The first declares the parameter as being an output parameter. The second makes the insert that utilizes the identity value, and finally the SET statement captures the identity value. When the procedure exits, the value in @ErrorLogID is passed to the calling script. + +Let's utilize our TRY/CATCH example from the tail end of the previous chapter, but this time we'll make the call to uspLogError: + +USE AdventureWorks2008; + +BEGIN TRY + +\-- Try and create our table + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +) + +END TRY + +BEGIN CATCH + +\-- Uh oh, something went wrong, see if it's something + +\-- we know what to do with + +DECLARE @MyOutputParameter int; + +IF ERROR_NUMBER() = 2714 -- Object exists error, we knew this might happen + +BEGIN + +PRINT 'WARNING: Skipping CREATE as table already exists'; + +EXEC dbo.uspLogError @ErrorLogID = @MyOutputParameter OUTPUT; + +PRINT 'A error was logged. The Log ID for our error was ' + +\+ CAST(@MyOutputParameter AS varchar); + +END + +ELSE -- hmm, we don't recognize it, so report it and bail + +RAISERROR('something not good happened this time around', 16, 1 ); + +END CATCH + +If you run this in a database that does not already have the OurIFTest table, then you will get a simple: + +Command(s) completed successfully. + +But run it where the OurIFTest table already exists (for example, run it twice if you haven't run the CREATE code before), and you get something to indicate the error: + +WARNING: Skipping CREATE as table already exists + +A error was logged. The Log ID for our error was 1 + +Now run a little select against the error log table: + +SELECT ErrorLogID, UserName, ErrorMessage + +FROM ErrorLog + +WHERE ErrorLogID = 1; -- change this value to whatever your + +\-- results said it was logged as + +And you can see that the error was indeed properly logged: + +ErrorLogID UserName ErrorMessage + +\----------- ----------- --------------------------------------------------- + +1 dbo There is already an object named 'OurIFTest'... + +(1 row(s) affected) + +There are several things that you should take note of between the sproc itself, and the usage of it by the calling script: + + * The OUTPUT keyword was required for the output parameter in the sproc declaration. + * You must use the OUTPUT keyword when you call the sproc, much as you did when you declared the sproc. This gives SQL Server advance warning about the special handling that parameter will require. Be aware, however, that forgetting to include the OUTPUT keyword won't create a runtime error (you won't get any messages about it), but the value for the output parameter won't be moved into your variable (you'll just wind up with what was already there—most likely a NULL value). This means that you'll have what I consider to be the most dreaded of all computer terms—unpredictable results. + * The variable you assign the output result to does not have to have the same name as the internal parameter in the sproc. For example, in our previous sproc, the internal parameter in the error logging sproc was called @ErrorLogID, but the variable the value was passed to was called @MyOutputParameter. + * The EXEC (or EXECUTE) keyword was required since the call to the sproc wasn't the first thing in the batch (you can leave off the EXEC if the sproc call is the first thing in a batch)—personally, I recommend that you train yourself to use it regardless. + +Dealing with Errors + +This is one of those sections that squarely overlaps with my "Beginning" title. If you think about it a while, I hope you won't be that surprised. + +The problem is fairly simple: Many who learn SQL do so almost by accident. That is, they either don't have a beginning book to read at all, or, at best, they skim an SQL title enough to get some statements crammed into their client language and eventually move on to some basic sprocs. While they know error handling in their client environment, they suddenly find themselves writing fairly complex stored procedures having learned the things required to actually make a sproc run, but not much about how a sproc really should be written. I overlap here to back up and catch a spot that a lot of intermediate to fairly advanced stored procedure authors often have very little real exposure to. + +If you already have the whole error handling in SQL thing down cold, then I'd suggest just skimming through this section for new ideas, and otherwise moving on to the coverage of table-valued parameters and .NET programming in SQL Server. + +Four common types of errors can happen in SQL Server: + + * Errors that create runtime errors and stop your code from proceeding further. + * Errors that are informational in nature and do not create runtime errors. A non-zero error number is returned (if you ask), but no error is raised (and so no error trapping will be activated unless you are testing for that specific error). + * Errors that create runtime errors but continue execution within SQL Server such that you can trap them and respond in the manner of your choosing. + * Errors that are more logical in nature and to which SQL Server is essentially oblivious. + +Now, here things get a bit sticky, and versions become important, so hang with me as I lead you down a winding road. + +We touched on TRY/CATCH blocks some in our last chapter, and examined how to make use of them, but they weren't always a part of T-SQL. The possibilities for error handling have changed a lot over the years, and particularly so back in SQL Server 2005. Today, we have genuine error traps in the form of the aforementioned TRY/CATCH blocks. There is, as you might expect, backward compatibility to consider, but that continues to be less of a consideration as SQL Server 2000 fades in support. + +One thing remains common between the old and new error-handling models: higher-level runtime errors. Some general errors cause SQL Server to terminate the script immediately. This was true prior to TRY/CATCH, and it remains true even in the TRY/CATCH era. Errors that have enough severity to generate a runtime error are problematic from the SQL Server side of the equation. The new TRY/CATCH logic is a bit more flexible for some errors than the error trapping model that preceded it, but even now your sproc won't necessarily know when something bad happens (it just depends how bad "bad" is). On the bright side, all the current data access object models pass through the message on such errors, so you know about them in your client application and can do something about them there. + +The Way We Were + +In older versions of SQL Server (prior to 2005), there was no formal error handler. You didn't have an option that essentially said, "If any error happens, go run this code over in this other spot." Instead, you had to monitor for error conditions within your own code and then decide what to do at the point you detected the error—possibly well after the actual error occurred. Let's go ahead and take a look at how we handle errors in that model. + +In case you're in the "since we have the new TRY/CATCH blocks, why do I even care about this?" frame of mind, let me point out that there is tons of code out there written for those earlier versions of SQL Server (before TRY/CATCH), and continues to be older style code written by developers that either don't know the newer way or are just too much creatures of habit to use it. In short, it's important to understand this way of doing things, so you understand other code that you will see in your career. + +Handling Inline Errors + +Inline errors are those pesky little things where SQL Server keeps running as such, but hasn't, for some reason, succeeded in doing what you wanted it to do. For example, try to insert a record into the Person.EmailAddress table that doesn't have a corresponding record in the Person.BusinessEntity table: + +USE AdventureWorks2008; + +GO + +INSERT INTO Person.EmailAddress + +(BusinessEntityID, EmailAddress) + +VALUES + +(0, 'robv@professionalsql.com'); + +SQL Server won't perform this insert for you because there is a FOREIGN KEY constraint on BusinessEntityID that references another table. Since there is not a matching record in that table, the record we are trying to insert into Person.EmailAddress violates that foreign key constraint and is rejected: + +Msg 547, Level 16, State 0, Line 2 + +The INSERT statement conflicted with the FOREIGN KEY constraint + +"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database + +"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'. + +The statement has been terminated. + +Pay attention to that error 547 up there. That's something you can use. + +Using @@ERROR + +@@ERROR contains the error number of the last T-SQL statement executed. If the value is zero, then no error occurred. This is somewhat similar to the ERROR_NUMBER() function we saw in the previous chapter when we first discussed TRY/CATCH blocks. While ERROR_NUMBER() is only valid and remains the same regardless of where you are within a CATCH block, @@ERROR receives a new value with each statement you execute. + +The caveat with @@ERROR is that it is reset with each new statement. This means that if you want to defer analyzing the value, or you want to use it more than once, you need to move the value into some other holding bin—a local variable that you have declared for this purpose. + +Play with this just a bit using the INSERT example from before: + +USE AdventureWorks2008; + +GO + +DECLARE @Error int; + +\-- Bogus INSERT - there is no BusinessEntityID of 0. + +INSERT INTO Person.EmailAddress + +(BusinessEntityID, EmailAddress) + +VALUES + +(0, 'robv@professionalsql.com'); + +\-- Move our error code into safekeeping. Note that, after this statement, + +\-- @@Error will be reset to whatever error number applies to this statement + +SELECT @Error = @@ERROR; + +\-- Print out a blank separator line + +PRINT ''; + +\-- The value of our holding variable is just what we would expect + +PRINT 'The Value of @Error is ' + CONVERT(varchar, @Error); + +\-- The value of @@ERROR has been reset - it's back to zero + +PRINT 'The Value of @@ERROR is ' + CONVERT(varchar, @@ERROR); + +Now execute your script, and you can examine how @@ERROR is affected: + +Msg 547, Level 16, State 0, Line 6 + +The INSERT statement conflicted with the FOREIGN KEY constraint + +"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database + +"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'. + +The statement has been terminated. + +The Value of @Error is 547 + +The Value of @@ERROR is 0 + +This illustrates pretty quickly the issue of saving the value from @@ERROR. The first error statement is only informational in nature. SQL Server has thrown that error, but hasn't stopped the code from executing. Indeed, the only part of that message that your sproc has access to is the error number. That error number resides in @@ERROR for just that next T-SQL statement; after that it's gone. + +Notice that @Error and @@ERROR are two separate and distinct variables, and can be referred to separately. This isn't just because of the case difference. (Depending on how you have your server configured, case sensitivity can affect your variable names.) It's because of the difference in scope. The @ or @@ is part of the name, so the number of @ symbols on the front makes each one separate and distinct from the other. + +Using @@ERROR in a Sproc + +OK, so let's start with an assumption here: If you're using @@ERROR, then the likelihood is that you are not using TRY/CATCH blocks. If you have not made this choice for backward compatibility reasons, I'm going to bop you upside the head and suggest you reconsider—TRY/CATCH is much cleaner and all around better. + +TRY/CATCH will handle varieties of errors that in previous versions would have terminated the execution of your script. + +That said, TRY/CATCH is out of the equation if backward compatibility with SQL Server 2000 or prior is what you need, so let's take a quick look. + +What we're going to do is look at two short procedures to take a look at how inline error checking works when it works, and how it doesn't when it doesn't (in particular, when inline does not work, but TRY/CATCH would). + +Let's start with the referential integrity example we did a moment ago: + +USE AdventureWorks2008; + +GO + +INSERT INTO Person.EmailAddress + +(BusinessEntityID, EmailAddress) + +VALUES + +(0, 'robv@professionalsql.com'); + +You may recall this got us a simple 547 error. This is one of those that is trappable. We could trap this in a simple script, but let's do it as a sproc since procedural stuff is supposedly what we're working on here.... + +USE AdventureWorks2008; + +GO + +CREATE PROC spInsertValidatedEmailAddress + +@BusinessEntityID int, + +@EmailAddress nvarchar(50) + +AS + +BEGIN + +DECLARE @Error int; + +INSERT INTO Person.EmailAddress + +(BusinessEntityID, EmailAddress) + +VALUES + +(@BusinessEntityID, @EmailAddress); + +SET @Error = @@ERROR; + +IF @Error = 0 + +PRINT 'New Record Inserted'; + +ELSE + +BEGIN + +IF @Error = 547 -- Foreign Key violation. Tell them about it. + +PRINT 'At least one provided parameter was not found. Correct and retry'; + +ELSE -- something unknown + +PRINT 'Unknown error occurred. Please contact your system admin'; + +END + +END + +Now try executing this with values that work: + +EXEC spInsertValidatedEmailAddress 1, 'robv@professionalsql.com'; + +Our insert happens correctly, so no error condition is detected (because there isn't one): + +(1 row(s) affected) + +New Record Inserted + +Now, try something that should blow up: + +EXEC spInsertValidatedEmailAddress 0, 'robv@professionalsql.com'; + +And you see not only the actual SQL Server message but the message from our error trap (note that there is no way of squelching the SQL Server message): + +Msg 547, Level 16, State 0, Procedure spInsertValidatedEmailAddress, Line 10 + +The INSERT statement conflicted with the FOREIGN KEY constraint + +"FK_EmailAddress_Person_BusinessEntityID". The conflict occurred in database + +"AdventureWorks2008", table "Person.Person", column 'BusinessEntityID'. + +The statement has been terminated. + +At least one provided parameter was not found. Correct and retry + +As you can see, we were able to detect our error without a TRY/CATCH block. + +Now, let's move on to an example of why TRY/CATCH is better—a situation where a TRY/CATCH works fine, but where inline error checking fails. To show this one off, all we need to do is use our example for TRY/CATCH that we used in the scripting chapter. It looked like this: + +BEGIN TRY + +\-- Try and create our table + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +) + +END TRY + +BEGIN CATCH + +\-- Uh oh, something went wrong, see if it's something + +\-- we know what to do with + +DECLARE @ErrorNo int, + +@Severity tinyint, + +@State smallint, + +@LineNo int, + +@Message nvarchar(4000) + +SELECT + +@ErrorNo = ERROR_NUMBER(), + +@Severity = ERROR_SEVERITY(), + +@State = ERROR_STATE(), + +@LineNo = ERROR_LINE (), + +@Message = ERROR_MESSAGE() + +IF @ErrorNo = 2714 -- Object exists error, we knew this might happen + +PRINT 'WARNING: Skipping CREATE as table already exists' + +ELSE -- hmm, we don't recognize it, so report it and bail + +RAISERROR(@Message, 16, 1 ) + +END CATCH + +It worked just fine. But if I try and do this using inline error checking, I have a problem: + +CREATE TABLE OurIFTest( + +Col1 int PRIMARY KEY + +); + +IF @@ERROR != 0 + +PRINT 'Problems!'; + +ELSE + +PRINT 'Everything went OK!'; + +Run this (you'll need to run it twice to generate the error if the table isn't already there), and we quickly find out that, without the TRY block, SQL Server aborts the script entirely on the particular error we're generating here: + +Msg 2714, Level 16, State 6, Line 2 + +There is already an object named 'OurIFTest' in the database. + +Notice that our PRINT statements never got a chance to execute—SQL Server had already terminated processing. With TRY/CATCH we were able to trap and handle this error, but using inline error checking, our attempts to trap an error like this fail. + +Manually Raising Errors + +Sometimes you have errors that SQL Server doesn't really know about, but you wish it did. For example, perhaps in the previous example you don't want to return –1000. Instead, you'd like to be able to create a runtime error at the client end that the client would then use to invoke an error handler and act accordingly. To do this, you use the RAISERROR command in T-SQL. The syntax is pretty straightforward: + +RAISERROR (, , + +[, + +[,<...n>]] ) + +[WITH option[,...n]] + +Message ID/Message String + +The message ID or message string you provide determines which message is sent to the client. + +Using a message ID creates a manually raised error with the ID that you specified and the message that is associated with that ID as found in the sys.messages system view. + +If you want to see what your SQL Server has as predefined messages, you can always perform a SELECT * FROM sys.messages. This includes any messages you've manually added to your system using the sp_addmessage stored procedure or through Management Studio. + +You can also just supply a message string in the form of ad hoc text without creating a more permanent message in sys.messages: + +RAISERROR ('Hi there, I''m an error', 1, 1); + +This raises a rather simple error message: + +Hi there, I'm an error + +Msg 50000, Level 1, State 50000 + +Notice that the assigned message number, even though you didn't supply one, is 50000. This is the default error value for any ad hoc error. It can be overridden using the WITH SETERROR option. (We'll look at that briefly in a moment.) + +Severity + +We got a quick overview of this when looking at TRY/CATCH in the chapter on scripting. For those of you already familiar with Windows servers, severity should be an old friend. Severity is an indication of just how bad things really are based on this error. For SQL Server, however, what severity codes mean can get a little bizarre. They can range from informational (severities 1–18), to system level (19–25), and even catastrophic (20–25). If you raise an error of severity 19 or higher (system level), the WITH LOG option must also be specified. 20 and higher automatically terminates the users' connections. (They hate that!) + +So, get back to what I meant by bizarre. SQL Server actually varies its behavior into more ranges than Windows does, or even than the Books Online will tell you about. Errors fall into five major groupings, as shown in the following table: + +1–10 | Purely informational but will return the specific error code in the message information. +---|--- +11–16 | If you do not have a TRY/CATCH block set up, then these terminate execution of the procedure and raise an error at the client. The state is shown to be whatever value you set it to. If you have a TRY/CATCH block defined, then that handler will be called rather than raising an error at the client. +17 | Usually, only SQL Server should use this severity. Basically, it indicates that SQL Server has run out of resources—for example tempdb was full—and can't complete the request. Again, a TRY/CATCH block will get this before the client does. +18–19 | Both of these are severe errors and imply that the underlying cause requires system administrator attention. With 19, the WITH LOG option is required, and the event will show up in the NT or Windows Event Log if you are using that OS family. These are the final levels at which you can trap the error with a TRY/CATCH block—after this, it will go straight to the client. +20–25 | Your world has just caved in as has the user's connection. Essentially, this is a fatal error. The connection is terminated. As with 19, you must use the WITH LOG option, and a message will, if applicable, show up in the Event Log. + +State + +State is an ad hoc value. It's something that recognizes that exactly the same error may occur at multiple places within your code. The notion is that this gives you an opportunity to send something of a place marker for where exactly the error occurred. + +State values can be between 1 and 127. If you are troubleshooting an error with Microsoft tech support, they apparently have some arcane knowledge that hasn't been shared with us about what some of these mean. I'm told that if you make a tech support call to Microsoft, they are likely to ask about and make use of this state information. + +One way I make use of State when raising my own errors is as a location tool. There will be instances where your procedure has the potential to raise the same error in multiple places in the sproc—I will change the State information in my RAISERROR to provide an extra indication of which specific line raised the error. + +Error Arguments + +Some predefined errors accept arguments. These allow you to make the error to be somewhat more dynamic by changing to the specific nature of the error. You can also format your error messages to accept arguments. + +When you want to use dynamic information in what is otherwise a static error message, you need to format the fixed portion of your message so that it leaves room for the parameterized section of the message. You do so by using placeholders. If you're coming from the C or C++ world, then you'll recognize the parameter placeholders immediately; they are similar to the printf command arguments. If you're not from the C world, these may seem a little odd to you. All the placeholders start with the % sign and are then coded for the kind of information you'll be passing to them, as shown in the following table. + +Placeholder Type Indicator | Type of Value +---|--- +d | Signed integer. Books Online indicates that i is an acceptable choice, but I've had problems getting it to work as expected. +o | Unsigned octal. +p | Pointer. +s | String. +u | Unsigned integer. +X or x | Unsigned hexadecimal. + +In addition, there is the option to prefix any of these placeholder indicators with some additional flag and width information: + +Flag | What It Does +---|--- +− (dash or minus sign) | Left-justify. Only makes a difference when you supply a fixed width. ++ (plus sign) | Indicates the positive or negative nature if the parameter is a signed numeric type. +0 | Tells SQL Server to pad the left side of a numeric value with zeros until it reaches the width specified in the width option. + +# (pound sign) | Applies only to octal and hex values. Tells SQL Server to use the appropriate prefix (0 or 0x) depending on whether it is octal or hex. +' ' | Pads the left of a numeric value with spaces if positive. + +Last, but not least, you can also set the width, precision, and long/short status of a parameter: + + * Width—Set by simply supplying an integer value for the amount of space you want to hold for the parameterized value. You can also specify a *, in which case SQL Server will automatically determine the width according to the value you've set for precision. + * Precision—Determines the maximum number of digits output for numeric data. + * Long/Short—Set by using an h (short) or I (long) when the type of the parameter is an integer, octal, or hex value. + +Use this in an example: + +RAISERROR ('This is a sample parameterized %s, along with a zero + +padding and a sign%+010d',1,1, 'string', 12121); + +If you execute this, you get back something that looks a little different from what's in the quotation marks: + +This is a sample parameterized string, along with a zero + +padding and a sign+000012121 + +Msg 50000, Level 1, State 1 + +The extra values supplied were inserted, in order, into your placeholders, with the final value being reformatted as specified. + +WITH
)]} + +[ WITH [ENCRYPTION]|[SCHEMABINDING]| + +[ RETURNS NULL ON NULL INPUT | CALLED ON NULL INPUT ] | [EXECUTE AS { + +CALLER|SELF|OWNER|<'user name'>} ] + +] + +[AS] { EXTERNAL NAME | + +BEGIN + +[] + +{RETURN |RETURN ( ) + +RETURNS + +EXTERNAL NAME [ . ] + +So, to create the aggregate from our assembly, we would do something like: + +CREATE AGGREGATE dbo.Product(@input float) + +RETURNS float + +EXTERNAL NAME ExampleAggregate.Product; + +And, with that, we're ready to try it out. To test it, we'll create a small sample table that includes some data that can be multiplied along with a grouping column, so we can test out how our aggregate works with a GROUP BY scenario. + +CREATE TABLE TestAggregate + +( + +PK int NOT NULL PRIMARY KEY, + +GroupKey int NOT NULL, + +Value float NOT NULL + +); + +Now we just need some test data: + +INSERT INTO TestAggregate(PK, GroupKey, Value) + +VALUES (1, 1, 2), + +(2, 1, 6), + +(3, 1, 1.5), + +(4, 2, 2), + +(5, 2, 6); + +And we're ready to give our aggregate a try. What we're going to be doing is returning the PRODUCT of all the rows within each group (our sample data has two groups, so this should work out to two rows). + +SELECT GroupKey, dbo.Product(Value) AS Product + +FROM TestAggregate + +GROUP BY GroupKey; + +Run this and we get back two rows (just as we expected): + +GroupKey Product + +\----------- ---------------------- + +1 18 + +2 12 + +(2 row(s) affected) + +Do the match on our sample data, and you'll see we got back just what we wanted. + +If you're thinking about it, you should be asking yourself "OK, this is great, but how often am I really going to use this?" For most of you, the answer will be "never." There are, however, those times where what's included just isn't ever going to do the job. Aggregates are one of those places where special cases come rarely, but when they come, they really need exactly what they need and nothing else. In short, I wouldn't crowd your brain cells by memorizing every little thing about this section, but do take the time to learn what's involved and get a concept for what it can and can't do so you know what's available should you need it. + +Creating Triggers from Assemblies + +Note that we have a bit of a "chicken or the egg" (which came first?) thing going on with triggers and .NET. Triggers are not covered until Chapter 12, but I wanted to keep all .NET items close together for reference reasons. If you understand the basics of triggers, you'll be fine with this—if not, you may want to read Chapter 12 first, and then come back to this. + +Much like the other assembly types we've worked with so far in this chapter, triggers have a lot in common with the rest, but also their own little smattering of special things. + +The differences will probably come to mind quickly if you think about it for any length of time: + + * How do we deal with the contextual nature of triggers? That is, how do we know to handle things differently if it's an INSERT trigger situation versus a DELETE or UPDATE trigger? + * How do we access the inserted and deleted tables? + +You may recall from earlier examples, how we can obtain the "context" of the current connection—it is by utilizing this context that we are able to gain access to different objects that we are interested in. For example, the SqlContext object that we've obtained a connection from in prior examples also contains a SqlTriggerContext object—we can use that to get properties such as whether we are dealing with an insert, update, or delete scenario (the first question we had). The fact that we have access to the current connections also implies that we are able to access the inserted and deleted tables simply by querying them. Let's get right to putting this to use in an example. + +Start by creating a new SQL Server project in Visual Studio (I've called mine ExampleTrigger this time). Once your project is up, right-click the project in the Solution Explorer and select Add ⇒ Trigger. + +Visual Studio is nice enough to provide you with what is, for the most part, a working template. Indeed, it would run right as provided except for one issue: + +using System; + +using System.Data; + +using System.Data.SqlClient; + +using Microsoft.SqlServer.Server; + +public partial class Triggers + +{ + +// Enter existing table or view for the target and uncomment the attribute + +line + +// [Microsoft.SqlServer.Server.SqlTrigger (Name="ExampleTrigger", + +Target="Table1", Event="FOR UPDATE")] + +public static void ExampleTrigger() + +{ + +// Replace with your own code + +SqlContext.Pipe.Send("Trigger FIRED"); + +} + +} + +I've highlighted the key code line for you. At issue is that we must provide more information to SQL Server than we do in our other object types. Specifically, we must identify what table and events we're going to be executing our trigger against. We're actually going to create a special demonstration table for this before the trigger is actually put into action, so we can just use the table name TriggerTable for now. + +[Microsoft.SqlServer.Server.SqlTrigger (Name="ExampleTrigger", + +Target="TriggerTable", Event="FOR INSERT, UPDATE, DELETE")] + +Notice that I've also altered what events will fire our trigger to include all event types. + +Now we'll update the meat of things just a bit, so we can show off different actions we might take in our trigger and, perhaps more importantly, how we can check the context of things and make our actions specific to what has happened to our table. We'll start by getting our class going: + +public static void ExampleTrigger() + +{ + +// Get a handle to our current connection + +SqlConnection cn = new SqlConnection("context connection=true"); + +cn.Open(); + +SqlTriggerContext ctxt = SqlContext.TriggerContext; + +SqlCommand cmd = new SqlCommand(); + +cmd.Connection = cn; + +So far, this isn't much different from what we've used in our other .NET examples. Perhaps the only significant difference from things we've seen already is the SqlTriggerContext object—we will use this later on to determine what action caused the trigger to fire. + +We're ready to start code that is conditional on the action the trigger is firing for (based on the TriggerAction property of the TriggerContext of the SqlContext). For this, I'm going to use a simple switch command (though there are those that will call me a programming charlatan for using a switch statement—to them I say "deal with it!"). I'm also going to pipe out various things to the client to report what we're doing. + +In practice, you generally do not want to be outputting information from a trigger—figure that they should usually run silently as far as the client is concerned. I've gone ahead and output several items in this example just to make it readily apparent what the trigger is doing under what scenario. + +switch (ctxt.TriggerAction) + +{ + +case TriggerAction.Insert: + +cmd.CommandText = "SELECT COUNT(*) AS NumRows FROM INSERTED"; + +SqlContext.Pipe.Send("Insert Trigger Fired"); + +SqlContext.Pipe.ExecuteAndSend(cmd); + +break; + +case TriggerAction.Update: + +// This time, we'll use datareaders to show how we can + +// access the data from the inserted/deleted tables + +SqlContext.Pipe.Send("Update Trigger Fired"); + +SqlContext.Pipe.Send("inserted rows..."); + +cmd.CommandText = "SELECT * FROM INSERTED"; + +SqlContext.Pipe.Send(cmd.ExecuteReader()); + +break; + +case TriggerAction.Delete: + +// And now we'll go back to what we did with the inserted rows... + +cmd.CommandText = "SELECT COUNT(*) AS NumRows FROM DELETED"; + +SqlContext.Pipe.Send("Delete Trigger Fired"); + +SqlContext.Pipe.ExecuteAndSend(cmd); + +break; + +} + +SqlContext.Pipe.Send("Trigger Complete"); + +} + +} + +And, with that, we're ready to compile and upload it. The assembly upload works just as most of them have so far (we're back to not needing anything other than the default PERMISSION_SET). + +CREATE ASSEMBLY ExampleTrigger + +FROM '\ExampleTrigger\bin\Debug\ExampleTrigger.dll'; + +Before we get to creating the reference to the trigger, however, we need a table. For this example, we'll just create something very simple: + +CREATE TABLE TestTrigger + +( + +PK int NOT NULL PRIMARY KEY, + +Value varchar(max) NOT NULL + +); + +With the assembly uploaded and the table created, we're ready to create our trigger reference. + +Much like stored procedures and functions, a .NET trigger creation is made from the same statement as T-SQL-based triggers. We eliminate the T-SQL side of things and replace it with the EXTERNAL NAME declaration: + +CREATE TRIGGER trgExampleTrigger + +ON TestTrigger + +FOR INSERT, UPDATE, DELETE + +AS EXTERNAL NAME ExampleTrigger.Triggers.ExampleTrigger; + +And with that, our trigger should be in place on our table and ready to be fired whenever one of its trigger actions is fired (which happens to be for every trigger action), so let's test it. + +We'll start by getting a few rows inserted into our table. And, wouldn't you just know it? That will allow us to test the insert part of our trigger. + +INSERT INTO TestTrigger + +(PK, Value) + +VALUES + +(1, 'first row'), + +(2, 'second row'); + +Run this, and we not only get our rows in but we also get a little bit of feedback that is coming out of our trigger: + +Insert Trigger Fired + +NumRows + +\----------- + +1 + +(1 row(s) affected) + +Trigger Complete + +(1 row(s) affected) + +Insert Trigger Fired + +NumRows + +\----------- + +1 + +(1 row(s) affected) + +Trigger Complete + +(1 row(s) affected) + +As you can see, we're getting output from our trigger. Notice that we're getting the "(1 row(s) affected)" both from the query running inside the trigger and from the one that actually inserted the data. We could have taken any action that could have been done in a T-SQL trigger (though many are more efficient if you stay in the T-SQL world). The key is that we could do so much more if we had the need. We could, for example, make an external call or perform a calculation that isn't doable in the T-SQL world. + +There is an old saying: "Caution is the better part of valor." This could have been written with triggers in mind. I can't possibly express enough about the "be careful" when it comes to what you're doing in triggers. Just because you can make an external call doesn't make it a smart thing to do. Assess the need—is it really that important that the call be made right then? Realize that these things can be slow, and whatever transaction that trigger is participating in will not complete until the trigger completes—this means you may be severely damaging performance. + +Okay, so with all that done, let's try an update: + +UPDATE TestTrigger + +SET Value = 'Updated second row' + +WHERE PK = 2; + +And let's see what we get back: + +Update Trigger Fired + +inserted rows... + +PK Value + +\----------- --------------------------------------- + +2 Updated second row + +(1 row(s) affected) + +Trigger Complete + +(1 row(s) affected) + +The result set we're getting back is the one our trigger is outputting. That's followed by some of our other output as well as the base "(1 row(s) affected)" that we would normally expect from our single row update. Just as with the insert statement, we were able to see what had happened and could have adapted accordingly. + +And so, that leaves us with just the delete statement. This time, we'll delete all the rows, and we'll see how the count of our deleted table does indeed reflect both of the deleted rows. + +DELETE TestTrigger; + +And again check the results: + +Delete Trigger Fired + +NumRows + +\----------- + +2 + +(1 row(s) affected) + +Trigger Complete + +(2 row(s) affected) + +Now, these results may be just a little confusing, so let's look at what we have. + +We start with the notification that our trigger fired. That comes from our trigger. (Remember, we send that message down the pipe ourselves.) Then comes the result set from our SELECT COUNT(*). Notice the "(1 row(s) affected)"—that's from our result set rather than the UPDATE that started it all. We then get to the end of execution of our trigger (again, we dropped that message in the pipe), and, finally, the "(2 row(s) affected)" that was from the original UPDATE statement. + +And there we have it. We've done something to address every action scenario, and we could have, of course, done a lot more within each. We could also do something to address a BEFORE trigger if we needed to. + +Custom Data Types + +Sometimes you have the need to store data that you want to be strongly typed, but that SQL Server doesn't fit within SQL Server's simple data type list. Indeed, you may need to invoke a complex set of rules in order to determine whether or not the data properly meets the type requirement. + +Requests for support of complex data types have been around a very long time. Indeed, I can recall being at the Sphinx Beta 2.0—known to most as Beta 2 for SQL Server 7.0—event in 1998, and having that come up as something like the second most requested item in a request session I was at. Well, it took a lot of years, but it's finally here. + +By utilizing a .NET assembly, we can achieve a virtually limitless number of possibilities in our data types. The type can have complex rules or even contain multiple properties. + +Before we get to the syntax for adding assemblies, let's get an assembly constructed. + +The sample used here will be the ComplexNumber.sln solution included in the SQL Server samples. You will need to locate the base directory for the solution—the location of which will vary depending on your particular installation. + +We need to start by creating the signature keys for this project. To do this, I recommend starting with your solution directory being current and then calling sn.exe using a fully qualified path (or, if your .NET Framework directory is already in your PATH, then it's that much easier!). For me, it looks like this: + +C:\Program Files\Microsoft.NET\SDK\v2.0 + +64bit\LateBreaking\SQLCLR\UserDefinedDat + +aType>"C:\Program Files (x86)\Microsoft Visual Studio 8\SDK\v2.0\Bin\sn" -k + +temp + +.snk + +And with that, you're ready to build your DLL. + +Let's go ahead and upload the actual assembly (alter this to match the paths on your particular system): + +CREATE ASSEMBLY ComplexNumber + +FROM '\ComplexNumber\bin\debug\ComplexNumber.dll' + +WITH PERMISSION_SET = SAFE; + +And with the assembly loaded, we're ready to begin. + +Creating Your Data Type from Your Assembly + +So, you have an assembly that implements your complex data type and have uploaded it to SQL Server using the CREATE ASSEMBLY command. You're ready to instruct SQL Server to use it. This works pretty much as other assemblies have. The syntax (you may recall from Chapter 7) looks like this: + +CREATE TYPE [.] + +EXTERNAL NAME [.][;] + +You'll notice immediately that it looks like our previous assembly-related constructs, and, indeed, the use is the same. + +So, utilizing our complex type created in the last section, it would look like this: + +CREATE TYPE ComplexNumber + +EXTERNAL NAME [ComplexNumber].[Microsoft.Samples.SqlServer.ComplexNumber]; + +Accessing Your Complex Data Type + +Microsoft has provided a file called test.sql for testing the assembly we just defined as our complex data type, but I find it falls just slightly short of where we want to be in our learning here. What I want to emphasize is how the various functions of the supporting class for our data type are still available. In addition, each individual property of the variable is fully addressable. So, let's run a modified version of the provided script: + +USE AdventureWorks2008; + +GO + +\-- create a variable of the type, create a value of the type and invoke + +\-- a behavior over it + +DECLARE @c ComplexNumber; + +SET @c = CONVERT(ComplexNumber, '(1, 2i)'); + +SELECT @c.ToString() AS FullValueAsString; + +SELECT @c.Real AS JustRealProperty' + +GO + +Now run it, and check out the results: + +FullValueAsString + +\------------------ + +(1,2i) + +(1 row(s) affected) + +JustRealProperty + +\------------------ + +1 + +(1 row(s) affected) + +In the first result that was returned, the ToString function was called as defined as a method of our class. The string is formatted just as our method desires. If we had wanted to reverse the order of the numbers or some silly thing like that, we would only have needed to change the ToString function in the class, recompile it, and re-import it our database. + +In our second result, we address just one property of our complex data type. The simple dot "." delimiter told SQL Server that we were looking for a property—just as it would in C# or VB.NET. + +Dropping Data Types + +As you might expect, the syntax for dropping a user-defined data type works just like other drop statements: + +DROP TYPE [.][;] + +And it's gone—maybe. + +Okay, so why a "maybe" this time? Well, if there is most any object out there that references this data type, then the DROP will be disallowed and will fail. So, if you have a table that has a column of this type, then an attempt to drop it would fail. Likewise, if you have a schema bound view, stored procedure, trigger, or function defined that utilizes this type, then a drop would also fail. + +Note that this form of restriction appears in other places in SQL Server—such as dropping a table when it is the target of a foreign key reference—but those restrictions tend to be less all-encompassing than this one is (virtually any use of it in your database at all will block the drop), so I haven't felt as much need to point it out (they were more self-explanatory). + +Summary + +Well, if you aren't thinking to yourself something along the lines of "Wow, some of that stuff is pretty powerful," then I can only guess you somehow skipped straight to the summary without reading the rest of the chapter. That's what this chapter is all about—giving you the power to do very complex things (or, in a few cases, simple things that still weren't possible before). + +There is a lot to think about out of this chapter. You have table-valued parameters, which allow a sharp reduction in round trips from the client and further allows you to bundle more logic in a single parent sproc. + +When using assemblies, you need to be careful. Think about what you're doing, and analyze each of the steps that your assembly is going to be taking even more thoroughly than you already do. Consider latency you're going to be adding if you create long-running processes. Consider external dependencies you are creating if you make external calls—how reliable are those external processes? You need to know, as your system is now only as reliable as the external systems you're calling. + +As always, think about what you need, and don't make your solution any more complex than it needs to be. Keep in mind, however, that what seems at first to be the more complex solution may actually be simpler in the end. I've seen stored procedures that solved the seemingly unsolvable T-SQL problem. Keeping your system away from assemblies would seem to make it simpler, but what's better: a 300-line, complex T-SQL stored proc or an assembly that is concise and takes only 25 lines including declarations? + +Choose wisely. +11 + +Transactions and Locks + +What to do...? What to do...? This I pondered when considering this chapter. Since I usually teach this topic even to so-called "beginners" (and I have coverage of it in Beginning SQL Server 2008 Programming), I seriously debated removing this subject from the Professional title. The problem with that, however, is that, while fundamental in nature, transactions and locks are a fundamental that even lots of fairly advanced users don't quite "get." You see, while nothing in this chapter is wildly difficult, transactions and locks tend to be two of the most misunderstood areas in the database world. + +This is one of those chapters that, when you go back to work, will make you sound like you've had your Wheaties today. As such, this "beginning" (or at least I think it's a basic) concept is going to make you start to look like a real pro. + +In this chapter, we're going to: + + * Examine transactions + * Examine how the SQL Server log and "checkpoints" work + * Unlock your understanding of locks + +Now, lest you think that I've suddenly decided to treat you like a rookie, rest assured, we will look a tad more in depth in several places than I necessarily do for beginning readers. + +Transactions + +Transactions are all about atomicity. Atomicity is the concept that something should act as a unit. From our database standpoint, it's about the smallest grouping of one or more statements that should be considered to be "all or nothing." + +Often, when dealing with data, we want to make sure that if one thing happens, another thing happens, or that neither of them do. Indeed, this can be carried out to the degree where 20 things (or more) all have to happen together or nothing happens. Let's look at a classic example. + +Imagine that you are a banker. Sally comes in and wants to transfer $1,000 from checking to savings. You are, of course, happy to oblige, so you process her request. + +Behind the scenes, we have something like this happening: + +UPDATE checking + +SET Balance = Balance − 1000 + +WHERE Account = 'Sally' + +UPDATE savings + +SET Balance = Balance + 1000 + +WHERE Account = 'Sally' + +This is a hypersimplification of what's going on, but it captures the main thrust of things: you need to issue two different statements—one for each account. + +Now, what if the first statement executes and the second one doesn't? Sally would be out of a thousand dollars! That might, for a short time, seem okay from your perspective (heck, you just made a thousand bucks!), but not for long. By that afternoon you'd have a steady stream of customers leaving your bank. It's hard to stay in the bank business with no depositors. + +What you need is a way to be certain that if the first statement executes, the second statement executes. At first, it would seem that there really isn't a way that we can be certain of that. All sorts of things can go wrong, from hardware failures to simple things such as violations of data integrity rules. Fortunately, however, there is a way to do something that serves the same overall purpose. We can essentially forget that the first statement ever happened. We can enforce at least the notion that if one thing didn't happen, then nothing did—at least within the scope of our transaction. + +In order to capture this notion of a transaction, however, we need to be able to define boundaries. A transaction has to have very definitive begin and end points. Actually, every SELECT, INSERT, UPDATE, and DELETE statement you issue in SQL Server is part of an implicit transaction. Even if you issue only one statement, that one statement is considered to be a transaction. Everything about the statement will be executed, or none of it will. Indeed, by default, that is the length of a transaction—one statement. + +Again: Every SELECT, INSERT, UPDATE, and DELETE statement you issue in SQL Server is part of an implicit transaction. Even if you issue only one statement, that one statement is considered to be a transaction. Everything about the statement will be executed, or none of it will. + +But what if we need to have more than one statement be all or nothing—such as our preceding bank example? In such a case, we need a way of marking the beginning and end of a transaction, as well as the success or failure of that transaction. To that end, there are several T-SQL statements that we can use to "mark" these points in a transaction. We can: + + * BEGIN a Transaction: Set the starting point. + * COMMIT a Transaction: Make the transaction a permanent, irreversible part of the database. + * ROLLBACK a Transaction: Essentially saying that you want to forget that it ever happened. + * SAVE a Transaction: Establish a specific marker to allow us to do only a partial rollback. + +Let's look over all of these individually before we put them together into our first transaction. + +BEGIN TRAN + +The beginning of the transaction is probably one of the easiest concepts to understand in the transaction process. Its sole purpose in life is to denote the point that is the beginning of a unit. If, for some reason, we are unable to or do not want to commit the transaction, this is the point to which all database activity will be rolled back. That is, everything beyond this point that is not eventually committed will effectively be forgotten as far as the database is concerned. + +The syntax is: + +BEGIN TRAN[SACTION] [|<@transaction variable>] + +[WITH MARK ['']][;] + +The WITH MARK section is optional, and is, in practice, rarely used, but don't discount it as unimportant—quite the contrary! + +If you're marking the transaction, you must include the transaction name. (Note that it's the name, not the description that is required. The name is optional if you're not marking the transaction.) If supplied the description should be a maximum of 255 characters. (It can be longer, but, if so, it will be truncated to 255.) + +Regarding Marking Transactions + +Beginning back in SQL Server 2005, we gained the ability, when restoring a database from backups and logs, to restore to a specific point in time. You could specify an exact time that you wanted a backup rolled forward to (utilizing a log), and SQL Server would recover everything up to that point, and nothing beyond. A marked transaction expands this capability by creating a special notation in the transaction log. When performing a point-in-time recovery, you can specify the marked transaction as the point you want to recover to, instead of the time, by simply specifying the description of the mark. You can use this for things such as: + + * Marking a point when a critical action took place so that, if necessary, you can recover to just that point + * Marking activity in two databases so that those databases can be restored to a synchronized point in time + +This concept of marking your point in time can be a handy thing to have available. While it is something of an extreme use, you will find scenarios where you need to synchronize with external systems (not even necessarily a SQL Server) on backups. + +COMMIT TRAN + +The committing of a transaction is the end of a completed transaction. At the point that you issue the COMMIT TRAN, the transaction is considered to be what is called durable. That is, the effect of the transaction is now permanent and will last even if you have a system failure (as long as you have a backup or the database files haven't been physically destroyed). The only way to "undo" whatever the transaction accomplished is to issue a new transaction that, functionally speaking, is a reverse of your first transaction. + +The syntax for a COMMIT looks pretty similar to a BEGIN: + +COMMIT [TRAN[SACTION] [|<@transaction variable>]][;] + +Note that, similar to the way EXECUTE can be truncated to EXEC, TRANSACTION can be truncated down to TRAN. While TRANSACTION is the more full and clear form of the word, you'll find, in practice, that most developers use the shortened TRAN. (What can I say? We're apparently a rather lazy bunch.) + +SQL Server also supports a more ANSI-compliant syntax in the form of: + +COMMIT [WORK][;] + +The notion of a transaction name moniker is not supported under this syntax, and, while it is more ANSI compliant, it has, for whatever reason (probably its late addition to the product), been virtually nonutilized with SQL Server in actual practice. + +ROLLBACK TRAN + +Whenever I think of a ROLLBACK, I think of the old movie The Princess Bride. If you've ever seen the film (if you haven't, I highly recommend it), you'll know that the character Vizzini (considered a genius in the film) always said, "If anything goes wrong, go back to the beginning." + +That was some mighty good advice. A ROLLBACK does just what Vizzini suggested. It goes back to the beginning. In this case, it's your transaction that goes back to the beginning. Anything that happened since the associated BEGIN statement is effectively forgotten. The only exception to going back to the beginning occurs when using what are called savepoints, which I'll describe shortly. + +The syntax for a ROLLBACK again looks pretty much the same, with the exception of allowance for a savepoint: + +ROLLBACK TRAN[SACTION] [|| + +<@transaction variable>|<@savepoint variable>][;] + +Alternatively, you can use the ANSI syntax similar to what we saw with COMMIT: + +ROLLBACK [WORK][;] + +SAVE TRAN + +To save a transaction is essentially to create something of a bookmark. You establish a name for your bookmark. (You can have more than one.) After this "bookmark" is established, you can reference it in a rollback. What's nice about this is that you can roll back to the exact spot in the code that you want to just by naming a savepoint to which you want to roll back. + +Names for savepoints must conform to the rules for identifiers that we discussed back in Chapter 1. There is, however, a difference; savepoint names are limited to 32 characters in length. + +The syntax is simple enough: + +SAVE TRAN[SACTION] [| <@savepoint variable>][;] + +The thing to remember about savepoints is that they are cleared on ROLLBACK—that is, even if you save five savepoints, once you perform one ROLLBACK they are all gone. You can start setting new savepoints again, and rolling back to those, but whatever savepoints you had when the ROLLBACK was issued are gone. + +Savepoints were something of a major confusion area for me when I first came across them. Books Online indicates that, after rolling back to a savepoint, you must run the transaction to a logical conclusion. (This is technically correct.) Where the confusion came was in the Books Online implication that seemed to indicate that you had to go to a ROLLBACK or COMMIT without using any more savepoints. This is not the case. You just can't use the savepoints that you declared prior to the ROLLBACK. Savepoints after this are just fine. + +Let's test this out with a bit of code to see what happens when we mix the different types of TRAN commands. Type the following code in and then we'll run through an explanation of it: + +USE AdventureWorks2008; -- We're making our own table - what DB doesn't matter + +\-- Create table to work with + +CREATE TABLE MyTranTest + +( + +OrderID INT PRIMARY KEY IDENTITY + +); + +\-- Start the transaction + +BEGIN TRAN TranStart; + +\-- Insert our first piece of data using default values. + +\-- Consider this record No1. It is also the 1st record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Create a "Bookmark" to come back to later if need be + +SAVE TRAN FirstPoint; + +\-- Insert some more default data (this one will disappear + +\-- after the rollback). + +\-- Consider this record No2. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Roll back to the first savepoint. Anything up to that + +\-- point will still be part of the transaction. Anything + +\-- beyond is now toast. + +ROLLBACK TRAN FirstPoint; + +\-- Insert some more default data. + +\-- Consider this record No3 It is the 2nd record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Create another point to roll back to. + +SAVE TRAN SecondPoint; + +\-- Yet more data. This one will also disappear, + +\-- only after the second rollback this time. + +\-- Consider this record No4. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Go back to second savepoint + +ROLLBACK TRAN SecondPoint; + +\-- Insert a little more data to show that things + +\-- are still happening. + +\-- Consider this record No5. It is the 3rd record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Commit the transaction + +COMMIT TRAN TranStart; + +\-- See what records were finally committed. + +SELECT TOP 3 OrderID + +FROM MyTranTest + +ORDER BY OrderID DESC; + +\-- Clean up after ourselves + +DROP TABLE MyTranTest; + +First, we create a table to work with for our test: + +\-- Create table to work with + +CREATE TABLE MyTranTest + +( + +OrderID INT PRIMARY KEY IDENTITY + +); + +Since we're creating our own table to play with, what database we are using doesn't really matter for this demonstration. + +Then it's time to begin the transaction. This starts our grouping of "all or nothing" statements. We then INSERT a row. At this juncture, we have just one row inserted: + +\-- Start the transaction + +BEGIN TRAN TranStart; + +\-- Insert our first piece of data using default values. + +\-- Consider this record No1. It is also the 1st record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +Next, we establish a savepoint called FirstPoint and insert yet another row. At this point, we have two rows inserted, but remember, they are not committed yet, so the database doesn't consider them to be part of the database: + +\-- Create a "Bookmark" to come back to later if need be + +SAVE TRAN FirstPoint; + +\-- Insert some more default data (this one will disappear + +\-- after the rollback). + +\-- Consider this record No2. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +We then ROLLBACK—explicitly saying that it is not the beginning that we want to rollback to, but just to FirstPoint. With the ROLLBACK, everything between ROLLBACK and the FirstPoint savepoint is undone. Since we have one INSERT statement between the ROLLBACK and the SAVE, that statement is rolled back. At this juncture, we are back down to just one row inserted. Any attempt to reference a savepoint would now fail since all savepoints have been reset with our ROLLBACK: + +\-- Roll back to the first savepoint. Anything up to that + +\-- point will still be part of the transaction. Anything + +\-- beyond is now toast. + +ROLLBACK TRAN FirstPoint; + +We add another row, putting us back up to a total of two rows inserted at this point. We also create a brand new savepoint. This is perfectly valid, and we can now refer to this savepoint since it is established after the ROLLBACK: + +\-- Insert some more default data. + +\-- Consider this record No3. It is the 2nd record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +\-- Create another point to roll back to. + +SAVE TRAN SecondPoint; + +Time for yet another row to be inserted, bringing our total number of still-valid inserts up to three: + +\-- Yet more data. This one will also disappear, + +\-- only after the second rollback this time. + +\-- Consider this record No4. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +Now we perform another ROLLBACK, this time referencing our new savepoint (which happens to be the only one valid at this point since FirstPoint was reset after the first ROLLBACK). This one undoes everything between it and the savepoint it refers to—in this case just one INSERT statement. That puts us back at two INSERT statements that are still valid: + +\-- Go back to second savepoint + +ROLLBACK TRAN SecondPoint; + +We then issue yet another INSERT statement, bringing our total number of INSERT statements that are still part of the transaction back up to three: + +\-- Insert a little more data to show that things + +\-- are still happening. + +\-- Consider this record No5. It is the 3rd record that stays + +\-- after all the rollbacks are done. + +INSERT INTO MyTranTest + +DEFAULT VALUES; + +Last (for our transaction anyway), but certainly not least, we issue the COMMIT TRAN statement that locks our transaction in and makes it a permanent part of the history of the database: + +\-- Commit the transaction + +COMMIT TRAN TranStart + +\-- See what records were finally committed. + +SELECT TOP 3 OrderID + +FROM MyTranTest + +ORDER BY OrderID DESC; + +Note that if either of these ROLLBACK statements had not included the name of a savepoint, or had included a name that had been set with the BEGIN statement, then the entire transaction would have been rolled back, and the transaction would be considered to be closed. + +With the transaction complete, we can issue a little statement that shows us our three rows. When you look at this, you'll be able to see what's happened in terms of rows being added and then removed from the transaction: + +OrderID + +\----------- + +5 + +3 + +1 + +(3 row(s) affected) + +Sure enough, every other row was inserted. + +Finally, we clean up after ourselves. This really has nothing to do with the transaction. + +DROP TABLE MyTranTest; + +How the SQL Server Log Works + +You definitely must have the concept of transactions down before you get into trying to figure out the way that SQL Server tracks what's in your database. You see, what you think of as your database is only rarely a complete version of all the data. Except for rare moments when it happens that everything has been written to disk, the data in your database is made up of not only the data in the physical database file(s) but also any transactions that have been committed to the log since the last checkpoint. + +In the normal operation of your database, most activities that you perform are "logged" to the transaction log rather than written directly to the database. A checkpoint is a periodic operation that forces all dirty pages for the database currently in use to be written to disk. Dirty pages are log or data pages that have been modified after they were read into the cache, but the modifications have not yet been written to disk. Without a checkpoint the log would fill up and/or use all the available disk space. The process works something like the diagram in Figure 11.1. + +Figure 11.1 + +Don't mistake all this as meaning that you have to do something special to get your data out of the cache. SQL Server handles all of this for you. This information is only provided here to facilitate your understanding of how the log works, and, from there, the steps required to handle a transaction. Whether something is in cache or not can make a big difference to performance, so understanding when things are logged and when things go in and out of the cache can be a big deal when you are seeking maximum performance. + +Note that the need to read data into a cache that is already full is not the only reason that a checkpoint would be issued. Checkpoints can be issued under the following circumstances: + + * By a manual statement—using the CHECKPOINT command. + * At normal shutdown of the server (unless the WITH NOWAIT option is used). + * When you change any database option (for example, single user only, dbo only, and so on). + * When the Simple Recovery option is used and the log becomes 70 percent full. + * When the amount of data in the log since the last checkpoint (often called the active portion of the log) exceeds the size that the server could recover in the amount of time specified in the recovery interval option. + +Let's look at each of these more carefully. + +Using the CHECKPOINT Command + +One way—but probably the least often used way—for the database to have a checkpoint issued is for it to be done manually. You can do this anytime by just typing in the word: + +CHECKPOINT + +It's just that simple. + +SQL Server does a very good job of managing itself in the area of checkpoints, so the times when issuing a manual checkpoint makes sense are fairly rare. + +One place that I will do this is during the development cycle when I have the simple recovery model turned on for my database (you are very unlikely to want that for a production database). It's not at all uncommon during the development stage of your database to perform actions that are long running and fill up the log rather quickly. While I could always just issue the appropriate command to truncate the log myself, CHECKPOINT is a little shorter and faster and, when using the simple recovery model, has the same effect. + +At Normal Server Shutdown + +Ever wonder why SQL Server can sometimes take a very long time to shut down? Besides the deallocation of memory and other destructor routines that have to run to unload the system, SQL Server must also first issue a checkpoint before the shutdown process can begin. This means that you'll have to wait for any data that's been committed in the log to be written out to the physical database before your shutdown can continue. Checkpoints also occur when the server is stopped: + + * Using the Management Studio + * Using the NET STOP MSSQLSERVER instruction at a command window (a DOS box some would call it) prompt + * Using the Services icon in the Windows control panel, selecting the MSSQLSERVER service, and clicking the stop button + +Unlike Checkpoint on Recovery, this is something that I like. I like the fact that all my committed transactions are in the physical database (not split between the log and database), which just strikes me as being cleaner, with less chance of data corruption. + +There is a way you can get around the delay if you so choose. To use it, you must be shutting down using the SHUTDOWN command in T-SQL. To eliminate the delay associated with the checkpoint (and the checkpoint itself for that matter), you just add the WITH NO WAIT key phrase to your shutdown statement: + +SHUTDOWN [WITH NO WAIT] + +Note that I recommend highly against using this unless you have some programmatic need to shut down your server. It will cause the subsequent restart to take a longer time than usual to recover the databases on the server, and it means that your shutdown is not as clean. (Some data is only in the log rather than all of it being in the database file.) + +At a Change of Database Options + +A checkpoint is issued anytime you issue a change to your database options regardless of how the option gets changed (such as using sp_dboption or ALTER DATABASE). The checkpoint is issued prior to making the actual change in the database. + +When the Truncate on Checkpoint Option Is Active + +If you have turned on the Truncate On Checkpoint database option (which is a common practice during the development phase of your database), then SQL Server will automatically issue a checkpoint any time the log becomes more than 70 percent full. + +When Recovery Time Would Exceed the Recovery Interval Option Setting + +As we saw briefly earlier (and will see more closely next), SQL Server performs a process called recovery every time the SQL Server is started up. SQL Server will automatically issue a checkpoint any time the estimated time to run the recovery process would exceed the amount of time set in a database option called recovery interval. By default, the recovery interval is set to zero, which means that SQL Server will decide for you. (In practice, this means about one minute.) + +Failure and Recovery + +A recovery happens every time that SQL Server starts up. SQL Server takes the database file and then applies (by writing them out to the physical database file) any committed changes that are in the log since the last checkpoint. Any changes in the log that do not have a corresponding commit are rolled back—that is, they are essentially forgotten about. + +Let's take a look at how this works depending on how transactions have occurred in your database. Imagine five transactions that span the log, as pictured in Figure 11.2. + +Figure 11.2 + +Let's look at what would happen to these transactions one by one. + +Transaction 1 + +Absolutely nothing would happen. The transaction has already been through a checkpoint and has been fully committed to the database. There is no need to do anything at recovery, because any data that is read into the data cache would already reflect the committed transaction. + +Transaction 2 + +Even though the transaction existed at the time that a checkpoint was issued, the transaction had not been committed (the transaction was still going). Without that commitment, the transaction does not actually participate in the checkpoint. This transaction would, therefore, be "rolled forward." This is just a fancy way of saying that we would need to read all the related pages back into cache and then use the information in the log to re-run all the statements that we ran in this transaction. When that's finished, the transaction should look exactly as it did before the system failed. + +Transaction 3 + +It may not look the part, but this transaction is exactly the same as Transaction 2 from the standpoint of what needs to be done. Again, because Transaction 3 wasn't finished at the time of the last checkpoint, it did not participate in that checkpoint, just like Transaction 2 didn't. The only difference is that Transaction 3 didn't even exist at that time, but, from a recovery stand-point it makes no difference—it's where the commit is issued that makes all the difference. + +Transaction 4 + +This transaction wasn't completed at the time of system failure, and must, therefore, be rolled back. In effect, it never happened from a row data perspective. The user would have to re-enter any data, and any process would need to start from the beginning. + +Transaction 5 + +This one is no different than Transaction 4. It appears to be different because the transaction has been running longer, but that makes no difference. The transaction was not committed at the time of system failure, and must therefore be rolled back. + +Implicit Transactions + +Primarily for compatibility with other major RDBMS systems, such as Oracle or DB2, SQL Server supports (it is off by default but can be turned on if you choose) the notion of what is called an implicit transaction. Implicit transactions do not require a BEGIN TRAN statement—instead, they are automatically started with your first statement. They then continue until you issue a COMMIT TRAN or ROLLBACK TRAN statement. The next transaction then begins with your next statement. + +Theoretically, the purpose behind this is to make sure that every statement is part of a transaction. SQL Server also wants every statement to be part of a transaction, but, by default, takes a different approach—if there is no BEGIN TRAN, then SQL Server assumes you have a transaction of just one statement, and automatically begins and ends that transaction for you. With some other systems though, you'll find the implied transaction approach. Those systems will assume that any one statement is only the beginning of the transaction and therefore require that you explicitly end the every transaction with a COMMIT or ROLLBACK. + +By default, the IMPLICIT_TRANSACTIONS option is turned off (and the connection is in autocommit transaction mode). You can turn it on by issuing the command: + +SET IMPLICIT_TRANSACTIONS ON; + +After that, any of the following statements will initiate a transaction: + +CREATE + +ALTER TABLE + +GRANT + +REVOKE + +SELECT + +UPDATE + +DELETE + +INSERT + +TRUNCATE TABLE + +DROP + +OPEN + +FETCH + +The transaction will continue until you COMMIT or ROLLBACK. Note that the implicit transactions option will affect only the current connection—any other users will still have the option turned off unless they have also executed the SET statement. + +The implicit transactions option is dangerous territory, and I highly recommend that you leave this option off unless you have a very specific reason to turn it on (such as compatibility with code written in another system). + +Here's a common scenario: A user calls up and says, "I've been inserting data for the last half hour, and none of my changes are showing." So, you go run a DBCC OPENTRAN, and discover that there's a transaction that's been there for a while—you can take a guess at what's happened. The user has a transaction open, and his or her changes won't appear until that transaction is committed. The user may have done it using an explicit BEGIN TRANS statement, but he or she may also have executed some code that turned implicit transactions on and then didn't turn it off. A mess follows. + +Locks and Concurrency + +Concurrency is a major issue for any database system. It addresses the notion of two or more users trying to interact with the same object at the same time. The nature of that interaction may be different for each user (updating, deleting, reading, inserting), and the ideal way to handle the competition for control of the object changes depending on just what all the users in question are doing and just how important their actions are. The more users—more specifically, the more transactions—that you can run with reasonable success at the same time, the higher your concurrency is said to be. + +In the Online Transaction Processing (OLTP) environment, concurrency is usually the first thing we deal with in data, and it is the focus of most of the database notions put forward in this book. (Online Analytical Processing [OLAP] is usually something of an afterthought; it shouldn't necessarily be that way, but it is.) Dealing with the issue of concurrency can be critical to the performance of your system. At the foundation of dealing with concurrency in databases is a process called locking. + +Locks are mechanisms for preventing a process from performing an action on an object that conflicts with something already being done on that object. That is, you can't do some things to an object if someone else got there first. What you can and cannot do depends on what the other user is doing. It is also a means of describing what is being done, so the system knows whether or not the second process action is compatible with the first process. For example, 1, 2, 10, 100, 1,000, or whatever number of user connections the system can handle are usually all able to share the same piece of data at the same time as long as they all only want the record on a read-only basis. Think of it as being like a crystal shop: Lots of people can be in looking at things—even the same thing—as long as they don't move it, buy it, or otherwise change it. If more than one person does that at the same time, you're liable to wind up with broken crystal. That's why the shopkeeper usually keeps a close eye on things, and they will usually decide who gets to handle it first. + +The SQL Server lock manager is that shopkeeper. When you come into the SQL Server "store," the lock manager asks what is your intent—what it is you're going to be doing. If you say "just looking," and no one else already there is doing anything but "just looking," then the lock manager will let you in. If you want to "buy" (update or delete) something, then the lock manager will check to see if anyone's already there. If so, then you must wait, and everyone who comes in behind you will also wait. When you are let in to "buy," no one else will be let in until you are done. + +By doing things this way, SQL Server is able to help us avoid a mix of different problems that can be created by concurrency issues. We will examine the possible concurrency problems and how to set a transaction isolation level that will prevent each, but for now, let's move on to what can and cannot be locked, and what kinds of locks are available. + +What Problems Can Be Prevented by Locks + +Locks can address four major problems: + + * Dirty reads + * Non-repeatable reads + * Phantoms + * Lost updates + +Each of these presents a separate set of problems, and can be handled by mix of solutions that usually includes proper setting of the transaction isolation level. Just to help make things useful as you look back at this chapter later, I'm going to include information on which transaction isolation level is appropriate for each of these problems. We'll take a complete look at isolation levels shortly, but for now, let's first make sure that we understand what each of these problems is all about. + +Dirty Reads + +Dirty reads occur when a transaction reads a record that is part of another transaction that isn't complete yet. If the first transaction completes normally, then it's unlikely there's a problem. But what if the transaction were rolled back? You would have information from a transaction that never happened from the database's perspective! + +Let's look at it in an example series of steps: + +Oops—problem!!! + +Transaction 2 has now made use of a value that isn't valid! If you try to go back and audit to find where this number came from, you'll wind up with no trace and an extremely large headache. + +Fortunately, this scenario can't happen if you're using the SQL Server default for the transaction isolation level (called READ COMMITTED, which will be explained later in the section "Setting the Isolation Level"). + +Non-Repeatable Reads + +It's really easy to get this one mixed up with a dirty read. Don't worry about that—it's only terminology. Just get the concept. + +A non-repeatable read is caused when you read the record twice in a transaction, and a separate transaction alters the data in the interim. For this one, let's go back to our bank example. Remember that we don't want the value of the account to go below 0 dollars: + +Again, we have a problem. Transaction 1 has prescanned (which can be a good practice in some instances) to make sure that the value is valid and that the transaction can go through (there's enough money in the account). The problem is that, before the UPDATE was made, Transaction 2 beat Transaction 1 to the punch. If there isn't any CHECK constraint on the table to prevent the negative value, then it will indeed be set to –25—even though it logically appeared that we prevented this through the use of our IF statement. + +We can prevent this problem in only two ways: + + * Create a CHECK constraint and monitor for the 547 Error. + * Set our ISOLATION LEVEL to be REPEATABLE READ or SERIALIZABLE. + +The CHECK constraint seems fairly obvious. The thing to realize here is that you are taking something of a reactive rather than a proactive approach with this method. Nonetheless, in most situations we have a potential for non-repeatable reads, so this would be my preferred choice in most circumstances. + +We'll be taking a full look at isolation levels shortly, but for now, suffice to say that there's a good chance that setting it to REPEATABLE READ or SERIALIZABLE is going to cause you as many headaches (or more) as it solves. Still—it's an option. + +Phantoms + +No—we're not talking the "of the opera" kind here—what we're talking about are records that appear mysteriously, as if unaffected by an UPDATE or DELETE statement that you've issued. This can happen quite legitimately in the normal course of operating your system, and doesn't require any kind of elaborate scenario to illustrate. Here's a classic example of how this happens. + +Let's say you are running a fastfood restaurant. If you're typical of that kind of establishment, you probably have a fair number of employees working at the "minimum wage" as defined by the government. The government has just decided to raise the minimum wage from $6.55 to $7.25 per hour, and you want to run an update on a table called Employees to move anyone making less than $7.25 per hour up to the new minimum wage. No problem, you say, and you issue the rather simple statement: + +UPDATE Employees + +SET HourlyRate = 7.25 + +WHERE HourlyRate < 7.25; + +ALTER TABLE Employees + +ADD CONSTRAINT ckWage CHECK (HourlyRate >= 7.25); + +GO + +That was a breeze, right? Wrong! Just for illustration, we're going to say that you get an error message back: + +Msg 547, Level 16, State 1, Line 1 + +ALTER TABLE statement conflicted with COLUMN CHECK constraint 'ckWage'. The + +conflict occurred in database 'FastFood', table 'Employees', column 'HourlyRate'. + +So, you run a quick SELECT statement checking for values below $7.25, and sure enough you find one. The question is likely to come rather quickly, "How did that get there? I just did the UPDATE which should have fixed that!" You did run the statement, and it ran just fine—you just got a phantom. + +The instances of phantom reads are rare and require just the right circumstances to happen. In short, someone performed an INSERT statement at the very same time your UPDATE was running. Since it was an entirely new row, it didn't have a lock on it, and it proceeded just fine. + +The only cure for this is setting your transaction isolation level to SERIALIZABLE, in which case any updates to the table must not fall within your WHERE clause, or they will be locked out. + +Lost Updates + +Lost updates happen when one update is successfully written to the database but is accidentally overwritten by another transaction. I can just hear you right about now, "Yikes! How could that happen?" + +Lost updates can happen when two transactions read an entire record, then one writes updated information back to the record, and the other writes updated information back to the record. Let's look at an example. + +Let's say that you are a credit analyst for your company. You get a call that customer X has reached his or her credit limit and would like an extension, so you pull up the customer information to take a look. You see that they have a credit limit of $5,000, and that they appear to always pay on time. + +While you're looking, Sally, another person in your credit department, pulls up customer X's record to enter a change in the address. The record she pulls up also shows the credit limit of $5,000. + +At this point, you decide to go ahead and raise customer X's credit limit to $7,500, and press enter. The database now shows $7,500 as the credit limit for customer X. + +Sally now completes her update to the address, but she's using the same edit screen that you are—that is, she updates the entire record. Remember what her screen showed as the credit limit? $5,000. Oops, the database now shows customer X with a credit limit of $5,000 again. Your update has been lost! + +The solution to this depends on your code somehow recognizing that another connection has updated your record between the time when you read the data and when you went to update it. How this recognition happens varies depending on what access method you're using. + +Lockable Resources + +There are six different lockable resources for SQL Server, and they form a hierarchy. The higher level the lock, the less granularity it has (that is, you're choosing a higher and higher number of objects to be locked in something of a cascading action just because the object that contains them has been locked). The more relevant of these include, in ascending order of granularity: + + * Database: The entire database is locked. This happens usually during database schema changes. + * Table: The entire table is locked. This includes all the data-related objects associated with that table, including the actual data rows (every one of them) and all the keys in all the indexes associated with the table in question. + * Extent: The entire extent is locked. Remember that an extent is made up of eight pages, so an extent lock means that the lock has control of the extent, the eight data or index pages in that extent, and all the rows of data in those eight pages. + * Page: All the data or index keys on that page are locked. + * Key: There is a lock on a particular key or series of keys in an index. Other keys in the same index page may be unaffected. + * Row or Row Identifier (RID): Although the lock is technically placed on the row identifier (an internal SQL Server construct), it essentially locks the entire row. + +Lock Escalation and Lock Effects on Performance + +Escalation is all about recognizing that maintaining a finer level of granularity (say a row lock instead of a page lock) makes a lot of sense when the number of items being locked is small. However, as we get more and more items locked, the overhead associated with maintaining those locks actually hinders performance. It can cause the lock to be in place longer, thus creating contention issues; the longer the lock is in place, the more likely that someone will want that particular record. When you think about this for a bit, you'll realize there's probably a balancing act to be done somewhere, and that's exactly what the lock manager uses escalation to do. + +When the number of locks being maintained reaches a certain threshold, the lock is escalated to the next highest level, and the lower-level locks do not have to be so tightly managed (freeing resources and helping speed over contention). + +Note that the escalation is based on the number of locks rather than the number of users. The importance here is that you can single-handedly lock a table by performing a mass update. A row lock can graduate to a page lock, which then escalates to a table lock. That means that you could potentially be locking every other user out of the table. If your query makes use of multiple tables, it's actually quite possible to wind up locking everyone out of all of those tables. + +While you certainly would prefer not to lock all the other users out of your object, there are times when you still need to perform updates that are going to have that effect. There is very little you can do about escalation other than to keep your queries as targeted as possible. Recognize that escalations will happen, so make sure you've thought about what the possible ramifications of your query are. + +Lock Modes + +Beyond considering just what resource level you're locking, you also should consider what lock mode your query is going to acquire. Just as there are a variety of resources to lock, there are also a variety of lock modes. + +Some modes are exclusive of each other (which means they don't work together). Some modes do nothing more than essentially modify other modes. Whether modes can work together is based on whether they are compatible. We'll take a closer look at compatibility between locks later in this chapter. + +Just as we did with lockable resources, let's take a look at lock modes one by one. + +Shared Locks + +This is the most basic type of lock there is. A shared lock is used when you only need to read the data—that is, when you won't be changing anything. A shared lock wants to be your friend, as it is compatible with other shared locks. That doesn't mean that it still won't cause you grief—while a shared lock doesn't mind any other kind of lock, there are other locks that don't like shared locks. + +Shared locks tell other locks that you're out there. It's the old, "Look at me! Ain't I special?" thing. They don't serve much of a purpose, yet they can't really be ignored. However, one thing that shared locks do is to prevent users from performing dirty reads. + +Exclusive Locks + +Exclusive locks are just what they sound like. Exclusive locks are not compatible with any other lock. They cannot be achieved if any other lock exists, nor will they allow a new lock of any form to be created on the resource while the exclusive lock is still active. This prevents two people from updating, deleting, or doing whatever at the same time. + +Update Locks + +Update locks are something of a hybrid between shared locks and exclusive locks. An update lock is a special kind of placeholder. Think about it—in order to do an UPDATE, you need to validate your WHERE clause (assuming there is one) to figure out just what rows you're going to be updating. That means that you only need a shared lock, until you actually go to make the physical update. At the time of the physical update, you'll need an exclusive lock. + +Update locks indicate that you have a shared lock that's going to become an exclusive lock after you've done your initial scan of the data to figure out what exactly needs to be updated. This acknowledges the fact that there are two distinct stages to an update: + + * First, the stage where you are figuring out what meets the WHERE clause criteria (what's going to be updated). This is the part of an update query that has an update lock. + * Second, the stage where, if you actually decide to perform the update, the lock is upgraded to an exclusive lock. Otherwise, the lock is converted to a shared lock. + +What's nice about this is that it forms a barrier against one variety of deadlock. A deadlock is not a type of lock in itself but rather a situation where a paradox has been formed. A deadlock would arise if one lock can't do what it needs to do in order to clear because another lock is holding that resource. The problem is that the opposite resource is itself stuck waiting for the lock to clear on the first transaction. + +Without update locks, these deadlocks would crop up all the time. Two update queries would be running in shared mode. Query A completes its query and is ready for the physical update. It wants to escalate to an exclusive lock, but it can't because Query B is finishing its query. Query B then finishes the query, except that it needs to do the physical update. To do that, Query B must escalate to an exclusive lock, but it can't because Query A is still waiting. This creates an impasse. + +An update lock prevents any other update locks from being established. The instant that the second transaction attempts to achieve an update lock, the new transaction will be put into a wait status for whatever the lock timeout is; the lock will not be granted. If the first lock clears before the lock timeout is reached, then the lock will be granted to the new requester, and that process can continue. If not, an error will be generated. + +Update locks are compatible only with shared locks and intent shared locks. + +Intent Locks + +An intent lock is a true placeholder and is meant to deal with the issue of object hierarchies. Imagine a situation where you have a lock established on a row, but someone wants to establish a lock on a page or extent, or to modify a table. You wouldn't want another transaction to go around yours by going higher up the hierarchy, would you? + +Without intent locks, the higher-level objects wouldn't even know that you had the lock at the lower level. Intent locks improve performance, as SQL Server needs to examine intent locks only at the table level, and not check every row or page lock on the table, to determine if a transaction can safely lock the entire table. Intent locks come in three different varieties: + + * Intent Shared Lock: A shared lock has or is going to be established at some lower point in the hierarchy. For example, a page is about to have a page level shared lock established on it. This type of lock applies only to tables and pages. + * Intent Exclusive Lock: This is the same as intent shared, but with an exclusive lock about to be placed on the lower-level item. + * Shared with Intent Exclusive Lock: A shared lock has or is about to be established lower down the object hierarchy, but the intent is to modify data, so it will become an intent exclusive at some point. + +Schema Locks + +These come in two flavors: + + * Schema Modification Lock (Sch-M): A schema change is being made to the object. No queries or other CREATE, ALTER, or DROP statements can be run against this object for the duration of the Sch-M lock. + * Schema Stability Lock (Sch-S): This is very similar to a shared lock; this lock's sole purpose is to prevent a Sch-M since there are already locks for other queries (or CREATE, ALTER, DROP statements) active on the object. This is compatible with all other lock types. + +Bulk Update Locks + +A bulk update lock (BU) is really just a variant of a table lock with one little (but significant) difference. Bulk update locks allow parallel loading of data—that is, the table is locked from any other "normal" (T-SQL Statements) activity, but multiple BULK INSERT or bcp operations can be performed at the same time. + +Ranged Keylocks + +Ranged keylocks are merely a way for SQL Server to control internally individual locks more efficiently. Rather than being its own lock, it is, instead, just a method of tracking which locks are being held. Instead of holding an individual lock for each row in a range being accessed, SQL Server is able to maintain one lock that addressed the entire range (thus saving memory and lock operations). + +Lock Compatibility + +The table that follows shows the compatibility of the resource lock modes (listed in increasing lock strength). Existing locks are shown by the columns; requested locks by the rows: + +Also: + + * The Sch-S is compatible with all lock modes except the Sch-M. + * The Sch-M is incompatible with all lock modes. + * The BU is compatible only with schema stability and other bulk update locks. + * RangeS-S, RangeS-U, RangeI-N, and RangeX-X are range locks that match with the corresponding S, U, and X lock types where applicable, and, in the case of RangeI-N (the N stands for null), lock a range of potential rows to prevent phantoms. + +Specifying a Specific Lock Type—Optimizer Hints + +Sometimes you want to have more control over how the locking goes either in your query, or perhaps in your entire transaction. You can do this by making use of what are called optimizer hints. + +Optimizer hints are ways of explicitly telling SQL Server to escalate a lock to a specific level. They are included right after the name of the table (in your SQL Statement) that they are to act against, and are designated as follows: + +Hint | Description +---|--- +SERIALIZABLE/HOLDLOCK | Once a lock is established by a statement in a transaction, that lock is not released until the transaction is ended (via ROLLBACK or COMMIT). Inserts are also prevented if the inserted record would match the criteria in the WHERE clause in the query that established the lock (no phantoms). This is the highest isolation level, and guarantees absolute consistency of data. +READUNCOMMITTED/NOLOCK | Obtains no lock (not even a shared lock) and does not honor other locks. While a very fast option, it can generate dirty reads as well as a host of other problems. +READCOMMITTED | The default. Honors all locks, but how it handles acquiring locks depends on the database option READ_COMMITTED_SNAPSHOT. If that setting is on, then READCOMMITTED will not acquire locks, and will instead use a row versioning scheme to determine whether any conflicts have occurred. In practice, this should work just fine, and READCOMMITTED should be the way for you to go for both backward compatibility and what is likely better performance. +READCOMMITTEDLOCK | This is nuance stuff here. Consider this one to be largely the same as READCOMMITTED in most situations. (Indeed, this one works exactly as READCOMMITTED did in prior versions of SQL Server.) It honors all locks but releases any locks held as soon as the object in question is no longer needed. Performs the same as the READ COMMITTED isolation level. +REPEATABLEREAD | Once a lock is established by a statement in a transaction, that lock is not released until the transaction is ended (via ROLLBACK or COMMIT). New data can be inserted, however. +READPAST | Rather than waiting for a lock to clear, skips all locked rows. The skip is limited to row locks (still waits for page, extent, and table locks) and can only be used with a SELECT statement. +NOWAIT | Causes the query to fail immediately rather than wait if any locks are detected. +ROWLOCK | This forces the initial level of the lock to be at the row level, even if the optimizer would have otherwise selected a less granular locking strategy. It does not prevent the lock from being escalated to those less granular levels if the number of locks reaches the system's lock threshold. +PAGLOCK | Uses a page-level lock regardless of the choice that otherwise would have been made by the optimizer. The usefulness of this can go both ways—sometimes you know that a page lock is more appropriate than a row lock for resource conservation—other times you want to minimize contention where the optimizer might have chosen a table lock. +TABLOCK | Forces a full table lock rather than whatever the lock manager would have used. Can really speed up known table scan situations but creates big contention problems if other users want to modify data in the table. +TABLOCKX | Similar to TABLOCK, but creates an exclusive lock—locks all other users out of the table for the duration of the statement or transaction depending on how the TRANSACTION ISOLATION LEVEL is set. +UPDLOCK | Uses an update lock instead of a shared lock. This is a highly underutilized tool in the war against deadlocks, as it still allows other users to obtain shared locks but ensures that no data modification (other update locks) are established until you end the statement or transaction (presumably after going ahead and updating the rows). +XLOCK | With its roots in TABLOCKX, this one first appeared in SQL Server 2000. The advantage here is that you can specify an exclusive lock regardless of what lock granularity you have chosen (or not chosen) to specify. + +Most of these can be very useful in specific situations, but, before you get too attached to using these, make sure that you also check out the concept of isolation levels later in the chapter. + +The syntax for using locks is fairly easy—just add it after the table name, or after the alias if you're using one: + +.... + +FROM
AS ][[WITH] + +So, to put this into a couple of examples, any of these would be legal, and all would force a table lock (rather than the more likely key or row lock) on the SalesOrderHeader table: + +SELECT * FROM Sales.SalesOrderHeader AS ord WITH (TABLOCKX) + +SELECT * FROM Sales.SalesOrderHeader AS ord (TABLOCKX) + +SELECT * FROM Sales.SalesOrderHeader WITH (TABLOCKX) + +SELECT * FROM Sales.SalesOrderHeader (TABLOCKX) + +Now look at it from a multiple-table perspective. The following queries would do the same thing as the previous ones in terms of locking. They would force an exclusive table lock on the SalesOrderHeader table. The thing to note, though, is that they do not place any kind of special lock on the SalesOrderDetail table. The SQL Server lock manager still is in complete control of that table. + +SELECT * + +FROM Sales.SalesOrderHeader AS ord WITH (TABLOCKX) + +JOIN Sales.SalesOrderDetail AS od + +ON ord.SalesOrderID = od.SalesOrderID; + +SELECT * + +FROM Sales.SalesOrderHeader AS ord (TABLOCKX) + +JOIN Sales.SalesOrderDetail AS od + +ON ord.SalesOrderID = od.SalesOrderID; + +SELECT * + +FROM Sales.SalesOrderHeader WITH (TABLOCKX); + +JOIN Sales.SalesOrderDetail AS od + +ON Sales.SalesOrderHeader.SalesOrderID = od.SalesOrderID; + +SELECT * + +FROM Sales.SalesOrderHeader (TABLOCKX) + +JOIN Sales.SalesOrderDetail AS od + +ON Sales.SalesOrderHeader.SalesOrderID = od.SalesOrderID; + +We also could have done something completely different here and placed a totally separate hint on the SalesOrderDetail table. It's all up to you. + +Determining Locks Using the Management Studio + +Perhaps the nicest way of all to take a look at your locks is by using Management Studio. Management Studio will show you locks in two different sorts—by process ID or by object—by utilizing the Activity Monitor. + +To make use of Management Studio's lock display, just navigate to the Server and right-click, then choose Activity Monitor. You should come up with a new window that looks something like Figure 11.3 (I've expanded the Processes frame). + +Figure 11.3 + +Just expand the node that you're interested in (either the Process ID or the Object), and you'll see various locks. + +Perhaps the coolest feature in Management Studio shows itself when you double-click a specific lock in the right-hand side of the window. A dialog box will come up and tell you the last statement that was run by that process ID. This can be very handy when you are troubleshooting deadlock situations. + +Setting the Isolation Level + +We've seen that several different kinds of problems can be prevented by different locking strategies. We've also seen what kinds of locks are available and how they have an impact on the availability of resources. Now it's time to take a closer look at how these process management pieces work together to ensure overall data integrity and to make certain that you can get the results you expect. + +The first thing to understand about the relationship between transactions and locks is that they are inextricably linked with each other. By default, any lock that is data modification related will, once created, be held for the duration of the transaction. If you have a long transaction, this means that your locks may be preventing other processes from accessing the objects you have a lock on for a long time. It probably goes without saying that this can be rather problematic. + +However, that's only the default. In fact, there are actually five different isolation levels that you can set at the transaction level: + + * READ COMMITTED (the default) + * READ UNCOMMITTED + * REPEATABLE READ + * SERIALIZABLE + * SNAPSHOT + +The syntax for switching between them is pretty straightforward: + +SET TRANSACTION ISOLATION LEVEL + +The change in isolation level will affect only the current connection. So you don't need to worry about adversely affecting other users (or them affecting you). + +Let's start by looking at the default situation (READ COMMITTED) a little more closely. + +READ COMMITTED + +With READ COMMITTED, any shared locks you create will be automatically released as soon as the statement that created them is complete. That is, if you start a transaction, run several statements, run a SELECT statement, and then run several more statements, the locks associated with the SELECT statement are freed as soon as the SELECT statement is complete. SQL Server doesn't wait for the end of the transaction. + +Action queries (UPDATE, DELETE, and INSERT) are a little different. If your transaction performs a query that modifies data, then those locks will be held for the duration of the transaction (in case you need to roll back). + +By keeping this level of default, with READ COMMITTED, you can be sure that you have enough data integrity to prevent dirty reads. However, non-repeatable reads and phantoms can still occur. + +READ UNCOMMITTED + +READ UNCOMMITTED is the most dangerous of all isolation level choices but also has the highest performance in terms of speed. + +Setting the isolation level to READ UNCOMMITTED tells SQL Server not to set any locks, and not to honor any locks. With this isolation level, it is possible to experience any of the various concurrency issues we discussed earlier in the chapter (most notably a dirty read). + +Why would one ever want to risk a dirty read? When I watch the newsgroups on Usenet, I see the question come up on a regular basis. It's surprising to a fair number of people, but there are actually good reasons to have this isolation level, and they are almost always to do with reporting. + +In an OLTP environment, locks are both your protector and your enemy. They prevent data integrity problems, but they also often prevent, or block, you from getting at the data you want. It is extremely commonplace to see a situation where the management wants to run reports regularly, but the data entry people are often prevented from or delayed in entering data because of locks held by the manager's reports. + +By using READ UNCOMMITTED, you can often get around this problem—at least for reports where the numbers don't have to be exact. For example, let's say that a sales manager wants to know just how much has been done in sales so far today. Indeed, we'll say he's a micro-manager and asks this same question (in the form of re-running the report) several times a day. + +If the report happened to be a long-running one, then there's a high chance that his running it would damage the productivity of other users due to locking considerations. What's nice about this report though, is that it is a truly nebulous report. The exact values are probably meaningless. The manager is really just looking for ballpark numbers. + +By having an isolation level of READ UNCOMMITTED, we do not set any locks, so we don't block any other transactions. Our numbers will be somewhat suspect (because of the risk of dirty reads), but we don't need exact numbers anyway, and we know that the numbers are still going to be close even on the off chance that a dirty read is rolled back. + +You can get the same effect as READ UNCOMMITTED by adding the NOLOCK optimizer hint in your query. The advantage to setting the isolation level is that you don't have to use a hint for every table in your query, or use it in multiple queries. The advantage to using the NOLOCK optimizer hint is that you don't need to remember to set the isolation level back to the default for the connection. (With READ UNCOMMITTED you do.) + +REPEATABLE READ + +The REPEATABLE READ escalates your isolation level somewhat, and provides an extra level of concurrency protection by preventing not only dirty reads (the default already does that) but also preventing non-repeatable reads. + +That prevention of non-repeatable reads is a big upside, but holding even shared locks until the end of the transaction can block users' access to objects, and therefore hurt productivity. Personally, I prefer to use other data integrity options (such as a CHECK constraint together with error handling) rather than this choice, but it remains an available option. + +The equivalent optimizer hint for the REPEATABLE READ isolation level is REPEATABLEREAD (these are the same, only no space). + +SERIALIZABLE + +SERIALIZABLE is something of the fortress of isolation levels. It prevents all forms of concurrency issues except for a lost update. Even phantoms are prevented. + +When you set your isolation to SERIALIZABLE, you're saying that any UPDATE, DELETE, or INSERT to the table or tables used by your transaction must not meet the WHERE clause of any statement in that transaction. Essentially, if the user was going to do something that your transaction would be interested in, then it must wait until your transaction has been completed. + +The SERIALIZABLE isolation level can also be simulated by using the SERIALIZABLE or HOLDLOCK optimizer hint in your query. Again, as with the READ UNCOMMITTED and NOLOCK debate, the option of not having to set it every time versus not having to remember to change the isolation level back is the big issue. + +Going with an isolation level of SERIALIZABLE would, on the surface, appear to be the way you want to do everything. Indeed, it does provide your database with the highest level of what is called consistency—that is, the update process works the same for multiple users as it would if all your users did one transaction at a time (processed things serially). + +As with most things in life, however, there is a trade-off. Consistency and concurrency can, in a practical sense, be thought of as polar opposites. Making things SERIALIZABLE can prevent other users from getting to the objects they need; that equates to lower concurrency. The reverse is also true: Increasing concurrency (by going to a REPEATABLE READ for example) reduces the consistency of your database. + +My personal recommendation on this is to stick with the default (READ COMMITTED) unless you have a specific reason not to. + +SNAPSHOT + +Note that the SNAPSHOT transaction isolation level is not available by default. To utilize it, you must enable the ALLOW_SNAPSHOT_ISOLATION option for your database utilizing the ALTER DATABASE command. + +This was first added in SQL Server 2005, and was not particularly well publicized (and still isn't well documented if you ask me!). SNAPSHOT utilizes what is referred to as "row versioning." Transactions that would have been blocked from a given record are instead allowed read access to that record in its last known good state, which is to say, the way it was before whatever transaction is blocking began its modifications to the row. + +SNAPSHOT is something of a mixed blessing. On one hand, concurrency is increased as read transactions are allowed to continue forward unabated with a value that is technically the correct value for that moment in time (at least in terms of what data has been truly committed). The down side, however, is that those transactions are being allowed to continue with data that has a significant chance of being inaccurate soon. + +Which should you use? Well, as you can imagine, my answer would be "It depends." The safer answer is to stick with the default of READ COMMITTED. Sometimes, however, we don't need that safety, and higher concurrency is the better choice. + +The default isolation level of READ COMMITTED can be switched over to a version that utilizes row versioning, effectively the same as SNAPSHOT, by enabling the READ_COMMITTED_SNAPSHOT database option with the ALTER DATABASE command. Make certain however, that you know what you fully understand the differences between the two READ COMMITTED implementations before making such a change. + +Dealing with Deadlocks (a.k.a. "A 1205") + +Okay. So now you've seen locks, and you've also seen transactions. Now that you've got both, we can move on to the rather pesky problem of dealing with deadlocks. + +As we've already mentioned, a deadlock is not a type of lock in itself, but rather a situation where a paradox has been formed by other locks. Like it or not, you'll bump into these on a regular basis (particularly when you're just starting out), and you'll be greeted with an error number 1205. So prolific is this particular problem that you'll hear many a database developer refer to them simply by the number. + +Deadlocks are caused when one lock can't do what it needs to do in order to clear because a second lock is holding that resource, and vice versa. When this happens, somebody has to win the battle, so SQL Server chooses a deadlock victim. The deadlock victim's transaction is then rolled back and is notified that this happened through the 1205 error. The other transaction can continue normally. (Indeed, it will be entirely unaware that there was a problem, other than seeing an increased execution time.) + +How SQL Server Figures Out There's a Deadlock + +Every 5 seconds SQL Server checks all the current transactions for what locks they are waiting for but haven't yet been granted. As it does this, it essentially makes a note that the request exists. It will then re-check the status of all open lock requests again, and, if one of the previous requests has still not been granted, it will recursively check all open transactions for a circular chain of lock requests. If it finds such a chain, then one or more deadlock victims will be chosen. + +How Deadlock Victims Are Chosen + +By default, a deadlock victim is chosen based on the "cost" of the transactions involved. The transaction that costs the least to roll back will be chosen (in other words, SQL Server has to do the least number of things to undo it). You can, to some degree, override this by using the DEADLOCK_PRIORITY SET option available in SQL Server; this is, however, generally both ill-advised and out of the scope of this book. (I consider this to be very much in the camp of the administrator rather than the developer.) + +Avoiding Deadlocks + +Deadlocks can't be avoided 100 percent of the time in complex systems, but you can almost always totally eliminate them from a practical standpoint—that is, make them so rare that they have little relevance to your system. + +To cut down or eliminate deadlocks, follow these simple (okay, usually simple) rules: + + * Use your objects in the same order. + * Keep your transactions as short as possible and in one batch. + * Use the lowest transaction isolation level necessary. + * Do not allow open-ended interruptions (user interactions, batch separations) within the same transaction. + * In controlled environments, use bound connections. + +Nearly every time I run across deadlocking problems, at least one (usually more) of these rules has been violated. Let's look at each one individually. + +Using Objects in the Same Order + +This is the most common problem area within the few rules that I consider to be basic. What's great about using this rule is that it almost never costs you anything to speak of; it's more a way of thinking. You decide early in your design process how you want to access your database objects, including order, and it becomes a habit in every query, procedure, or trigger that you write for that project. + +Think about it for a minute. If our problem is that our two connections each have what the other wants, then it implies that we're dealing with the problem too late in the game. Let's look at a simple example. + +Consider that we have two tables: Suppliers and Products. Now say that we have two processes that make use of both of these tables. Process 1 accepts inventory entries, updates Products with the new amount of product on hand, and then updates Suppliers with the total amount of product that we've purchased. Process 2 records sales; it updates the total amount of product sold in the Suppliers table and then decreases the inventory quantity in Products. + +If we run these two processes at the same time, we're begging for trouble. Process 1 will grab an exclusive lock on the Products table. Process 2 grabs an exclusive lock on the Suppliers table. Process 1 then attempts to grab a lock on the Suppliers table, but it will be forced to wait for Process 2 to clear its existing lock. In the meantime, Process 2 tries to create a lock on the Products table, but it will have to wait for Process 1 to clear its existing lock. We now have a paradox: Both processes are waiting for each other. SQL Server will have to pick a deadlock victim. + +Now let's rearrange that scenario, with Process 2 changed to first decrease the inventory quantity in Products and then update the total amount of product sold in the Suppliers table. This is a functional equivalent to the first way we organized the processes, and it will cost us nothing to perform it this new way. The impact though, will be stunning. No more deadlocks (at least not between these two processes)! Let's walk through what will now happen. + +When we run these two processes at the same time, Process 1 will grab an exclusive lock on the Products table (so far, it's the same). Process 2 then also tries to grab a lock on the Products table but will be forced to wait for Process 1 to finish. (Notice that we haven't done anything with Suppliers yet.) Process 1 finishes with the Products table but doesn't release the lock because the transaction isn't complete yet. Process 2 is still waiting for the lock on Products to clear. Process 1 now moves on to grab a lock on the Suppliers table. Process 2 continues to wait for the lock to clear on Products. Process 1 finishes and commits or rolls back the transaction as required but frees all locks in either case. Process 2 now is able to obtain its lock on the Products table and moves through the rest of its transaction without further incident. + +Just swapping the order in which these two queries are run has eliminated a potential deadlock problem. Keep things in the same order wherever possible and you, too, will experience far fewer deadlocks. + +Keeping Transactions As Short As Possible + +This is another of the basics. Again, it should become just an instinct—something you don't really think about, something you just do. + +This is one that never has to cost you anything really. Put what you need to put in the transaction, and keep everything else out. It's just that simple. The reason this works isn't rocket science. The longer the transaction is open, and the more it touches (within the transaction), the higher the likelihood that you're going to run into some other process that wants one or more of the objects that you're using (reducing concurrency). If you keep your transaction short, you minimize the number of objects that can potentially cause a deadlock, plus you cut down on the time that you have your lock on them. It's as simple as that. + +Keeping transactions in one batch minimizes network round-trips during a transaction, reducing possible delays in completing the transaction and releasing locks. + +Using the Lowest Transaction Isolation Level Possible + +This one is considerably less basic, and requires some serious thought. As such, it isn't surprising just how often it isn't thought of at all. Consider it Rob's axiom: That which requires thought is likely not to be thought of. Be different—think about it. + +We have several different transaction isolation levels available. The default is READ COMMITTED. Using a lower isolation level holds shared locks for a shorter duration than a higher isolation level, thereby reducing locking contention. + +Allowing No Open-Ended Transactions + +This one probably makes the most common sense out of all the recommendations here, but it's one that's often violated because of past practices. + +One of the ways we used to prevent lost updates (mainframe days here, folks!) was just to grab the lock and hold it until we were done with it. I can't tell you how problematic this was. (Can you say yuck!) + +Imagine this real-life example: Someone in your service department likes to use update (exclusive locks) screens instead of display (shared locks) screens to look at data. "After all," he says. "That way I'm right there ready to edit if I see something that needs to be changed." He goes on to look at a work order. Now his buddy calls and asks if he's ready for lunch. "Sure!" comes the reply, and the service clerk heads off to a rather long lunch (1–2 hours). Everyone who is interested in this record is now locked out of it for the duration of this clerk's lunch. + +Wait—it gets worse. In the days of the mainframe, you used to see the concept of queuing far more often. (It actually can be quite efficient.) Now someone submits a print job (which is queued) for this work order. It sits in the queue waiting for the record lock to clear. Since it's a queue environment, every print job your company has for work orders now piles up behind that first print job (which is going to wait for that person's lunch before clearing). + +This is a rather extreme example, but it is a real-life scenario I've seen many times, and I hope that it clearly illustrates the point. Don't ever create locks that will still be open when you begin some form of open-ended process. Usually we're talking user interaction (like our lunch lover), but it could be any process that has an open-ended wait to it. + +Using Bound Connections + +Hmm. I had to debate even including this one, because it's something of a can of worms. Once you open it, you're never going to get them all back in. I'll just say that this is one which is used extremely rarely and is not for the faint of heart. + +It's not that it doesn't have its uses; it's just that things can become convoluted rather quickly, so you need to manage things well. It's my personal opinion that there is usually a better solution. + +That brings on the question of what is a bound connection. Bound connections are connections that have been associated and are essentially allowed to share the same set of locks. What that means is that the two transactions can operate in tandem without any fear of deadlocking each other or being blocked by one another. The flip side of this means that you essentially are on your own in terms of dealing with most concurrency issues. Locks aren't keeping you safe anymore. + +Given my distaste for these for 99.9 percent of situations, we're going to forget that these exist now that we've seen that they are an option. If you're going to insist on using them, just remember that you're going to be dealing with an extremely complex relationship between connections, and you need to manage the activities in those connections rather closely if you are going to maintain data integrity within the system. + +Summary + +Transactions and locks are both cornerstone items to how SQL Server works and, therefore, to maximizing your development of solutions in SQL Server. + +By using transactions, you can make sure that everything you need to have happen as a unit happens, or none of it does. SQL Server's use of locks ensures that we avoid the pitfalls of concurrency to the maximum extent possible. (You'll never avoid them entirely, but it's amazing how close you can come with a little—OK a lot—of planning.) By using the two together, you are able to pass what the database industry calls the ACID test. If a transaction is ACID, then it has: + + * Atomicity: The transaction is all or nothing. + * Consistency: All constraints and other data integrity rules have been adhered to, and all related objects (data pages, index pages) have been updated completely. + * Isolation: Each transaction is completely isolated from any other transaction. The actions of one transaction cannot be interfered with by the actions of a separate transaction. + * Durability: After a transaction is completed, its effects are permanently in place in the system. The data is "safe," in the sense that things such as a power outage or other non-disk system failure will not lead to data that is only half-written. + +In short, by using transactions and locks, you can minimize deadlocks, ensure data integrity, and improve the overall efficiency of your system. + +In our next chapter, we'll be looking at triggers. Indeed, we'll see that, for many of the likely uses of triggers, the concepts of transactions and rollbacks will be at the very center of the trigger. +12 + +Triggers + +I am often asked, "Should I use triggers?" The answer is, as with most things in SQL, "It depends." There's little that's black and white in the wonderful world of SQL Server; triggers are definitely a very plain shade of gray. + +Know what you're doing before you go the triggers route; it's important for the health and performance of your database. The good news is that's what we're here to learn. + +As with most of the core subjects we've covered in this book (save for a few that were just too important to rush), we're going to be moving along quickly in the assumption that you already know the basics. Still, this also happens to be one of those topics where you can have become a relatively advanced user of SQL Server, and never hit this particular topic. That is, triggers can be needed by the beginner for some installations, and yet never been touched by the "Pro" in others (SQL is just that way...). The result is that, if you've read my Beginning SQL Server 2008 Programming title, then you'll definitely notice some overlap (but you'll find much more depth here). If you're in that group of people, feel free to skip ahead to the INSTEAD OF triggers section. + +In this chapter, we'll try to look at triggers in all of their colors—from black all the way to white and a whole lot in between. The main issues we'll be dealing with include: + + * What is a trigger (the very quick and dirty version)? + * Using triggers for more flexible referential integrity + * Using triggers to create flexible data integrity rules + * Using INSTEAD OF triggers to create more flexible updatable views + * Other common uses for triggers + * Controlling the firing order of triggers + * Performance considerations + +By the time we're done, you should have an idea of just how complex is the decision about when and where not to use triggers. You'll also have an inkling of just how powerful and flexible they can be. + +Most of all, if I've done my job well, you won't be a trigger extremist (which so many SQL Server people I meet are) with the distorted notion that triggers are evil and should never be used. Neither will you side with the other end of the spectrum: those who think that triggers are the solution to all the world's problems. The right answer in this respect is that triggers can do a lot for you, but they can also cause a lot of problems. The trick is to use them when they are the right things to use, and not to use them when they aren't. + +Some common uses of triggers include: + + * Enforcement of referential integrity: Although I recommend using declarative referential integrity (DRI) whenever possible, there are many things that DRI won't do (for example, referential integrity across databases or even servers, many complex types of relationships, and so on). The use of triggers for RI is becoming very special case, but it's still out there. + * Creating audit trails, which means writing out records that keep track of not just the most current data but also the actual change history for each record. + * Functionality similar to a CHECK constraint, but which works across tables, databases, or even servers. + * Substituting your own statements in the place of a user's action statement (usually used to enable inserts in complex views). + +In addition, you have the new but likely much more rare case (as I said, they are new, so only time will tell for sure) DDL trigger—which is about monitoring changes in the structure of your table. + +And these are just a few. So, with no further ado, let's look at exactly what a trigger is. + +What Is a Trigger? + +A trigger is a special kind of stored procedure that responds to specific events. There are two kinds of triggers: Data Definition Language (DDL) triggers and Data Manipulation Language (DML) triggers. + +DDL triggers fire in response to someone changing the structure of your database in some way (CREATE, ALTER, DROP, and similar statements). These were first added back in SQL Server 2005 and are critical to some installations (particularly high-security installations) but are pretty narrow in use. In general, you will need to look into using these only where you need extreme auditing of changes/history of your database structure. We will save these until last. + +DML triggers are pieces of code that you attach to a particular table or view. Unlike sprocs, where you needed to explicitly invoke the code, the code in triggers is automatically run whenever the event(s) you attached the trigger to occurs in the table. Indeed, you can't explicitly invoke triggers—the only way to do this is by performing the required action in the table that they are assigned to. + +Beyond not being able to explicitly invoke a trigger, you'll find two other things that exist for sprocs but are missing from triggers: parameters and return codes. + +While triggers take no parameters, they do have a mechanism for figuring out what records they are supposed to act on (we'll investigate this further later in the chapter). And, while you can use the RETURN keyword, you cannot return a specific return code (because you didn't explicitly call the trigger, what would you return a return code to?). + +What events can you attach triggers to? The three "action" query types you use in SQL. So, you wind up with triggers based in inserts, updates, and/or deletes (you can mix and match to what events you want the trigger to be attached). + +It's worth noting that there are times when a trigger will not fire—even though it seems that the action you are performing falls into one of the preceding categories. At issue is whether or not the operation you are doing is in a logged activity. For example, a DELETE statement is a normal, logged activity that would fire any delete trigger, but a TRUNCATE TABLE, which has the effect of deleting rows, just deallocates the space used by the table. There is no individual deletion of rows logged, and no trigger is fired. + +The syntax for creating triggers looks an awful lot like all of our other CREATE syntax, except that it has to be attached to a table somewhat similar to an index; a trigger can't stand on its own. + +Let's take a look: + +CREATE TRIGGER + +ON [.]
+ +[WITH ENCRYPTION | EXECUTE AS >] + +{{{FOR|AFTER} <[DELETE] [,] [INSERT] [,] [UPDATE]>} |INSTEAD OF} + +[WITH APPEND] + +[NOT FOR REPLICATION] + +AS + + | EXTERNAL NAME + +As you can see, the all too familiar CREATE is still there as well as the execution stuff we've seen in many other objects—we've just added the ON clause to indicate the table to which this trigger is going to be attached, as well as when and under what conditions it fires. + +ON + +This part just names what object you are creating the trigger against. Keep in mind that if the type of the trigger is an AFTER trigger (if it uses FOR or AFTER to declare the trigger), then the target of the ON clause must be a table—AFTER triggers are not supported for views. + +WITH ENCRYPTION + +This works just as it does for views and sprocs. If you add this option, you can be certain that no one will be able to view your code (not even you!). This is particularly useful if you are going to be building software for commercial distribution, or if you are concerned about security and don't want your users to be able to see what data you're modifying or accessing. Obviously, you should keep a copy of the code required to create the trigger somewhere else, in case you want to re-create it sometime later. + +As with views and sprocs, the thing to remember when using the WITH ENCRYPTION option is that you must reapply it every time you ALTER your trigger. If you make use of an ALTER TRIGGER statement and do not include the WITH ENCRYPTION option, then the trigger will no longer be encrypted. + +The FOR|AFTER versus the INSTEAD OF Clause + +In addition to deciding what kind of queries will fire your trigger (INSERT, UPDATE, and/or DELETE), you also have some choice as to when the trigger fires. While the FOR (alternatively, you can use the keyword AFTER if you choose) trigger is the one that has been around a long time and is the one people generally think of, you also have the ability to run what is called an INSTEAD OF trigger. Choosing between these two will affect whether you enter your trigger before or after the data has been modified. In either case, you will be in your trigger before any changes are truly committed to the database. + +Confusing? Probably. Let's try it a different way with a diagram that shows where each choice fires (see Figure 12.1). + +The thing to note here is that, regardless of which choice you make, SQL Server will put together two working tables—one holding a copy of the records that were inserted (and, incidentally, called INSERTED) and one holding a copy of any records that were deleted (called DELETED). We'll look into the details of the uses of these working tables a little later. For now realize that with INSTEAD OF triggers the creation of these working tables will happen before any constraints are checked, and with FOR triggers, these tables will be created after constraints are checked. + +The key to INSTEAD OF triggers is that you can actually run your own code in the place of whatever the user requested. This means we can clean up ambiguous insert problems in views (remember the problem back in Chapter 8 with inserting when there was a JOIN in the view?). It also means that we can take action to clean up constraint violations before the constraint is even checked. + +Triggers using the FOR and AFTER declaration behave identically to each other. The big difference between them and INSTEAD OF triggers is that they build their working tables after any constraints have been checked. + +The AFTER (or, alternatively, you can use FOR) clause indicates under what type of action(s) you want this trigger to fire. You can have the trigger fire whenever there is an INSERT, UPDATE, or DELETE, or any mix of the three. So, for example, your FOR clause could look something like: + +AFTER INSERT, DELETE + +... or: + +AFTER UPDATE, INSERT + +... or: + +AFTER DELETE + +As was stated in the section about the ON clause, triggers declared using the AFTER or FOR clause can only be attached to tables—no views are allowed (see INSTEAD OF triggers for those). + +It's worth noting that, unlike prior editions of this book, I actually do advise a specific choice between AFTER and FOR. While both are equally usable, and there is no indication that either will be deprecated, the AFTER clause is the "standard" way of doing things, so it is more likely to be supported by other database vendors. + +Figure 12.1 + +INSERT Trigger + +The code for any trigger that you mark as being FOR INSERT will be executed any time that someone inserts a new row into your table. For each row that is inserted, SQL Server will create a copy of that new row and insert it in a special table that exists only within the scope of your trigger. That table is called INSERTED, and we'll see much more of it over the course of this chapter. The big thing to understand is that the INSERTED table only lives as long as your trigger does. Think of it as not existing before your trigger starts or after your trigger completes. + +DELETE Trigger + +This works much the same as an INSERT trigger does, save that the INSERTED table will be empty (after all, you deleted rather than inserted, so there are no records for the INSERTED table). Instead, a copy of each record that was deleted is inserted into another table called DELETED. That table, like the INSERTED table, is limited in scope to just the life of your trigger. + +UPDATE Trigger + +More of the same, save for a twist. The code in a trigger declared as being FOR UPDATE will be fired whenever an existing record in your table is changed. The twist is that there's no such table as UPDATED. Instead, SQL Server treats each row as if the existing record had been deleted, and a totally new record was inserted. As you can probably guess from that, a trigger declared as FOR UPDATE contains not one but two special tables called INSERTED and DELETED. The two tables have exactly the same number of rows, of course. + +WITH APPEND + +WITH APPEND is something of an oddball and, in all honesty, you're pretty unlikely to use it; nonetheless, and since this is, after all, a "Professional" title, we'll cover it here for that "just-in-case" scenario. WITH APPEND applies only when you are running in 6.5 compatibility mode (which can be set using sp_dbcmptlevel). + +SQL Server 6.5 and prior did not allow multiple triggers of the same type on any single table. For example, if you had already declared a trigger called trgCheck to enforce data integrity on updates and inserts, then you couldn't create a separate trigger for cascading updates. Once one update (or insert, or delete) trigger was created, that was it—you couldn't create another trigger for the same type of action. + +This was a real pain. It meant that you had to combine logically different activities into one trigger. Trying to get what amounted to two entirely different procedures to play nicely together could, at times, be quite a challenge. In addition, it made reading the code something of an arduous task. + +Along came SQL Server 7.0 and the rules changed substantially. No longer do we have to worry about how many triggers we have for one type of action query—you can have several if you like. When running our database in 6.5 compatibility mode, though, we run into a problem: Our database is still working on the notion that there can only be one trigger of a given type on a given table. + +WITH APPEND gets around this problem by explicitly telling SQL Server that we want to add this new trigger even though we already have a trigger of that type on the table; both will be fired when the appropriate trigger action (INSERT, UPDATE, DELETE) occurs. It's a way of having a bit of both worlds. + +Again, this option is not really needed unless you're running SQL Server in the "way back machine" version, that is, 6.5 compatibility mode. Do not use this unless you know you have a very specific reason you need it. + +At this juncture, running in 6.5 compatibility mode means that you are asking SQL Server to run as it was more than a decade ago, and with a version compatibility level that is now four versions old. If the code is important enough to still be running after this much time has passed, it would seem important enough to warrant updating to a more recent version of support. + +NOT FOR REPLICATION + +Adding this option slightly alters the rules as to when the trigger is fired. With this option in place, the trigger will not be fired whenever a replication-related task modifies your table. Usually a trigger is fired (to do the housekeeping/cascading/and so on) when the original table is modified and there is no point in doing it again. + +AS + +Exactly as it was with sprocs, this is the meat of the matter. The AS keyword tells SQL Server that your code is about to start. From this point forward, we're into the scripted portion of your trigger. + +Using Triggers for Data Integrity Rules + +Although they shouldn't be your first option, triggers can also perform the same functionality as a CHECK constraint or even a DEFAULT. The answer to the question "Should I use triggers or CHECK constraints?" is the rather definitive: "It depends." If a CHECK can do the job, then it's probably the preferable choice. There are times, however, when a CHECK constraint just won't do the job, or when something inherent in the CHECK process makes it less desirable than a trigger. Examples of where you would want to use a trigger over a CHECK include: + + * Your business rule needs to reference data in a separate table. + * Your business rule needs to check the delta (difference between before and after) of an update. + * You require a customized error message. + +This really just scratches the surface of things. Since triggers are highly flexible, deciding when to use them really just comes down to whenever you need something special done. To provide at least some guidance though, here's a comparison table I've included in past books: + +Restriction | Pros | Cons +---|---|--- +Constraints | Fast. | Must be redefined for each table. +| Can reference other columns. | Can't reference other tables. +| Happens before the command occurs. | Can't be bound to data types. +| ANSI compliant. | +Triggers | Ultimate flexibility. | Happens after the command occurs. +| Can reference other columns and other tables. | High overhead. +| Can even use .NET to reference information that is external to your SQL Server. | + +Note that this is deliberately non-specific. Every situation varies, so what I've tried to provide here is a set of guidelines about where either option either succeeds or fails. + +Some of you may have noticed that, when I included the preceding table, I did not include the option for Rules and Defaults as I have in previous editions. Why not? Well, because Rules and Defaults (Default the object, not default the constraint) have been considered deprecated for several releases now, so I am gradually intensifying my presentation of the idea that they are there for backward compatibility only. + +Dealing with Requirements Sourced from Other Tables + +CHECK constraints are great—fast and efficient—but they don't do everything you'd like them to. Perhaps the biggest shortcoming shows up when you need to verify data across tables. + +To illustrate this, let's take a look at the Products and SalesOrderDetail tables in AdventureWorks2008 as well as the related SpecialOfferProduct table. The relationship looks like Figure 12.2. + +So, under normal DRI, you can be certain that no order line item can be entered into the SalesOrderDetail table unless there is a matching ProductID in the Products table (via the chain through the SpecialOfferProduct table). We are, however, looking for something more than just the "norm" here. + +Figure 12.2 + +Our Inventory department has been complaining that our Customer Support people keep placing orders for products that are discontinued. They would like to have such orders rejected before they get into the system. + +We can't deal with this using a CHECK constraint because the place where we know about the discontinued status (the Products table) is in a separate table from where we are placing the restriction (the SalesOrderDetail table). Don't sweat it though; you can tell the Inventory department, "No problem!" You just need to use a trigger: + +USE AdventureWorks2008; + +GO + +CREATE TRIGGER OrderDetailNotDiscontinued + +ON Sales.SalesOrderDetail + +AFTER INSERT, UPDATE + +AS + +IF EXISTS + +( + +SELECT 'True' + +FROM Inserted i + +JOIN Production.Product p + +ON i.ProductID = p.ProductID + +WHERE p.DiscontinuedDate IS NOT NULL + +) + +BEGIN + +RAISERROR('Order Item is discontinued. Transaction Failed.',16,1); + +ROLLBACK TRAN; + +END + +Let's go ahead and test our handiwork. First, we need at least one record that will fail when it hits our trigger. That means we need a discontinued item in the Products table; the problem is, there is no such record currently. + +SELECT ProductID, Name + +FROM Production.Product + +WHERE DiscontinuedDate IS NOT NULL; + +ProductID Name + +\----------- -------------------------------------------------- + +(0 row(s) affected) + +So, we'll pick one and change it ourselves for test purposes: + +UPDATE Production.Product + +SET DiscontinuedDate = GETDATE() + +WHERE ProductID = 680; + +With that done, we're ready to see if our trigger works, so let's go ahead and add a line item that violates this constraint. I'm going to make use of a SalesOrderHeader that already exists, so we don't have to get over elaborate building up a full order: + +INSERT Sales.SalesOrderDetail + +( + +SalesOrderID, + +OrderQty, + +ProductID, + +SpecialOfferID, + +UnitPrice, + +UnitPriceDiscount + +) + +VALUES + +( + +43660, + +5, + +680, + +1, + +1431, + +0 + +); + +This gets the rejection that we expect: + +Msg 50000, Level 16, State 1, Procedure OrderDetailNotDiscontinued, Line 14 + +Order Item is discontinued. Transaction Failed. + +Msg 3609, Level 16, State 1, Line 1 + +The transaction ended in the trigger. The batch has been aborted. + +Remember that we could, if desired, also create a custom error message to raise, instead of the ad hoc message that we used with the RAISERROR command. + +Using Triggers to Check the Delta of an Update + +Sometimes, you're not interested as much in what the value was or is as you are in how much it changed. While there isn't any one column or table that gives you that information, you can calculate it by making use of both the Inserted and Deleted tables in your trigger. + +A quick example of this might be to write records for security reasons. Let's say, for example, that you wanted to track every adjustment to inventory regardless of what initiated it for auditing purposes (for example, inventory adjustments might be made directly against inventory tables rather than via an order item). + +To implement something like this, we would need an audit table to make use of both the Inserted and Deleted tables: + +USE AdventureWorks2008; + +CREATE TABLE Production.InventoryAudit + +( + +TransactionID int IDENTITY PRIMARY KEY, + +ProductID int NOT NULL + +REFERENCES Production.Product(ProductID), + +NetAdjustment smallint NOT NULL, + +ModifiedDate datetime DEFAULT(CURRENT_TIMESTAMP) + +); + +GO + +CREATE TRIGGER ProductAudit + +ON Production.ProductInventory + +FOR INSERT, UPDATE, DELETE + +AS + +INSERT INTO Production.InventoryAudit + +(ProductID, NetAdjustment) + +SELECT COALESCE(i.ProductID, d.ProductID), + +ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) AS NetAdjustment + +FROM Inserted i + +FULL JOIN Deleted d + +ON i.ProductID = d.ProductID + +AND i.LocationID = d.LocationID + +WHERE ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) != 0; + +Before we test this, let's analyze what we're doing here. I've started by adding an audit table to receive information about changes to our base table. From there, I've created a trigger that will fire on any change to the table and will write the next change out to our new audit table. + +Now, let's check this out by running a test script: + +PRINT 'The values before the change are:' + +SELECT ProductID, LocationID, Quantity + +FROM Production.ProductInventory + +WHERE ProductID = 1 + +AND LocationID = 50; + +PRINT 'Now making the change' + +UPDATE Production.ProductInventory + +SET Quantity = Quantity + 7 + +WHERE ProductID = 1 + +AND LocationID = 50; + +UPDATE Production.ProductInventory + +SET Quantity = Quantity - 7 + +WHERE ProductID = 1 + +AND LocationID = 50; + +PRINT 'The values after the change are:' + +SELECT ProductID, LocationID, Quantity + +FROM Production.ProductInventory + +WHERE ProductID = 1 + +AND LocationID = 50; + +SELECT * FROM Production.InventoryAudit; + +And we can use the before and after output to verify that our audit records were properly written: + +The values before the change are: + +ProductID LocationID Quantity + +\----------- ---------- -------- + +1 50 353 + +(1 row(s) affected) + +Now making the change + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +The values after the change are: + +ProductID LocationID Quantity + +\----------- ---------- -------- + +1 50 353 + +(1 row(s) affected) + +TransactionID ProductID NetAdjustment ModifiedDate + +\------------- ----------- ------------- ----------------------- + +1 1 7 2008-12-15 22:29:11.900 + +2 1 -7 2008-12-15 22:29:11.900 + +(2 row(s) affected) + +Using Triggers for Custom Error Messages + +We've already touched on this in some of our other examples, but remember that triggers can be handy for retaining control over the error message or number that gets passed out to your user or client application. + +With a CHECK constraint, for example, you're just going to get the standard 547 error along with its rather nondescript explanation. As often as not, this is less than helpful in terms of the user really figuring out what went wrong; indeed, your client application often doesn't have enough information to make an intelligent and helpful response on behalf of the user. + +In short, sometimes you create triggers when there is already something that would give you the data integrity that you want but won't give you enough information to handle it. + +Other Common Uses for Triggers + +In addition to the straight data integrity uses, triggers have a number of other uses. Indeed, the possibilities are fairly limitless, but here are a few common examples: + + * Updating summary information + * Feeding de-normalized tables for reporting + * Setting condition flags + +Updating Summary Information + +Sometimes we like to keep aggregate information around to help with reporting or to speed performance when checking conditions. + +Take, for instance, the example of a customer's credit limit versus their current balance. The limit is a fairly static thing and is easily stored with the rest of the customer information. The current balance is another matter. We can always figure out the current balance by running a query to total all of the unpaid balances for any orders the customer has, but think about that for a moment. Let's say that you work for Sears, and you do literally millions of transactions every year. Now think about how your table is going to have many millions of records for your query to sort through and that you're going to be competing with many other transactions in order to run your query. Things would perform an awful lot better if we could just go to a single place to get that total—but how to maintain it? + +We certainly could just make sure that we always use a stored procedure for adding and paying order records, and then have the sproc update the customer's current balance. But that would mean that we would have to be sure that every sproc that has a potential effect on the customer's balance would have the update code. If just one sproc leaves it out, then we have a major problem, and figuring out which sproc is the offending one is a hassle at best, and problematic at worst. By using a trigger, however, the updating of the customer balance becomes pretty easy. + +We could maintain virtually any aggregation we want to keep track of. Keep in mind, however, that every trigger that you add increases the amount of work that has to be done to complete your transactions. That means that you are placing an additional burden on your system and increasing the chances that you will run into deadlock problems. + +Feeding Data into De-normalized Tables for Reporting + +I'm going to start right off by saying this isn't the way you should do things in most circumstances. Usually, this kind of data transfer should be handled as part of a batch process run at night or during non-peak hours for your system—depending on the nature of what you are moving, replication may also be an excellent answer. We will be discussing replication in detail in Chapter 17. + +That being said, sometimes you need the data in your reporting tables to be right up-to-the-minute. The only real ways to take care of this is to modify all your sprocs and other access points into your system, to update the reporting tables at the same time as they update the Online Transaction Processing (OLTP) tables (YUCK!), or to use triggers to propagate any updates to records. + +What's nice about using this method to propagate data is that you are always certain to be up-to-the-minute on what's happening in the OLTP tables. That being said, it defeats a large part of the purpose of keeping separate reporting tables. While keeping the data in a de-normalized format can greatly improve query performance, one of its main goals, in most installations, is to clear reporting needs out of the main OLTP database and minimize concurrency issues. If all your OLTP updates still have to update information in your reporting tables, then all you've done is to move the database in which the actual deadlock or other concurrency issue is happening. From the OLTP standpoint, you've added work without gaining any benefits. + +The thing you have to weigh here is whether you're going to gain enough performance in your reporting to make it worth the damage you're going to do to performance on your OLTP system. + +Setting Condition Flags + +This situation is typically used much as aggregation is—to maintain a flag as changes are made rather than having to look for a certain condition across a complete table. Lookup flags are one of the little things that, while they usually break the rules of normalization (you're not supposed to store data that can be derived elsewhere), they can really boost system performance substantially. + +For the example on this topic, let's assume that we maintain a variety of information on the products that we sell. Material Safety Data Sheets (MSDS), information on suppliers—imagine there can be an unlimited number of different documents that all provide some sort of information on our products. Now, further imagine that we have something more than the mere 504 products that are in the AdventureWorks2008 database (it's not at all uncommon for businesses to have 50,000 or more different line items in their catalog). The number of possible informational records could get extremely high. + +We want to be able to put a flag on our Customer Support screens that tell the order taker whether there is any additional information available for this product. If we were living by the rules of a normalized database, we would have to look in the ProductDocument table to see if it had any records that matched up with our ProductID. + +Rather than do those lookups, we can just place a bit field in our Products table that is a yes/no indicator on whether other information is available. We would then put a trigger on the ProductInformation table that updates the bit flag in the Products table. If a record is inserted into ProductInformation, then we set the bit flag to TRUE for the corresponding product. When a ProductInformation record is deleted, we look to see whether it was the last one, and, if so, set the bit flag in the Products table back to FALSE. + +We'll go for an ultra-quick example. First, we need to set up by creating the bit flag field and ProductDocument table: + +ALTER TABLE Production.Product + +ADD InformationFlag bit NOT NULL + +CONSTRAINT InformationFlagDefault + +DEFAULT 0 WITH VALUES; + +Then we need to fix the data in the table to allow for documentation we already have: + +UPDATE p + +SET p.InformationFlag = 1 + +FROM Production.Product p + +WHERE EXISTS + +( + +SELECT 1 + +FROM Production.ProductDocument pd + +WHERE pd.ProductID = p.ProductID + +); + +Then we're ready to add our trigger: + +CREATE TRIGGER DocumentBelongsToProduct + +ON Production.ProductDocument + +FOR INSERT, DELETE + +AS + +DECLARE @Count int; + +SELECT @Count = COUNT(*) FROM Inserted; + +IF @Count > 0 + +BEGIN + +UPDATE p + +SET p.InformationFlag = 1 + +FROM Inserted i + +JOIN Production.Product p + +ON i.ProductID = p.ProductID; + +END + +IF @@ERROR != 0 + +ROLLBACK TRAN; + +SELECT @Count = COUNT(*) FROM Deleted + +IF @Count > 0 + +BEGIN + +UPDATE p + +SET p.InformationFlag = 0 + +FROM Inserted i + +RIGHT JOIN Production.Product p + +ON i.ProductID = p.ProductID + +WHERE i.ProductID IS NULL + +END + +IF @@ERROR != 0 + +ROLLBACK TRAN; + +And we're ready to test: + +SELECT ProductID, InformationFlag + +FROM Production.Product p + +WHERE p.ProductID = 1; + +INSERT INTO Production.ProductDocument + +(ProductID, DocumentNode) + +VALUES + +(1, 0x); + +SELECT ProductID, InformationFlag + +FROM Production.Product p + +WHERE p.ProductID = 1; + +This yields the proper update: + +ProductID InformationFlag + +\----------- --------------- + +1 0 + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +ProductID InformationFlag + +\----------- --------------- + +1 1 + +(1 row(s) affected) + +And the delete: + +DELETE Production.ProductDocument + +WHERE ProductID = 1 + +AND DocumentNode = 0x; + +SELECT ProductID, InformationFlag + +FROM Production.Product p + +WHERE p.ProductID = 1; + +Again, this gets the proper update: + +ProductID InformationFlag + +\----------- --------------- + +1 0 + +(1 row(s) affected) + +Now we can find out whether there's product documentation right in the very same query with which we grab the base information on the product. We won't incur the overhead of the query to the ProductDocument table unless there really is something out there for us to retrieve. + +Other Trigger Issues + +You have most of it now, but if you're thinking you are finished with triggers, then think again. As I indicated early in the chapter, triggers create an awful lot to think about. The sections that follow attempt to point out some of the biggest issues you need to consider, plus they provide some information on additional trigger features and possibilities. + +Triggers Can Be Nested + +A nested trigger is one that does not fire directly as a result of a statement that you issued but rather because of a statement that was issued by another trigger. + +This can actually set off quite a chain of events—with one trigger causing another trigger to fire which, in turn, causes yet another trigger to fire, and so on. Just how deep the triggers can fire depends on: + + * Whether nested triggers are turned on for your system (this is a system-wide, not database-level option; it is set using Management Studio or sp_configure, and defaults to on). + * Whether there is a limit of nesting to 32 levels deep. + * Whether a trigger has already been fired. A trigger can, by default, only be fired once per trigger transaction. Once fired, it will ignore any other calls as a result of activity that is part of the same trigger action. Once you move on to an entirely new statement (even within the same overall transaction), the process can start all over again. + +In most circumstances, you actually want your triggers to nest (thus the default), but you need to think about what's going to happen if you get into a circle of triggers firing triggers. If it comes back around to the same table twice, then the trigger will not fire the second time, and something you think is important may not happen; for example, a data integrity violation may get through. It's also worth noting that, if you do a ROLLBACK anywhere in the nesting chain, then the entire chain is rolled back. In other words, the entire nested trigger chain behaves as a transaction. + +Triggers Can Be Recursive + +What is a recursive trigger? A trigger is said to be recursive when something the trigger does eventually causes that same trigger to be fired. It may be directly (by an action query done to the table on which the trigger is set), or indirectly (through the nesting process). + +Recursive triggers are rare. Indeed, by default, recursive triggers are turned off. This is, however, a way of dealing with the situation just described, where you are nesting triggers and you want the update to happen the second time around. Recursion, unlike nesting, is a database-level option and can be set using the sp_dboption system sproc. + +The danger in recursive triggers is that you'll get into some form of unintended loop. As such, you'll need to make sure that you get some form of recursion check in place to stop the process if necessary. + +Debugging Triggers + +Debugging triggers is a hassle at best. Since you have something of a level of indirection (you write a statement that causes the trigger to fire, rather than explicitly firing it yourself), it always seems like you have to second-guess what's going on. + +You can utilize the same debugger we utilized in Chapter 10—you just need to get tricky to do it. The trick? The trick is to create a block of code (stored procedure or batch) that will cause your trigger to fire, and then step into that block of code. You can then step your way right into the trigger. + +When debugging with the built-in tool is a trial, use PRINT and SELECT statements to output your values in the triggers. Beyond telling you what your variables are doing along the way, they can also tip you off to recursion and, in some cases, nesting problems. + +Nesting issues can be one of the biggest gotchas of trigger design. You will find it not at all uncommon to see situations where you execute a command and wind up with unexpected results because you didn't realize how many other triggers were, in turn, going to be fired. What's more, if the nested triggers perform updates to the initiating table, the trigger will not fire a second time—this creates data integrity problems in tables where you are certain that your trigger is correct in preventing them. It probably has the right code for the first firing, but it doesn't even run the second time around in a nested situation. + +You can also make use of SELECT @@NESTLEVEL to show just how deep into a nesting situation you've got. + +Keep in mind though, that PRINT and result set generating SELECT statements don't really have anywhere to send their data other than the screen (in Management Studio) or as an informational message (data access models). This is usually far more confusing than anything else. As such, I highly recommend removing these statements once you've finished debugging, and before you go to production release. + +Triggers Don't Get in the Way of Architecture Changes + +This is a classic good news/bad news story. + +Using triggers is positively great in terms of making it easy to make architecture changes. Indeed, I often use triggers for referential integrity early in the development cycle (when I'm more likely to be making lots of changes to the design of the database) and then change to DRI late in the cycle when I'm close to production. + +When you want to drop a table and re-create it using DRI, you must first drop all of the constraints before dropping the table. This can create quite a maze in terms of dropping multiple constraints, making your changes, and then adding the constraints again. It can be quite a wild ride trying to make sure that everything drops that is supposed to so that your changed scripts will run. Then it's just as wild a ride to make sure that you've got everything back on that needs to be. Triggers take care of all this because they don't care that anything has changed until they actually run. + +There's the rub though—when they run. You see, it means that you may change architecture and break several triggers without even realizing that you've done it. It won't be until the first time that those triggers try to address the object(s) in question that you find the error of your ways. By that time, you may find difficulty in piecing together exactly what you did and why. + +Both sides have their hassles; just keep the hassles in mind no matter which method you're employing. + +Triggers Can Be Turned Off without Being Removed + +Sometimes, just like with CHECK constraints, you want to turn off the integrity feature, so you can do something that will violate the constraint but still has a valid reason for happening (importation of data is probably the most common of these). + +Another common reason for doing this is when you are performing some sort of bulk insert (importation again), but you are already 100 percent certain the data is valid. In this case, you may want to turn off the triggers to eliminate their overhead and speed up the insert process. + +You can turn a trigger off and on by using an ALTER TABLE statement. The syntax looks like this: + +ALTER TABLE
+ + TRIGGER > + +As you might expect, my biggest words of caution in this area are, "Don't forget to re-enable your triggers!" + +One last thing. If you're turning them off to do some form of mass importation of data, I highly recommend that you kick out all your users and go to RESTRICTED_USER mode. This will make sure that no one sneaks in behind you while you have the triggers turned off. + +Be sure to consider the ability to disable triggers when addressing security concerns. If you are counting on triggers to perform audits for you, but you are allowing the disabling of triggers (granted, they would have to have some degree of security already, but you still need to fully consider the possibilities), then you have a loophole in your auditing. + +Trigger Firing Order + +In long ago releases of SQL Server (7.0 and prior), we had no control over firing order. Indeed, you may recall me discussing how there was only one of any particular kind of trigger (INSERT, UPDATE, DELETE) prior to 7.0, so firing order was something of a moot point. Later releases of SQL Server provide a limited amount of control over which triggers go in what order. For any given table (not views, since firing order can only be specified for AFTER triggers and views accept only INSTEAD OF triggers), you can elect to have one (and only one) trigger fired first. Likewise, you may elect to have one (and only one) trigger fired last. All other triggers are considered to have no preference on firing order—that is, you have no guarantee in what order a trigger with a firing order of "none" will fire, other than that it will fire after the FIRST trigger (if there is one) is complete and before the LAST trigger (again, if there is one) begins (see Figure 12.3). + +The creation of a trigger that is to be first or last works just the same as any other trigger. You state the firing order preference after the trigger has already been created by using a special system stored procedure, sp_settriggerorder. + +The syntax of sp_settriggerorder looks like this: + +sp_settriggerorder[@triggername =] '', + +[@order =] '{FIRST|LAST|NONE}', + +[@stmttype =] '{INSERT|UPDATE|DELETE}' + +There can be only one trigger that is considered to be "first" for any particular action (INSERT, UPDATE, or DELETE). Likewise, there can be only one "last" trigger for any particular action. Any number of triggers can be considered to be "none"—that is, the number of triggers that don't have a particular firing order is unlimited. + +Figure 12.3 + +So, the question should be, "Why do I care what order they fire in?" Well, often you won't care at all. At other times, it can be important logic-wise or just a good performance idea. Let's consider what I mean in a bit more detail. + +Controlling Firing Order for Logic Reasons + +Why would you need to have one trigger fire before another? The most common reason would be that the first trigger lays some sort of foundation for, or otherwise validates, what will come afterward. Under SQL Server 6.5 and earlier, we didn't have to think about this kind of thing much—we were only allowed one trigger of any particular type (UPDATE, DELETE, or INSERT) for a given table. This meant that having one thing happen before another wasn't really a problem. Because you combined all logic into one trigger, you just put the first thing that needed to happen first in the code and the last part last (no real rocket science there at all). + +Version 7.0 came along and made things both better and worse than they were before. You were no longer forced to jam all of your logic into one trigger. This was really cool because it meant that you could physically separate parts of your trigger code that were logically different, which, in turn, both made the code much easier to manage and allowed one part of the code to be disabled (remember that NO CHECK thing we did a few sections ago?) while other parts of the code continued to function. The downside was that if you went ahead and separated out your code that way, you lost the logical stepping order that the code had when it was in one trigger. + +By gaining at least a rudimentary level of control over firing order, we now have something of the best of both worlds: we can logically separate our triggers but still maintain necessary order of precedence on what piece of code runs first or last. + +Controlling Firing Order for Performance Reasons + +On the performance front, a FIRST trigger is the only one that really has any big thing going for it. If you have multiple triggers, but only one of them is likely to generate a rollback (for example, it may be enforcing a complex data integrity rule that a constraint can't handle), you would want to consider making such a trigger a FIRST trigger. This ensures that the most likely cause of a rollback is already complete before you invest any more activity in your transaction. The more you do before the rollback is detected, the more that will have to be rolled back. Determine the highest possibility of that rollback happening before performing additional activity. + +INSTEAD OF Triggers + +While it can work against tables, the primary purpose of an INSTEAD OF trigger is usually to allow updates to views in places where it was previously not possible. + +Essentially, an INSTEAD OF trigger is a block of code we can use as something of an interceptor for anything that anyone tries to do to our table or view. We can either elect to go ahead and do whatever the user requests or, if we choose, we can go so far as doing something entirely different. + +As with FOR/AFTER triggers, INSTEAD OF triggers come in three different flavors—INSERT, UPDATE, and DELETE. Unlike FOR/AFTER triggers, however, you can only have one trigger per table or view for each of the different flavors (one each for INSERT, UPDATE, DELETE). + +If we're going to explore these, we need to get some appropriate sample tables out there. To that end, let's take the following four tables (you can change the script to use an existing database if you wish): + +CREATE DATABASE OurInsteadOfTest; + +GO + +USE OurInsteadOfTest; + +CREATE TABLE dbo.Customers + +( + +CustomerID varchar(5) NOT NULL PRIMARY KEY , + +Name varchar(40) NOT NULL + +); + +CREATE TABLE dbo.Orders + +( + +OrderID int IDENTITY NOT NULL PRIMARY KEY, + +CustomerID varchar(5) NOT NULL + +REFERENCES Customers(CustomerID), + +OrderDate datetime NOT NULL + +); + +CREATE TABLE dbo.Products + +( + +ProductID int IDENTITY NOT NULL PRIMARY KEY, + +Name varchar(40) NOT NULL, + +UnitPrice money NOT NULL + +); + +CREATE TABLE dbo.OrderItems + +( + +OrderID int NOT NULL + +REFERENCES dbo.Orders(OrderID), + +ProductID int NOT NULL + +REFERENCES dbo.Products(ProductID), + +UnitPrice money NOT NULL, + +Quantity int NOT NULL + +CONSTRAINT PKOrderItem PRIMARY KEY CLUSTERED + +(OrderID, ProductID) + +); + +\-- INSERT sample records + +INSERT dbo.Customers + +VALUES ('ABCDE', 'Bob''s Pretty Good Garage'); + +INSERT dbo.Orders + +VALUES ('ABCDE', CURRENT_TIMESTAMP); + +INSERT dbo.Products + +VALUES ('Widget', 5.55), + +('Thingamajig', 8.88) + +INSERT dbo.OrderItems + +VALUES (1, 1, 5.55, 3); + +We will use these tables for all three of the upcoming examples of INSTEAD OF triggers. + +INSTEAD OF INSERT Triggers + +The INSTEAD OF INSERT trigger allows us to examine the data that is about to go into our table or view, and decide what we want to do with it prior to the insert physically occurring. The typical use of this will usually be on a view—in which manipulating the data before the actual physical insert is attempted can mean the difference between the insert succeeding or failing. + +Let's look at an example by creating an updatable view—specifically, one that will accept INSERTs where, before INSTEAD OF INSERT triggers, we wouldn't have been able to do it. + +In this case, we'll create a view that demonstrates the update problem and then look at how to fix it. Let's take the case of showing some order line items, but with more full information about the customer and products (be sure you're using the database you created the sample tables in): + +USE OurInsteadOfTest; + +GO + +CREATE VIEW CustomerOrders_vw + +WITH SCHEMABINDING + +AS + +SELECT o.OrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.Quantity, + +od.UnitPrice + +FROM dbo.Orders AS o + +JOIN dbo.OrderItems AS od + +ON o.OrderID = od.OrderID + +JOIN dbo.Products AS p + +ON od.ProductID = p.ProductID; + +CREATE VIEW CustomerOrders_vw + +WITH SCHEMABINDING + +AS + +SELECT o.SalesOrderID, + +o.OrderDate, + +od.ProductID, + +p.Name, + +od.OrderQty, + +od.UnitPrice, + +od.LineTotal + +FROM Sales.SalesOrderHeader AS o + +JOIN Sales.SalesOrderDetail AS od + +ON o.SalesOrderID = od.SalesOrderID + +JOIN Production.Product AS p + +ON od.ProductID = p.ProductID + +The view is not fully updatable in its current state. How would SQL Server know which data went to which table? Sure, one could make a case for a straight update statement working, but we don't have the primary key for every table here. Even worse, what if we wanted to do an insert (which, as it happens, we do)? + +The answer is something that SQL Server can't give you by itself—you need to provide more instructions as to what you want to do in such complex situations. That's where INSTEAD OF triggers really shine. + +Let's take a look at our example order: + +SELECT * + +FROM CustomerOrders_vw + +WHERE OrderID = 1; + +This gets us back the one row we used to prime our sample: + +Bob's Pretty Good Garage...1...2006-04-13 05:14:22.780...1...Widget...3...5.55 + +Now, just to prove it doesn't work, let's try to INSERT a new order item: + +INSERT INTO CustomerOrders_vw + +( + +OrderID, + +OrderDate, + +ProductID, + +Quantity, + +UnitPrice + +) + +VALUES + +( + +1, + +'1998-04-06', + +2, + +10, + +6.00 + +) + +As expected, it doesn't work: + +Server: Msg 4405, Level 16, State 1, Line 2 + +View or function 'CustomerOrders_vw' is not updatable because the modification affects + +multiple base tables. + +It's time for us to take care of this with an INSTEAD OF trigger. What we need to do here is decide ahead of time what scenarios we want to handle (in this case, just the insert of new OrderItem records) and what we want to do about it. + +We're going to treat any INSERT as an attempt to add a new order item. We're going to assume for this example that the customer already exists (if we wanted to get complex, we could break things up further) and that we have an OrderID available. Our trigger might look something like: + +CREATE TRIGGER trCustomerOrderInsert ON CustomerOrders_vw + +INSTEAD OF INSERT + +AS + +BEGIN + +\-- Check to see whether the INSERT actually tried to feed us any rows. + +\-- (A WHERE clause might have filtered everything out) + +IF (SELECT COUNT(*) FROM Inserted) > 0 + +BEGIN + +INSERT INTO dbo.OrderItems + +SELECT i.OrderID, + +i.ProductID, + +i.UnitPrice, + +i.Quantity + +FROM Inserted AS i + +JOIN Orders AS o + +ON i.OrderID = o.OrderID; + +\-- If we have records in Inserted, but no records could join to + +\-- the orders table, then there must not be a matching order + +IF @@ROWCOUNT = 0 + +RAISERROR('No matching Orders. Cannot perform insert',10,1); + +END + +END + +So, let's try that insert again: + +INSERT INTO CustomerOrders_vw + +( + +OrderID, + +OrderDate, + +ProductID, + +Quantity, + +UnitPrice + +) + +VALUES + +( + +1, + +'1998-04-06', + +2, + +10, + +6.00 + +) + +We've explicitly addressed what table we're going to insert into, and so SQL Server is happy. We could easily extend this to address non-nullable columns that don't participate in the view if we needed to. (The customer can't provide values to those columns because they are not in the view the customer is using.) + +INSTEAD OF UPDATE Triggers + +We've now seen how INSERT statements against views can lead to ambiguous situations and also how to fix them with an INSTEAD OF INSERT trigger—but what about updates? + +Even on the update side of things our statements can become ambiguous; if we update the ProductName in CustomerOrders_vw, does that mean we want to change the actual name on the product or does it mean that we want to change what product this line item is selling? The answer, of course, is that it depends on the situation. For one system, changing the ProductName might be the correct answer. For another system, changing the product sold might be the thing. + +Much like INSTEAD OF INSERT triggers, INSTEAD OF UPDATE triggers give us the chance to trap what is coming in and address it explicitly. In our ProductName example, we could have chosen to do it either way. By default, SQL Server would update the name in the Products table. We could, however, use an INSTEAD OF UPDATE trigger to trap it and explicitly look up the ProductName to find the ProductID if that is what the user intended. From there, we could generate an error if the provided ProductID did not make the one that went with the name. + +INSTEAD OF DELETE Triggers + +Okay, this is the last of our INSTEAD OF triggers and, most likely, the one that you'll run into the least often. As with the other two INSTEAD OF trigger types, these are used almost exclusively to allow views to delete data in one or more underlying tables. + +So, continuing with our CustomerOrders_vw example, we'll add some delete functionality. This time, however, we're going to raise the complexity bar a bit. We want to delete all the rows for a given order, but if deleting those rows means that the order has no detail items left, then we also want to delete the order header. + +We know from our last section (assuming you've been playing along) that we have two rows in Order 1 (the one we seeded when we built the table and the one we inserted in the INSTEAD OF INSERT example) but, before we start trying to delete things, let's build our trigger: + +CREATE TRIGGER trCustomerOrderDelete ON CustomerOrders_vw + +INSTEAD OF DELETE + +AS + +BEGIN + +\-- Check to see whether the DELETE actually tried to feed us any rows + +\-- (A WHERE clause might have filtered everything out) + +IF (SELECT COUNT(*) FROM Deleted) > 0 + +BEGIN + +DELETE oi + +FROM dbo.OrderItems AS oi + +JOIN Deleted AS d + +ON d.OrderID = oi.OrderID + +AND d.ProductID = oi.ProductID; + +DELETE Orders + +FROM Orders AS o + +JOIN Deleted AS d + +ON o.OrderID = d.OrderID + +LEFT JOIN OrderItems AS oi + +ON oi.OrderID = d.OrderID + +AND oi.ProductID = d.OrderID + +WHERE oi.OrderID IS NULL; + +END + +END + +And now we're ready to test. We'll start off by deleting just a single row from our CustomerOrders_vw view: + +DELETE CustomerOrders_vw + +WHERE OrderID = 1 + +AND ProductID = 2; + +We're ready to run our select again: + +SELECT ProductID, UnitPrice, Quantity + +FROM CustomerOrders_vw + +WHERE OrderID = 1; + +Sure enough, the row that we first inserted in our INSTEAD OF INSERT section is now gone: + +ProductID UnitPrice Quantity + +\----------- --------------------- ----------- + +1 5.55 3 + +(1 row(s) affected) + +So, our deleting of individual detail lines is working just fine. Now let's get a bit more cavalier and delete the entire order: + +DELETE CustomerOrders_vw + +WHERE OrderID = 1 + +To really check that this worked okay, we need to go all the way to our Orders table: + +SELECT * FROM Orders WHERE OrderID = 1; + +Sure enough—the order has been removed. + +While we don't have to think about individual columns with INSTEAD OF DELETE triggers (you delete by row, not by column), we do need to be aware of what referential integrity actions exist on any table (not view) for which we are defining an INSTEAD OF DELETE trigger. Just like INSTEAD OF UPDATE triggers, INSTEAD OF DELETE triggers are not allowed on tables that have referential integrity actions. + +IF UPDATE() and COLUMNS_UPDATED() + +In an UPDATE trigger, we can often limit the amount of code that actually executes within the trigger by checking to see whether the column(s) we are interested in are the ones that have been changed. To do this, we make use of the UPDATE() or COLUMNS_UPDATED() functions. Let's look at each. + +The UPDATE() Function + +The UPDATE() function has relevance only within the scope of a trigger. Its sole purpose in life is to provide a Boolean response (true/false) to whether a particular column has been updated or not. You can use this function to decide whether or not a particular block of code needs to run—for example, if that code is only relevant when a particular column is updated. + +Let's run a quick example of this by modifying one of our earlier triggers: + +USE AdventureWorks2008; + +GO + +ALTER TRIGGER Production.ProductAudit + +ON Production.ProductInventory + +FOR INSERT, UPDATE, DELETE + +AS + +IF UPDATE(Quantity) + +BEGIN + +INSERT INTO Production.InventoryAudit + +(ProductID, NetAdjustment) + +SELECT COALESCE(i.ProductID, d.ProductID), + +ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) AS NetAdjustment + +FROM Inserted i + +FULL JOIN Deleted d + +ON i.ProductID = d.ProductID + +AND i.LocationID = d.LocationID + +WHERE ISNULL(i.Quantity, 0) - ISNULL(d.Quantity, 0) != 0; + +END + +With this change, we will now limit the rest of the code to run only when the Quantity column (the one we care about) has been changed. The user can change the value of any other column, and we don't care. This means that we'll be executing fewer lines of code and, therefore, this trigger will perform slightly better than our previous version. + +The COLUMNS_UPDATED() Function + +This one works somewhat differently from UPDATE() but has the same general purpose. What COLUMNS_UPDATED() gives us is the ability to check multiple columns at one time. In order to do this, the function uses a bit mask that relates individual bits in one or more bytes of varbinary data to individual columns in the table. It ends up looking something like Figure 12.4. + +In this case, our single byte of data is telling us that the second, third, and sixth columns were updated—the rest were not. + +In the event that there are more than eight columns, SQL Server just adds another byte on the right-hand side and keeps counting (see Figure 12.5). + +Figure 12.4 + +Figure 12.5 + +This time the second, ninth, and fourteenth columns were updated. + +I can hear you out there: "Gee, that's nice—but how do I make any use of this?" Well, to answer that, we have to get into the world of Boolean algebra. + +Making use of this information means that you need to add up the binary value of all the bytes, considering the leftmost digit to be the least significant. So, if you want your comparison to take into account 2, 5, and 7, then you need to add the binary value of each bit: 2 + 16 + 64. Then you need to compare the sum of the binary values of your columns to the bit mask by using bitwise operators: + + * | Represents bitwise OR + * & Represents bitwise AND + * ∧ Represents bitwise Exclusive OR + +As I read back over what I've just written, I realize that it is correct, but about as clear as mud, so let's look a little closer at what I mean with a couple of examples. + +Imagine that we updated a table that contained five columns. If we updated the first, third, and fifth columns, the bit mask used by COLUMNS_UPDATED would contain 10101000, from 1 + 4 + 16 = 21. We could use: + + * COLUMNS_UPDATED() > 0 to find out if any column was updated + * COLUMNS_UPDATED() ∧ 21 = 0 to find out if all of the columns specified (in this case 1, 3, and 5) were updated and nothing else was + * COLUMNS_UPDATED() & 21 = 21 to find out if all of the columns specified were updated, but the state of other columns doesn't matter + * COLUMNS_UPDATED | 21 != 21 to find out if any column other than those we're interested in was updated + +Understand that this is tough stuff—Boolean math is not exactly the easiest of concepts to grasp for most people, so check things carefully and TEST, TEST, TEST! + +Performance Considerations + +I've seen what appear almost like holy wars happen over the pros and cons, evil and good, and light and dark of triggers. The worst of it tends to come from purists—people who love the theory, and that's all they want to deal with, or people that have figured out how flexible triggers are and want to use them for seemingly everything. + +My two bits worth on this is, as I stated early in the chapter, use them when they are the right things to use. If that sounds sort of noncommittal and ambiguous—good! Programming is rarely black and white, and databases are almost never that way. I will, however, point out some facts for you to think about. + +Triggers Are Reactive Rather Than Proactive + +What I mean here is that triggers happen after the fact. By the time that your trigger fires, the entire query has run and your transaction has been logged (but not committed and only to the point of the statement that fired your trigger). This means that, if the trigger needs to roll things back, it has to undo what is potentially a ton of work that's already been done. Slow! Keep this knowledge in balance though. How big an impact this adds up to depends strongly on how big your query is. + +"So what?" you say. Well, compare this to the notion of constraints, which are proactive—that is, they happen before your statement is really executed. That means that they prevent things that will eventually fail from happening before the majority of the work has been done. This will usually mean that they will run at least slightly faster—much faster on more complex queries. Note that this extra speed really only shows itself to any significant extent when a rollback occurs. + +What's the end analysis here? Well, if you're dealing with very few rollbacks, and/or the complexity and runtime of the statements affected are low, then there probably isn't much of a difference between triggers and constraints. There's some, but probably not much. If, however, the number of rollbacks is unpredictable or if you know it's going to be high, you'll want to stick with constraints if you can (and frankly, I suggest sticking with constraints unless you have a very specific reason not to). + +Triggers Don't Have Concurrency Issues with the Process That Fires Them + +You may have noticed throughout this chapter that we often make use of the ROLLBACK statement, even though we don't issue a BEGIN TRAN. That's because a trigger is always implicitly part of the same transaction as the statement that caused the trigger to fire. + +If the firing statement was not part of an explicit transaction (one where there was a BEGIN TRAN), then it would still be part of its own one-statement transaction. In either case, a ROLLBACK TRAN issued inside the trigger will still roll back the entire transaction. + +Another upshot of this part-of-the-same-transaction business is that triggers inherit the locks already open on the transaction they are part of. This means that we don't have to do anything special to make sure that we don't bump into the locks created by the other statements in the transaction. We have free access within the scope of the transaction, and we see the database based on the modifications already placed by previous statements within the transaction. + +Keep It Short and Sweet + +I feel like I'm stating the obvious here, but it's for a good reason. + +I can't tell you how often I see bloated, stupid code in sprocs and triggers. I don't know whether it's that people get in a hurry, or if they just think that the medium they are using is fast anyway, so it won't matter. + +Remember that a trigger is part of the same transaction as the statement in which it is called. This means the statement is not complete until your trigger is complete. Think about it—if you write long-running code in your trigger, this means that every piece of code that you create that causes that trigger to fire will, in turn, be long running. This can really cause heartache in terms of trying to figure out why your code is taking so long to run. You write what appears to be a very efficient sproc, but it performs terribly. You may spend weeks and yet never figure out that your sproc is fine—it just fires a trigger that isn't. + +Don't Forget Triggers When Choosing Indexes + +Another common mistake. You look through all your sprocs and views figuring out what the best mix of indexes is—and totally forget that you have significant code running in your triggers. + +This is the same notion as the "Short and Sweet" section—long-running queries make for long-running statements which, in turn, lead to long-running everything. Don't forget your triggers when you optimize! + +Try Not to Roll Back within Triggers + +This one's hard since rollbacks are so often a major part of what you want to accomplish with your triggers. + +Just remember that AFTER triggers (which are far and away the most common type of trigger) happen after most of the work is already done—that means a rollback is expensive. This is where DRI picks up almost all of its performance advantage. If you are using many ROLLBACK TRAN statements in your triggers, then make sure that you pre-process looking for errors before you execute the statement that fires the trigger. That is, because SQL Server can't be proactive in this situation, be proactive for it. Test for errors beforehand rather than waiting for the rollback. + +Dropping Triggers + +Dropping triggers is as easy as it has been for almost everything else this far: + +DROP TRIGGER + +And it's gone. + +Summary + +Triggers are an extremely powerful tool that can add tremendous flexibility to both your data integrity and the overall operation of your system. That being said, they are not something to take lightly. Triggers can greatly enhance the performance of your system if you use them for proper summarization of data, but they can also be the bane of your existence. They can be very difficult to debug (even now that we have the debugger), and a poorly written trigger affects not only the trigger itself but any statement that causes that trigger to fire. +13 + +SQL Cursors + +Throughout this book thus far, we've been dealing with data in sets. This tends to go against the way that the more procedure-driven languages go about things. Indeed, when the data gets to the client end, they almost always have to take our set and then deal with it row by row. What they are dealing with is a cursor. Indeed, even in traditional SQL Server tools, we can wind up in something of a cursor mode if we utilize a non-SQL-oriented language in our scripts using the new CLR-based language support. + +In this chapter, we will be looking at: + + * What a cursor is + * The life span of a cursor + * Cursor types (sensitivity and scrollability) + * Uses for cursors + +We'll discover that there's a lot to think about when creating cursors. + +Perhaps the biggest thing to think about when creating cursors is, "Is there a way I can get out of doing this?" If you ask yourself that question every time you're about to create a cursor, then you will be on the road to a better performing system. That being said, we shall see that there are times when nothing else will do. + +What Is a Cursor? + +Cursors are a way of taking a set of data and being able to interact with a single record at a time. It doesn't happen nearly as often as one tends to think, but there are indeed times where you just can't obtain the results you want to by modifying or even selecting the data in an entire set. The set is generated by something all of the rows have in common (as defined by a SELECT statement), but then you need to deal with those rows on a one-by-one basis. + +The result set that you place in a cursor has several distinct features that set it apart from a normal SELECT statement: + + * You declare the cursor separately from actually executing it. + * The cursor and, therefore, its result set are named at declaration; you then refer to it by name. + * The result set in a cursor, once opened, stays open until you close it. + * Cursors have a special set of commands used to navigate the recordset. + +While SQL Server has its own engine to deal with cursors, there are actually a few different object libraries that can also create cursors in SQL Server: + + * SQL Native Client (used by ADO.NET) + * OLE DB (used by ADO) + * ODBC (used by RDO, DAO, and in some cases, OLE DB/ADO) + * JDBC (used by Java) + * DB-Lib (now a distant legacy offering, but still used in some older apps) + +These are the libraries that client applications will typically use to access individual records. Each provides it own syntax for navigating the recordset and otherwise managing the cursor. Each, however, shares in the same set of basic concepts, so, once you have got one object model down for cursors, you're most of the way there for all of them. + +Every data access API out there (ADO.NET, ADO, ODBC, OLE DB, JDBC, and so on) returns data to a client application or component in a cursor. It's simply the only way that non-SQL programming languages can currently deal with things. This is the source of a big difference between this kind of cursor and SQL Server cursors. With SQL Server cursors, you usually have a choice to perform things as a set operation, which is what SQL Server was designed to do. With the API-based cursors, all you have is cursors, so you don't have the same cursor versus no cursor debate that you have in your server-side activities. + +The client-side part of your data handling is going to be done using cursors. That's a given, so don't worry about it. Instead, worry about making the server side of your data access as efficient as possible; that means not using cursors on the server side if you can possibly help it. + +The Life Span of a Cursor + +Cursors have lots of little pieces to them, but I think that it's best if we get right into looking first at the most basic form of cursor and then build up from there. + +Before we get into the actual syntax though, we need to understand that using a cursor requires more than one statement. Indeed, it takes several. The main parts include: + + * The declaration + * Opening + * Utilizing/navigating + * Closing + * Deallocating + +That being said, the basic syntax for declaring a cursor looks like this: + +DECLARE CURSOR + +FOR + +[FOR UPDATE [OF [,...n]]][;] + +Or for better ANSI/ISO support: + +DECLARE [INSENSITIVE|SCROLL] CURSOR + +FOR + +This section of the cursor declaration is at the very heart of the matter. This is a section that is required under even the most basic of cursor syntax, and that's because it's the one and only clause that determines what data should be placed in the cursor. + +Almost any SELECT statement is valid—even those including an ORDER BY clause. As long as your SELECT statement provides a single result set, you should be fine. Examples of options that would create problems would be any of the summary options such as a CUBE or ROLLUP. + +FOR UPDATE + +By default, any cursor that is updatable at all is completely updatable—that is, if one column can be edited then any of them can. + +The FOR UPDATE option allows you to specify that only certain columns are to be editable within this cursor. If you include this option, then only the columns in your column list will be allowed to be updatable. Any columns not explicitly mentioned will be considered to be read-only. + +Navigating the Cursor: The FETCH Statement + +I figure that whoever first created the SQL cursor syntax must have really liked dogs. They probably decided to think of the data they were after as being the bone, with SQL Server the faithful bloodhound. From this, I'm guessing, the FETCH keyword was born. + +It's an apt term if you think about it. In a nutshell, it tells SQL Server to "go get it boy!" With that, our faithful mutt (in the form of SQL Server) is off to find the particular bone (row) we were after. We've gotten a bit of a taste of the FETCH statement in some of the previous cursors in this chapter, but it's time to look at this very important statement more closely. + +FETCH actually has many more options than what we've seen so far. Up to this point, we've seen three different options for FETCH (NEXT, PREVIOUS, and FIRST). These really aren't a bad start. Indeed, we really only need to add one more for the most basic set of cursor navigation commands, and a few after that for the complete set. + +Let's look at each of the cursor navigation commands and see what they do for us: + +FETCH Option | Description +---|--- +NEXT | This moves you forward exactly one row in the result set and is the backbone option. Ninety percent or more of your cursors won't need any more than this. Keep this in mind when deciding to declare as FORWARD_ONLY or not. When you try to do a FETCH NEXT and it results in moving beyond the last record, you will have a @@FETCH_STATUS of −1. +PRIOR | As you have probably surmised, this one is the functional opposite of NEXT. This moves backward exactly one row. If you performed a FETCH PRIOR when you were at the first row in the result set, then you will get a @@FETCH_STATUS of −1 just as if you had moved beyond the end of the file. +FIRST | Like most cursor options, this one says what it is pretty clearly. If you perform a FETCH FIRST, then you will be at the first record in the recordset. The only time this option should generate a @@FETCH_STATUS of −1 is if the result set is empty. +LAST | The functional opposite of FIRST, FETCH LAST moves you to the last record in the result set. Again, the only way you'll get a −1 for @@FETCH_STATUS on this one is if you have an empty result set. +ABSOLUTE | With this one, you supply an integer value that indicates how many rows you want from the beginning of the cursor. If the value supplied is negative, then it is that many rows from the end of the cursor. Note that this option is not supported with dynamic cursors (since the membership in the cursor is redone with every fetch, you can "really know where you're at"). This equates roughly to navigating to a specific "absolute position" in a few of the client access object models. +RELATIVE | No—this isn't your mother-in-law kind of thing. Instead, this is about navigating by moving a specified number of rows forward or backward relative to the current row. + +We've already gotten a fair look at a few of these in our previous cursors. The other navigational choices work pretty much the same. + +Altering Data within Your Cursor + +Up until now, we've kind of glossed over the notion of changing data directly in the cursor. Now it's time to take a look at updating and deleting records within a cursor. + +Since we're dealing with a specific row rather than set data, we need some special syntax to tell SQL Server that we want to update. Happily, this syntax is quite easy given that you already know how to perform an UPDATE or DELETE. + +Essentially, we're going to update or delete data in the table that is underlying our cursor. Doing this is as simple as running the same UPDATE and DELETE statements that we're now used to, but qualifying them with a WHERE clause that matches our cursor row. We just add one line of syntax to our DELETE or UPDATE statement: + +WHERE CURRENT OF + +Nothing remarkable about it at all. Just for grins though, we'll go ahead and implement a cursor using this syntax: + +USE AdventureWorks2008; + +/* Build the table that we'll be playing with this time */ + +SELECT SalesOrderID, CustomerID + +INTO CursorTable + +FROM Sales.SalesOrderHeader + +WHERE SalesOrderID BETWEEN 43661 AND 43665; + +\-- Now create a unique index on it in the form of a primary key + +ALTER TABLE CursorTable + +ADD CONSTRAINT PKCursor + +PRIMARY KEY (SalesOrderID); + +/* The IDENTITY property was automatically brought over when + +** we did our SELECT INTO, but I want to use my own OrderID + +** value, so I'm going to turn IDENTITY_INSERT on so that I + +** can override the identity value. + +*/ + +SET IDENTITY_INSERT CursorTable ON; + +\-- Declare our cursor + +DECLARE CursorTest CURSOR + +SCROLL -- So we can scroll back and see if the changes are there + +KEYSET + +FOR + +SELECT SalesOrderID, CustomerID + +FROM CursorTable; + +\-- Declare our two holding variables + +DECLARE @SalesOrderID int; + +DECLARE @CustomerID varchar(5); + +\-- Get the cursor open and the first record fetched + +OPEN CursorTest; + +FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID; + +\-- Now loop through them all + +WHILE @@FETCH_STATUS=0 + +BEGIN + +IF (@SalesOrderID % 2 = 0) -- Even number, so we'll update it + +BEGIN + +\-- Make a change. This time though, we'll do it using cursor syntax + +UPDATE CursorTable + +SET CustomerID = -99999 + +WHERE CURRENT OF CursorTest; + +END + +ELSE -- Must be odd, so we'll delete it. + +BEGIN + +\-- Now we'll delete a record so we can see how to deal with that + +DELETE CursorTable + +WHERE CURRENT OF CursorTest; + +END + +FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID; + +END + +\-- Now go back to the top. We can do this since we have a scrollable cursor + +FETCH FIRST FROM CursorTest INTO @SalesOrderID, @CustomerID; + +\-- And loop through again. + +WHILE @@FETCH_STATUS != -1 + +BEGIN + +IF @@FETCH_STATUS = -2 + +BEGIN + +PRINT ' MISSING! It probably was deleted.'; + +END + +ELSE + +BEGIN + +PRINT CAST(@SalesOrderID AS varchar) + ' ' + CAST(@CustomerID AS varchar); + +END + +FETCH NEXT FROM CursorTest INTO @SalesOrderID, @CustomerID; + +END + +\-- Now it's time to clean up after ourselves + +CLOSE CursorTest; + +DEALLOCATE CursorTest; + +DROP TABLE CursorTable; + +Again, I'm treating this one as an entirely new cursor. We've done enough deletions, additions, and updates that I suspect you'll find it easier to just key things in a second time rather than having to look through row by row to see what you might have missed. + +We are also again using the modulus operator (%) that we saw earlier in the book. Remember that it gives us nothing but the remainder. Therefore, if the remainder of any number divided by 2 is zero, then we know the number was an even number. + +The rest of the nuts and bolts of this don't require any rocket science, yet we can quickly tell that we got some results: + +(5 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +(1 row(s) affected) + +MISSING! It probably was deleted. + +43662 * + +MISSING! It probably was deleted. + +43664 * + +MISSING! It probably was deleted. + +You can see the multiple "1 row affected" that is the returned message for any row that was affected by the UPDATE and DELETE statements. When we get down to the last result set enumeration, you can quickly tell that we deleted all the odd numbers (which is what we told our code to do), and that we updated the even numbered rows with a new CustomerID. + +No tricks—just a WHERE clause that makes use of the WHERE CURRENT argument. + +Summary + +Cursors give us those memories of the old days when we could address things row by row. Ahhh, it sounds so romantic with that "old days" kind of thought. WRONG! I'd stick to set operations any day if I thought I could get away with it. + +The fact is that set operations can't do everything. Cursors are going to be the answer any time a solution must be done on a row-by-row basis. Notice that I used the word "must" in there, and that's the way you should think of it. Cursors are great for taking care of some problems that can't be solved by any other means. + +That being said, remember to avoid cursor use wherever possible. Cursors are resource pigs and will almost always produce 100 times or worse negative performance impact. It is extremely tempting—especially if you come from the mainframe world or from a dBase background—to just keep thinking in that row-by-row method. Don't fall into that trap! Cursors are meant to be used only when no other options are available. +14 + +Reporting Services + +There are a few chapters in my books where I've chosen to overlap content between the Beginning and Professional titles. Now, it may seem like beginning and professional topics would be mutually exclusive, but that holds true only in a perfect world where everyone is gaining experience in the same way and in the same order, and where everyone has the same definition of beginning and professional. + +In case you haven't already guessed it, this is one of those chapters where, if you've read my Beginning title, you're going to notice a little bit of overlap. In the case of Reporting Services, the reasons are multifold, but a couple of the key ones are: + + * Some people get into database development specifically driven by the need to control more of their own reporting destiny (in which case they may have almost started with Reporting Services, and then started learning the queries they need to support the data in the report). Others are long-term database "experts" who are just getting around to using one of those "extras" that SQL Server provides. + * It's a relatively new feature (in the grand life of SQL Server as a product), so it's "new" to many professional-level people. + +Now, don't go rushing off yet if you read the chapter on Reporting in the Beginning title. While we do repeat some key items, we go a bit deeper here, and focus on more of the true developer-oriented items (and less on the model-driven aspects). Feel free, however, to skip ahead to the section on the data sources and data source views, where we will take a far more "Pro" look at things including parameterization, drill-throughs, and charting. + +A Quick Look at Reports as a Concept + +After all the queries have been written, and after all the stored procedures have been run, there remains a rather important thing we need to do in order to make our data useful—make it available to end users. + +Reporting is one of those things that seems incredibly simple, but turns out to be rather tricky. You see, you can't simply start sticking numbers in front of people's faces. The numbers must make sense and, if at all possible, capture the attention of the person you're reporting for. To produce reports that actually get used and, therefore, are useful, there are a few things to keep in mind: + + * Use Just the Right Amount of Data: Do not try to do too much in one report; nor should you do too little. A report that is a jumble of numbers is going to lose a reader's attention quickly, and you'll find that it doesn't get utilized after the first few times it is generated. Likewise, a barren report will get just a glance and get tossed without any real thought. Find a balance of mixing the right amount of data with the right data. + * Make it Appealing: Sad as it is to say, another important element in reporting is what one of my daughters would call making it "prettiful," which is to say, making it look nice and pleasing to the eye. An ugly report is a dead report. + +In this chapter, we're going to be taking a look at a few key concepts of Reporting Services (often referred to as SSRS), and then moving on to some more advanced aspects. While I do indeed skip some of the "basics," I cover some fundamental items necessary to make any sense out of the more advanced topics, but then quickly move on to the Report Designer, which allows for the most advanced reporting options Reporting Services has to offer. + +For the sake of brevity (and to minimize overlap), I cover report models in this book only with a discussion of what they are there for, not with a specific example. This is one of the places where I draw the line between Beginning- and Pro-level information. That said, even if you did not already understand report models before reading this chapter, you'll find that learning about core items such as data sources and the Report Designer will make learning how to use the Report Modeler and the Report Model designer largely intuitive. The building of actual reports will be similarly easy. + +Reporting Services 101 + +Odds are that you've already generated some reports in your day. They may have been paper reports off a printer (perhaps in something as rudimentary as Access's reporting area, which is actually one of the best parts of Access to me). Or perhaps you have used a rather robust reporting engine such as Crystal Reports. Even if you haven't used tools that fancy, one can argue that handing your boss the printout from a stored procedure is essentially a very simple (albeit not necessarily nice-looking) report. I would tend to agree with that argument. + +The reality, however, is that our managers and coworkers today expect something more. This is where Reporting Services comes in. Reporting Services really has two different varieties of operation: + + * Report Models: This is making use of a relatively simple, Web-driven interface that is meant to allow end users to create their own simple reports. + * Reports Generated in the Business Intelligence Development Studio: While this doesn't necessarily mean you have to write code (you can actually create some fairly robust reports using drag-and-drop functionality), you can get pretty fancy and do very complex things depending on just how far you want to take it. + +Note that, while your users can eventually access these reports from the same Reporting Services Web host, they are based on somewhat different architectures (and are created in different fashions). + +In addition, Reporting Services provides features for pre-generating reports (handy if the queries that underlie the report take a while to run) as well as for distributing the report via e-mail. Exported reports can be rendered in PDF, Excel, and Word formats. + +Tools Used with Reporting Services + +Reporting Services has several tools to help you create, use, and manage reports. These include: + + * Reporting Services Configuration Manager: This tool can be found in the Configuration Tools subfolder under the main SQL Server. This allows you to configure such things as the account Reporting Services runs under, the IP addresses and ports the supporting Web server will respond to, the virtual directory names used for Reporting Services, e-mail accounts to be used, and the database used to keep track of Reporting Services information, as well as encryption keys and scalability configuration information. + * Business Intelligence Development Studio (BIDS): This is essentially Visual Studio with a set of templates installed that focus on Reporting Services, Analytics, Integration Services, and Data Mining. If you already have Visual Studio 2008 installed, BIDS just adds some more templates and shortcuts to get to Visual Studio. We will be utilizing the Development Studio extensively over several of the remaining chapters of this book (sometimes in its base SQL Server installed form, and sometimes as part of a full Visual Studio installation). + * SQL Server Management Studio: In the Management Studio, you can connect to virtually all of the different SQL Server–related services in order to (can you see this one coming? Of course you can!), manage things about that particular service. While only the base data engine has what I would consider "full functionality" entirely wrapped up in the Management Studio, the Studio is the place to perform most security-related tasks as well as anything tied to job scheduling. + * The Report Server Website: This is where you go to actually run most of the reports you'll want executed in Reporting Services, but through the Site Settings link (in the upper-right side of the browser). It is also a place to manage some elements of your server (in particular, caching, assigning roles, and scheduling). + +Unfortunately, no individual tool does everything involved in Reporting Services. Indeed, none of them even comes close (as Management Studio does for the database engine). But by utilizing a combination of the various tools, we're able to manage all the aspects of our Report Server. + +Other Means of Accessing Reporting Services + +Reporting Services also supports a fairly robust Web service model. There is a set of libraries provided to support .NET projects accessing the Reporting Services Web Service API. We will take a look at the basics of that toward the end of the chapter. + +Report Server Projects + +Report Models (the primary discussion of Reporting Services in my Beginning title) can be considered as "scratching the surface" of things. Reporting Services has much more flexibility than that. (Indeed, there are entire books solely about Reporting Services; there is that much to it.) In addition to the Report Modeler, the Business Intelligence Development Studio will allow you to create Report Server Projects. + +As I said earlier, there are entire books about this subject, so the approach we're going to take here is to start with a little taste of the possibilities through a simple example. We'll then expand on things a bit. + +A lot has changed with the look and feel of Report Service Projects for this release. Microsoft bought licenses to a number of the Dundas (a component development company) Reporting Service Components. These are a significant upgrade in the componentry for Reporting Services. + +In our journey to look at Report Server Projects, we'll start with several core items that are common to both the Report Modeler and Report Server Projects. If you are already familiar with data sources and data source views, you can scan the next two sections to pick up the relevant parts of the project example, but otherwise skip to the section where we are discussing the actual report layout. + +So, let's get started with a Report Server Project. Start by opening the Business Intelligence Development Studio, and opening a new project. You'll want to use the Business Intelligence using the Report Server Project template in the Business Intelligence Development Studio, as shown in Figure 14.1. + +Note that the exact appearance of this dialog may vary somewhat depending on whether you have Visual Studio installed and, if so, which specific languages and templates you've installed. The image shown is of a full version of Visual Studio, as it is required for some of the more advanced topics of this book. + +Figure 14.1 + +This will serve as the project for most of what we are going to do in this chapter. With our project now created, we're ready to get into some of the key concepts of a report. Some of these will be a review if you've read my Beginning title, but you'll want to get this first report together to have it available for some of the more robust examples later. + +Data Sources + +Data sources and data source views (we'll be looking at those next) are perhaps the most central items in Reporting Services. Each serves in some fashion regardless of what specific type of report you're building and regardless of whether it's using the Report Modeler or a Report Project. Although they have similar names, they serve slightly different levels in the hierarchy of pulling data together into a report. + +A data source is essentially the definition required for connecting to wherever you're getting your data from. This can be a connection to a SQL Server or any OLE DB or ODBC data source. If you ponder the possibilities of that for a moment, you should quickly come to the conclusion that, although Reporting Services is associated with SQL Server, you have the prospect of using a wide variety of non–SQL Server data sources in your reports. This is a very powerful concept indeed. + +There are two types of data sources: + + * Embedded: This type of data source is stored within the same file that defined the report. We will take a look at the XML (called Report Definition Language—or RDL) a little later in the chapter, but suffice to say that all the relevant information for the data source is stored in an XML block within the report definition file. Access to this kind of data source definition is limited to the report with which it is embedded. + * Shared: This is largely the same as an embedded data source, except that the definition for the data source is stored in its own file (usually with the extension .ds). + +We will be making use of a shared data source later in the chapter. + +Regardless of the type, data sources store several pieces of required information, and optionally store additional items to deal with security scenarios. + +Creating a Data Source + +Let's go ahead and create a data source that we will use throughout the remainder of this chapter. + +If your Visual Studio environment is still in its default configuration, you should see the Solution Explorer on the upper-right side. Right-click Shared Data Sources and choose Add New Data Source, as show in Figure 14.2. + +Figure 14.2 + +This will bring you up to the Shared Data Source Properties dialog (as shown in Figure 14.3). + +Figure 14.3 + +The dialog has two major elements, the first of which allows us to define the name (I've named mine for the database we're going to connect to) as well as the connection string for our data source. (For those not familiar with connection strings, it tells whatever object is connecting to your data source where to go and how to log in.). You can either edit the connection string directly or click the Edit button to bring up the Connection Properties dialog shown in Figure 14.4. + +The first time I saw this dialog, I was mildly surprised to see that it was different than the connection dialog that had been used repeatedly in the Management Studio; nonetheless, it does contain the same basic elements, just in a slightly different visual package (in short, don't worry if it looks a little different). + +Figure 14.4 + +In my case, I've selected the local server, the system administrator account (sa), and our old friend, the AdventureWorks2008 database. + +Go ahead and click OK, and then the Credentials option in the Data Source Properties dialog, and we get the security options for our data source (see Figure 14.5). + +Figure 14.5 + +We have several options here worth discussing—they include: + + * Use Windows Authentication: This is what it sounds like. It authenticates based on the user who executes the report. This means that the related Windows user account must have access to not only the report, but all underlying data related to the report. + * Use this user Name and Password: The user name and password referenced will be SQL Server login information (not Windows). + * Prompt for Credentials: Again, this is predictable. Credentials are obtained from the user at run time. The credentials supplied will be passed to whatever data provider the report utilizes. + * No Credentials: This forces anonymous access, so the data provider needs to support such access or you will get an authentication error when you run the report. + +In Figure 14.5, I've chosen to use the sa and provided the related password. This means that the supplied login and password will be persisted (in an encrypted form) with the data source in the ds file. + +When we click OK for this dialog, we wind up back at our relatively generic Visual Studio project, but we have our new data source, and are ready to create more of the required pieces for our report. + +Using the Report Wizard + +Even though we didn't choose the Report Wizard project type when we created this project, elements of the Report Wizard are still available as we create reports. Indeed, the simple act of asking for a new report will, by default, bring up the Report Wizard. You can cancel out of the wizard to create a blank report, but, unless you do, Visual Studio will try and use the wizard to do some of the work for you. + +To move on with the example we're building, we'll go ahead and add a report to walk through the Report Wizard process. For our example, we'll say that our manager has asked us for a summary report showing the total sales by category for all the sales invoices sold by David Campbell in July 2003. She has warned us that she may ask about other salespeople and periods later, but, for now, the information on Mr. Campbell for July 2003 is all she needs. + +To get started, right-click the Reports node in the Solution Explorer, select Add New Report as shown in Figure 14.6, and it should bring up the Report Wizard Welcome dialog. + +Figure 14.6 + +Click Next to move on to the data source selection dialog shown in Figure 14.7. Note that, while I've chosen to use the shared data source we created a few moments ago, I could also create a new data source as part of this dialog. (The new data source would be embedded, but could be converted to shared later if we so chose.) + +Figure 14.7 + +Again click Next to move on to the Query Builder dialog shown in Figure 14.8. I've already created a query and the query looks like this: + +SELECT per.FirstName + ' ' + per.LastName AS Employee, + +ps.Name AS Subcategory, + +SUM(sod.LineTotal) AS Sales, + +soh.SalesOrderID, + +soh.SalesOrderNumber, + +p.Name AS Product, + +SUM(sod.OrderQty) AS OrderQty, + +sod.UnitPrice, + +pc.Name AS Category + +FROM Sales.SalesOrderHeader soh + +JOIN Sales.SalesPerson SP + +ON sp.BusinessEntityID = soh.SalesPersonID + +JOIN Sales.SalesOrderDetail sod + +ON soh.SalesOrderID = sod.SalesOrderID + +JOIN HumanResources.Employee e + +ON soh.SalesPersonID = e.BusinessEntityID + +JOIN Person.Person per + +ON per.BusinessEntityID = sp.BusinessEntityID + +JOIN Production.Product p + +ON sod.ProductID = p.ProductID + +JOIN Production.ProductSubcategory ps + +ON p.ProductSubcategoryID = ps.ProductSubcategoryID + +JOIN Production.ProductCategory pc + +ON ps.ProductCategoryID = pc.ProductCategoryID + +WHERE (DATEPART(Year, soh.OrderDate) = 2003) + +AND (DATEPART(Month, soh.OrderDate) = 7) + +AND (soh.SalesPersonID = 283) + +GROUP BY per.FirstName + ' ' + per.LastName, + +DATEPART(Month, soh.OrderDate), + +soh.SalesOrderID, + +soh.SalesOrderNumber, + +p.Name, + +ps.Name, + +sod.UnitPrice, + +pc.Name + +There isn't any real rocket science to this query. It is simply gathering up sales totals for the salesperson with an ID of 283 (which happens to be David Campbell) in July of 2003. We will look at how to make this selectable later in the chapter, but, for now, we'll go with the simple, hard coded query. + +Figure 14.8 + +Paste in this query code (you can find it in the downloadable sample code on the wrox.com or www.professionalsql.com websites), and click Next to choose between a tabular or matrix report. A tabular report is a classic row-by-row of data layout. A matrix looks for an intersection of data, and is more oriented around displaying totals at the intersection of a column and row. For this particular report, we'll go with the tabular option, and then click Next to move on to the dialog shown in Figure 14.9. + +Figure 14.9 + +The sales report we're generating is going to be showing the total for each sales order that Mr. Campbell issued in July 2003. The selections we're making now will have the wizard create part of the formatting we need. Choose the SalesOrderNumber as a Group By item, and the Category and Sales fields for detail items, and click Next. In this next dialog (shown in Figure 14.10), I've chosen a block format. There isn't any real magic in it. I've just chosen it because I think it suits this particular data best. I've also chosen to include subtotals. Since we're grouping by SalesOrderNumber, it means we will get a total for each SalesOrderNumber value. + +Figure 14.10 + +Again click Next to choose a style for the wizard with which to configure the report. I happen to be choosing Ocean, but anything will work. Click Next one last time to see a summary, as shown in Figure 14.11, of what the wizard is going to do and to name your report. (I've chosen SalesOrderSummary. I'd suggest using that name since we will alter this report as we go through the chapter.) You're then ready to click Finish to generate the actual report. + +Figure 14.11 + +The report that first comes up (shown in Figure 14.12) doesn't look that complex. + +Figure 14.12 + +Go ahead and choose the Preview tab to see what the report looks like with real data (shown in Figure 14.13). + +Figure 14.13 + +This is indeed a nice start, but has some significant flaws, so let's look at editing the report. + +Editing Reports + +To edit a report, we move back to the Design tab for the report in Visual Studio. Continuing our example, we have a few issues we would like to take care of to clean up the look of the report: + + * The title should reflect a more proper title format. + * The number values should look more like currency values. + * We're seeing each instance of a category sale, not a total as was requested. + +Let's take each of these in turn. + +First up, let's change the title. This is the easiest of the changes we'll make. Simply click the area of the title once to select it, and a second time to make your cursor active so you can edit it much as you would any other label object. Double-clicking has the same effect. Go ahead and select it and change the title to D. Campbell, July 2003 Summary. + +Next, we'll take on the number formatting issue. Again, this isn't that difficult. Simply right-click the field that holds our Sales information, and select Text Box Properties as shown in Figure 14.14. + +Figure 14.14 + +This brings us up the dialog shown in Figure 14.15, which allows us to set a wide variety of properties for the cell of our report table (which, incidentally, is called a tablix). In Figure 14.15, I've chosen the Numbers node, and set our number display to round to the nearest whole unit of currency, and to use a separator for thousands. + +Notice that it doesn't ask what you want to use as a thousands separator, nor does it just assume that you want to use a comma. The thousands separator will vary based on what localization your report server is configured for, and can be overridden on a report-by-report basis. + +Figure 14.15 + +That takes us to the last, and trickiest of the changes we decided to make: rolling up each category to a total within each sales order. To do this, we again right-click the cell that contains the [Sales] value as shown in Figure 14.16. We choose the row group, and modify the properties using the dialog shown in Figure 14.17. This will limit the rows returned to just one per category within the larger SalesOrderNumber group. (Notice the brackets on the far left of the tablix. Remember we added that one by selecting it when we were in the Report Wizard.) We're not quite done in here, though. Since we're focused on categories, we should probably sort the categories to make them a bit more readable. To do that, we can choose the Sorting node in the current dialog, as shown in Figure 14.18. + +Figure 14.16 + +Figure 14.17 + +Figure 14.18 + +So, with all that accomplished, it would seem that we're ready to preview our report again, but, when we do, we see that, while things are vastly improved, we still have a few problems (as shown in Figure 14.19). + +Figure 14.19 + +While our report is starting to look good, we have some problems with our numbers. If you were to compare it with the earlier values that were returned (you can go back to Figure 14.13 to see those), you should quickly see that our numbers don't add up. Indeed, the report is not showing the totals for each category, but rather the first row returned in each category. We can't have that! + +To fix this, we need to explicitly indicate what we want done for each cell. Once again, right-click our [Sales] cell, but, this time, click Expression as shown in Figure 14.20. + +Figure 14.20 + +The dialog returned shows that we are currently returning the exact value from the Sales field in the data set: + +=Fields!Sales.Value + +What we need, however, is a total—or a Sum—for the field within the group. To do this, we can use one of the many built-in functions of Reporting Services. In this case, the Sum function: + +=Sum(Fields!Sales.Value) + +So, to see how this looks in the dialog, check out Figure 14.21. + +Figure 14.21 + +Click OK, and preview the report again, and we now have a reasonably well-formatted report (don't get too carried away formatting it—we're just getting started with this report!) shown in Figure 14.22, and we're ready to run it, print it (or export it to another format), and deliver it to our manager. + +Figure 14.22 + +Parameterizing Reports + +Getting this report on David Campbell is all well and good, but it is pretty limiting. Recall that our manager warned us that she might want it for other people and for other times, later on. It's time to implement that functionality. + +Parameterization is a vital part of most reporting projects. Fortunately, making SQL Server recognize a report as parameterized is relatively easy. Once a report is parameterized, SQL Server will prompt the user in some fashion to supply a parameter value. As we'll learn in this section, we have many options for making parameter choices easy on the user. + +As our first step, we will add the most rudimentary parameterization to our report. Making our report reliant on parameters starts with simply altering our query to expect those parameters. We'll then just need to tell the report to request the parameters before the report is executed. Let's start by editing our query. Go to the Report Data item in the View menu for the project. (It's also available as a tab in the Solution Explorer pane.) The Report Data tab is shown in Figure 14.23. Just double-click our one data set for this report to bring up the dialog shown in Figure 14.24, which will, among other things, allow us to edit our query. (Some reports can have several data sets. This particular report just has one.) + +Note that you can also edit the query in a separate Query Editor Window by right-clicking the data set and selecting Query. + +Figure 14.23 + +Figure 14.24 + +I have already changed our hard-coded values for Darren Campbell's BusinessEntityID, the month of July, and the year of 2003 to be parameter values (@BusinessEntityID, @Month, and @Year, respectively). With this complete, we're ready to move on to the Parameters node of the dialog as shown in Figure 14.25. + +Figure 14.25 + +I've added each of the parameters in this dialog, so I can now click OK, and I'm ready to preview (or just downright run) the report. In Figure 14.26, I have run it via the Preview tab. Notice at the top of the pane how it has asked for (and I have provided) the three parameters. + +Figure 14.26 + +In looking over the report, you can see that we wound up with exactly the same values that we had in our original report, only now we could run the report for a different time period, or for a sales rep. Our report just became a lot more flexible. + +Providing/Controlling Parameter Values and How They Are Used + +Well, the report as we have it seems pretty nice. We cannot only provide a report on David Campbell as we could before, but now we can input different parameters including a different employee's BusinessEntityID, and a completely different time period. We do, however, still have several usability issues. Some of these include: + + * The input values are free-form, which means users may input illegal values. + * There are no hints at what might be a proper input value, so the user is left to know ahead of time, or guess. This is not too horrible for the date and year, but would be problematic in terms of getting the right salesperson's BusinessEntityID. + * No matter which sales person you input, the header is hard-coded to say David Campbell. A similar issue exists for the month and year. + +Let's take a look at how to fix these issues. + +Creating Pre-set Parameter Lists + +Reporting Services gives you the ability to create pre-defined value lists for your parameters. This functionality utilizes the parameters that we've already defined, and simply adds additional properties to them. + +To add fixed lists to our @Month and @Year parameters, we navigate to the Parameters node of the Report Data tab, expand the list, and then double-click the parameter for which we're interested in supplying values. (You could also right-click the parameter and then select Parameter Properties.) Go ahead and try this for the @Month parameter, which should bring up the dialog shown in Figure 14.27. + +Figure 14.27 + +Notice that I could set a custom prompt for my parameter. (It doesn't have to be the parameter name.) I can also control the initial visibility of the parameter (perhaps for a parameter that is only valid if another parameter is set to a specific value) as well as the nullability or acceptance of blank values. + +I've mostly stuck with the defaults here, but I did change the data type to be an Integer. (Remember we are taking the month number as a parameter.) We're then ready to move on to the Available Values node shown in Figure 14.28. + +Figure 14.28 + +I've made several modifications in this dialog—most notably supplying separate labels and values. The label indicates what the user will be shown to choose from, and the value will be what is passed to the parameter when the report is executed. I was given the ability to create this list as part of choosing the Specify Values option. Note, however, that I could also have made the list query driven. (We'll get to one of those shortly.) + +Go ahead and switch over to the Defaults node, and you can see we are allowed to supply a default value. (In Figure 14.29, I've chosen the value of 7 that we have been working with thus far.) + +Figure 14.29 + +Finally, switch over to the Advanced node (shown in Figure 14.30), and we are given the option of selecting when our report data will change if the user changes the parameter value. We can force a refresh every time, require the user to explicitly call for the refresh, or allow SQL Server to decide when it is the right time. + +Figure 14.30 + +Go ahead and try this out on your own by setting the data type for the @Year parameter to Integer and the default value to 2003. Then we're ready to preview or run the report again to check out the effects of our changes as shown in Figure 14.31. + +Figure 14.31 + +While you can't see any significant difference in the BusinessEntityID and Year parameters, you should quickly notice that Month is now a drop-down list that supplies the name of each month even though the parameter will really use the integer value for the month. You can also test out entering text into the year field. SQL Server will indicate the type mismatch relatively gracefully. (It isn't the prettiest thing ever, but it's better than a full blown error.) + +Creating Parameter Lists from Queries + +Supplying a pre-populated list for our @BusinessEntityID parameter is a bit trickier than the other two parameters. We could create a fixed list much as we did with @Month, but that would mean we would have to edit the report every time the list of salespersons changed. While months are likely to remain very stable (unless Einstein comes back from the dead with a new theory on time), salespeople have a tendency to come and go with high frequency. Editing the report each time is very impractical, particularly when we already have salesperson information entered elsewhere in the system. + +To get this started, we need to create a new data set. Start by right-clicking the data source in the Report Data tab, and select Add Dataset as shown in Figure 14.32. + +Figure 14.32 + +Which, in turn, brings us up the dialog shown in Figure 14.33. + +Figure 14.33 + +I've already supplied a query that lists all of the salespeople. It is entirely visible in the dialog, but just to make it clear, it looks like this: + +SELECT p.BusinessEntityID, p.LastName + ', ' + p.FirstName + +FROM Person.Person p + +JOIN Sales.SalesPerson sp + +ON p.BusinessEntityID = sp.BusinessEntityID; + +Now continue to the Fields node as shown in Figure 14.34. This allows us to select what the returned fields are going to be named (so you can access them) in any reports that use this data set. I am sticking with the defaults here, but we could have altered the names on the results if we had so chosen. Click OK and our data set is created. We're now ready to use it to populate our parameter list. + +Figure 14.34 + +Double-click the BusinessEntityID parameter again to open it back up for editing, then move to the Available Values node shown in Figure 14.35. I have again pre-filled-in suitable values. I have, as you might expect, chosen the Get values from a query option. I have likewise chosen what data set to use as a source and which fields from the data set relate to the value and label fields (which function just as they did when we manually supplied values for them). I am also going to go to the Default Values node and set a default of 283 (our old friend, David Campbell) before previewing or executing the report as shown in Figure 14.36. + +Figure 14.35 + +Figure 14.36 + +So, just that quickly we have all of our parameters defaulted and data typed as appropriate. All that leaves us is to deal with the fixed header. + +Getting Headings and Other Fields from Parameters + +Editing a text box to use parameter values is relatively easy. Start by selecting the text box that holds our current fixed value and get it into an edit mode. To make it dynamic, I need to combine several items. First, I'll start off with a prefix to my dynamic values. I'll use the phrase "Summary for:" I then need to again right-click and choose Create Placeholder bringing up the dialog in Figure 14.37. The placeholder will allow Reporting Services to distinguish between my literal text and my functional code. Note that the value field has a drop-down box, and that by expanding it you can choose between a wide array of dynamic values. In our case, I've supplied a reference to one of the parameters the user selected. Go ahead and click OK, and then preview or run the report to see the effect (shown in Figure 14.38). + +Figure 14.37 + +Figure 14.38 + +To finish out this section, let's add another placeholder or two, but this time let's use the expression editor. Add a comma and a space after the placeholder we just created, and then right-click and again choose Add Placeholder. This time, however, click the Fx button to the right of the Value field to bring up the dialog shown in Figure 14.39. In this figure, I'm in the middle of adding a reference to the Month parameter that the user selected when they ran the report, but notice that Visual Studio is providing me with IntelliSense while I edit. Go ahead and add placeholders for both Month and Year, and your report should now come out looking something like Figure 14.40. + +Figure 14.39 + +Figure 14.40 + +Adding Charts + +Reporting Services also supports chart objects. This is relatively powerful stuff, as it does a lot to allow our reports to become more than just a source for reporting, but also a venue for more genuine analysis. We're going to add one chart to our report to provide a visual representation of the sales this month between categories. + +Start by opening the Visual Studio toolbox and dragging a Chart object onto your report. (I'm placing mine to the right of our tablix.) This brings up the dialog shown in Figure 14.41, and allows us to choose between a wide array of chart types. + +Figure 14.41 + +Given that we don't have that many categories to choose from, I've decided to go with a pie chart in a 3D representation (shown in Figure 14.41). To get this working, I can just drag fields from data sets in my Report Data tab right into special receiver areas on the chart (shown in Figure 14.42). I've dragged the sales field from Dataset1 into the Drop Data Fields Here area, and the categories field into the Drop Category Fields Here area. + +Figure 14.42 + +Also change the (caption) field in the chart properties to Sales by Category, and we're again ready to run or preview the report as shown in Figure 14.43. + +Figure 14.43 + +Just that quickly, we have a basic chart available to provide a visual representation of the numbers in our tablix. + +Note that there is no interdependence between the two objects. They happen to make use of the same data set, but there is no requirement that they do so. Indeed, we did most of this report building without the chart, and we could, if we so desired, delete the tablix and work only with the chart. + +Linking Reports + +Reporting Services also allows you to link multiple reports, either drilling down into finer levels of detail, or drilling across into a totally different report. + +The linking process is supported through what are termed "Actions." Actions support both internal (other reports) and external (such as a website) links. + +Let's add one last element to the report we've been working on in this chapter. To make use of this link, you'll want to download (if you haven't already) the code for this book, and look for the SalesOrderDetail.rdl file that I've pre-created for you. You can add it to your project by right-clicking Reports in the Solution Explorer, and choosing Add⇒Existing Item. + +To make use of this new Sales Order Detail report, you need to edit the properties for the text box that has the Sales Order Number on your report, then access the Actions settings as shown in Figure 14.44. + +Figure 14.44 + +Once you have the SalesOrderDetail.rdl file properly added to the project and have configured the SalesOrderNumber action as shown in Figure 14.44, go ahead and run or preview your summary report one last time. Now click the first Sales Order Number for David Campbell in July of 2003, and you should get the Sales Order Detail report shown in Figure 14.45. + +Figure 14.45 + +Deploying the Report + +The thing left to do is deploy the report. To deploy, you right-click the report in the Solution Explorer and choose Deploy. There is, however, a minor catch—you need to define the target to deploy to in the project definition. + +1. Right-click the Report Server Project and choose Properties. + +2. In the TargetReportFolder field, put whatever folder you want it to reside in when you log into the Report Manager. + +3. In the TargetServerURL field, enter the URL to your ReportServer. In my case, this may be as simple as , but the server name could be any server to whom you have appropriate rights to deploy. (The Virtual Directory may also be something other than ReportServer if you defined it that way at install.) + +After you've deployed (by right-clicking the project and selecting Deploy), you'll want to view the report. Navigate to your report server. (If it is on the local host and uses the default directory, it would be .) Click your report folder, and choose your SalesOrderSummary report. It may take a bit to come up the first time you load it but you should see your report just as we defined it in our project. (If you navigate back to it again, the report definition will be cached and thus come up fairly quickly.) + +A Brief Note on RDL + +RDL stands for Report Definition Language—an XML–based language that defines reports. All the changes we made to our report over the course of this chapter were translated into RDL by Visual Studio. If you want see what the RDL for your report project looks like, right-click your report and choose View Code. The following is an excerpt from the report I produced as an example for this chapter. It defines the data set that supplied the values for our sales staff to the appropriate parameter: + + + + + + + +BusinessEntityID + +System.Int32 + + + + + +Name + +System.String + + + + + + + +AdventureWorks2008 + +SELECT p.BusinessEntityID, p.LastName + ', ' + + +p.FirstName AS Name + +FROM Person.Person p + +JOIN Sales.SalesPerson sp + +ON p.BusinessEntityID = sp.BusinessEntityID; + +true + + + + + +You can modify the RDL directly if you wish. (But be careful. It can be a hassle to figure out what exactly you did wrong if you introduced an error through direct editing.) + +Summary + +Reporting Services has had a major impact on many SQL Server installations. For many companies, having a relatively robust reporting server built right into their central data store has been liberating by making it much easier to disseminate information to data consumers. For other organizations, Reporting Services has provided an adequate solution to replace long-standing reporting packages such as Crystal Reports. SQL Server 2008 adds several new features and controls to allow for more elegant and powerful reports, plus the engine has been redesigned to allow for much higher scalability. + +Even with the relatively robust report used in this chapter, we've really only just begun to taste the possibilities. Reports can be parameterized, you can embed charts, integrate with other products (such as Microsoft Sharepoint Services or Microsoft Office Sharepoint Services), drill through from one report to another, and even embed reports inside of other reports. + +For more information on reporting, I'd suggest a book specific to Reporting Services. +15 + +Buying in Bulk: The Bulk Copy Program (BCP) and Other Basic Bulk Operations + +If your system is going to be operating in something of a bubble, then you can probably skip this chapter and move on. Unfortunately, the real world doesn't work that way, so you probably ought to hang around for a while. + +For most systems, there will eventually come a time (often, it's many times) when you need to move around large blocks of data. Sometimes you need to bring in data that's in the wrong format or that's sitting in another application's data files. Sometimes, you need to extract data directly from another system. The good thing is SQL Server has two tools to help you move data fast—the Bulk Copy Program (bcp) and SQL Server Integration Services (SSIS). In this chapter, we'll be looking primarily at the first of these. In addition, we'll take a look at bcp's close cousins—the BULK INSERT command and OPENROWSET (BULK). + +We will examine SSIS in the next chapter. + +bcp is something of an old friend. You know the one—where you hardly ever see them anymore, but, when you do, you reminisce on all the crazy things you used to do together. It was, for a very long time, the way we moved around large blocks of data; and it did so (still does as far as that goes) amazingly fast. What, however, it lacks is sex appeal—well, frankly, since SQL Server 7.0, it has lacked appeal in a whole lot of areas. + +So, why then am I even spending a chapter on it? Well, because bcp still definitely has its uses. Among its advantages are: + + * It's very compact. + * It can move a lot of data very quickly. + * It is legacy—that is, there may be code already running that is making effective use of it, so why change it? + * It uses a cryptic, yet very traditional scripting style (which will probably appeal to some). + * It is very consistent. + +bcp is used for transferring text and SQL Server native format data to and from SQL Server tables. It has changed very little in the last several versions, and other bulk features have continued to erode the usefulness of bcp, but it still holds its own. You can think of bcp as a data pump, with little functionality other than moving data from one place to the other as efficiently as possible. The various other bulk operations we'll look at in this chapter are often easier to use, but usually come at the price of less flexibility. + +In this chapter, we will look at some of the ins and outs of bcp and then use what we learn about bcp to form the foundations of many of the other features that serve a similar purpose—to get data in and out of your system as quickly as possible. + +bcp Utility + +bcp runs from an operating system command prompt to import or export native data (specific to SQL Server), ASCII text, or Unicode text. This means that you can execute bcp from an operating system batch file or user-defined stored procedure, as well as from other places. bcp can also be run as part of a scheduled job, or executed from a .NET object through the use of a shell command. + +Like most command-line utilities, options can be specified using a hyphen (-) or forward slash (/); however, unlike most DOS or Windows family utilities, option switches are case sensitive. + +bcp Syntax + +bcp {[[.][].]{
|}|""} + +{in | out | queryout | format} + +[-m ] [-f ] [-x] [-e ] + +[-F ] [-L ] [-b ] + +[-n] [-c] [-w] [-N] [-V (60 | 65 | 70 | 80 | 90)] [-6] + +[-q] [-C ] [-t ] [-r ] + +[-i ] [-o ] [-a ] + +[-S [\]] [-U ] [-P ] + +[-T] [-v] [-R] [-k] [-E] [-h " [,...n]"] + +Geez—that's a lot to take in, so let's go through these switches one by one. (Thankfully, most of them are optional, so you will usually only include just a fraction of them.) + +Note that many of the switches for the bcp utility are case sensitive; often, a given letter has an entirely different meaning between cases. + +Parameter | Description +---|--- +Database name | Exactly what it sounds like. Basically, this is a standard part of the four-part naming scheme. If not specified, the user's default database is assumed. +owner | More of the four-part naming scheme stuff. Again, exactly what it sounds like. +Table or View name "query" | Can only be one—table, view, or query. This is the input destination or output source table or view. A SQL Server query can be used only as a bcp output destination, and only when queryout is specified. If the query returns multiple result sets, only the first result set is used by bcp. +in data file out data file queryout data file format data file | Again, can only be one. If using any of these, you must also supply a source or destination file. Establishes the direction of the bcp action. in indicates that you are importing data from a source file into a table or view. out indicates that you are exporting data from a table or view into the destination file. Use queryout only for output to the destination file using a query as its source. Use format to create a format file based on the format option you've selected. You must also specify -f, as well as format options (-n, -c, -w, -6, -C, or-N) or answer prompts from interactive bcp. The source or destination path and filename is specified as and cannot include more than 255 characters. +-m | You can specify a maximum number of errors that you will allow before SQL Server cancels the bulk copy operation, defaulting to 10 errors. Each row that cannot be copied by bcp is counted as one error. +-f | A format file contains responses saved from a previous bcp operation on the same table or view. This parameter should include the full path and filename to the format file. This option is used primarily with the in and format options to specify the path and filename when making use of or creating a format file. +-x | Generates a XML-based format file instead of the straight text version that is default. (The non-XML version is legacy support, but remains default for now.) It must be used with both the format and –f options. +-e | You can specify the full path and filename for an error file to store any rows that bcp is not able to transfer. Otherwise, no error file is created. Any error messages will be displayed at the client station. +-F first row | Use this option if you want to specify the first row to be copied by the bulk copy operation. If not specified, bcp defaults to a value of 1 and begins copying with the first row in the source data file. This option can be handy if you want to handle your loading in chunks, and can be used to pick back up where you left off in a previous loading run. +-L last row | This option is the complement of –F. It provides a method for determining the last row you want loaded as part of this bcp execution. If not specified, bcp defaults to a value of 0, the last row in the source file. When used in conjunction with –F, this option can allow you to load your data one chunk at a time, loading small blocks of data and then picking up next time where the previous load left off. +-b batch size | You can specify the number of rows copied as a batch. A batch is copied as a single transaction. Like all transactions, the rows of the batch are committed in an "all or nothing" fashion—either every row is committed or the transaction is rolled back and it is as if the batch never happened. The –h (hint) switch has a similar option (ROWS_PER_BATCH), which should be considered to be mutually exclusive with –b (use neither or one of them, but not both). +-n | Native data types (SQL Server data types) are used for the copy operation. Using this option prevents the need to answer the questions regarding the data types to be used in the transfer (it just picks up the native type and goes with it). +-c | This specifies that the operation uses character data (text) for all fields, and, as such, does not require a separate data type question for each field. A tab character is assumed as field delimiter unless you use the –t option and a newline character as row separator unless you specify different terminator using -r. +-w | The -w option is similar to –c but specifies Unicode data type instead of ASCII for all fields. Again, unless you override with –t and –r, the tab character and row separator are assumed to be the field delimiter and newline character, respectively. This option cannot be used with SQL Server version 6.5 or earlier. +-N | This is basically the same as –w, using Unicode for character data but uses native data types (database data types) for non-character data. This option offers higher performance when going from SQL Server to SQL Server. As with –w, this option cannot be used with SQL Server version 6.5 or earlier. +-V (60|65|70|80|90) | Causes bcp to utilize data type formats that were available only in previous versions of SQL Server. 60 uses 6.0 data types, 65 uses 6.5 data types, 70 uses 7.0 data types, 80 uses 2000 data types, and 90 uses 2005 data types. This replaces the –6 option. +-6 | Use this option to force bcp to use SQL Server 6.0 or 6.5 data types. This option is used in conjunction with the -c or -n format options for backward-compatibility reasons only. Use –V whenever possible (when working with SQL Server 7.0 or newer, which should be pretty much always at this point). +-q | Use -q to specify that a table or view name includes non-ANSI characters. This effectively executes a SET QUOTED_IDENTIFIERS ON statement for the connection used by bcp. The fully qualified name, database, owner, and table or view must be enclosed in double quotation marks, in the format "database name.owner.table". +-C | This option is used to specify the code page for the data file data. It is only necessary to use this option with char, varchar, or text data having ASCII character values of less than 32 or greater than 127. A code page value of ACP specifies ANSI/Microsoft Windows (ISO 1252). OEM specifies the default client code page. If RAW is specified, there will be no code page conversion. You also have the option of providing a specific code page value. Avoid this option where possible; instead, use a specific collation in the format file or when asked by bcp. +-t | This option allows you to override the default field terminator. The default terminator is the tab character. You can specify the terminator as tab (\t), newline (\n), carriage return (\r), backslash (\\\), null terminator (\0), any printable character, or a string of up to 10 printable characters. For example, you would use "-t," for a comma-delimited text file. +-r | This option works just like –t except that it allows you to override the default row terminator (as opposed to the field terminator). The default terminator is \n, the newline character. The rules are otherwise the same as -t. +-i | You have the option of specifying a response file, as the input file, containing the responses to be used when running bcp in interactive mode. (This can save answering a ton of questions!) +-o | You can redirect bcp output from the command prompt to an output file. This gives you a way to capture command output and results when executing bcp from an unattended batch or stored procedure. +-a | You have the option of overriding the default packet size for data transfers across the network. Larger packet sizes tend to be more efficient when you have good line quality (few CRC errors). The specified value must be between 4096 and 65535, inclusive, and overrides whatever default has been set up for the server. At installation, the default packet size is 4096 bytes. This can be overridden using the SQL Server Management Studio or the sp_configure system stored procedure. +-S | If running bcp from a server, the default is the local SQL Server. This option lets you specify a different server and is required in a network environment when running bcp from a remote system. +-U | Unless connecting to SQL Server through a trusted connection, you must provide a valid username for login. +-P password | When you supply a username, you must also supply a password. Otherwise, you will be prompted for a password. Include -P as your last option with no password to specify a null password. +-T | You have the option of connecting to the server using network user credentials through a trusted connection. If a trusted connection is specified, there is no need to provide a login name or password for the connection. +-v | When this option is used, bcp returns version number and copyright information. +-R | Use this option to specify that the regional format for clients' local settings is used when copying currency, date, and time data. The default is that regional settings are ignored. +-k | Use this option to override the use of column default values during bulk copy, ignoring any default constraints. Empty columns will retain a null value rather than the column default. +-E | This option is used during import when the import source file contains identity column values and is essentially equivalent to SET IDENTITY_INSERT ON. If not specified, SQL Server will ignore the values supplied in the source file and automatically generate identity column values. You can use the format file to skip the identity column when importing data from a source that does not include identity values and have SQL Server generate the values. +-h "hint[,...]" | The hint option lets you specify one or more hints to be used by the bulk copy operation. Option -h is not supported for SQL Server version 6.5 or earlier. +ORDER column [ASC|DESC] | You can use this hint to improve performance when the sort order of the source data file matches the clustered index in the destination table. If the destination table does not have a clustered index or if the data is sorted in a different order the ORDER hint is ignored. +ROWS_PER_BATCH=nn | This can be used in place of the -b option to specify the number of rows to be transferred as a batch. Do not use this hint with the -b option. +KILOBYTES_PER_BATCH=nn | You can optionally specify batch size as the approximate number of kilobytes of data to be transferred in a batch. +TABLOCK | This will cause a table-level lock to be acquired for the duration of the operation. Default locking behavior is set by the table lock on bulk load table option. +CHECK_CONSTRAINTS | By default, check constraints are ignored during an import operation. This hint forces check constraints to be checked during import. +FIRE_TRIGGERS | Similar to CHECK_CONSTRAINTS, this option causes any triggers on the destination table to fire for the transaction. By default, triggers are not fired on bulk operations. This option is not supported in versions of SQL Server prior to 2000. + +bcp runs in interactive mode, prompting for format information, unless -f, -c, -n, -w, -6, or -N is specified when the command is executed. When running in interactive mode, bcp will also prompt to create a format file after receiving the format information. + +bcp Import + +Okay, so up to this point we've been stuck in the preliminaries. Well, it's time to get down to the business of what bcp is all about. + +Probably the most common use of bcp is to import bulk data into existing SQL Server tables and views. To import data, you must have access permissions to the server, either through a login ID or a trusted connection, and you must have INSERT and SELECT permissions on the destination table or view. + +The source file can contain native code, ASCII characters, Unicode, or mixed native and Unicode data. Remember to use the appropriate option to describe the source data. Also, for the data file to be usable, you must be able to describe the field and row terminators (using –t and –r) or the fields and rows must be terminated with the default tab and newline characters, respectively. + +Be sure you know your destination before you start. bcp has a few quirks that can affect data import. Values supplied for timestamp or computed columns are ignored. If you have values for those columns in the source file, they'll be ignored. If the source file doesn't have values for these columns, you'll need a format file (which we'll see later in this chapter), so you can skip over them. + +This is one of those really bizarre behaviors that you run across from time to time in about any piece of software you might use. In this case, if your destination table contains them, you're required to have columns to represent timestamp or computed data even though SQL Server will just ignore that data—silly, isn't it? Again, the way around this is to use a format file that explicitly says to skip the columns in question. + +For bcp operations, rules are ignored. Any triggers and constraints are ignored unless the FIRE_TRIGGERS and/or CHECK_CONSTRAINTS hints are specified. Unique constraints, indexes, and primary/foreign key constraints are enforced. Default constraints are enforced unless the -k option is specified. + +Data Import Example + +The easiest way to see how bcp import works is to look at an example. Let's start with a simple example, a tab-delimited file containing department information for the AdventureWorks2008 database. Here's how the data looks: + +1 Smart Guys Research and Development 2006-04-01 00:00:00.000 + +2 Product Test Research and Development 2006-04-01 00:00:00.000 + +To import this into the Department table using a trusted connection at the local server, you run: + +BCP AdventureWorks2008.HumanResources.Department in c:\DepartmentIn.txt -c -T + +Two things are important here: First, up to this point, everything we've run has been done in Management Studio. For bcp, however, you type your command into a command-prompt box. Second, you'll need to change the preceding command line to match wherever you've downloaded the sample files/data for this book. + +Because the first column in the Department table is an identity column and the -E option wasn't specified, SQL Server will ignore the identity values in the file and generate new values. The -c option identifies the source data as character data, and -T specifies to use a trusted connection. + +Note that, if you have not been using Windows authentication and haven't set up your network login with appropriate rights in SQL Server, then you may need to modify the preceding example to utilize the –S and –P options. + +When we execute it, SQL Server quickly tells us some basic information about how our bulk copy operation went: + +2 rows copied. + +Network packet size (bytes): 4096 + +Clock Time (ms.) Total : 109 Average : (18.35 rows per sec.) + +We can go back into Management Studio and verify that the data went into the Department table as expected: + +USE AdventureWorks2008; + +SELECT * FROM HumanResources.Department; + +which gets us back several rows—most importantly, the two we expect from our bcp operation: + +DepartmentID Name GroupName ModifiedDate + +\------------ ---------------- -------------------------------------- -------------- + +1 Engineering Research and Development 1998-06-01... + +2 Tool Design Research and Development 1998-06-01... + +... + +... + +16 Executive Executive General and Administration 1998-06-01... + +17 Smart Guys Research and Development 2006-04-01... + +18 Product Test Research and Development 2006-04-01... + +As always, note that, other than the two rows we just imported, your data may look a bit different depending on what parts of this book you've run the examples on, and which you haven't, and how much playing around of your own you've done. For this example, you just want to see that Smart Guys and Product Test made it into the table with the appropriate information. The identity values will have been reassigned to whatever was next for your particular server. + +Now let's look at a more involved example. Let's say we have a table called CustomerList. A CREATE statement to make our CustomerList table looks like this: + +CREATE TABLE dbo.CustomerList + +( + +CustomerID nchar(5) NOT NULL + +PRIMARY KEY, + +CompanyName nvarchar(40) NOT NULL, + +ContactName nvarchar(30) NULL, + +ContactTitle nvarchar(30) NULL, + +Address nvarchar(60) NULL, + +City nvarchar(15) NULL, + +Region nvarchar(15) NULL, + +PostalCode nvarchar(10) NULL, + +Country nvarchar(15) NULL, + +Phone nvarchar(24) NULL, + +Fax nvarchar(24) NULL + +); + +We have a comma-delimited file (in the same format as a .csv file) with new customer information. This time, the file looks like: + +XWALL,Wally's World,Wally Smith,Owner,,,,,,(503)555-8448,, + +XGENE,Generic Sales and Services,Al Smith,,,,,,,,(503)555-9339,, + +XMORE,More for You,Paul Johnston,President,,,,,,(573)555-3227,, + +What's with all the commas in the source file? Those are placeholders for columns in the CustomerList table. The source file doesn't provide values for all of the columns, so commas are used to skip over those columns. This isn't the only way to handle a source file that doesn't provide values for all of the columns. You can use a format file to map the source data to the destination. We'll be covering format files a little later in the chapter. + +Imagine for a moment that we are going to run bcp to import the data to a remote system. The command is: + +BCP AdventureWorks2008.dbo.CustomerList in c:\newcust.txt -c -t, -r\n -Ssocrates - + +Usa -Pbubbagump + +The line wrapping shown here was added to make the command string easier to read. Do not press Enter to wrap if you try this example yourself. Type the command as a single string and allow it to wrap itself inside the command prompt. + +Once again, the data is being identified as character data. The -t, option identifies the file as comma-delimited (terminated) data, and -r\n identifies the newline character as the row delimiter. Server connection information was also provided for a little variety this time, using sa as your login and bubbagump as the password. + +Again, bcp confirms the transfer along with basic statistics: + +Starting copy... + +3 rows copied. + +Network packet size (bytes): 4096 + +Clock Time (ms.) Total : 15 Average : (200.00 rows per sec.) + +And again we'll also go verify that the data got there as expected: + +USE AdventureWorks2008; + +SELECT CustomerID, CompanyName, ContactName + +FROM dbo.CustomerList + +WHERE CustomerID LIKE 'X%'; + +And, sure enough, all our data is there... + +CustomerID CompanyName ContactName + +\---------- ---------------------------------------- -------------------------- + +XGENE Generic Sales and Services Al Smith + +XMORE More for You Paul Johnston + +XWALL Wally's World Wally Smith + +Logged vs. Non-logged + +bcp can run in either fast mode (not logged) or slow mode (logged operation). Each has its advantages. Fast mode gives you the best performance, but slow mode provides maximum recoverability. Since slow mode is logged, you can run a quick transaction log backup immediately after the import and be able to recover the database should there be a failure. + +Fast mode is usually your best option when you need to transfer large amounts of data. Not only does the transfer run faster, but since the operation isn't logged you don't have to worry about running out of space in the transaction log. What's the catch? There are several conditions that must be met for bcp to run as non-logged: + + * The target table cannot be replicated. + * If the target table is indexed, it must not currently have any rows. + * If the target table already has rows, it must not have any indexes. + * The TABLOCK hint is specified. + * The target table must have no triggers. + * For versions prior to SQL Server 2000, the select into/bulkcopy option must be set to true. + +Obviously, if you want to do a fast mode copy into an indexed table with data, you will need to: + + * Drop the indexes + * Drop any triggers + * Run bcp + * Reindex the target table + * Re-create any triggers + +You need to immediately back up the destination database after a non-logged bcp operation. + +If the target table doesn't meet the requirements for fast bcp, then the operation will be logged. This means that you can run the risk of filling the transaction log when transferring large amounts of data. You can run BACKUP LOG using the WITH TRUNCATE_ONLY option to clear the transaction log. The TRUNCATE_ONLY option truncates the inactive portion of the log without backing up any data. + +I can't stress enough how deadly bcp operations can be to the size of your log. If you can't achieve a minimally logged operation, then consider adjusting your batch size down and turning TRUNCATE ON CHECKPOINT on for the duration of the operation. Another solution is to use the –F and –L options to pull things in a block at a time and truncate the log in between each block of data. Recognize, however, that an important part of your backup strategy—the transaction log—is now missing part of the information it needs to properly restore the database. It is, therefore, critical that you create a fresh backup as soon as your bulk load activity is complete. + +bcp Export + +If you're going to be accepting data in via bulk operations, then it follows that you probably want to be able to pump data out, too. + +bcp allows you to export data from a table, view, or query. You must specify a destination filename. If the file already exists, it will be overwritten. Unlike import operations, you are not allowed to skip columns during export. Timestamp, rowguid, and computed columns are exported in the same manner (just like they were "real" data) as any other SQL Server columns. To run an export, you must have appropriate SELECT authority to the source table or tables. + +Look at a couple of quick examples using the HumanResources.Department table in the AdventureWorks2008 database. + +To export to a data file using the default format, you could run: + +BCP AdventureWorks2008.HumanResources.Department out c:\somedir\ + +DepartmentOut.txt -c -T + +Note that if you're running Vista or a later version of the Windows operating system (including Windows Server 2008), new security controls will likely prevent you from doing a bulk extra to the root directory (C:\ on most systems)—thus my use of somedir in the preceding code. + +This would create a file that looks like: + +1 Engineering Research and Development 1998-06-01 00:00:00.000 + +2 Tool Design Research and Development 1998-06-01 00:00:00.000 + +... + +... + +17 Smart Guys Research and Development 2006-04-01 00:00:00.000 + +18 Product Test Research and Development 2006-04-01 00:00:00.000 + +In this case, we didn't have to use a format file, nor were we prompted for any field lengths or similar information. The use of the -c option indicated that we just wanted everything, regardless of type, exported as basic ASCII text in a default format. The default calls for tabs as field separators and the newline character to separate rows. + +Keep in mind that the destination file will be overwritten if it already exists. This will happen without any kind of prompt or warning. + +To modify the separator to something custom, we could run something like: + +BCP AdventureWorks2008.HumanResources.Department out DepartmentOut.txt -c -T -t, + +Notice the comma at the end. That is not a typo. The next character after the t is the field separator—in this case, a comma. + +This would give us: + +1,Engineering,Research and Development,1998-06-01 00:00:00.000 + +2,Tool Design,Research and Development,1998-06-01 00:00:00.000 + +... + +... + +17,Smart Guys,Research and Development,2006-04-01 00:00:00.000 + +18,Product Test,Research and Development,2006-04-01 00:00:00.000 + +We used a comma separator instead of a tab, and got what amounts to a .csv file. + +Format Files + +If you have any previous experience dealing with the kinds of files we typically have handed to us with the dreaded "load this data into our database" order, then you have probably looked at my previous import examples and said "Heh—I wish my data actually came in that cleanly formatted...." Yes indeed, data rarely looks as perfect as we would like it to, and that brings us to the concept of format files. + +Format files were first mentioned in the previous section, and provide something of an import template. Among other things, they make it easier to support recurring import operations when: + + * Source file and target table structures or collations do not match. + * You want to skip columns in the target table. + * Your file contains data that makes the default data typing and collation difficult or unworkable. + +Format files come in two varieties: non-XML and XML. We will start off by looking at the "old" way of doing things (the non-XML version) and then take a look at the newer XML format files. + +To get a better idea of how each type of format file works, let's look at some specific examples. First you'll see how the file is structured when the source and destination match. Next, you can compare this to situations where the number of source file fields doesn't match the number of table columns or where source fields are ordered differently than the table columns. + +You can create a default format file (which is non-XML for backward-compatibility reasons) to use as your source when you run bcp in interactive mode. After prompting for column value information, you're given the option of saving the file. The default filename is BCP.fmt, but you can give the format file any valid filename. + +To create a default format like this for the AdventureWorks2008 database HumanResources.Department table, you could run: + +BCP AdventureWorks2008.HumanResources.Department out c:\somedir\department.txt –T + +This is a handy way of creating a quick format file that you can then edit as needed. You can do this with any table, so you can use bcp to get a jump-start on your format file needs. + +Accept the default prefix and data length information for each file, and, in this case, a comma as the field terminator. SQL Server will prompt you to save the format file after you've entered all of the format information; in my case I'm going to save it off as Department.fmt. You can then edit the format file to meet your particular needs with any text editor, such as Windows Notepad. + +Let's take a look at the format file we just produced: + +10.0 + +4 + +1 SQLSMALLINT 0 2 "," 1 DepartmentID "" + +2 SQLNCHAR 2 100 "," 2 Name + +SQL_Latin1_General_CP1_CI_AS + +3 SQLNCHAR 2 100 "," 3 GroupName + +SQL_Latin1_General_CP1_CI_AS + +4 SQLDATETIME 0 8 "," 4 ModifiedDate "" + +The first two lines in the file identify the bcp version number (10.0 for SQL Server 2008, 9.0 for SQL Server 2005, and so on) and the number of fields in the host file. The remaining lines describe the host data file and how the fields match up with target columns and collations. + +The first column is the host file field number. Numbering starts with 1 through the total number of fields. Next is the host file data type. The example file has a mix of a few data types. All text is in Unicode format, so the data type of all fields is SQLNCHAR. Given that there are no special characters in this data, we could have just as easily gone with a SQLCHAR (ASCII) format. + +The next two columns describe the prefix and data length for the data fields. The prefix is the number of prefix characters in the field. The prefix describes the length of the data in the actual bcp file and allows the data file to be compacted to a smaller size. The data field is the maximum length of the data stored in the field. Next is the field terminator (delimiter). In this case, a comma is used as the field terminator and newline as the row terminator. The next two columns describe the target table columns by providing the server column order and server column name. Since there is a direct match between the server columns and host fields in this example, the column and field numbers are the same, but it didn't necessarily have to work that way. Last, but not least, comes the collation for each column. (Remember that, with SQL Server 2000 and newer, we can have a different collation for every column in a table.) + +Now, let's check the XML version. To create this, we run almost the same command, but add the -x switch: + +BCP AdventureWorks2008.HumanResources.Department out c:\somedir\department.txt –T –x + +The format file we wind up with looks radically different: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Notice that everything is explicitly called out. What's more, there is an XML schema document associated with XML format files, which means you can validate the XML in your XML editor of choice. + +I'm not going to pick any bones about this. I LOVE the new XML-formatted version. If you don't need to worry about compatibility with versions prior to SQL Server 2005, this one seems a no-brainer to me. + +The old format files work, but, every time I work with them extensively, I consider purchasing stock in a pain reliever company. They are that much of a headache if you have to do anything beyond the defaults. Everything about them has to be "just so," and in larger tables, it's easy to miss a typo since fields are not clearly separated. XML tagging fixes all that and makes clear what every little entry is there for—debugging is much, much easier. + +When Your Columns Don't Match + +If only the world was perfect and the data files we received always looked just like our tables. + +Okay, time to come out of dreamland. I'm reasonably happy with the world I live in, but it's hardly a perfect place and the kinds of data files I need to do bulk operations on rarely look like their destination. So, what then are we to do when the source file and destination table do not match up the way we want? Or what about going the other way—from a table to an expected data file format that isn't quite the same? + +Fortunately, format files allow us to deal with several different kinds of variations we may have between source and destination data. Let's take a look. + +Files with Fewer Columns Than the Table + +Let's start with the situation where the data file has fewer fields than the destination table. We need to modify the format file we've already been using to identify which columns do not exist in the data file and, accordingly, which columns in our table should be ignored. This is done by setting the prefix and data length to 0 for each missing field and the table column number to 0 for each column we are going to skip. + +For example, if, as one might expect, the data file has only DepartmentID, Name, and GroupName, you would modify the file to: + +10.0 + +4 + +1 SQLSMALLINT 0 2 "," 1 DepartmentID "" + +2 SQLNCHAR 2 100 "," 2 Name + +SQL_Latin1_General_CP1_CI_AS + +3 SQLNCHAR 2 100 "," 3 GroupName + +SQL_Latin1_General_CP1_CI_AS + +4 SQLDATETIME 0 0 "," 0 ModifiedDate "" + +As you can see, the ModifiedDate field and column has been zeroed out. Because ModifiedDate is not supplied and the column has a default value (Getdate()), that default value will be used for our inserted rows. + +The XML version doesn't look all that different, but instead of zeroing out elements of the definition, we simply don't define it: + + + + + + + + + + + + + + + + + + + + + + + + + + + +There was no column in the file to define, so we didn't. We aren't sticking anything in the ModifiedDate column, so we skipped that, too (counting on the default in its case). + +More Columns in the File Than in the Table + +The scenario for a data file that has more columns than the table does is actually amazingly similar to the short data file scenario we just looked at. The only trick here is that you must add column information for the additional fields, but the prefix length, data length, and column number fields are all set to 0: + +10.0 + +4 + +1 SQLSMALLINT 0 2 "," 1 DepartmentID "" + +2 SQLNCHAR 2 100 "," 2 Name + +SQL_Latin1_General_CP1_CI_AS + +3 SQLNCHAR 2 100 "," 3 GroupName + +SQL_Latin1_General_CP1_CI_AS + +4 SQLDATETIME 0 8 "," 4 ModifiedDate "" + +5 SQLDATETIME 0 0 "," 0 CreatededDate "" + +This time, the host file includes fields for a date the department was created. The target table doesn't have a column to receive this information. The fields are added to the original format file, as well as two dummy columns with a column number of 0. This will force bcp to ignore the fields. + +For this one, the XML version does have to deal with the fact that the file has a column that needs to be addressed. The destination, however, we can continue to ignore: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Mismatched Field Order + +Another possibility is that the host and target have the same fields, but the field orders don't match. This is corrected by changing the server column order to match the host file order: + +10.0 + +4 + +1 SQLSMALLINT 0 2 "," 1 DepartmentID "" + +2 SQLNCHAR 2 100 "," 3 GroupName + +SQL_Latin1_General_CP1_CI_AS + +3 SQLNCHAR 2 100 "," 2 Name + +SQL_Latin1_General_CP1_CI_AS + +4 SQLDATETIME 0 8 "," 4 ModifiedDate "" + +In this case, the group name is listed before the department name in the source file. The server column order has been changed to reflect this. Notice, the order in which the server columns are listed has not changed, but the server column numbers have been swapped. + +So, to translate this to XML, we just need to change a field or two versus our original XML file: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Using Format Files + +As an example, let's use a format file for an import. This command will copy records into the Department table based on a file named shortdept.txt. We'll use ShortDept.fmt as our non-XML format file example, and ShortDeptX.fmt as our XML-based format file. + +BCP AdventureWorks2008.HumanResources.Department in c:\shortdept.txt – + +fc:\shortdept.fmt –Usa -Pbubbagump + +Just for a change of flavor, the preceding example command line uses SQL Server authentication instead of Windows authentication. If you prefer Windows authentication, just replace the –U and –P parameters with the –T we've used frequently. + +The sample files used in this example, ShortDept.txt, ShortDept.fmt, and ShortDeptx.fmt, are available for download from the Wrox website or from ProfessionalSQL.com. + +Maximizing Import Performance + +One obvious way of maximizing bcp performance is to make sure that the target table meets all the requirements for running bcp as a non-logged operation. This may mean you need to: + + * Drop any existing indexes on the target table. While this is actually required only if you want a minimally logged operation, the fact is that leaving indexes off during bulk operation is greatly beneficial performance-wise regardless of the logging status. Be sure, however, to rebuild your indexes after the bulk operation is complete. + * Attempt to have your source data files created in the same order that your clustered index (if there is one) is in. During your index rebuild, this will allow you to make use of the SORTED_DATA_REORG option, which greatly speeds index creation (and thus the overall time of your bcp operation). Even if you have to leave a clustered index in place, performing the bcp with sorted data will allow the use of the ORDER column option (within the -HINT option). + * Make sure your maintenance properties are set to simple or non-logged. If they are set to Full Recovery, then bcp will not be allowed a minimally logged operation. + +If you're looking for additional improvement when importing data into a table, you can run parallel data loads from multiple clients. To do this, you must: + + * Use the TABLOCK hint. + * Remove all indexes (you can rebuild them after the operation is complete). + * Set the server recovery option to Bulk-Logged. + +How would this work? Rather than importing one very large file, break it up into smaller files. Then you launch bcp from multiple client systems, each client importing one of the smaller files. Obviously, you will be interested in doing this only if the expected performance increase saves more time on the import than you'll spend preparing the source files and copying them to the clients. + +Parallel loads were not supported for SQL Server 6.5 or earlier. + +With either of these operations, it will be necessary to re-create any indexes on the target table after completing the operation. Re-create the target table clustered index (if any) before any non-clustered indexes. + +You can get additional performance improvement by letting SQL Server ignore check constraints and triggers, the default option. Keep in mind that this can result in loading data that violates the table's check constraints and any data integrity rules that are enforced by your triggers. + +BULK INSERT + +One of the "cousins" that I mentioned at the beginning of the chapter was the BULK INSERT command. In order to make use of this command, you must be a member of either the sysadmin or bulkadminserver role. + +BULK INSERT essentially operates like a limited version of bcp that is available directly within T-SQL. The syntax looks like this: + +BULK INSERT [[''.][''].]'
' FROM '' + +[WITH + +( + +[BATCHSIZE [ = ]] + +[, CHECK_CONSTRAINTS] + +[, CODEPAGE [={'ACP'|'OEM'|'RAW'|''}]] + +[, DATAFILETYPE [={'char'|'native'|'widechar'|'widenative'}]] + +[, FIELDTERMINATOR [= '' ]] + +[, FIRSTROW [= ]] + +[, FIRE_TRIGGERS] + +[, FORMATFILE = '' ] + +[, KEEPIDENTITY] + +[, KEEPNULLS] + +[, KILOBYTES_PER_BATCH [= ]] + +[, LASTROW [ = ]] + +[, MAXERRORS [ = ]] + +[, ORDER ({column [ASC|DESC]} [ ,...n ] )] + +[, ROWS_PER_BATCH [= ]] + +[, ROWTERMINATOR [ = '']] + +[, TABLOCK] + +[, ERRORFILE = ''] + +) + +] + +Now, if you are getting a sense of déjà vu, then you're on top of things for sure. These switches pretty much all have equivalents in the basic bcp import syntax with which we started off the chapter. + +The special permission requirements of BULK INSERT are something of a hassle (not everyone belongs to sysadmin or bulkinsert), but BULK INSERT does carry with it a couple of distinct advantages: + + * It can be enlisted as part of a user-defined transaction using BEGIN TRAN and its associated statements. + * It runs in-process to SQL Server, so it should pick up some performance benefits there as it avoids marshalling. + * It's slightly (very slightly) less cryptic than the command-line syntax used by bcp. + +The big issue with BULK INSERT is just that. It's bulk insert. BULK INSERT will not help you build format files. It will not export data for you. It's just a simple and well-performing way to get bcp functionality for moving data into your database from within SQL Server. + +OPENROWSET (BULK) + +Yet another cousin to bcp, but this one is a far more distant one. You can think of this cousin as being from the side of the family that got most of the money and power. (In case you can't tell, I like this one!) OPENROWSET (BULK) marries the bulk rowset provider with the OPENROWSET's ability to be used within queries for fast and relatively flexible access to external files without necessarily needing to load them into an intermediate table. + +One of the more common uses for bcp is to load external data files for use by some periodic process. For example, you may receive files that contain things like credit reports, vendor catalogs, and other data that is placed in a generic format by a vendor. This is vital information to you, but you're more interested in a one-time interaction with the data than in truly importing it. OPENROWSET (BULK) allows the possibility of treating that file—or just one portion of that file—as a table. What's more, it can utilize a format file to provide a better translation of the file layout than a simple linked table might provide. The syntax looks like this: + +OPENROWSET + +( BULK '' , + +{ [ FORMATFILE = '' ] [ + +[, CODEPAGE [={'ACP'|'OEM'|'RAW'|''}]] + +[, FIRSTROW [= ]] + +[, LASTROW [ = ]] + +[, MAXERRORS [ = ]] + +[, ROWS_PER_BATCH [= ]] + +[, ERRORFILE = ''] + +] + +| SINGLE_BLOB | SINGLE_CLOB | SINGLE_NCLOB } + +} ) + +Keep in mind that OPENROWSET is more of a bulk access method than an insert method. You can most certainly do an INSERT INTO where the source of your data is an OPENROWSET (indeed, that's often how it's used), but OPENROWSET has more flexibility than that. Now, with that in mind, let's look at a couple of important bulk option issues when dealing with OPENROWSET. + +ROWS_PER_BATCH + +This is misleading. The big thing to remember is that, if you use this, you are essentially providing a hint to the Query Optimizer. SQL Server will always process the entire file, but whatever you put in this value is going to be a hint to the Optimizer about how many rows are in your file. Try to make it accurate or leave it alone. + +SINGLE_BLOB, SINGLE_CLOB, SINGLE_NCLOB + +These say to treat the entire file as one thing—one row with just one column. The type will come through as varbinary(max). Windows encoding conventions will be applied if you use SINGLE_BLOB. SINGLE_CLOB will assume that your data is ASCII, and SINGLE_NCLOB will assume it is Unicode. + +Summary + +In this chapter, we looked at the first of our two major data import/export utilities. bcp is used primarily for importing and exporting data stored as text files to and from our SQL Server. We also took a look at some of bcp's brethren. + +As a legacy utility, bcp will be familiar to most people who have worked with SQL Server for any length of time. Microsoft continues to enhance the core technology behind bcp, so I think it's safe to say that bcp is here to stay. + +That said, bcp is quite often not your best option. Be sure to check your options with BULK INSERT (and the benefits of running in-process to SQL Server) as well as OPENROWSET (BULK). + +In our next chapter, we will take a look at bcp's major competition—SQL Server Integration Services (SSIS). SSIS has the glamour and glitz that bcp is missing, but it also has its own quirks that can occasionally make the simplicity of bcp seem downright appealing. +16 + +Getting Integrated + +SQL Server Integration Services—or SSIS—is a tool that is a descendant of another tool called Data Transformation Services—or DTS. Remembering DTS is important particularly because of how revolutionary it was at the time it was released (in early 1999 as part of SQL Server 7.0). Never before was a significant tool for moving and transforming large blocks of data included in one of the major Relational Database Management Systems (RDBMSs). All sorts of things that were either very difficult or required very expensive third-party tools were suddenly a relative piece of cake. As we fast forward to the SQL Server 2008 era, what is now called SSIS (the name was changed when the service was totally rewritten as part of SQL Server 2005) is still relatively unique in terms of making such an important tool so accessible. + +In this chapter, we'll be looking at how to perform basic import and export of data, and we'll discuss some of the other things possible with tools like Integration Services. We will place our primary focus on the basics of SSIS packages, setting us up for a more advanced discussion of SSIS programmability in the Web-based chapter, Chapter 25. + +Understanding the Problem + +The problems being addressed by Integration Services exist in at least some form in a large percentage of systems—how to get data into or out of our system from or to foreign data sources. It can be things like importing data from the old system into the new, or a list of available items from a vendor—or who knows what. The common thread in all of it, however, is that we need to take data that doesn't necessarily match our table definitions and get that data into those tables anyway. + +What we need is a tool that will let us Extract, Transform, and Load data into our database—a tool that does this is usually referred to simply as an "ETL" tool. Just how complex of a problem this kind of tool can handle varies, but SQL Server Integration Services—or SSIS—can handle nearly every kind of situation you may have. + +This may bring about the question "Well, why doesn't everybody use it, then, since it's built in?" The answer is one of how intuitive it is in a cross-platform environment. There are third-party packages out there that are much more seamless and have fancier UI environments. These are really meant to allow unsophisticated users to move data around relatively easily—they are also outrageously expensive. Under the old DTS product, I actually had customers that were Oracle or other DBMS oriented, but purchased a full license for SQL Server just to make use of DTS. While the price of competing packages has come down, and SQL Server licensing prices have gone up, I'm sure that there are still SQL Server licenses out there that exist largely because of the need for SSIS. + +An Overview of Packages + +SSIS utilizes the notion of a "package" to contain a set of things to do. Each individual action is referred to as a "task." You can bundle up a series of tasks and even provide control of flow choices to conditionally run different tasks in an order of your choosing (for example, if one task were to fail, then run a different task). Packages can be created programmatically (using a rather robust object model that we will take an introductory look at in Chapter 25), but most initial package design is done in a designer that is provided in SQL Server. + +Let's go ahead and create a simple package just to get a feel for the environment. To get to SSIS, you need to start the SQL Server Business Intelligence Development Studio from the Programs⇒Microsoft SQL Server 2008 menu on your system—then select Integration Services Project as your project type as shown in Figure 16.1. + +To be honest, I'm still not a fan of the Integration Services modeler being in the Intelligence Studio rather than the Management Studio. Nonetheless, Microsoft has this nasty habit of not consulting me before they move their tools around, so I guess we'll have to live with it! + +So, to reiterate, the SSIS tool is in Business Intelligence Studio (much like the Reporting Services–related items)—not in Management Studio as most items we've looked at have been. + +The exact look of the dialog in Figure 16.1 will vary depending on whether you also have Visual Studio installed and, if so, what parts of Visual Studio you included in your installation. + +In this case, I've named my project an ever-so-descriptive "SSISProject"—from there, I simply click OK and SQL Server creates the project and brings up the default project window, shown in Figure 16.2, for SSIS-related projects. + +Figure 16.1 + +Figure 16.2 + +For those of you used to the Visual Studio environment, you should feel relatively at home. The only significant difference versus most Dev Studio projects is that, as we build the project, the design tab will be graphical in nature rather than in code. + +There are four key windows in our project, so let's start by looking at these. We will then do a walkthrough example later in the chapter. + +Tasks + +On the left side of our project (depending on your settings, you may have to click a tab to expand it), we have the toolbox window. The Control Flow Items list is at the top and thus what you first see, but you should also be able to find a section on Maintenance Plan tasks by scrolling down (these are more in the realm of the administrator, but you should take note of them—they underline my earlier notion that Integration Services is not just about ETL activities but also for a wide array of other actions, including many that you might have expected to find in Management Studio). You'll notice that many of these Control Flow Items entries are labeled as "tasks." + +A task, much as the word implies, is generally about an action that you want to take. They range from migration tasks (such as moving objects between servers) to data migration and transformation to tasks that manage the execution of other programs or packages. Though most are called tasks, you will also find some container objects that help organize or wrap the other objects in your package. + +It's worth noting that you can reorganize the tasks. You can, for example, drag and drop tasks in the task list to reorder them (perhaps to move those you use the most often up to the top where they are more visible), or create your own tabs to contain those tasks you use the most often. In addition, you can add new tasks to the list much as you can add new controls to other Dev Studio projects. In short, the environment is very customizable. + +There are a ton of tasks here, so let's take a quick overview at what the base tasks do. + +Task | Description +---|--- +Pointer | Okay, it's silly to even have to describe this, but just in case: This puts things into a generic drag-and-drop mode. When pointer is selected, clicking in the designer pane implies that you merely want to select an object that is already there as opposed to adding a new one. +For Loop Container | This is nothing more than a glorified FOR (or FOR/NEXT depending on your language of choice) statement. The FOR loop container allows you to initialize a control counter and set the conditions by which that counter is adjusted as well as under what conditions you exit the loop. Use this task to allow for controlled repetition of other tasks. +For Each Container | Again, this is your run-of-the-mill FOR/EACH statement. Like the FOR loop, it allows for controlled repetition, but this time, rather than using a counter, the loop is based on iterating through a collection of some sort (perhaps a collection of tables or other objects). The object list can come from a wide variety of sources ranging from such things as ADO and ADO.NET rowsets to SMO object lists. +Sequence Container | I think of this one as something of a "sub-package." The sequence container allows you to group up tasks and treat them as a single unit. This is useful for things like wrapping several tasks into a single transaction (thus allowing your overall package to contain several separate transactions—each potentially having many tasks to perform). Individual sequence containers can be made active or inactive conditionally, so you could, for example, turn off an entire set of tasks by disabling that sequence container (you could even do that programmatically, based on conditions found in previous tasks!). +Script Tasks | One of those "what it sounds like" things—these let you run your own custom code using either any ActiveX scripting language (JavaScript or VBScript for example) or any .NET-based language. Use the ActiveX Script task for ActiveX languages, and use the Script task for .NET code. +Analysis Services Tasks | These allow you to construct or alter Analysis Services objects as well as execute them. +Bulk Insert Task | As you might guess, this allows for the bulk importing of data. It uses the same Bulk Insert facilities that you touched on in the bcp chapter, but allows the bulk operation to be part of a larger control flow. The Bulk Insert task is easily the fastest way for an SSIS package to get data into your system. Note, however, that any package containing a Bulk Insert task can be run only by a login that is a member of the sysadmins server role. +Data Flow Task | The Data Flow task wraps the connection between data sources along with any transformations you want to make in moving data between those data sources. The Data Flow task is among the most complex tasks in SSIS in that it operates as both a task and a container. The Data Flow task is a container in the sense that you associate several parts of a given data flow with it. The Data Flow tasks define sources as well as destinations of data as well as the transformations to take place between the source and destination. Editing Data Flow tasks will automatically take you to a different tab within the main editing window. +Data Mining Query Task | This task requires that you have already defined Data Mining Models in Analysis Services. You can utilize this task to run predictive queries and output the results into tables (you could then define additional tasks to make use of those tables). +Execute Tasks | These are somewhat specific to what you want to execute. They can range from running other packages (there are separate tasks for running old DTS packages versus the newer SSIS packages) to executing external programs to running SQL scripts. +File System Tasks | These allow you to create, move, and delete files and directories. In a wide variety of SSIS environments, the ability to transfer files is key to both performance and execution of your package. For example, you may need to copy a file from a remote location to local storage for performance reasons as you perform operations against that file. Likewise, you may only have network access that allows you to read or to create a file, but not to change it—File System Tasks allow you to get just the right thing done. +FTP Tasks | This is something of a different slant on the File System Tasks notion. Instead, however, this allows you to use the FTP protocol to retrieve files (very handy for doing things like transferring files to or from vendors, customers, or other partners). +Message Queue Task | This allows you to send and receive messages via Microsoft Message Queue. This is actually a very powerful tool that allows for the delivery and/or receipt of files and other messages even when the remote host is not currently online. Instead, you can "queue" the file, and that host can be notified that the file is available the next time it is online. Likewise, files can be left in queue for your process to pick up when you execute the package. +Send Mail | Yup—yet another of those "what it sounds like" things. This allows you to specify a mail including attachments that may have been created earlier in your package execution. The only real trick on this one is that you must specify an SMTP connection (basically the outbound mail server) to use to send the mail. SSL and Windows-based authentication is also supported. +Transfer Tasks | These range from server migration tasks, such as transferring logins, error messages, and master database stored procedures, to more straightforward transfers such as transferring a table. +Web Service Task | This allows you to execute a Web service method and retrieve the result into a variable. You can then make use of that result in the remaining tasks in your package. +WMI Tasks | Windows Management Instrumentation (WMI) is an API that allows for system monitoring and control. It is a Windows-specific implementation of Web-Based Enterprise Management (WBEM), which is an industry standard for accessing system information. SSIS includes tasks for monitoring WMI events (so you can tell when certain things have happened on your system) and for requesting data from WMI in the form of a WMI query. You could, for example, ask WMI what the total system memory is on your server. +XML Tasks | XML Tasks allows for a wide variety of XML manipulation. You can apply XSLT transformations, merge documents, filter the XML document using XPath, and the list goes on. +Maintenance Tasks | Much of this is outside the scope of this book, but this set of tasks allows you to perform a wide variety of maintenance tasks on your server. From a developer perspective, a key use here would be things like a backup prior to a major import or another similar activity that is part of your package. Similarly, you may want to do index rebuilds or other maintenance after performing tasks that do major operations against a particular table. + +The Main Window + +This window makes up the center of your default SSIS package window arrangement in Dev Studio. The thing to note is that it has four tabs available, and each is something of its own realm—take a look at each of them. + +It's worth noting that you can change from the default tab style interface to a window-based interface if you so choose. (It's in the options for Visual Studio.) + +Control Flow + +This is actually where the meat of your package comes together. No, a package isn't just made up of flow alone, but this is where you initially drag all your tasks in and establish the order in which they will execute. + +Data Flow + +As you place data flow objects into the Control Flow pane, they become available for further definition in the Data Flow pane. Data flow tasks require additional objects to define such things as data connections, sources, and destinations of data as well as actual transformations. + +Event Handlers + +SSIS packages create a ton of events as they execute, and this tab allows you to trap certain events and act upon them. Some of the more key events worth trapping include: + +Event | Description +---|--- +OnError | This is a glorious new feature with SSIS. DTS had a quasi–error handler, but it was weak at best. This gives you something far more robust. +OnExecStatusChanged | This event is triggered any time the task is going into a different status. The possible statuses are idle, executing, abend (abnormal ending), completed, suspended, and validating. You can set traps for each of these conditions and have code run accordingly. +OnPostExecute | This one fires immediately after execution of the task is complete. In theory, this is the same as OnExecStatusChanged firing and having a status of completed, but I have to be honest and say I haven't tested this enough to swear to it. +OnProgress | This event is called regularly when any reasonably measurable progress happens in the package. This one is probably more useful when you're controlling a package programmatically than through one of the other execution methods, but it is nice from the standpoint of providing a progress bar for your end users if you need one. + +There are several other event methods available, but the preceding gives you a flavor of things. + +Package Explorer + +I find the location of this one to be a little odd. In a nutshell, this one presents a tree control of your package, complete with all the event handlers, connections, and executables (which include any tasks you have defined in the package). The reason I find this one a little odd is because I would have expected something like this to be part of or at least similar to Solution Explorer. Nonetheless, it does give you a way of looking over your project at an overall package level. + +Solution Explorer + +This is pretty much just like any other explorer window for Dev Studio. You get a listing of all the files that belong to your solution broken down by their nature (packages and data source views, for example). + +The Properties Window + +This one is pretty much the same as any other property window you've seen throughout SQL Server and Dev Studio. The only real trick here is paying attention to what exactly is selected so you know what you're setting properties for. If you've selected an object within the package, then it should be that particular task or event object. If you have nothing selected, then it should be the properties for the entire package. + +Building a Simple Package + +Okay, it's time for us to put some application to all this. This is going to be something of a quick-and-dirty example run, but, in the end, we will have shown off several of the key features of SSIS. + +Let's start with a little prep work. For this sample, we're going to be making use of a vbScript file that will generate some data for us to import. You can think of this script as simulating any kind of preprocess script you need to run before a major import or export. + +Create a text file called CreateImportText.vbs with the following code: + +Dim iCounter + +Dim oFS + +Dim oMyTextFile + +Set oFS = CreateObject("Scripting.FileSystemObject") + +Set oMyTextFile = oFS.CreateTextFile("C:\TextImport.txt", True) + +For iCounter = 1 to 10 + +oMyTextFile.WriteLine(cstr(iCounter) & vbTab & """TestCol" & + +cstr(iCounter) & """") + +Next + +oMyTextFile.Close + +This script, when executed, will create a new text file (or replace the existing file if it's there). It will add 10 rows of text to the file containing two tab-separated columns with a newline row terminator. We will use this in conjunction with a few other tasks to create and populate a table in SQL Server. + +In its default form, the CreateImportText.vbs file will try and write the text file it creates to the root directory of the C drive. If your system has User Access Control enabled, you may be prevented from running the script; if so, just move it to a directory below the root and adjust the later paths in this example. + +This is a pretty simplistic sample, so please bear with me here. What, in the end, I hope you see from this is the concept of running a preprocess of some sort (that's our script that generates the file in this case, but it could have been any kind of script or external process), followed by SQL Server scripting and data pump activities. + +With our sample vbScript created, we're ready to start building a package. + +Let's start with the SSISProject project file we created in the previous section. At this point, our Control Flow should be empty. In order to get the proverbial ball rolling on this, we need to make a call to our vbScript to generate the text file we will be importing. Drag an Execute Process Task from the Toolbox into the main Control Flow window. Very little will happen other than SSIS adding the Execute Process Task to the Control Flow window, as shown in Figure 16.3. + +Figure 16.3 + +To do much with our new task, we need to double-click the task to bring up the Execute Process Task Editor shown in Figure 16.4. + +Figure 16.4 + +Note that I've switched to the Process options because they are a bit meatier to show in a screenshot than the General options are, but here's an overview of how we want to set things up: + +Option | Setting +---|--- +General⇒Name | GenerateImportFile +General⇒Description | Generates the text file for import +Process⇒Executable | CreateImportText.vbs (prefix it with the full path to your script file) +Process⇒Working Directory | C:\ (or other directory of your choosing—just make sure you're consistent) + +When you're done making these changes, click OK, and very little will have changed in your Control Flow window, except that the name of the task will have been updated to GenerateImportFile. + +Next, drag an Execute SQL Task object into your Control Flow. Now, select the GenerateImportFile task, and it should have an arrow hanging from the bottom of the task box, as shown in Figure 16.5. + +Figure 16.5 + +Now comes the tricky part: Click the "output" of our GenerateImportFile task—that is, click the end of the little arrow. Drag the arrow into the top of the Execute SQL Task, and the builder should connect the two tasks (as shown in Figure 16.6)—notice how the arrow indicates the control flow. + +Figure 16.6 + +For the moment, let's look at what this arrow represents. Double-click it, and you'll get a Precedence Constraint Editor, as shown in Figure 16.7. + +Figure 16.7 + +Notice how it defines under what conditions this flow will be allowed to happen. In our case, it will move on to the Execute SQL Task only if our GenerateImportFile task completes successfully. We could define additional flows to deal with such things as the task failing or to allow for our second task to run on completion of the first task regardless of whether the first task succeeds or fails (any completion, regardless of success). + +Cancel back out of this dialog, and double-click Execute SQL Task to bring up the Execute SQL Task Editor, as shown in Figure 16.8. + +Figure 16.8 + +Again, I've edited the name a bit. Next, click the SQLStatement option to bring up the Enter SQL Query dialog shown in Figure 16.9. + +We're checking to see whether the table already exists, and, if it does, drop it. Then, knowing that the table cannot already exist (if it did, we just dropped it), we go ahead and create our destination table. + +One last thing we need is to have a connection to work with. Start by clicking in the Connection option and selecting New Connection. This will bring up a connection manager dialog (if you were paying attention, you may have noticed a connection manager pane below the main pane for our package—this is essentially the same functional area). Our package doesn't have any connections yet, so we need to click New again to get a somewhat run-of-the-mill OLE DB connection manager dialog. How I've filled out mine is shown in Figure 16.10, but adjust yours to match your database server name (the simple period "." in mine implies that I mean my local server) and security model. + +Figure 16.9 + +Figure 16.10 + +We're now able to connect to our database to create the destination table. And, with source data created and a destination table in place, we're ready to start working on actually transferring the data from our source to our destination. For that, we're going to utilize a Bulk Insert task, so go ahead and drag one of those into our model and connect the CreateTable task to the new BulkImport task, as shown in Figure 16.11. + +Figure 16.11 + +Again, double-click our task (the Bulk Insert Task in this case) to bring up a relevant editor box. Of particular interest is the Connection tab shown in Figure 16.12. + +Figure 16.12 + +We have several things to change here. For example, I've already changed the Row Delimiter to be the line feed that is written by our vbScript's WriteLine command. We do, however, need to do even more. Start by selecting the same connection you created to run the CREATE TABLE statement against. Then enter in our destination table name ([AdventureWorks].[dbo].[TextImportTable]). + +Note that the table must already exist for you to reference it in this dialog. I just manually run the CREATE statement once to prime the database and make sure anything that needs to reference the table at compile time can do so. This should create no harm since the process will drop the table and create a new one each time anyway. + +Finally, click in the File connection box and select New Connection to bring up the File Connection Management Editor for text files shown in Figure 16.13. + +Figure 16.13 + +Notice the error that the file doesn't exist. This is the same issue that we had with the TextImportTable table. Either create an empty dummy file or run the CreateImportText.vbs file once to get an initial file out there, and then refresh and this error should go away. + +Click OK all the way back out to our Control Flow, and we're ready to rock. + +To execute our package immediately, click the run icon (the green arrow on the toolbar). Watch how Dev Studio indicates progress by changing the color of different tasks as they run. + +A few more items of note: SSIS is capable of running multiple tasks at one time for you. For example, I made this project entirely linear (one piece at a time) based on the idea that we didn't want to drop the destination data until the last minute (when we are sure there's new data available). We could, however, have placed the link from the file generation directly to the build import just the same as the CREATE TABLE dependency is linked directly to the import. If we had, SQL Server would have run both the table DROP/CREATE and the file creation at the same time but waited for "both" to complete before allowing the build import to execute. + +Go ahead and build your package (choose the Build option in the Build menu), as we will be utilizing it in the next section! + +Executing Packages + +There are a few different ways to execute an SSIS package. We utilized one of these in something of test mode within the Dev Studio, but this is hardly how you are likely to run your packages on a day-to-day basis. The more typical methods of executing a package include: + + * The Execute Package Utility: This is essentially an executable in which you can specify the package you want to execute, set up any required parameters, and have the utility run it for you on demand. + * As a Scheduled Task using the SQL Server Agent: I'll talk more about the SQL Server Agent in Chapter 22, but for now, realize that executing an SSIS package is one of the many types of jobs that the agent understands. You can specify a package name and time and frequency with which to run it, and the SQL Server Agent will take care of it. + * From Within a Program: There is an entire object model supporting the notion of instantiating SSIS objects within your programs, setting properties for the packages, and executing them. This is fairly detailed stuff—so much so that Wrox has an entire book on the subject: Professional SQL Server 2008 Integrations Services by Knight, et. al (Wiley, 2009). We take a fast and dirty look at this in Chapter 25 (downloadable as special web content from either p2p.wrox.com or professionalsql.com), but if SSIS programmability is what you need, I recommend taking a look at Brian's work. + +Using the Execute Package Utility + +The Execute Package Utility is a little program by the name of DTExecUI.exe. You can fire it up to specify settings and parameters for existing packages and then execute them. You can also navigate using Windows Explorer and find a package in the file system (they end in .DTSX) and then double-click it to execute it. Do that to our text import package, and you should get the execute dialog shown in Figure 16.14. + +Figure 16.14 + +As you can see, there are a number of different dialogs that you can select by clicking the various options to the left. Coverage of this could take up a book all to itself, but let's look at a few of the important things on several key dialogs within this utility. + +General + +Many fields on this first dialog are fairly self-explanatory, but let's pay particular attention to the Package Source field. We can store SSIS packages in one of three places: + + * The File System: This is what you did on your Import/Export Wizard package. This option is really nice for mobility—you can easily save the package off and move it to another system. + * SQL Server: This one stores the package in SQL Server. Under this approach, your package will be backed up whenever you back up your MSDB database (which is a system database in every SQL Server installation). + * SSIS Package Store: This storage model provides the idea of an organized set of "folders" where you can store your package along with other packages of the same general type or purpose. The folders can be stored in either MSDB or the file system. + +Configurations + +SSIS allows you to define configurations for your packages. These are essentially a collection of settings to be used, and you can actually combine more than one of them into a suite of settings. + +Command Files + +These are batch files that you wish to run as part of your package. You can use these to do system-level things such as copying files around to places you need them (they will run under whatever account the Integration Services service is running under, so any required access on your network will need to be created to that account). + +Connection Managers + +This is a bit of misnomer—this isn't so much a list of connection managers as it is a list of connections. By taking a look at the Description column, you'll see many of the key properties for each connection your package uses. Notice that in our example package, we have two connections, and if you look closely, you'll see how one relates to file information (for our connection to the flat file we're using), and there is another that specifically relates to SQL Server (the export source connection). + +Execution Options + +Do not underestimate the importance of this one. Not only does it allow you to specify how, at a high level, you want things to happen if something goes wrong (if there's an error), but it also allows you to establish checkpoint tracking—making it easy to see when and where your package is getting to different execution points. This can be critical in performance tuning and debugging. + +Reporting + +This one is all about letting you know what is happening. You can set up for feedback: exactly how much feedback is based on which events you decide to track and the level of information you establish. + +Logging + +This one is fairly complex to set up and get going but has a very high "coolness" factor in terms of giving you a very flexible architecture for tracking even the most complex of packages. + +Using this area, you can configure your package to write log information to a number of preconfigured "providers" (essentially, well-understood destinations for your log data). In addition to the preinstalled providers such as text files and even a SQL Server table, you can even create your own custom providers (not for the faint of heart). You can log at the package level, or you can get very detailed levels of granularity and write to different locations for different tasks within your package. + +Set Values + +This establishes the starting value of any runtime properties your package uses (there are none in our simple package). + +Verification + +Totally different packages can have the same filename (just be in a different spot in the file system, for example). In addition, packages have the ability to retain different versions of themselves within the same file or package store. The Verification dialog is all about filtering or verifying what package/version you want to execute. + +Command Line + +You can execute SSIS packages from the command line (handy when, for example, you're trying to run DTS packages out of a batch file). This option within the SSIS Package Execution Utility is about specifying parameters you would have used if you had run the package from the command line. + +The utility will establish most of this for you—the option here is just to allow you to perform something of an override on the options used when you tell the utility to Execute. + +Executing the Package + +If you simply click Execute in the Package Execution Utility, your package will be off and running. After it runs, you should find a text file in whatever location you told your package to store it—open it up, take a look, and verify that it was what you expected. + +Executing within Management Studio + +While Management Studio doesn't give you a package editor, it does give you the ability to run your packages. + +In the Object Explorer pane of Management Studio, clicking the Connect icon can choose Integration Services. Fill out the connection dialog. This should create a connection to Integration Services on that server, and add an Integration Services node in your Object Explorer. + +To execute a package in this fashion (using Management Studio), the package must be local to that server (not in the file system). Fortunately, if you right-click the File System node under Stored Packages, SQL Server gives you the ability to import your package. Simply navigate the file system to the package we created, give it a name in the package store, and import it. You can then right-click and execute the package at any time. (It will bring up the execution utility we saw in a previous section, so you should be in familiar territory from here.) + +Summary + +SQL Server Integration Services is a robust Extract, Transform, and Load tool. You can utilize Integration Services to provide one-off or repeated import and export of data to and from your databases—mixing a variety of data sources while you're at it. + +In this chapter, we actually went just slightly beyond the basics—touching on external access and multi-stage control of flow. While becoming expert in all that Integration Services has to offer is a positively huge undertaking, getting basic imports and exports up and running is a relative piece of cake. I encourage you to start out simple and then add to it as you go. As you push yourself further and further with what SSIS can do, take a look at other books that are specific to what SSIS has to offer. +17 + +Replication + +Coming off the heels of significant change in 2005, replication is one of a few quiet areas in terms of version differences in SQL Server 2008. Indeed, virtually nothing has changed that isn't directly tied to a non-replication feature. (They had to allow for replication of the new data types, didn't they?) + +Replication is one of those things that everyone loves to ignore—until they need it. Then, it seems, there is a sudden crisis about learning and implementing it instantly (and not necessarily in that order, I'm sorry to say). + +So, what then, exactly, is replication? I'll shy entirely away from the Webster's definition of it and go to my own definition: + +Replication is the process of taking one or more databases and systematically providing a rule-based copy mechanism for that data to and potentially from a different database. + +Replication is often a topology and administration question. As such, many developers have a habit of ignoring it—bad idea. Replication has importance to software architects in a rather big way, as it can be a solution to many complex load and data distribution issues such as: + + * Making data available to clients that are generally not connected to your main network + * Distributing the load associated with heavy reporting demands + * Addressing latency issues with geographically dispersed database needs + * Supporting geographic redundancy + +And those are just a few of the biggies. + +So, with that in mind, we're going to take a long look at replication. I'm going to warn you in advance that this isn't going to have quite as many walkthroughs as I usually do, but patience, my young padawan—there is a reason. In simple terms, once you've built one or two of the styles of replication, you have most of the "constructing" part of the learning out of the way. What's more, the actual building up of the replication instance is indeed mostly an administrator's role. Instead, we're going to focus on understanding what's happened, and, from there, save most of the space in this chapter for understanding how different replication methods both create and solve problems for us and how we might use the different replication models to solve different problems. + +In this chapter we will look at things like: + + * General replication concepts + * What replication models are available (we will see an example or two here) + * Security considerations + * Replication Management Objects (RMO)—the programmatic way of managing replication + +In the end, while I can't promise to make you a replication expert (to be honest, I'm not really one myself), you will hopefully have a solid understanding of the fundamentals and have a reasonable understanding of the possibilities. + +Replication Basics + +Replication is like a big puzzle—made up of many pieces in order to form a complete unit. We have topology considerations (publisher, subscriber, and distributor) as well as publication models (merge, transactional, snapshot). Before you get to deciding on those, there are several things to take into account. + +Considerations When Planning for Replication + +There are a number of things to take into account when thinking about the topology and replication methods available. These should be part of an assessment you make at design time to determine what forms of replication should even be considered for your application. Among these are: + + * Autonomy + * Latency + * Data consistency + +Let's take a quick look at each of these. + +Autonomy + +Autonomy is all about how much a replication instance is able to run as its own thing. What data needs to be replicated and at what frequency? For example, you could be supporting a sales application where each site keeps separate customer records. You would want to have these replicated to a central database for reporting and, perhaps, such other things as automatic stock replacement. Each site is highly autonomous (they really don't care whether the central database gets its data or not; they can still continue to make sales based on the data they have on-site). Indeed, even the central database, while dependent, is probably not in a catastrophic situation if it misses data from a site for a day (depends how you're using the reports that come off it or how much lag you can have before you restock). + +Latency + +Latency refers to the time delay between updates; in other words, the time taken for a change at the publishing server to be made available at the subscribing server. The higher the autonomy between sites, the greater the latency between updates can be. + +Determining an acceptable delay can be tricky and will likely be tied into the aforementioned autonomy question. If our site information is only transmitted to the central server for periodic rollup reporting, then we can probably get away with only daily—or even longer—updates. If, however, the sites are drawing from a central shipping facility for some of the sales, then we need to update the central database in a timelier manner, so a product is not oversold (two sites trying to sell the one remaining piece of stock). + +Data Consistency + +Data consistency is obviously going to be a key concern of virtually any distributed system. This is, of course, all about making sure that your various replication instances contain the same values from end to end, and this can be accomplished in two ways: + + * Data Convergence: All sites eventually end up with the same values; however, the values aren't necessarily the same as they would be if all of the changes had taken place on one server. An example might be our overstock situation. Had our two sales happened on the same server, then the second sale would have known about the out of stock situation and perhaps not been completed. Instead, each database thought one item was available, and, depending on the way the inventory adjustment is handled, you may wind up with a negative inventory level. In the same vein, your data may wind up with exactly the same end value, but may have taken a different set of steps to arrive at that value (the actual ordering of the updates may not be the same depending on how many replication clients were involved and at what time they synchronized). + * Transactional Consistency: The results at any server are the same as if all transactions were executed on a single server. This is implemented by the mechanism implied in the name—transactions. I'm sure, if you ponder this for a bit, you can recognize the latency impact (both good and bad) of this—before your transaction can complete, it has to complete on every server that is participating in that particular replication set. + +Schema Consistency + +Many developers who are used to developing in non-replicated environments take the ability to easily change the database schema for granted. Need to add or drop a new column? No problem. Need to add a new table? No big deal. Well, beyond the basic problems of being so cavalier with your database in any environment, you'll quickly find that life gets a bit more complicated in a replicated world. + +Replication or not, remember that any time you alter the schema of your table you are essentially altering the foundation of your entire system (or at least the part that the schema object in question serves). Schema changes should always be treated as fairly serious alterations and be carefully considered as well as methodically planned. Some changes (additions in particular) can usually be made with relatively minor collateral impact. Things that change or remove existing objects, however, can be deadly when dealing with backward-compatibility issues. Also, keep in mind that others may have built "extensions" to your system that are relying on your existing schema; this can mean impacts that are hard to plan for when you change your existing schema. + +The good news is that SQL Server continues to increase its support for schema changes during replication. Fields that are added or dropped on the publisher may be propagated to all subscribers during future replication operations. The bad news is that your change procedures need to be much stricter. The bottom line is that, if you need to make frequent schema changes, you'll want to fully plan what your change strategy is going to be before implementing replication at all. + +When the concept of replicating schema changes was first added to SQL Server, it was done through the use of special stored procedures called sp_repladdcolumn and sp_repldropcolumn rather than the more familiar ALTER TABLE command. This was changed back in SQL Server 2005, and sp_repladdcolumn and sp_repldropcolumn should be considered deprecated (avoid using them). + +Other Considerations + +Some other things to think about include: + + * How reliable is the connection between your servers? If it is a local connection, then you can probably count on it, but what if it is in a different geographic location? What if it's a different country? + * What kind of connection latency do you have? This falls somewhat into the reliability question, but is really its own issue. Do you really want to enforce transactional replication if it takes even a second or two for a simple ping to return (imagine that with a block of data now)? + * In the same vein as connection latency, how much bandwidth do you have? How much traffic are you going to be flushing over the wire, and what other processes are going to be using that same wire? Do you need to compress your replication related data? + * Is the replication method wired at all? That is, what if you don't have connectivity at all with the servers you want to replicate to? SQL Server supports a disconnected model, but what does that do to you between long updates? + +Replication Roles + +The process of replication is based on three basic roles: The publisher, distributor, and subscriber. Any one server can potentially be serving any one (or any subset) of these roles. Just to paint a picture of how flexible this can be, take a look at Figure 17.1. + +As you can see, multiple publishers can be utilizing the same distributor, and any given publication can have multiple subscribers. Let's take a little bit closer look at these roles. + +The Publisher + +The publisher can be considered to be the source database. Even in situations where the publisher and its various subscribers are sharing data equally, there is one database that can be thought of as something of the control database. + +The Distributor + +The distributor serves as something of the clearinghouse for changes. It has a special distribution database that keeps track of changes, as well as which subscribers have already received those changes. In addition, it will keep track of the results of any synchronization process and will know what happened in the case of any conflicts that had to be resolved (we'll look more into conflict resolution later). + +Figure 17.1 + +The Subscriber + +Any database that is participating in the replication publication, but is not the actual publisher, can be considered a subscriber. This does not, however, mean that the subscriber only receives data—indeed, depending on the specific model chosen (again, more on those later), the subscriber may well be both receiving and disseminating data. + +Subscriptions + +The subscriptions that a subscriber receives are called publications. A publication will contain one or more articles. An article is usually a table or some subsection of the data from a table, but it can be a stored procedure or a group of stored procedures. By subscribing to a publication, the subscriber is subscribing to all of the articles in the publication. The subscriber cannot subscribe to individual articles alone. + +Subscriptions can be set up as push subscriptions or pull subscriptions: + + * With push subscriptions, the publisher determines when updates go out to the subscriber. This is used most frequently when you want to keep latency to a minimum (since the publisher is often the only copy of the database receiving changes, it makes sense that it would be the one to know about changes as they happen and take appropriate action) or you want to keep full control at the publisher for some other reason. + * With pull subscriptions, the subscriber requests updates. This allows for a higher level of autonomy since the subscriber decides when updates should occur. + +A publication can simultaneously support both push and pull subscriptions; however, any given subscriber is restricted to either a push or pull subscription—it cannot have both push and pull to the same publication. + +Types of Subscribers + +SQL Server supports three types of subscribers: + + * The default is a local subscriber. The publisher is the only server that knows about the subscriber. Local subscribers are often used as a security mechanism or when you want to maximize autonomy between servers. + * Global subscribers occur where all servers participating in the publication (be they the publisher or a subscriber) know about all the other subscribers. Global subscribers are commonly used in a multiserver environment where you want to be able to combine data from different publishers at the subscriber. + * Anonymous subscribers are visible only to the publisher while the subscriber is connected. This is useful when setting up Internet-based applications. + +Filtering Data + +SQL Server provides for the idea of horizontally or vertically filtering tables. Horizontal filtering (you may come across the term horizontal partitioning for this as well) identifies rows within the table (by way of a WHERE clause) for publication. For example, you could divide inventory information by warehouse as a way of maintaining separate warehouse totals. Vertical filtering (also known as vertical partitioning) identifies the columns to be replicated. For example, you might want to publish quantity on hand information from an inventory table, but not quantity on order. + +Replication Models + +We have three different models available to us in replication. They trade off between the notions of latency, autonomy, and some of the other considerations we discussed earlier in the chapter. Deciding which to choose is something of a balancing act between: + + * Degree of Autonomy: Is there a constant connection available between the servers? If so, what kind of bandwidth is available? How many transactions will be replicating? + * Conflict Management: What is the risk that the same data will be edited in multiple locations either at the same time or in between replicated updates? What is the tolerance for data on one or more of the replicated servers disagreeing? + +Some replication scenarios don't allow for connectivity except on a sporadic basis—others may never have connectivity at all (save, perhaps, through what is sarcastically referred to as "sneaker net"—where you run, mail, fly, or the like, a disk or other portable storage medium from one site to another). Other replication scenarios have an absolute demand for perfectly consistent data at all sites with zero data loss. + +From highest to lowest in autonomy, the three models are: + + * Snapshot replication + * Merge replication + * Transactional replication + +Let's look at the pros and cons of each replication model, outlining situations where it would be an appropriate solution and any data integrity concerns. + +It's important to note that you can mix and match the replication types as necessary to meet your implementation requirements. There are going to be some publications where you want to allow greater autonomy between sites. There will be other publications where minimizing latency is critical. + +Let me take a moment here to point out that a publication is just that—a publication. It does not necessarily map out that one publication equals one database. You may have one publication where the articles included in it make up only part of your subscribing database. Other objects in the subscribing database may be served by a different publication—potentially from a completely different publishing server. + +Snapshot Replication + +With snapshot replication, a "picture" is taken at the source of all of the data to be replicated (as shown in Figure 17.2). This is used to replace the data at the destination server. + +Figure 17.2 + +Snapshot replication, in its simplest form, is the easiest type of replication to set up and manage. Complete tables or table segments (for partitioned tables) are written to the subscribers during replication. Since updates occur on a periodic basis only, most of the time, there is minimal server or network overhead required to support replication. + +Snapshot replication is frequently used to update read-only tables on subscriber systems. It allows for a high level of autonomy at the subscriber, but at the cost of relatively high latency. You are able to keep tight control on when periodic updates occur when using snapshot replication. This means that you can schedule updates to occur when network and server activity is at a lull (or you can even carry the snapshot via disk or other hard medium). There is a potential concern about the time and resources to complete replication during the periodic updates. As source tables grow, the amount of data that has to be transferred during each update increases. Over time, it may become necessary to either change the replication type or partition the table to reduce the amount of data replicated to keep traffic to manageable levels. + +A variation of snapshot replication is snapshot replication with immediate-updating subscribers. With this, changes can be made to the data at the subscriber. Those changes are sent to the publishing server on a periodic basis unless immediate updating has been implemented, in which case distributed transactions are executed in real time. + +How Snapshot Replication Works + +Replication is implemented through replication agents. Each agent is essentially its own, small, independent program that takes care of the tasks of monitoring transactions and distributing data as required for that particular type of agent. + +Snapshot Agent + +The Snapshot Agent supports snapshot replication and initial synchronization of data tables for other types of replication (which all also rely on a snapshot for synchronizing data for the first time). All types of replication require that the source and destination tables must be synchronized, either by the replication agents or through manual synchronization, before replication can begin. In either case, the Snapshot Agent has the same responsibility. It takes the "picture" of the published data and stores the files on the distributor. + +Distribution Agent + +The Distribution Agent is used for moving data for initial synchronization and snapshot replication (and, as we'll see later, for transactional replication) from the publisher to the subscriber(s). For push subscriptions, the Distribution Agent typically runs on the distributor. For pull subscriptions, the Distribution Agent typically runs on the subscriber. The actual location of the Distribution Agent is an option that can be configured within Management Studio or via RMO. + +The Process of Snapshot Replication + +Snapshot replication uses periodic updates (the frequency is up to you, but, in general, you'll schedule a job in the job manager to run your snapshot on a regular basis). During the updates, schemas and data files are created and sent to the subscribers. Let's step through the basic procedure (see Figure 17.3): + +1. The Snapshot Agent places a shared lock on all articles in the publication to be replicated, ensuring data consistency. + +2. A copy of each article's table schema is written to the distribution working folder on the distributor. + +3. A snapshot copy of table data is written to the snapshot folder. + +4. The Snapshot Agent releases the shared locks from the publication articles. + +5. The Distribution Agent creates the destination tables and database objects, such as indexes, on the subscriber and copies in the snapshot data, overwriting the existing tables, if any. + +Figure 17.3 + +Snapshot data is stored as a native bcp (we explored these back in Chapter 15) file if all of the subscribers are Microsoft SQL Servers. Character mode files, instead of SQL Server bcp files, will be created if you are supporting heterogeneous (non-SQL Server) data sources. + +SQL Server supports heterogeneous data sources for replication. Currently, transactional and snapshot replication are supported on all O/S platforms for Oracle as well as most O/S platforms for DB2. + +When to Use Snapshot Replication + +Use snapshot replication to update lookup data or read-only copies of data on remote servers. You can use snapshot replication when you want (or need) to connect to the publisher only intermittently. + +As an example, think of how servers might be managed for a chain of garden supply stores. You have stores in several cities. Some larger cities have multiple stores. What are some good candidates for snapshot replication? + +Customer records are an obvious choice. A customer, such as a landscape gardener, may turn up at different locations. In most cases, it won't matter if there's a delay updating customer information. This would also give you a way to make sure that only users who have access to the publishing server can change customer records. + +Inventory records could be a little more of a problem. The items you keep in inventory are somewhat constant with most changes taking place by season. Even then, you would probably keep the items in file, but with a zero quantity on hand. The problem is, you may want to replicate more up-to-date inventory records between stores. This would let you search for items you might not have on hand without having to call each of the stores. Timely updates would most likely mean transactional replication (which we will discuss shortly). + +Special Planning Requirements + +An important issue when setting up snapshot replication is timing. You need to make sure that users are not going to need write access to any published tables when the Snapshot Agent is generating its snapshot (remember that share lock that gets set on every article in the publication? Well, that's going to prevent inserts, updates, and deletes to that data for the duration of that lock—which is to say for the duration of the publishing of the distribution). You also want to be sure that the traffic generated by replication does not interfere with other network operations. + +Storage space can also become an issue as published tables grow. You have to verify that you have enough physical disk space available on the destination folder (CD-ROM, DVD, jump drive, tape, and so on) to support the snapshot folder. + +Merge Replication + +Snapshot is great, but we do not always live in a "read-only" world. Among the choices for dealing with data changes taking place at multiple servers is through the use of merge replication. The changes from all of the sites are merged when they are received by the publisher (see Figure 17.4). Updates can take place either periodically (via schedule—this is the typical way of doing things) or on demand. + +Figure 17.4 + +Merge replication has a high level of autonomy, but also has high latency and runs a risk of lower transactional consistency. Unlike transactional and snapshot replication, which guarantee consistency, merge replication does not. This is one of the more critical design considerations that you need to make when implementing merge replication—how important is consistency? + +In a way, roles tend to get somewhat blurred in merge replication. The publisher is the initial source for the merge data, but changes can be made at the publisher or the subscribers. Changes can be tracked by row or by column. Transactional consistency is not guaranteed because conflicts can occur when different systems make updates to the same row. Data consistency is maintained through conflict resolution based on criteria you establish (you can even write custom resolution algorithms). You can determine whether conflicts are recognized by row or by column. + +As with transactional replication, the Snapshot Agent prepares the initial snapshot for synchronization. The synchronization process is different, however, in that the Merge Agent performs synchronization. It will also apply any changes made since the initial snapshot. + +Merge Agent + +Just as we saw with snapshot replication, merge replication uses an agent—the Merge Agent. As shown in Figure 17.5, the agent copies the changes from all subscribers and applies them to the publisher. It then copies all changes at the publisher (including those made by the Merge Agent itself during the resolution process) to the subscribers. The Merge Agent typically runs on the distributor for push subscriptions and on the subscriber for pull subscriptions, but as with the snapshot and transactional replication, this can be configured to run remotely. + +Figure 17.5 + +The Process of Merge Replication + +Assuming that the initial synchronization has already taken place (remember, that will be based on a snapshot), the steps to merge replication are: + +1. Triggers installed by SQL Server track changes to published data. + +2. Changes from the publisher are applied to subscribers. + +3. Changes from subscribers are applied to the publisher, and any conflicts resolved. + +Merge triggers do not interfere with the placement or use of user-defined triggers. + +Changes, whether occurring at the publisher or subscriber, are applied by the Merge Agent. Conflicts are resolved automatically through the Merge Agent, using a conflict resolver (you can select one and can even build your own). The Merge Agent tracks every row update for conflicts at the row or column level, depending on how you have configured conflict resolution. You will define the priority scheme to be used when conflicts occur between new (arriving) and current data values. + +When to Use Merge Replication + +One way of using merge replication is to support partitioned tables. Going back to the garden supply business, you could set up filtering (partitioning) so that each store can view inventory information for any store but would only be able to directly update its own inventory. Changes would be propagated through merge replication. Data can be filtered horizontally or vertically. You can exclude rows to be replicated from a table, and you can exclude any table columns. Merge replication watches for changes to any column in a replicated row. In this particular scenario, there is little risk of conflict in inventory since each store can only update its own inventory, but what if you were allowing all stores to update customer data (such as a new address for the customer)? The right answer is situational, but this illustrates how different needs can place a different burden on your replication design. + +Special Planning Requirements + +When implementing merge replication, there are checks that you need to make to ensure that your data is ready for replication. While setting up merge replication, some changes may be made automatically by SQL Server to your database objects. Use care when selecting the tables to be published. Any tables required for data validation (such as lookup tables and other foreign key situations) must be included in the publication if you want that validation to apply on the subscribers. + +SQL Server will identify a column as a globally unique identifier for each row in a published table. If the table already has a uniqueidentifier column, SQL Server will automatically use that column. Otherwise, it will add a rowguid column (which will, as it happens, also be called rowguid) to the table and create an index based on the column. + +There will be triggers created on the published tables at both the publisher and the subscribers. These are used to track data changes for Merge Agent use based on row or column changes. + +There will also be several tables added for tracking purposes. These tables are used by the server to manage: + + * Conflict detection and resolution + * Data tracking + * Synchronization + * Reporting + +For example, conflicts are detected through a column in the MSmerge_contents table, one of the tables created when you set up merge replication. + +Transactional Replication + +The difference between transactional replication and snapshot replication is that incremental changes, rather than full tables, are replicated to the subscribers. Any changes logged to published articles, such as INSERT, UPDATE, and DELETE statements, are tracked and replicated to subscribers. In transactional replication, only changed table data is distributed, maintaining the transaction sequence. In other words, all transactions are applied to the subscriber in the same order that they were applied to the publisher. + +Note that only logged actions are properly replicated. Unlogged bulk operations (such as a bcp that has logging turned off) or Binary Large Object (BLOB) operations that do not generate full log entries will not be properly replicated. + +In its simplest form, as shown in Figure 17.6, changes can only be made at the publisher. Changes can be replicated to subscribers at set intervals or as near real-time updates. While you may have less control over when replication occurs, you are typically moving less data with each replication. Updates are occurring much more often and latency is kept to a minimum. Reliable and consistent near real-time subscriber updates (immediate transactional consistency) require a reliable network connection between the publisher and subscriber (make sure you have the bandwidth on your connection to handle the chatter between the publisher and the subscriber if it is a very high update frequency and/or volume). + +Figure 17.6 + +Just as with merge replication, the published articles must be initially synchronized between the publisher and the subscriber before transactional replication can take place. This is typically managed through automatic synchronization, using snapshot replication. In situations where automatic synchronization is neither practical nor efficient, manual synchronization can be used to prepare the subscriber. This is a relatively simple process: + +1. Run BACKUP DATABASE to back up the Publisher database. + +2. Deliver the tape backup to the subscriber system. + +3. Run RESTORE DATABASE to create the database and database objects, and to load the data. + +The publisher and subscriber are synchronized as of the point when the backup was run. + +Transactional replication can also be used to replicate stored procedures. In its simplest implementation, changes can only be made at the publishing server. This means that you don't have to worry about conflicts. + +You can also implement transactional replication as transactional replication with immediate-updating subscribers. This means that changes can be made at the publisher or at the subscriber. Transactions occurring at the subscriber are treated as distributed transactions. Microsoft Distributed Transaction Coordinator (MS DTC) is used to ensure that both the local data and data on the publisher are updated at the same time to avoid update conflicts. Queued updating—where updates are placed in an ordered "to be done" list—can be used as a fallback in the event that there is a network connectivity issue such as a disconnection or if the network is physically offline. + +Another option would be to implement distributed transactions directly rather than using transactional replication. This will get you a lower latency than that provided with transactional replication, but you will still have the distribution delay in getting changes posted at the publisher out to all of the subscribers. Assuming a solid connection between the servers involved, distributed transactions could provide near immediate updates to all servers when data is changed at any server. However, depending on the connection speed and reliability between servers, this could result in performance problems, including locking conflicts. + +Log Reader Agent + +The Log Reader Agent is used in transactional replication. After a database is set up for transactional replication, the associated transaction log is monitored by the Log Reader Agent for changes to published tables. The agent then has responsibility for copying those transactions marked for replication from the publisher to the distributor as shown in Figure 17.7. The Distribution Agent is also used in transactional replication and is responsible for moving transactions from the distributor to the subscriber(s). + +The Process of Transactional Replication + +Assuming that initial synchronization has already taken place, transactional replication follows these basic steps: + +1. Modifications are posted to the publisher database and recorded in the associated transaction log. + +2. The Log Reader Agent reads the transaction log and identifies changes marked for replication. + +3. Changes taken from the transaction log are written to the distribution database on the distributor. + +4. The Distribution Agent applies the changes to the appropriate database tables. + +Figure 17.7 + +You can set up the Log Reader Agent to read the transaction log continuously or on a schedule that you specify. As before, the Distribution Agent typically runs at the publisher for push subscriptions and at the subscriber for pull subscriptions, but this can be changed through Management Studio or RMO to run remotely. + +When to Use Transactional Replication + +Use transactional replication when you need or just want to reduce latency and provide subscribers with relatively up-to-date information. Near real-time updates usually require a local area network connection, but scheduled replication can often be managed through scheduled updates. If you choose to use scheduled updates, latency increases, but you gain control over when replication occurs. + +Let's go back to our garden supply store and the inventory problem discussed earlier. You want each of the stores to have up-to-date, or at the very least, relatively up-to-date, inventory information. You would probably use scheduled replication to pass data to the subscribers. + +Now let's see if we can make things a little more difficult. Not only do you have a chain of stores; you also have traveling salespeople who visit and take orders from your largest customers. They need to have at least relatively up-to-date inventory information but can spend their days sitting around and waiting for updates from the publisher. For systems of this type, you may want to use pull subscriptions, letting the salespeople decide when they connect to the server and download recent transactions. + +You've probably noticed a potential problem in both of these scenarios. The remote servers can receive data, but they are not able to make any changes to the data. We'll cover that problem a little later. Transactional replication, when implemented in this manner, is used to support read-only copies of the data at subscriber systems. + +Special Planning Requirements + +Space is an important issue when planning for transactional replication. You have to make sure that you allow adequate space for the transaction log on the publisher and for the distribution database on the distributor. + +Check each of the tables that you are planning to publish. For a table to be published under transactional replication, it must have a primary key. There are also potential concerns if you are supporting text or image data types in any of the tables. INSERT, UPDATE, and DELETE are supported as for any data type, but you must be sure to use an option that utilizes the transaction log when performing BLOB or bulk operations. + +You may encounter problems with the max text repl size parameter, which sets the maximum size of text or image data that can be replicated. Make sure that this server-level parameter is set to a high enough value to support your replication requirements. + +Immediate-Update Subscribers + +As indicated earlier in the chapter, you have the option of setting up subscribers to snapshot or transactional publications as immediate-update subscribers. Immediate-updating subscribers have the ability to update subscribed data, as long as the updates can be immediately reflected at the publisher. This is accomplished using the two-phase commit protocol managed by MS DTC. There is effectively no latency in updating the publisher. Updates to other subscribers are made normally (as if the change was initiated at the publisher), so latency when going to other subscribers will depend on the rate at which those subscribers are updated. + +You should consider immediate-updating subscribers when you need to post changes to replicated data at one or more subscribers and propagate near-immediate updates. You might be using multiple servers to support an Online Transaction Processing (OLTP) application as a way of improving performance and providing near real-time redundancy. When a transaction is posted to any server, it will be sent to the publisher, and through the publisher, to the remaining servers. + +Much as with any form of merge replication, conflicts can arise when using immediate-updating subscribers. In order to assist with conflict identification and management, a uniqueidentifier column will be added to any published tables that do not already have one (if your table has one, the column in question will have a column level property of IsRowGUID of true—you can only have one RowGUID column per table). + +A high-speed, reliable connection is required between the publisher and any immediate-updating subscribers, such as a local area network connection, unless queued updates are used. If queued updates are configured, then the replication process can tolerate an unreliable connection and will just process any queued transactions as soon as connectivity is restored. + +Keep in mind that queued updates increase the opportunities for you to have a conflict. Since the subscriber is making changes that the publisher does not know about, there is the increased prospect for the publisher to be making changes to the same rows that the subscriber is. In such a case, the conflict resolver will identify the existence of the conflict when replication occurs and resolve it according to whatever rules you have established. + +Mixing Replication Types + +You can mix and match replication types as needed. Indeed, not only can you have different replication types on the same server; you can even have different replication types for the same table. + +As an example of why you might want to do this, imagine that a heavy equipment warehouse wants to have up-to-date inventory information and reference copies of invoices available at each of its locations. Each location has its own local SQL Server. Invoices are posted to a central location using an Internet-based application. These are replicated to all local servers through transactional replication so that inventory records are updated. You also want to have invoice and inventory information replication updated to yet another server weekly. This information on this last server is used for business analysis and running weekly reports. This server is updated weekly through a separate snapshot publication referencing the same tables used by the distributed inventory servers that were getting immediate updates. + +Replication Topology + +Over the years, Microsoft has outlined a number of replication topology models to describe how replication can be physically implemented. Let's look at some of these here as examples of how things are commonly implemented. It's worth noting that it is not only possible to mix and modify these models but actually rather common to do so. + +Your decisions about the type of replication you need to use and your replication model topology can be made somewhat independent of each other. That said, there is a chance that restrictions imposed by your physical topology, such as transmission bandwidth, will influence your decisions. + +Simple Models + +Let's start with a look at the more simple models. Once you've got the basic idea, we can move on to some variations and ways these models are mixed. + +Central Publisher/Distributor + +This is the default SQL Server model. As shown in Figure 17.8, you have one system acting as publisher and as its own distributor. This publisher/distributor supports any number of subscribers. The publisher owns all replicated data and is the sole data source for replication. The most basic model assumes that all data is being published to the subscribers as read-only data. Read-only access can be enforced at the subscriber by giving users SELECT permission only on the replicated tables. + +Figure 17.8 + +Since this is the easiest model to set up and manage, you should consider its use in any situation where it fits. If you have a single publisher, one or more subscribers, and read-only access to data at the subscriber, this is your best choice. + +Central Publisher/Remote Distributor + +You may find that the volume of replicated data and/or the amount of activity at the publisher may create the need to implement the publisher and distributor as separate systems. As shown in Figure 17.9, this is effectively, from an operational point of view, the same as the publisher/distributor model. The publisher is still the owner of—and only source for—replicated data. Once again, the simple model assumes that the data will be treated as read-only at the subscriber. + +Figure 17.9 + +Obviously, you usually only use this model when a single publisher/distributor cannot handle both production activity and replication to subscribers. + +Central Subscriber + +In this model, as shown in Figure 17.10, you have only one subscriber receiving data, but there are multiple publishers. The publishers can be configured as publisher/distributor systems. This model provides a way to keep just local data at the local server but still have a way of consolidating the data at one central location. Horizontal filtering may be necessary to keep publishers from overwriting each other's data at the subscriber. + +This is the model to use when you have data consolidation requirements such as gathering distributed data up for use in a data warehouse. + +Figure 17.10 + +Mixed Models + +Now let's look at a few variations based on the idea that we will frequently want to mix and match the basic models. Consider these as just a taste of the possibilities—something of "just the beginning." The possibilities are almost endless. + +Publishing Subscriber + +Publishing subscribers (that is subscribers that are also configured as publishers) can be added to any of the basic models. This model has two publishers publishing the same data. The original publisher replicates data to its subscribers, one of which is a publishing subscriber. The publishing subscriber can then pass the same data along to its subscribers. + +This model, shown in Figure 17.11, is useful when you have pockets of servers or when you have an especially slow or expensive link between servers. Another possibility is that you don't have a direct link between the initial publisher and all of the potential subscribers. The publisher only needs to pass data to one system on the far side of the link, and the publisher subscriber can then pass the data along to the other subscribers. + +Figure 17.11 + +Publisher/Subscriber + +This is another case where you have SQL Servers acting as both publishers and subscribers (Figure 17.12). Each server has its own set of data for which it is responsible. This model can be used when you have data changes taking place at both locations and you want to keep both servers updated. This is different from publishing subscribers in that each server is generating its own data, not just passing along updates received from another server. + +Figure 17.12 + +Multiple Subscribers/Multiple Publishers + +Figure 17.13 shows one of the more complicated scenarios. Under this scenario, you have multiple publishers and multiple subscribers. Systems may or may not act as a publisher/subscriber or publishing subscriber. This model requires very careful planning to provide optimum communications and to ensure data consistency. + +Figure 17.13 + +Self-Publishing + +It is worth specifically calling out that you can have a server subscribe to its own published articles. This is actually fairly common in small installations, where there is a diverse need, but not necessarily enough load to justify more than one physical server. For example, you may want to segregate the data used for online transaction processing from the data used for decision making. You can use replication to make separate read-only copies of your data (updated on any schedule you consider appropriate) to be used as a reference. + +Whether to locate your other databases—such as a data warehouse—on the same physical server as your core system is a matter of taste and your particular scenario. An example of where this can be very valid is the scenario where you have relatively low transactional volume but complex analysis needs. In my experience, companies that have enough need for a separate data warehouse usually have a physical or operational need for that to be on a separate server, but that is far from an "always" scenario. Consider your particular situation: does your server have room to share the load? Can you risk both databases being offline at the same time in the event of a catastrophe? + +Planning for Replication + +Replication is one of those things where it can be easy to "just toss something together." It's also one of those things where it is easy to create a huge mess if you take such a cavalier approach. Keep in mind that SQL Server may automatically make some alterations to your schema to implement replication—do you really want SQL Server adding columns and objects to your database without fully thinking about that first? Of course not. + +Any replication installation worth doing is worth taking the time to plan out. Some planning considerations include: + + * What data is to be replicated + * Replication type + * Replication model + +Along with these are other factors that will influence your decision, such as current network topologies, current server configurations, server growth potential, activity levels, and so forth. Each replication method has its advantages and disadvantages, and there is not a one-size-fits-all approach to replicating data. For instance, if you have a slow network or unreliable connection, then you may not want to implement transactional replication. Instead, you may opt to use merge replication that runs during a scheduled connection time. As has been pointed out repeatedly in this chapter, however, you also need to balance that against consistency needs. + +Data Concerns + +First, you have to consider what you are going to publish and to whom. You need to identify your articles (tables and specific columns to be published) and how you plan to organize them into publications. In addition, there are some other data issues of which you need to be aware. Some of these have already been mentioned, but it's worth our time to review them here. + +timestamp + +Include a timestamp column for transaction publications. That gives you a way of detecting conflicts on updates. By having a timestamp column already in place, you've already met part of the requirements for adding immediate-updating subscribers. + +uniqueidentifier + +A unique index and globally unique identifier is required for merge replication. Remember, if a published table doesn't have a uniqueidentifier column, a globally unique identifier column will be added. + +User-Defined Data Types + +User-defined data types are not supported unless they exist on the subscriber destination database. Alternatively, you can have user-defined data types converted to base data types during synchronization. + +NOT FOR REPLICATION + +The NOT FOR REPLICATION clause lets you disable table actions on subscribers. You can disable: + + * The IDENTITY property + * CHECK constraints + * Triggers + +These actions are essentially ignored when and only when the replication process changes data on the subscriber. Any other processes would still use them normally. So, for example, an insert into the original receiving database would have an identity value assigned, but as the row was subsequently published (in the form of an INSERT) to subscribers, the existing identity value would be used rather than generating a new value. + +Mobile Devices + +SQL Server also comes in a "Mobile" version. This is an extremely small footprint version of SQL Server designed to run on Windows Mobile Edition. The Mobile edition supports replication from a subscriber point of view. Snapshot and merge replication are supported—transactional replication is not. + +Many of the considerations for mobile devices are just variants of the same theme that we've seen already in replication—bandwidth and space, for example. Just keep in mind that the constraints for mobile devices may be much more extreme than with a full server class system (or even your salesmen's laptops for that matter). + +Setting Up Replication in Management Studio + +Setting up replication takes a few steps. In particular, you need to: + + * Configure your publication and distribution server(s) to be ready to perform those tasks + * Configure your actual publications + * Configure subscribers + +Let's take a look at how to do each of these within the Management Studio. + +Configuring the Server for Replication + +Before you can set up any publication or distribution on your server, your server must be configured for replication. + +To get at this in Management Studio, navigate to the Replication node, right-click, and select Configure Distribution. + +Note that, in order to configure replication, you must have connected to the Object Explorer using the actual name of the server (local, a period (.), localhost, or an ip address are not supported). If you connected using anything other than the server's DNS name, you'll get an error and be required to reconnect. + +SQL Server greets you with the standard splash screen that we've seen in other wizards, and then moves on to an intro dialog—in this case, it points out some of the options you will have as you go through this wizard. Click Next, and you are moved on to a dialog (shown in Figure 17.14) that decides if this publisher is to serve as its own distributor or if it should utilize an existing distributor. + +Figure 17.14 + +If we select the option to use a different server as the distributor and choose Add, then we would get a standard connection dialog box (asking for login security information for the distribution server). For our example run, keep the default option (that this box will act as its own distributor) and click Next. + +Note that which dialog comes after the Distributor dialog will change depending on whether or not you have the SQL Server Agent configured to start automatically on system startup. + +If you do not have the SQL Server Agent configured to start automatically (although you almost certainly want it to be on a production server), SQL Server will pop up a dialog, shown in Figure 17.15, to ask you about this. (It will skip this next dialog if your agent is already configured to start automatically when you start your system.) + +Feel free to leave your system configured however you already have it (SQL Server will, however, default this dialog to changing your SQL Server Agent service to start automatically), but keep in mind that the agent will need to be running for some forms of replication to work. + +Figure 17.15 + +Click Next. We move on to configuring a snapshot folder as shown in Figure 17.16. This will default to a directory in your main SQL Server folder, which for many installations may not be large enough to hold snapshots of large databases. This can be configured as a local volume or as a UNC path. Since I'm not going to assume you have a full server farm to try this stuff out on, we're going to take a "one server does everything" approach for this example, so accepting the default should be fine. + +Figure 17.16 + +From there, it's on to configuring the actual distribution database. SQL Server gives a dialog to get some typical database creation information (what do you want to call it and where to store it), as shown in Figure 17.17. + +Figure 17.17 + +From here, we move on to what, at first, appears to be a rather boring dialog (shown in Figure 17.18) with seemingly nothing new. + +Figure 17.18 + +Looks can, however, be deceiving. If we click on the little ellipsis (...) on the right, we get yet another dialog (shown in Figure 17.19)—one that does have a key item of note. + +Figure 17.19 + +As Figure 17.19 shows, we have the ability to specifically set the connection mode we're going to use when connecting the agent to the publisher. In most cases, the default of impersonating the Agent process will be fine, but keep in mind that we can use specific SQL Server security credentials if need be. + +Cancel out of this properties dialog, and click Next back in the publishers dialog (the one in Figure 17.18). Figure 17.20 shows the confirmation dialog, with what we want to do, at the end of the wizard. Note how it provides not only the option of immediately configuring the distribution, but also the concept of scripting the configuration for later or potentially remote use. + +Figure 17.20 + +Go ahead and click Finish (the next dialog is just a summary, so there is no need to dwell there). SQL Server begins processing the configuration request. When the process is complete, go ahead and close the dialog. + +And, just that quick, you have a server configured for publication and distribution of replicated data. Obviously, were this a production environment, we might have some other choices to make in terms of specific locations or even whether we wanted the publisher and distributor to be on the same system, but the basic foundations of what we are doing remains the same regardless. + +If you wonder about the distribution database, you should now be able to find it under the "System Databases" subfolder of the Databases folder. + +Configuring a Publication + +With our server all nice and configured, we're ready to get down to creating an actual publication. + +To do this, navigate to the Replication node in Management Studio, right-click the Local Publications sub-node, and choose New Publication. + +After the usual intro dialog, we come to the Publication Database dialog shown in Figure 17.21. This allows us to choose what database we want to utilize for our publication. As you can see, I've selected our old friend, AdventureWorks2008. + +Figure 17.21 + +Click Next, and you're ready to move on to the Publication Type dialog shown in Figure 17.22. + +Figure 17.22 + +This allows us to select between the replication types that we looked at earlier in the chapter. I've chosen Transactional publication with updatable subscriptions. + +Click Next, and you move on to the Articles dialog. + +In Figure 17.23, I've expanded the Tables node and selected the Person.Person table. I'm taking most of that table, but I'm going to skip the AdditionalContactInfo and Demographics columns since they are schema-bound XML columns, and SQL Server does not allow for the replication of XML columns that are bound to an XML schema collection. I also could have taken other schema objects such as stored procedures (I'm sticking to just the one object for simplicity's sake). + +Click Next to be taken to the Article Issues dialog, as shown in Figure 17.24. + +Notice that SQL Server detected several issues it wants to let us know about. This is one where I say "kudos to the SQL Server team" for attempting to let a user know about some fundamental things before they become a problem. + +Figure 17.23 + +Figure 17.24 + +Click Next to move on to the Filter Table Rows dialog shown in Figure 17.25. + +Figure 17.25 + +This one allows us to do horizontal partitioning—essentially just applying a WHERE clause so that only rows that meet a specific condition will go across in our publication. + +Click Add to get the dialog shown in Figure 17.26. + +In our example here, we've restricted the rows being replicated to those where the persons in question have been flagged as employees (EmployeeType = 'EM'). + +Click OK to return to the Filter Table Rows dialog, and then click Next to move on to the Snapshot Agent dialog shown in Figure 17.27. + +Remember that any subscription, regardless of whether it is to a snapshot, merge, or transactional replication model, must start by synchronizing based on a snapshot. Subsequent changes are begun relative to that snapshot. + +Figure 17.26 + +Figure 17.27 + +I've configured mine to run the snapshot immediately, but I could have just as easily scheduled it to be generated at a later time (remember that snapshots place share locks on every table the snapshot utilizes—do not run them at a time where such lock issues are going to block writes to your database that you need done in a timely fashion). If, for example, you are getting frequent new subscribers, you may want to schedule a periodic update to the snapshot to give them a more up-to-date time to synchronize to. + +Click Next, and you're ready to define the Agent Security, as shown in Figure 17.28. + +I've used the Security Settings dialogs to set the agents to use the SQL Server Agent account. This is not, however, good practice in a production environment for security reasons. Give the agents their own account to impersonate to both limit agent access and increase your ability to audit. + +Figure 17.28 + +Click Next, and you'll find an Action dialog (just like the one back in Figure 17.20) where you can indicate whether you want the publication created immediately or scheduled for later execution. + +One more click of the Next button, and you're ready for a summary and to define a publication name as shown in Figure 17.29 (I've chosen Employees). + +Go ahead and click Finish to create your publication, and, just like that, you're ready to have subscribers! + +Figure 17.29 + +Setting Up Subscribers (via Management Studio) + +Setting up subscribers utilizes the same basic notions we've already leveraged with publications. Before we get started with an example, however, let's set up a dummy database to play the part of our subscriber: + +CREATE DATABASE AWSubscriber; + +And, with that created, we're ready to subscribe to some data. + +Start by right-clicking the Local Subscriptions sub-node below the Replication node in Management Studio, and selecting New Subscription. After the usual intro dialog, we move on to identifying our publication, as shown in Figure 17.30. Since we have only one publication, there really isn't a lot to choose from, but the list could have easily been many, many publications. + +Figure 17.30 + +Click Next to move on to the Agent location, as shown in Figure 17.31. Remember that we can run our replication agent on either the subscriber or the distributor. In our case, it doesn't matter much since these are the same box, but you may make different choices depending on server loading issues. + +Figure 17.31 + +Click Next to move on to the Subscribers dialog shown in Figure 17.32. I've already chosen our AWSubscriber database, but notice how we could choose Add SQL Server Subscriber and configure multiple subscribers at one time. + +From there it's on to the Distribution Agent Security dialog. Here we define what security context we want to run under for both the distributor and subscriber (in this case, it's the same system, but it could have easily been remote). In Figure 17.33 I've chosen to impersonate the SQL Server Agent security context, but, again, on a production server you would generally want a more specific security context for your replication agent for security reasons. + +Figure 17.32 + +Figure 17.33 + +We can move quickly through the remaining dialogs by setting the agent to "Run continuously" and leaving the default "Commit at publisher" setting of "Simultaneously commit changes." That takes us to the Login For Updatable Subscriptions dialog shown in Figure 17.34. + +Figure 17.34 + +Since this is all (distribution and subscription) happening on the same server, a linked server is implied (a server is always available to itself as a linked server). Were we using a remote distributor, we could have either used a regular SQL Server login or again went with a linked server (though, in the latter case, we would need to configure the linked server separately). + +A linked server is another SQL Server or ODBC data source that has had an alias established for it on your server. When you refer to a linked server by name, you are essentially grabbing a reference to connection information to that linked server. + +Figure 17.35 allows us to choose when to initialize our subscription (I've stayed with the default of immediately). The initialization involves pulling down the snapshot from the distributor and applying it. Subsequent synchronizations will be done using the snapshot as a baseline to apply changes to. + +Click Next to get the same finishing dialogs that we've seen in prior examples (when to run things and a summary page), and then click Finish. + +Figure 17.35 + +Using Our Replicated Database + +Once the replicated database is in place, the problem largely becomes one of administration. If things are running smoothly, there is very little to see. Users can access our AWSubscriber database and the Person.Person table within it. Since we configured for updating subscribers, changes made to the AWSubscriber version of the Person.Person table will be immediately reflected in the source AdventureWorks2008 database. Likewise, changes made to our AdventureWorks2008 database will be reflected in our subscriber database. + +You can start by taking a look in the AWSubscriber table list, a quick look at the list of tables in the database (using Management Studio, sp_help, or sys.tables)—you should find the Person.Person table that we replicated. Then go ahead and take a look in our AdventureWorks2008 database. You should find a table called Person.conflict_Employees_Person. This new table is for conflict tracking—it should receive data only in the event that changes we make in our subscriber run into a conflict with those on the publisher. + +In the event of a conflict, the default publishing agent chooses the publisher's data over the client's. You can change this to prefer things based on such things as which is the most recent change and other ready-made criteria. You can also write custom resolution algorithms to encompass any unusual rules you may have for resolving conflicts. + +Now let's test out our transaction-based replication by making a change to our data. We'll start by taking a look at the starting value of the row we're going to change: + +SELECT aw.FirstName AS PubFirst, + +aw.LastName AS PubLast, + +aws.FirstName AS SubFirst, + +aws.LastName AS SubLast + +FROM AdventureWorks2008.Person.Person aw + +JOIN AWSubscriber.Person.Person aws + +ON aw.BusinessEntityID = aws. BusinessEntityID + +WHERE aw. BusinessEntityID = 38; + +What I've done here is join across the databases so that we can see both the publisher and subscriber at the same time. This way, we can, in one query, compare the source and the destination. The first time we run this script (before we make any changes), we can see our starting values, and that they are indeed the same: + +PubFirst PubLast SubFirst SubLast + +\---------- ---------- ---------- ----------- + +Kim Abercrombie Kim Abercrombie + +(1 row(s) affected) + +Okay, now let's make a change. We'll say that Kim has gotten married and decided to change her name to Abercrombie-Smith. + +USE AdventureWorks2008; + +UPDATE Person.Person + +SET LastName = 'Abercrombie-Smith' + +WHERE BusinessEntityID = 38; + +Now, we run our original SELECT statement again to check the results: + +PubFirst PubLast SubFirst SubLast + +\--------------- --------------- --------------- --------------- + +Kim Abercrombie-Smith Kim Abercrombie-Smith + +(1 row(s) affected) + +As you can see, both the publisher and subscriber received the update. + +Now, let's change the script just slightly to run inside the subscriber database, and see what happens on the publisher's side. This time, we'll change Kim's name back (perhaps she changed her mind...): + +USE AWSubscriber; + +UPDATE Person.Person + +SET LastName = 'Abercrombie' + +WHERE BusinessEntityID = 38; + +And now we're ready to run our original select statement one more time: + +PubFirst PubLast SubFirst SubLast + +\-------------- ---------------- -------------- -------------------- + +Kim Abercrombie Kim Abercrombie + +(1 row(s) affected) + +Again, our change was seen in both databases. + +The change was seen going both directions and was replicated immediately because we had selected transactional replication with immediately updating subscribers. Other replication choices would have introduced latency in the change, or potentially not replicated the change at all without some form of manual intervention. Be sure to review all of the replication types (discussed earlier in the chapter) to understand the behavior of each. + +Replication Management Objects (RMO) + +Replication Management Objects, or RMO, is a .NET object model that was first seen in SQL Server 2005 and replaced the replication portion of the COM-based Distributed Management Objects (DMO) object model that was used in SQL Server 2000 and earlier. You can think of RMO as being something of a companion to SQL Management Objects (SMO), which we discuss extensively in Chapter 23. + +RMO gives you programmatic access to any portion of your replication creation and configuration using any .NET language. Examples of RMO use would be automating operations such as: + + * Creating and Configuring a Publication: You can make use of the ReplicationDatabase as well as the TransPublication or MergePublication objects to define publications. + * Adding and Removing Articles: The TransArticle object supports the addition and removal of articles within your publication. In addition, you can add column filters or add a FilterClause property to limit what rows are replicated. + * Republishing your snapshot. + +These are just some more everyday use kinds of examples. RMO is, however, capable of creating, modifying, or deleting any part of the replication process. + +RMO can be utilized in Visual Studio by adding a reference to the Microsoft.SqlServer.Replication .NET Programming Interface library. You then point your include, imports, or using directives to Microsoft.SqlServer.RMO. As with any of the management libraries that support SQL Server, you will also need to have a reference to the Microsoft.SqlServer.ConnectionInfo library. + +An example application that utilizes RMO to create the same publication we created earlier in the chapter using the GUI can be downloaded from the Wrox Web site (wrox.com) or professionalsql.com. + +Summary + +As much as there was to take in this chapter, this really was something of an introduction to replication. We covered a lot of the considerations for architects reasonably well, but the scope of replication is such that entire books are written on just that topic. Indeed, there is much to consider in order to build just the right model for complex scenarios. The good news is that, if you really grasped this chapter, then you are prepared for perhaps 90 percent of what you are likely to ever face. Time and the proverbial "school of hard knocks" will teach you the rest. + +If you've taken anything from this chapter, I hope that it's an understanding of some of the general problems that replication can solve and how replication works best when you plan ahead both in terms of topology planning and in your application's general architecture (making sure it understands the special needs of replication). + +In our next chapter, we'll take a look at yet another "extension" area for SQL Server—full-text indexing. +18 + +Looking at Things in Full: Full-Text Search + +Full-Text Search is an area of significant architectural change in SQL Server 2008. While the core use and functionality hasn't changed all that much, the full-text features are far more integrated into the core of SQL Server as of this release. If you feel you are already familiar with full-text and are ready to skip this chapter, I would encourage you to at least browse the architectural changes and consider their ramifications on things like backup and recovery as well as expanded query result support. + +Using plain old T-SQL (without full-text functionality), our options for querying text information are somewhat limited. Indeed, we have only a couple of options: + + * Use a LIKE clause. This is generally woefully inefficient, and is not able to utilize any kind of index structure unless your search pattern starts with an explicit value. If the search starts with a wildcard (say "%" or "_"), then SQL Server wouldn't know which spot in the index to begin with—any indexes become worthless. + * Use some other form of pattern matching, such as PATINDEX or CHARINDEX. These are generally even more inefficient, but can allow us to do things that LIKE will not. + +With Full-Text Search, however, we gain the ability to index the contents of the text—essentially keeping a word list that lets us know what words we can find and in what rows. In addition, we are not limited to just pattern-matching algorithms. We can search for the inflected forms of words. For example, we might use the word university but have SQL Server still find the word universities, or, even better, SQL Server can find a word like drunk when the word we asked for was drink. It's up to us to decide how precise we want to be, but even if the word we are searching for is located deep in the middle of a large text block, SQL Server can quickly find the rows that contain the word in question. + +Full-Text Search, or FTS, supports any document type that has a filter registered on the system that supports the iFilter interface. This means that you can store things like Word, Excel, Acrobat, and other supported files in an image data type, but still perform full-text searches against that data! Indeed, you could even write your own extensions to support other document types if necessary. + +Personally, I find this later point to be extremely cool. Implementation of the iFilter interface allows you to separate what is text information versus what is formatting information, so you could, for example, write a custom iFilter that knows how to string XML tags out of an XML file to allow full-text searching for a custom XML document type. + +In this chapter, we'll take a look at these Full-Text Search features and more. + +Among the sections we'll look at are: + + * Full-Text Search architecture + * Setting up full-text indexes and catalogs + * Full-text query syntax + * Full-text quirks + * Noise words + +In addition, we'll see how there are now two ways of completing most full-text-related operations. By the time we're done, you should be prepared for the hassles that FTS creates for you, but you should also be ready to utilize what can be some wonderful functionality in return. + +Full-Text Search Architecture + +The architecture of FTS got a major overhaul with this release. While some of the fundamental concepts (such as word-breakers, filters, catalogs, and indexes) still apply, the way these items are utilized has changed somewhat. A map of the new (and rather complex) architecture is shown in Figure 18.1. + +In prior versions of SQL Server, the core of Full-Text Search wasn't really part of SQL Server at all. It was a shared technology item that originally came from Microsoft Index Server. You would see the separate process installed with SQL Server under the service name of MSFTESQL. With SQL Server 2008, Full-Text is now a fundamental part of the main SQL Server process. The full-text engine is excellent at examining raw text data and aggregating word lists. It maintains an association between the individual words and phrases and the places that the FTS has encountered them. + +Full-Text is now part of the core SQL Server process. Individual filters are, however, instantiated in their own process for security reasons. + +Figure 18.1 + +To perform full-text queries against any SQL Server table, you must build a full-text index for that table. The construction and maintenance of this full-text index—or the population of the index—is done through a process of SQL Server instantiating an instance of a filter daemon, which is passed a text stream, the words in the stream are cataloged, and an association is made between the catalog entry and the row the word was sourced from. + +By default, tables have no full-text functionality at all. The fact that there is a table and that it has text data types is no guarantee that there is a full-text index on the table. If you want it, you need to create it. Even after you create the full-text index, the index will have nothing in it. To make the index fully functional, you need to populate the index. + +The population process looks over the columns specified by the index and builds the word list that is going to be used. Much like standard indexes in SQL Server, only the columns you specify to include in the index will become part of the index. Unlike normal indexes in SQL Server, however, you are allowed only one full-text index per table—so every column you want to have participate in full-text queries needs to be part of the index. + +The differences don't stop there though. Actually there are several. The major differences include: + + * Internal Structure: Typical SQL Server indexes are stored as a balanced tree structure. Full-text indexes, however, utilize a token-based structure that is inverted (essentially storing things backwards) and compressed. + * Method of Creation: SQL Server indexes are created using the CREATE INDEX command in T-SQL, SQL Management Objects (SMO), or Windows Management Instrumentation (you can use the Management Studio, but it just uses SMO). Full-text indexes are created either through the use of special system stored procedures or through the use of the CREATE FULLTEXT INDEX command. + * Method of Update: SQL Server indexes are automatically updated in the normal course of changes to the underlying SQL Server data. Full-text indexes can either be populated on demand or through a "change tracking" mechanism with an on-demand cleanup. + +So that's the quick lesson in Full-Text Architecture 101. As we move through the rest of the chapter, the impact of the differences should become apparent versus the more "normal" way things are implemented in SQL Server. + +Setting Up Full-Text Indexes and Catalogs + +As we saw in the last section, each table in a SQL Server database can have zero or one full-text indexes. For SQL Server 2008, these full-text indexes are stored in with the rest of the database (you can, if you wish, specific a specific filegroup if you want the full-text items to be on separate storage). A catalog can store multiple full-text indexes. The indexes must be from the same database; you may, however, want to store indexes from one database in multiple catalogs, so you can manage the population of those indexes on separate schedules or store them in separate filegroups. + +Enabling Full-Text for Your Database + +Prior to SQL Server 2008, there was the concept of Full-Text being "enabled" for a database. In SQL Server 2008, all databases are always full-text enabled. + +Creating, Altering, Dropping, and Manipulating a Full-Text Catalog + +The CREATE syntax for Full-Text looks much like other CREATE syntaxes, but with a few additional twists: + +CREATE FULLTEXT CATALOG + +[ON FILEGROUP ] + +[IN PATH <'root path'>] + +[WITH ACCENT_SENSITIVITY = {ON|OFF}] + +[AS DEFAULT] + +[AUTHORIZATION ] + +Most of this should be fairly self-explanatory, but let's take a look anyway: + +ON FILEGROUP | This is here for backward compatibility with SQL Server 2005 only (the CREATE FULLTEXT CATALOG command didn't exist in SQL Server 2000). It has no effect under SQL Server 2008. +---|--- +IN PATH | Again, this is a backward compatibility only thing. In prior releases, the actual full-text catalogs were not created inside the database but rather as a separate file on disk. This option told SQL Server what path you wanted that file created in. In SQL Server 2008, this option has no effect. +WITH ACCENT_SENSITIVITY | Pretty much what it sounds like. This determines whether searches will take into account accents or not (for example, is "e" the same as "é"). Keep in mind that, if you change this setting after the catalog is created, the entire catalog will need to be repopulated.The full-text catalog will use whatever accent sensitivity the database is set to. +AS DEFAULT | Another one that is what it sounds like; this one sets the full-text catalog you're creating to be the default catalog for any new full-text indexes you create. +AUTHORIZATION | Mildly more complex. As you might imagine, this one is about security and rights. It changes the ownership of the full-text catalog to be the user or role specified instead of the default (which would be the user that actually creates the catalog). This one has gotten muddled quite a bit by SQL Server's change from ownership to schemas. Ownership has largely morphed into schemas, but the nature of this particular setting more closely fits with the older ownership notion. The key thing to realize here is that a role can be the owner of a full-text catalog—not just a user. If you're changing the ownership to a specific role, then the user creating the full-text catalog must be a member of that role at the time that he or she creates the catalog. + +So, let's create a full-text catalog for AdventureWorks2008. We'll simply call it: + +USE AdventureWorks2008; + +CREATE FULLTEXT CATALOG MainCatalog; + +This is another one of those commands you issue where you don't get much feedback. As long as you don't see an error, the catalog should be created just fine. + +And just that quick we have a full-text catalog available for AdventureWorks2008. I did not specify this full-text catalog as the default, so any full-text indexes that want to make use of this catalog will need to explicitly state this catalog as their destination. + +Altering Full-Text Catalogs + +Altering full-text catalogs works pretty much the same as creating them, save for the fact that you are really limited in what can be altered. The syntax is: + +ALTER FULLTEXT CATALOG + +{ REBUILD [WITH ACCENT_SENSITIVITY = {ON|OFF} ] + +| REORGANIZE + +| AS DEFAULT + +} + +There are three top-level options you can set with this ALTER. Let's take a look at them. + +REBUILD + +Does what it says it does—completely rebuilds the full-text catalog in question. By default, it will be created with exactly the same settings the catalog had before (Owner and whether it is the default or not). + +Keep in mind that your full-text catalog, and every index that catalog contains, will be offline while the rebuild is in progress. + +In addition to the simple rebuild that you would typically do just to compact the file (for deleted rows and such), you can also rebuild to change the accent sensitivity. If you want to reset the accent sensitivity, just specify whether you want it on or off as you issue the REBUILD command. + +Any rebuild implies that all indexes in the catalog will be repopulated. + +REORGANIZE + +This is similar to REBUILD, but with some pros and cons. + +REORGANIZE cleans up your catalog for you, but in an online fashion. The result is like most situations where you rearrange things instead of moving things all the way out and starting over. It looks pretty good, but perhaps not as good as if you had started from scratch. + +You can think of REORGANIZE as being like a defragmentation process. It merges what may well be several different index structures internal to the catalog. (For performance reasons at the time the full-text was analyzed, some items may have been kept in their own substructure in the index rather than merged into the master index for the catalog.) This command attempts to rectify that. Unlike REBUILD, REORGANIZE does also reorganize the internal structures for your full-text catalog (the ones that store metadata). + +AS DEFAULT + +This works just like it did under CREATE. It establishes this particular catalog as being the default full-text catalog for new full-text indexes you create for this database. + +Dropping Full-Text Catalogs + +I know you can see this one coming—after all, it's that same core DROP syntax we've been using all along: + +DROP FULLTEXT CATALOG + +And, of course, it's gone. + +Creating, Altering, Dropping, and Manipulating Full-Text Indexes + +Okay, so what we had with a full-text catalog was largely just a container. A full-text catalog, by itself, is nothing at all—think of it like a gas can with no gas in it. What we need are the actual full-text indexes. Whereas a full-text catalog is the place to store full-text indexes, the indexes themselves are what provide the actual reference information that allows your full-text queries to operate quickly and efficiently. + +Creating Full-Text Indexes + +When you go to create a full-text index, the core items of the command are not all that different from regular indexes; however, much as regular indexes have properties such as whether they are clustered or non-clustered, full-text indexes also have their own properties. + +The syntax for creating a full-text index looks like this: + +CREATE FULLTEXT INDEX ON
+ +[( [TYPE COLUMN ] + +[LANGUAGE ] [,...n])] + +KEY INDEX + +[ON ] + +[WITH + +{ CHANGE_TRACKING [=]{ MANUAL | AUTO | OFF } + +[, NO POPULATION] } + +] | [STOPLIST [=] {OFF | SYSTEM | } + +Note that what is optional is a bit atypical here. Most of the time, required items are listed first, but the quirks of this syntax give us an optional parameter (a column list) before a required parameter (the key index). Let's start with a quick example and then take a look at the parts: + +CREATE FULLTEXT INDEX ON Production.ProductModel + +( Name LANGUAGE English) + +KEY INDEX PK_ProductModel_ProductModelID + +ON MainCatalog + +WITH CHANGE_TRACKING OFF, NO POPULATION; + +So, what we've created here is a full-text index for the Production.ProductModel table. We've explicitly stated that the language used in that column is U.S. English. If we had wanted, we could have added a comma followed by another column name and potentially a TYPE COLUMN or another LANGUAGE identifier. After the language, we specifically stated what full-text catalog we wanted this index stored in as well as that we wanted change tracking turned off and no initial population of the index. + +That's a lot to think about, so let's take a look at those parts a bit closer. + +Notice that I did not supply a name for my full-text index. There can only be one full-text index for any given table, so there is no need to name it. (It is essentially identified by the table it's built on.) Be sure what you define includes all the columns you want to perform full-text searches on. + +Column List + +This is probably the trickiest part of the whole thing. Even though it says "column name" in the preceding syntax, you're really working on a column list. The issue is that for each column you list you need to include everything about that column before you move on to the next column. That is, you need to include the TYPE COLUMN and LANGUAGE parameters (if you're going to) before you name the next column. + +So, for example, if we had also wanted to include the catalog description, we could have done that, too, by adding it at the end of the first column definition: + +CREATE FULLTEXT INDEX ON Production.ProductModel + +( Name LANGUAGE English, + +CatalogDescription) + +KEY INDEX PK_ProductModel_ProductModelID + +ON MainCatalog + +WITH CHANGE_TRACKING OFF, NO POPULATION; + +This example is purely for reference. It will not run since we already created a full-text index on the Production.ProductModel table. + +LANGUAGE + +This specifies what language the column we've just identified is in. This is important for determination of noise words (words that occur frequently but add little to your search—we'll see more about these later in this chapter), as well as things like collation. Any language that SQL Server has localization support for (33 localizations as of this writing) is valid. To get a list of the aliases you would use, you can query the sys.syslanguages metadata view in the master database: + +SELECT name, alias FROM master.sys.syslanguages; + +TYPE COLUMN + +This option is for use when you want to do full-text indexing against documents stored in an image or a varbinary column. AdventureWorks2008 has a full-text index established that makes use of this. (It is on the Production.Documents table.) We'll check it out a bit later in the chapter. For now though, imagine that you're doing document management using SQL Server (not at all an uncommon use for SQL Server). If you are storing documents written in a mix of one or more applications, such as Microsoft Word (.doc), Acrobat (.PDF), Excel (.XLS), or a text editor (.TXT), then Full-Text Search will need to know what kind of document is stored for each row it analyzes, so it knows what analysis plug-in to use. + +In this case, you need to add another column to your table (in addition to the image or varbinary column) that contains the extension (.DOC, .PDF, and so on) of the document stored in the binary column. This column becomes the parameter value for the TYPE COLUMN property in the CREATE FULLTEXT INDEX command. + +KEY INDEX + +Unlike all the other options in the CREATE FULLTEXT INDEX command, this one is required. + +Any table that Full-Text is indexing must have a column that uniquely identifies each row. This can be a primary key or a unique constraint. The thing to remember on this point is that you are supplying the name of the index associated with the unique identifier, not the column or constraint name. Since this is used repeatedly to associate data in the full-text index, I would suggest you use the smallest primary key or unique index available. + +ON + +This is simply the name of the full-text catalog you want this index stored in. This is optional if your database has a default full-text catalog, and required if no default catalog has been established. + +WITH + +This supplies instructions regarding how your index is populated with data and how it copes with changes to the table that the index is built over. + +CHANGE_TRACKING + +Change tracking is all about how your full-text index deals with changes to the underlying table. + +The dilemma here is how you want to balance the accuracy of your full-text searches versus the amount of overhead you incur by keeping a higher overhead system (as compares to maintaining standard B-Tree indexes) up to date. + +Change tracking gives us three levels of support for changes: + +OFF | The full-text index is updated only when you perform a full population of the index. Essentially, you need to rebuild from scratch each time you populate. This means there is no ongoing maintenance overhead, but it also means that there may be rows in the table that will not be returned in your full-text queries or, perhaps worse, that rows may come back as containing the word you are interested in when, due to changes, they no longer do. This option is great when your data is slow moving (doesn't change often) and/or you don't require perfect accuracy in your results. In return for giving up that accuracy, it means you have no ongoing overhead and that your indexes are always as compact as they can be because they have no issues with fragmentation. It does mean, however, that when you do repopulate, you have a period of downtime and the overall process takes longer. +---|--- +AUTO | Under this model, SQL Server is constantly updating the index for things happening in the table. While there still may be a lag between when the change is made and when it is reflected in full text, that lag is minimal and you are getting something approximating real-time updates. This is the way to go when you have fast-moving data or your need for accuracy is very high. You are enduring a high degree of overhead since SQL Server will use smaller, intermediate structures to keep track of the changes. These can become inefficient over time and may hurt search performance but are not that big of a deal in the short run. If you use this option, consider still performing a reorganization or full repopulation regularly. +MANUAL | This is something of a middle ground. It does tracking to be able to identify changes but does not update the full-text index until explicitly told to do so. You can then manually perform updates that apply the changes to the existing index without a full repopulation. + +NO POPULATION + +This applies only if you have chosen OFF for change tracking. + +By default, when you create a full-text index, SQL Server starts a background process to populate that index. If you turn off change tracking and specify NO POPULATION, then you are limiting yourself solely to defining the full-text index but not actually putting any data in it to start. You can then schedule your own index population job to run later (presumably in low-demand hours of the day). + +STOPLIST + +A stoplist replaces what was known in previous versions as a noise word list. Noise words are now called stop words. They are words that are explicitly exempt from being included in the index. In general, these equate to words that are so common (in English, these might include "the," "and," "or," and other words that occur at abnormally high frequencies, but rarely add any real value to the content. While noise words were kept in a separate file in previous releases, SQL Server 2008 stores stop words in a stoplist. For each language you can define for full-text indexing, there is an associated system stoplist, but you can also create your own custom stoplist. You can also turn off stoplist utilization if you want all words included regardless. + +Altering Full-Text Indexes + +Okay, so now you have an index, and you want to make changes to it. As you might expect, the new full-text syntax supports the notion of an ALTER statement. It is in the form of: + +ALTER FULLTEXT INDEX ON
+ +{ ENABLE + +| DISABLE + +| SET CHANGE_TRACKING { MANUAL | AUTO | OFF } + +| ADD ( + +[TYPE COLUMN ] + +[LANGUAGE ] [,...n] ) + +| DROP ( [,...n] ) + +| START { FULL | INCREMENTAL | UPDATE } POPULATION + +| {STOP | PAUSE | RESUME} POPULATION + +| SET STOPLIST { OFF| SYSTEM | } + +[WITH NO POPULATION] + +} + +This ALTER has some substantial differences from previous ALTER statements we've dealt with! See how verbs like START and STOP are in there? This ALTER not only changes the definition of our full-text index but also can be used to manage the index somewhat. Keep this difference in mind, as it is not very intuitive when you compare it to the other ALTER statements we use in SQL Server. + +Several elements of these work exactly as they did for the CREATE statement. We are merely changing a chosen option from one thing to another. However, some of this is totally new. Let's start with the more traditional ALTER statement items and then move on to the portions of this statement that are more management-oriented. + +ENABLE/DISABLE + +These do what they say. If you disable a full-text index, the index is kept in place and all data remains intact. What changes is that the index is not available for full-text queries, and the index data is not updated (any updates that were in process when the DISABLE was issued will be stopped immediately). + +When you ENABLE, it picks up where the index left off. (It likely has catching up to do, but any data already there is kept intact, and you do not need to do a full repopulation.) + +ADD + +This works just like the initial definition of columns. For example, if we wanted to add the Instructions column to our full-text index on Production.ProductModel, it would look like: + +ALTER FULLTEXT INDEX ON Production.ProductModel + +ADD ( Instructions ) + +The LANGUAGE and TYPE COLUMN properties also work just as they did in our early CREATE. + +DROP + +Again, this works much as you would expect. If we were dropping the Instructions column we just added, that would look like: + +ALTER FULLTEXT INDEX ON Production.ProductModel + +DROP ( Instructions ) + +START... POPULATION + +START gives us three options as to what kind of populations we want to use. + +FULL + +The nice simple one—think of this as the command to "start over!" Every row will be reexamined, and the index will be rebuilt from scratch. + +INCREMENTAL + +This one is valid only if you have a timestamp column in your table (otherwise it will default back to a FULL population) and will start a population of rows changes since the last time a population was performed for the table. Think of this one as the "catch up on your work please!" version of populating. Incremental population does not require that change tracking be turned on. + +UPDATE + +This one addresses the scenario where you have turned the AUTO populate off for the index, but want all updates, inserts, or deletes updated in the index. It does require that change tracking be turned on. + +STOP, PAUSE, RESUME + +These perform the specific action on any population that is currently running against this full-text index. The STOP option does not stop automatic change tracking—only full or incremental updates. PAUSE and RESUME operate exactly as one would expect. + +Dropping Full-Text Indexes + +I'm sure by this point that you could figure this one out for yourself, but for the sake of completeness, here we go: + +DROP FULLTEXT INDEX ON
+ +So, were we to run the command (don't actually run this, as we'll be using this index in our next example!): + +DROP FULLTEXT INDEX ON Production.ProductModel + +the full-text index would be gone! + +A Note Regarding the Older Syntax + +Prior to SQL Server 2005, we used a special system stored procedure called sp_fulltext_catalog. We likewise used other system stored procs to address other full-text functionality. + +These have now been deprecated for two releases, and are significantly out of touch with the next full-text architecture. I will not cover them in depth there, but I do want you to be aware of them in case you bump into them in production settings. If you do, I recommend migrating them to the new syntax as fast as reasonably possible (basically, as long as SQL Server 2000 support is no longer required). + +More on Index Population + +Unlike "normal" SQL Server indexes, which are naturally kept up to date by the very nature of SQL Server and the way it stores data, full-text indexes operate with a different storage structure and require substantially more overhead to populate. As such, they require a certain degree of intervention before the index will be up to date with the actual data it is supposed to represent. + +Population comes in three—well, more like two and a half—flavors. Let's look at each: + + * Full: Is what it sounds like. With this kind of population, SQL Server basically forgets anything that it knew about the data previously and starts over. Every row is rescanned, and the index is rebuilt from scratch. + * Incremental: Under this option, SQL Server utilizes a column of type timestamp in order to keep track of what columns have changed since the last population. In this scenario, SQL Server only needs to record the changes for those rows that have changed in some manner. This option requires that the table in question have a timestamp column. Any updates that do not cause a change in the timestamp (nonlogged operations—usually BLOB activity) will not be detected unless something else in the same row changed. + * Change Tracking: Tracks the actual changes since the last population. This option can help you keep your full-text indexes up to date at near real time; however, keep in mind that full-text population is very CPU and memory intensive, and can bog down your server. Weigh the notion of immediate updates against the notion that you may be able to hold your updates to off-peak hours for your server. + +Unless you're using change tracking, population of your full-text indexes will occur only when you specifically start the process or according to a population schedule that you establish. + +Obviously, whenever you first create a full-text index or change the list of columns participating in the index, you need to completely repopulate the index (an incremental change of a previously empty index would mean that every row would have to be scanned in—right?). SQL Server will now do this automatically unless you explicitly tell it not to. We can manually perform this repopulation at either the catalog or the table level. Typically, you'll perform repopulation at the table level for newly added or changed indexes, and repopulate at the catalog level when you are performing routine maintenance. + +So, with this in mind, we should be ready to populate the full-text index we have created on our Production.ProductModel table. Had we not specifically stated NO POPULATION, then SQL Server would have populated the index automatically; however, since we did tell it not to populate, we have to order up our population. Since this is the first population, we probably want a full population (frankly, an incremental would have the same result, so it doesn't really matter, but it reads more logically this way). Using the new syntax, this would look like: + +ALTER FULLTEXT INDEX ON Production.ProductModel + +START FULL POPULATION; + +Full-text population runs as a background process. As such, your command will return a "completed successfully" message as soon as the population job is started. Do not take this message to mean that your index is done populating, which, if the index is against a large table, could potentially take hours to complete. + +If you need to know the status of your full-text population process, right-click the name of your full-text index under the Storage⇒Full Text Catalogs node of your database, and then check the property called "Population Status." + +Since this table is relatively small, you shouldn't have to wait terribly long before you can run a query against it and get results: + +SELECT ProductModelID, Name + +FROM Production.ProductModel + +WHERE CONTAINS(Name, 'Frame'); + +This should get back something on the order of 10 rows: + +ProductModelID Name + +\-------------- -------------------------------------------------- + +5 HL Mountain Frame + +6 HL Road Frame + +7 HL Touring Frame + +8 LL Mountain Frame + +9 LL Road Frame + +10 LL Touring Frame + +14 ML Mountain Frame + +15 ML Mountain Frame-W + +16 ML Road Frame + +17 ML Road Frame-W(10 row(s) affected) + +We have a full-text index, and it works! Time to move on to what that query we just ran is supposed to do and what other options we have available. + +Full-Text Query Syntax + +Full-Text Search has its own brand of query syntax. It adds special commands to extend T-SQL and to clearly indicate that we want the full-text engine to support our query rather than the regular SQL Server engine. + +Fortunately, the basics of full-text queries are just that—basic. There are only four base statements to work with the full-text engine. They actually fall into two overlapping categories of two statements each: + +| Exact or Inflectional Term | Meaning +---|---|--- +Conditional | CONTAINS | FREETEXT +Ranked Table | CONTAINSTABLE | FREETEXTTABLE + +The conditional predicates both work an awful lot like an EXISTS operator. Essentially they, for each row, provide a simple yes or no as to whether the row qualifies against the search condition provided. You use both of these in the WHERE clause of your queries. On the other hand, the two ranked queries do not provide conditions at all. Instead, they return a tabular result set (which you can join to) that includes the key value of all the rows that found matches (that's what you join to) as well as a ranking to indicate the strength of the match. + +Let's look more closely at each of the four keywords. + +CONTAINS + +This term looks for a match based on a particular word or phrase. By default, it's looking for an exact match (that is, swim must be swim—not swam), but it can also use modifiers to look for what are called inflectional matches (words that have the same root—such as swim and swam). CONTAINS recognizes certain keywords. + +For now, we're going to stick with the simple form of CONTAINS. We will look at the advanced features after we have the basics of our four statements down (since they share certain modifiers, we'll look at those all at once). + +The basic syntax, then, looks like this: + +CONTAINS({|*} , '') + +You can name a specific column to check, or use *, in which case the condition will be compared for matches against any of the indexed columns. In its simplest form, the search condition should contain only a word or phrase. + +There are two things worth pointing out here. First, remember that you will only get back results against columns that were included in the full-text index. In the final index we created on the ProductModel table. That means the search includes only the Name and CatalogDescription columns. Columns like Introduction are not included in the search because they are not included in the index. (You may recall that we dropped that column in a test of our ALTER syntax.) Second, the search condition can be far more complex than the simple condition that we've shown here, but we'll get to that after you have the basic operations down. + +For an example, let's go back to the query we used to prove that our population exercise had worked: + +SELECT ProductModelID, Name + +FROM Production.ProductModel + +WHERE CONTAINS(Name, 'Frame'); + +What we've said we want here is the ProductModelID and Name columns for all the rows where the Name column in the index includes the word Frame. + +If you check out the Name column for the results, you'll see that every row has an exact match. + +Let's quickly look at another example. This time, we're going to run pretty much the same query, but we're going to look for the word Sport: + +SELECT ProductModelID, Name + +FROM Production.ProductModel + +WHERE CONTAINS(Name, 'Sport'); + +This time we get back just one row: + +ProductModelID Name + +\-------------- -------------------------------------------------- + +33 Sport-100 + +(1 row(s) affected) + +Again, we got back all the rows where the Name column had an exact match with the word Sport. Were you to look through the other rows in the table, however, you would find that there were other variations of the word Sport (a plural in this case), but they were not returned. + +Again—the default behavior of CONTAINS is an exact match behavior. + +FREETEXT + +FREETEXT is an incredibly close cousin to CONTAINS. Indeed, their syntax is nearly identical: + +FREETEXT({|*} , '')[;] + +So, the only real difference is in the results you get back. You see, FREETEXT is a lot more forgiving in just how exact of a match it looks for. It is more interested in the meaning of the word than it is the exact letter-for-letter spelling. + +To illustrate my point rather quickly here, let's look at our Sport query from the previous section, but modify it to use FREETEXT instead of CONTAINS: + +SELECT ProductModelID, Name + +FROM Production.ProductModel + +WHERE FREETEXT(Name, 'Sport'); + +When we execute this, we get back slightly different results than we did with CONTAINS: + +ProductModelID Name + +\-------------- -------------------------------------------------- + +13 Men's Sports Shorts + +33 Sport-100 + +(2 row(s) affected) + +The difference in this case comes in interpretation of the plurals—our FREETEXT query has picked up the row that contains the word Sports—not just those with the word Sport. FREETEXT can also handle things like swim versus swam and other word variations. + +CONTAINSTABLE + +CONTAINSTABLE, in terms of figuring out which rows would be a match, works identically to CONTAINS. The difference is how the results are dealt with. + +The syntax is similar, but with the twist of identifying which table the CONTAINSTABLE is going to operate against plus an optional limitation to just a top set of matches: + +CONTAINSTABLE (
, {|*}, '' [, ]) + +Where CONTAINS returns a simple Boolean response suitable for use in a WHERE clause, CONTAINSTABLE returns a table—complete with rankings of how well the search phrase matched the row being returned. + +Let's see what I mean here by running our original query, but with a CONTAINSTABLE this time: + +SELECT * + +FROM CONTAINSTABLE(Production.ProductModel,Name, 'Sport'); + +This gets us back one row—just like with CONTAINS—but the information provided by the returned values is somewhat different: + +KEY RANK + +\----------- ----------- + +33 128 + +(1 row(s) affected) + +We are provided with two columns: + + * KEY: Remember when we said that our full-text index had to be able to relate to a single column key in the indexed table? Well, the KEY returned by CONTAINSTABLE relates exactly to that key column. That is, the value output in the column called KEY matches with a single unique row, as identified by the key, in the index table. + * RANK: A value from 0 to 1000 that indicates just how well the search result matched the row being returned—the higher the value, the better the match. + +To make use of CONTAINSTABLE, we simply join our original table back to the CONTAINSTABLE result. For example: + +SELECT Rank, ProductModelID, Name + +FROM Production.ProductModel p + +JOIN CONTAINSTABLE(Production.ProductModel,Name, 'Sport') ct + +ON p.ProductModelID = ct.[KEY]; + +Notice the use of brackets around the KEY column name. The reason why is that KEY is also a keyword. Remember from our rules of naming that, if we use a keyword for a column or table name (which you shouldn't do), you need to enclose them in square brackets. + +This gets us back our original row, but this time we have the extra information from the underlying table: + +Rank ProductModelID Name + +\----------- -------------- -------------------------------------------------- + +128 33 Sport-100 + +(1 row(s) affected) + +In this case, the values in the Rank are the same, but, given more diverse values, we could have done things like: + + * Filter based on some arbitrary Rank value. For example, we could want to return only the best matches based on score. + * Order by the rank (sort the rankings—most likely highest to lowest). + +FREETEXTTABLE + +Much as FREETEXT was the close cousin to CONTAINS, so too is FREETEXTTABLE the close cousin to CONTAINSTABLE. FREETEXTTABLE simply combines the more inexact word matching of FREETEXT with the tabular presentation found in CONTAINSTABLE. + +We can then combine some of our previous examples to see how FREETEXTTABLE changes things: + +SELECT Rank, ProductModelID, Name + +FROM Production.ProductModel p + +JOIN FREETEXTTABLE(Production.ProductModel,Name, 'Sport') ct + +ON p.ProductModelID = ct.[KEY]; + +This gets us the same two rows we had with our original FREETEXT query, but with the kind of rankings we had with our CONTAINSTABLE: + +Rank ProductModelID Name + +\----------- -------------- -------------------------------------------------- + +102 13 Men's Sports Shorts + +102 33 Sport-100(2 row(s) affected) + +Experiment with this some in your full-text efforts, and you'll see how rankings can give you a lot to work with. + +Dealing with Phrases + +All of our various full-text keywords can deal with the concept of phrases. How the phrases are parsed and handled, however, is somewhat different. + +Let's start off with the most simple of examples—a simple two-word phrase. This time we'll say that the phrase we want to look for is damaged seats. To add a twist to things, we want it no matter what column it is in (as long as the column is part of our full-text index). + +SELECT DocumentNode, DocumentSummary, Document + +FROM Production.Document + +WHERE CONTAINS(*, '"damaged seats"'); + +Notice that the phrase was included in double quotation marks. We need to do this any time we want a set of words to be considered as a single unit. This does, however, get us back one row. The result is a little large (due to the size of the Document and DocumentSummary columns) to put in this text, but the relevant section is: + +DocumentNode DocumentSummary Document + +\------------ -------------------- -------------------------------- + +0x7C20 Worn or damaged se... 0xD0CF11E0A1B11AE100000000000... + +(1 row(s) affected) + +Our CONTAINS will check for rows that exactly match the phrase, as long as we enclose that phrase in double quotation marks. (Within the single quotes we always need on our search phrase.) FREETEXT works in the same way. + +Booleans + +SQL Server also supports the use of Booleans in your searches. The Boolean keywords apply: + + * AND + * OR + * AND NOT + +There really isn't a whole lot of rocket science to these, so I'll launch right into a simple example and point out one caveat. Let's go with a variation on an example we used earlier: + +SELECT DocumentNode, DocumentSummary, Document + +FROM Production.Document + +WHERE CONTAINS(*, '"damaged" OR "seats"'); + +What we've done here is change from where we were searching for the exact phrase damaged seats to a search that is looking for either word without worrying about whether the words are used together or not. Execute this, and you'll see we get back two rows instead of just one. + +The caveat that I mentioned earlier is that NOT cannot be used on its own. NOT is relevant only to full-text searches when used in conjunction with AND. + +Proximity + +Full-Text Search also allows us to make use of proximity terms. Currently, the list of supported proximity terms is a whopping one term long—NEAR. NEAR works a lot like it sounds. It says that the terms on either side of the NEAR keyword must be close to each other. Microsoft hasn't told us how close the words have to be to be considered NEAR, but figure around eight to ten words for most situations. + +Technically, there is one more "word" on the proximity keyword list, but it isn't a "word" at all—rather a symbol. You can, if you choose, use a tilde (∼) instead of the NEAR keyword. It works just the same. Personally, I recommend against this for readability reasons. Not too many readers of your code are going to recognize what ∼ means, but most of them will at least make a guess at NEAR. + +For examples on how NEAR works, we're going to stick with CONTAINSTABLE. NEAR works much the same in the other full-text query operators, so we're just going to focus on what happens to the rankings in a NEAR query as well as what does and doesn't get included in the query. + +For this example, we'll look at the words repair and instructions: + +SELECT Rank, DocumentNode, DocumentSummary + +FROM Production.Document pd + +JOIN CONTAINSTABLE(Production.Document, *, 'repair near instructions') ct + +ON pd.DocumentNode = ct.[KEY]; + +I include only the first two columns here for brevity, but notice that we have different rankings on the two rows returned. + +Rank DocumentNode + +\----------- -------------------------- + +3 0x5B40 + +2 0x7B40 + +(2 row(s) affected) + +If you look carefully at the DocumentSummary column in your results (again, for brevity's sake, I haven't included all of the column here), you'll see that both rows do indeed have both words but that the word repair occurs twice in the DocumentNode 0x5B40 row, thus it receives a higher ranking. + +Don't be surprised to see situations where a record that has your search criteria closer together gets ranked lower than one where the search criteria are not as close. Remember that, even when you use the NEAR keyword, nearness is only one of several criteria that SQL Server uses to rank the rows. Other considerations such as percentage of words that match, case values, and more can play with the numbers on you. + +Weighting + +So, these rankings are all cool and whatnot, but what would we do if one of the words in our search criteria was more important than another? + +To deal with situations where you need to give precedence to one or more words, Full-Text provides us with the ISABOUT() function and WEIGHT keyword. This syntax looks like this: + +ISABOUT( WEIGHT (), WEIGHT (),...n) + +Let's say that you want to allow customers to select among several kinds of bikes, but to further allow for selecting "preferred" options. For our example, let's say our customer is most interested in mountain bikes but is also interested in touring and road bikes—in that order. You could get a ranked listing using the following: + +SELECT Rank, ProductModelID, Name + +FROM Production.ProductModel pm + +JOIN CONTAINSTABLE( + +Production.ProductModel, + +Name, + +'ISABOUT (Road WEIGHT (.2), Touring WEIGHT (.4), Mountain WEIGHT (.8) )' + +) ct + +ON pm.ProductModelID = ct.[KEY] + +ORDER BY Rank DESC; + +Now take a look at the results: + +Rank ProductModelID Name + +\----------- -------------- -------------------------------------------------- + +31 5 HL Mountain Frame + +31 7 HL Touring Frame + +31 8 LL Mountain Frame + +... + +... + +... + +31 123 LL Mountain Rear Wheel + +31 124 ML Mountain Rear Wheel + +31 125 HL Mountain Rear Wheel + +7 126 LL Road Rear Wheel + +7 113 Road Bottle Cage + +7 93 Road Tire Tube + +... + +... + +... + +7 16 ML Road Frame + +7 17 ML Road Frame-W + +7 9 LL Road Frame + +7 6 HL Road Frame + +(89 row(s) affected) + +Note that not everything is perfect in our world—some touring entries come before our more heavily weighted mountain options, but if you look the list over, you will see we have indeed created a very heavy bias toward mountain bikes in our rankings. + +Inflectional + +This one doesn't really apply to FREETEXT, as FREETEXT is inherently inflectional. What is INFLECTIONAL you ask? Well, it's basically telling SQL Server that different forms of the word have the same general meaning. The syntax looks like this: + +FORMSOF(INFLECTIONAL, [, [,...n]] ) + +An inflectional form of a word is one that has the same general meaning. For example, swam is just the past tense of swim. The underlying meaning is the same. + +Stop Words + +As we discussed earlier, there are tons and tons of words in use in different languages (Full-Text supports more than just U.S. English!). Most languages have certain words that appear over and over again with little intrinsic meaning to them. In the English language, for example, prepositions (you, she, he, and so on), articles (the, a, an), and conjunctions (and, but, or) are just few examples of words that appear in many, many sentences but are not integral to the meaning of that sentence. If SQL Server paid attention to those words, and we did searches based on them, then we would drown in the results that SQL Server gave us in our queries. Quite often, every single row in the table would be returned! The solution comes in the form of what is called a stoplist (called a noise word list in previous releases). This is a list of words (individual words are referred to as stop words) that SQL Server ignores when considering matches. + +SQL Server includes a default stoplist for each language it supports. You can either use this system-supplied stoplist (usually referred to as SYSTEM if you need to explicitly reference it in a command), or you can create your own using the CREATE FULLTEXT STOPLIST command. The full syntax looks like this: + +CREATE FULLTEXT STOPLIST + +[FROM { [.] } | SYSTEM STOPLIST ] + +[AUTHORIZATION ] + +[;] + +In general, you'll want a well-populated stoplist, and thus will want to prepopulate your list from some existing stoplist. So, for example, I could create a stoplist for AdventureWorks2008 that starts with the same stop words in the SYSTEM stoplist: + +CREATE FULLTEXT STOPLIST ADStopList + +FROM SYSTEM STOPLIST; + +Stoplists you create are not automatically associated with any full-text index—you need to manually attach the new stoplist to the full-text index via the ALTER FULLTEXT INDEX command. + +You can add and delete words from this list as suits the particular needs of your application. For example, if you are in the business of selling tractor-trailer rigs, then you might want to add words like hauling to your noise word list. More than likely, a huge percentage of your customers have that word in their name, so it is relatively unhelpful in searches. To make additions or subtractions from a stoplist, you use the ALTER FULLTEXT STOPLIST command. The full syntax looks like this: + +ALTER FULLTEXT STOPLIST stoplist_name + +{ + +ADD '' LANGUAGE + +| DROP + +{ + +'' LANGUAGE + +| ALL LANGUAGE + +| ALL } + +[;] + +Let's try this out by adding a stop word to the AWStopList we just created: + +ALTER FULLTEXT STOPLIST ADStopList + +ADD 'bicycle' LANGUAGE 1033; + +Were we to repopulate our full-text index, the word bicycle (which may be a worthless search term in a business where every document is going to discuss bicycles), would be ignored. + +Adding and removing words from a stoplist is something of a double-edged sword. When you add a word to the list, it means that searches involving that word are no longer going to return the results that users are more than likely going to expect. By the same token, it also, depending on the frequency with which the word is used, can dramatically shrink the processing time and size of your catalogs. + +Summary + +Full-Text is now core to the SQL Server engine (it was a separate service in prior releases), but a separate process is spawned by the Full-Text daemon manager each time a search is issued. + +When you implement Full-Text, also consider the load the population process is going to place on your server, and balance that against how quickly you need changes reflected in search results. If possible, delay repopulation of full-text indexes until the non-peak hours on your system + +Full-Text Search is a powerful and fast way of referencing the contents of most any character-based columns. It is substantially more efficient and powerful than a LIKE clause but comes with additional overhead in terms of both space and processing time. +19 + +Feeling Secure + +There are probably as many ideas on security as there are programmers. It's one of those things where there isn't necessarily a right way to do it, but there are definitely plenty of wrong ones. + +The first thing to understand about security is that there is no such thing as a totally secure application. If you can make it secure, rest assured that someone, somewhere, can defeat your efforts and "hack" into the system. Even with this knowledge, the goal still needs to be to keep unwanted intruders out of your system. The good news about security is that, for most instances, you can fairly easily make it such a hassle that 99.999 percent of people out there won't want to bother with it. For the other .001 percent, I can only encourage you to make sure that all your employees have a life so they fall into the 99.999 percent. The .001 percent will hopefully find someplace else to go. + +SQL Server 2005 marked the start of a very concerted effort by Microsoft to raise the level of security in SQL Server. For those who have been around long enough, you may remember the hubbub surrounding the "slammer" virus that happened during the SQL Server 2000 lifespan. Microsoft radically altered the security profile of SQL Server in a service pack that followed the slammer scare, but SQL Server 2005 marked the first full release after the advent of the slammer virus, and it was just the beginning of a series of features not so much focused just around deterring hackers as a more far reaching protection of the safety and privacy of data in SQL Server. A ton of new features were added in SQL Server 2005, some more are added in SQL Server 2008, and there are more to come in the next version of SQL Server. Needless to say, all this leaves us with a lot to cover in the security realm. + +In this chapter, we're going to cover: + + * Security basics + * SQL Server security options + * Database and server roles + * Application roles + * Credentials + * Certificates + * Schema management + * XML integration security issues + * More advanced security + +What we'll discover is that there are a lot of different ways to approach the security problem. Security goes way beyond giving someone a user ID and a password—we'll see many of the things that you need to think about. + +Before beginning any of the examples in this chapter, you'll need to load and execute the script called NorthwindSecure.sql. This builds a special database we'll use throughout this chapter. You can download what you need for this at the book's Web site at www.wrox.com or at www.professionalsql.com. + +Okay, so this is a chapter where I have to make you create a working database in order for the examples to work—my apologies for that. What we're going to utilize is the old Northwind database but with any changes to permissions removed. The NorthwindSecure database that we'll use throughout this chapter is a more typical database scenario—that is, it has absolutely no permissions added to it beyond what comes naturally with creating tables and objects (which means NONE). We'll learn how to deal with this and explicitly add what permissions we want as the chapter progresses. + +Security Basics + +I'm sure that a fair amount of what we're going to look into in this section is going to seem exceedingly stupid—I mean, won't everyone know this stuff? Judging by how often I see violations of even the most simple of these rules, I would say, "No, apparently they don't." All I can ask is that you bear with me, and don't skip ahead. As seemingly obvious as some of this stuff is, you'd be amazed how often it gets forgotten or just plain ignored. + +Among the different basics that we'll look at here are: + + * One person, one login ID, one password + * Password expirations + * Password length and makeup + * Number of attempts to log in + * Storage of user ID and password information + +One Person, One Login, One Password + +It never ceases to shock me how, everywhere I go, I almost never fail to find that the establishment has at least one "global" user—some login into the network or particular applications that is usually known by nearly everyone in the department or even the whole company. Often, this "global" user has carte blanche (in other words, complete) access. For SQL Server, it used to be common that installations hadn't even bothered to set the sa password to something other than a blank password. This is a very bad scenario indeed. + +Prior to SQL Server 2000, the default password for the sa account was null—that is, it didn't have one. Thankfully, SQL Server 2000 not only changed this default, SQL Server will now, by default, not allow you to use a weak password (depends on your Windows policy settings), and, assuming your Windows policy settings allow a blank password, SQL Server will proactively tell you that you are effectively being an idiot if you insist on making it blank. The thing to watch out for is that, while you're developing, it's really common to still set it to something "easy." You still need to remember to change it before you go into production or to make it something hard from the beginning if your development server is going to be exposed directly to the Internet or some other non-trustworthy access. + +Even now, when most installations do have something other than a null password, it is very common for lots of people to know what that password is. + +The first basic, then, is that if everyone has access to a user ID that is essentially anonymous (if everyone knows it, it could be that anyone has used it) and has access to everything, then you've defeated your security model entirely. Likewise, if you give every user a login that has full access to everything, you've again severely damaged your security prospects. The only real benefit that's left is being able to tell who's who as far as who is connected at any point in time (assuming that they are really using their individual login rather than the global login). + +Users that have carte blanche access should be limited to just one or two people. Ideally, if you need passwords for such carte blanche access, then you would want separate logins that each have the access, but only one person would know the password for each login. + +You'll find that users will often share their passwords with someone else in order to let someone temporarily gain some level of access (usually because the owner of the login ID is either out of the office or doesn't have time to bother with doing it themselves at the time.) You should make this nothing short of a hanging offense if possible. + +The problem created by password sharing is multifold. First, some users are getting access to something that you previously decided not to give them (otherwise, why don't they have the necessary rights for themselves?). If you didn't want them to have that access before, why do you want them to have it now? Second, a user that's not supposed to have access probably will now have that access semi-permanently. Since users almost never change their passwords (unless forced to), the person they gave the password to will probably be able to use that login ID indefinitely and, I assure you, they will! Third, you again lose auditing. You may have something that tracks which user did what based on the login ID. If more than one person has the password for that login ID, how can you be sure which person was logged in to that login ID at the time? + +This means that if someone is going to be out of the office for some time, perhaps because he is sick or on vacation, and someone else is temporarily going to be doing his job, a new login ID and password should be created specifically for that replacement person (or a modification to the access rights of his existing login ID should be made), and it should be deleted as soon as the original person has returned. + +To summarize, stay away from global user accounts whenever possible. If you must have them, keep their use limited to as few people as at all possible. Usually this should be kept to just two (one to be a main user, and one person as a backup if the first person isn't available). If you really must have more than one person with significant access, then consider creating multiple accounts (one per user) that have the necessary level of access. By following these simple steps, you'll find you'll do a lot for both the security and auditability of the system. + +Password Expiration + +Using expiration of passwords tends to be either abused or ignored. That's because it's a good idea that often goes bad. + +The principle behind password expiration is to set up your system to have passwords that automatically expire after a certain period of time. After that time, the user must change the password to continue to have access to the account. The concept has been around many years, and if you work in a larger corporation, there's a good chance that the auditors from your accounting firm are already insisting that you implement some form of password expiration (no, it's not just your IT department being controlling—they may well have been forced to a given policy by the same people who audit your financial statements). + +With SQL Server 2005 and later, you can enforce Windows authentication rights even for your SQL Server–specific passwords. Alternatively, you can just use Windows-based security (more on that in the next section). + +What Do You Get for Your Effort? + +So, what does password expiration get you? Well, remember that, in the final part of a previous section, I said that once a password is shared, the user would have that access forever? Well, this is the exception. If you expire passwords, then you refresh the level of your security—at least temporarily. The password would have to be shared a second time in order for the user to regain access. While this is far from foolproof (often, the owner of the login ID will be more than happy to share it again), it does deal with the situation where the sharing of the password was really just intended for one-time use. Often, users who share their passwords don't even realize that months later the other user still has the password and may be using it on occasion to gain access to something they would not have, based on their own security. + +Now the Bad News + +It is very possible to get too much of a good thing. I mentioned earlier how many audit firms will expect their clients to implement a model where a user's password regularly expires, say, every 30 days. This is a very bad idea indeed. + +Every installation that I've seen that does this—without exception—has worse security after implementing a 30-day expiration policy. The problem is, as you might expect, multifold in nature. + + * First, technical support calls go way up. When users change passwords that often, they simply can't memorize them all. They can't remember which month's password they are supposed to use, so they are constantly calling for support to reset the password because they forgot what it is. + * Second, and much more important, the users get tired of both thinking of new passwords and remembering them. Experience has shown me that, for more than 90 percent of the users I've worked with in installations that use a 30-day expiration, users change their passwords to incredibly predictable (and therefore hackable) words or word/number combinations. Indeed, this often gets to a level where perhaps 50 percent or more of your users will have the same password—they are all using things like MMMYY where MMM is the month and YY is the year. For example, for January 1996 they might have used JAN96 for their password. Pretty soon, everyone in the place is doing something like that. + +I've seen some companies try and deal with this by implementing something of a password sniffer; it checks the password when you go to change it. The sniffing process looks for passwords that incorporate your name or start with a month prefix. These mechanisms are weak at best. + +Users are far smarter than you often give them credit for. It took about a week for most users to circumvent the first one of these password sniffers I saw; they simply changed their passwords to have an "X" prefix on them, and otherwise stayed with the same MMMYY format they had been using before. In short, the sniffer wound up doing next to nothing. It doesn't stop there though: they share their newfound algorithm with coworkers so they can get around the "problem" too. + +The bottom line here is to not get carried away with your expiration policy. Make it short enough to get reasonable turnover and deal with shared or stolen passwords but don't make it so often that users rebel and start using weak passwords. Personally, I suggest nothing more frequent than 90 days and nothing longer than 180 days. + +Password Length and Makeup + +Ah, an era of rejoicing for SQL Server in this area. In previous versions, you really didn't have much control over this if you were using SQL Server security. You can now have SQL Server enforce your Windows password policy (which you can adjust using utilities in Windows). + +Password Length + +Realize that, for each possible alphanumeric digit the user includes in the password, they are increasing the number of possible passwords by a factor of at least 36 (really a few more given special characters, but even 36 is enough to make the point here). That means there are only 36 possible single character passwords, but 1,296 possible two-character passwords. Go up to three characters, and you increase the possibilities to 46,656. By the time you add a fourth character, you're well over a million possibilities. The permutations just keep going up as you require more and more characters. The downside, though, is that it becomes more and more difficult for your users to remember what their password was and to actually think up passwords. Indeed, I suspect that you'll find that requiring anything more than 5 or 6 characters will generate a full-scale revolt from your end users. + +Password Makeup + +All right, so I've pointed out that, if you make it a requirement to use at least four alphanumeric characters, you've created a situation where there are over a million possible password combinations. The problem comes when you realize that people aren't really going to use all those combinations; they are going to use words or names that they are familiar with. Considering that the average person only uses about 5,000 words on a regular basis, that doesn't leave you with very many words to try out if you're a hacker. + +If you're implementing something other than the default Windows password policy, then consider requiring that at least one character be alphabetic in nature (no numbers, just letters) and that at least one character be numeric. This rules out simple numbers that are easy to guess (people really like to use their Social Security number, telephone number, or birthdays) and all words. The users can still create things that are easy to remember for them—say "77pizzas"—but the password can't be pulled out of a dictionary. Any hacker is forced to truly try each permutation in order to try and break in. + +Number of Tries to Log In + +Regardless of how you're physically storing the user and password information, your login screen should have logic to it that limits the number of tries that someone gets to log in. The response if they go over the limit can range in strength, but you want to make sure you throw in some sort of device that makes it difficult to set up a routine to try out all the passwords programmatically. + +How many tries to allow isn't really that important as long as it's a reasonably small number. I usually use three times, but I've seen four and five in some places and that's fine too. + +If you're utilizing the Windows password policy enforcement, then SQL Server will check the login attempts versus a bad password limit and enforce that policy. + +Storage of User and Password Information + +This obviously applies only if you are cooking your own security system rather than using the built-in Windows and/or SQL Server security systems (but many Web applications will do that), and, for the most part, there's no rocket science in how to store user profile and password information. There are, however, a few things to think about: + + * Since you need to be able to get at the information initially, you will have to do one of the following three things: + * Compile a password right into the client application or component (and then make sure that the proper login and password are created on any server that you install your application on). + * Utilize SQL Server's encryption technologies to encrypt and decrypt the data in the database. + * Require something of a double password situation—one to get the user as far as the regular password information, and one to get them to the real application. Forcing a user into two logins is generally unacceptable, which pushes you back to one of the other two options in most cases. + * If you go with a double password scenario, you'll want the access for the first login to be limited to just a stored procedure execution if possible. By doing this, you can allow the first login to obtain the validation that it needs while not revealing anything to anyone that tries to login through Management Studio. Have your stored procedure (sproc) accept a user ID and password, and simply pass back either a Boolean (true/false that they can log in) or pass back a recordset that lists what screens and functions the user can see at the client end. If you use a raw SELECT statement, then you won't be able to restrict what they can see. + +One solution I've implemented close to this scenario was to have a view that mapped the current SQL Server login to other login information. In this case, an application role was used that gave the application complete access to everything. The application had to know what the user could and couldn't do. All the user's login had a right to do was execute a stored procedure to request a listing of their rights. The sproc looked something like this (this is just pseudo-code, so don't try and actually execute this): + +CREATE PROC GetUserRights + +AS + +DECLARE @User varchar(128) + +SELECT @User = USER_NAME() + +SELECT * FROM UserPermissions WHERE LoginID = @User + + * If you're going to store password information in the system—encrypt it!!! I can't say enough about the importance of this. Most users will use their passwords for more than one thing; it just makes life a lot easier when you have less to remember. By encrypting the data before you put it in the database, you ensure that no one is going to stumble across a user's password information—even accidentally. They may see it, but what they see is not usable unless they have the key to decrypt it. + +What form of encryption to use is up to you. You can utilize the built-in encryption methods (we'll discuss some of these later in the chapter), or you can implement your own encryption at the application level. One way or the other, there is little excuse for not properly protecting password information. + +Personally, I am a big believer in one-way encryption. That is, once it's encrypted, there really isn't any reasonable way to decrypt it. If a user loses their password, then they need to go through some form of reset mechanism and choose a new password. Why do I feel this way? Well, realize that most users will reuse the same password for many applications, so the password they use to get into your system may very well be the same password they use to get into their personal online banking system. Creating a one-way encryption system minimizes the risk that an administrator of your system is able to get at users' passwords for nefarious use. + +Security Options + +As far as built-in options go, you have two choices in how to set up security under SQL Server. + + * Windows Integrated Security: The user logs in to Windows not SQL Server. Authentication is done via Windows with trusted connections. + * Standard Security: The user logs in to SQL Server separately from logging in to Windows. Authentication is done using SQL Server. + +Let's take a look at both. + +SQL Server Security + +We'll start with SQL Server's built-in login model. This was a security black hole for a very long time, but got substantially more robust in SQL Server 2005. The relatively simplistic model is still available, but there is now tons more you can do to add extra touches to just how secure your server and databases are. + +With SQL Server security, you create a login ID that is completely separate from your network login information. Some of the pros for using SQL Server security include: + + * The user doesn't necessarily have to be a domain user in order to gain access to the system. + * It's easier to gain programmatic control over the user information. + +Some of the cons are: + + * Your users may have to log in twice or more—once into whatever network access they have, and once into the SQL Server for each connection they create from a separate application. + * Two logins mean more maintenance for your DBA. + * If multiple passwords are required, they can easily get out of synch, and that leads to an awful lot of failed logins or forgotten passwords. (Does this sound familiar, "Let's see now, which one was it for this login?") + +An example of logging in using SQL Server security would be the use of the sa account that you've probably been using for much of this book. It doesn't matter how you've logged in to your network, you log in to the SQL Server using a login ID of sa and a separate password (which you've hopefully set to something very secure). + +On an ongoing basis, you really don't want to be doing things day-to-day logged in as sa. Why? Well, it will probably only take you a minute or two of thought to figure out many of the terrible things you can do by sheer accident when you're using the sa account (or any other account with system administrator access for that matter). Using sa means you have complete access to everything; that means the DROP TABLE statement you execute when you are in the wrong database will actually do what you told it—drop that table!!! About all you'll be left to say is "oops!" Your boss will probably be saying something completely different. + +Even if you do want to always have carte blanche access, just use the sa account to make your regular user account a member of the sysadmins server role. That gives you the power of sa, but gains you the extra security of separate passwords and the audit trail (in Profiler or when looking at system activity) of who is currently logged in to the system. + +Creating and Managing Logins + +There are currently four major ways to create logins on a SQL Server: + + * By using CREATE LOGIN + * By using the Management Studio + * SQL Management Objects (SMO) + * By using one of the several other options that remain solely for backward compatibility + +CREATE LOGIN + +CREATE LOGIN was added in SQL Server 2005 as part of a general effort by Microsoft to standardize the syntax used to create database and server objects. It deprecated the older sp_addlogin, which was the procedural way of adding logins in prior versions, and looks like the CREATE syntax that we've seen repeatedly in SQL but with some of the extra option requirements that we've seen with things like stored procedures. + +The most basic syntax is straightforward, but how the options can be mixed can become something of a pain to understand. The overall syntax looks like this: + +CREATE LOGIN + +[ { WITH + +PASSWORD = '' [ HASHED ] [ MUST_CHANGE ] + +[, SID = + +| DEFAULT_DATABASE = + +| DEFAULT_LANGUAGE = + +| CHECK_EXPIRATION = { ON | OFF} + +| CHECK_POLICY = { ON | OFF} + +[ CREDENTIAL = + +[, ... ] ] + +} | + +{ FROM + +WINDOWS + +[ WITH DEFAULT_DATABASE = + +| DEFAULT_LANGUAGE = ] + +| CERTIFICATE + +| ASYMMETRIC KEY + +} + +] + +The key part that sets the tone for things is the choice of a FROM versus a WITH clause immediately following the login name, so let's look at those along with the options as they are relevant to either the FROM or WITH clause they belong to. + +CREATE LOGIN... WITH + +The WITH clause immediately puts you into defining options that go with SQL Server authentication–based logins as opposed to any other authentication method. It is only relevant if you have SQL Server security enabled (as opposed to just Windows authentication). The number of options here can seem daunting, so let's break them down. + +Option | Description +---|--- +PASSWORD | This is, of course, just what it sounds like. The tricky part of this is the question of whether the password is in clear text (in which case SQL Server will encrypt it as it adds it) or whether it is already hashed (in which case you need to supply the HASHED keyword that is covered next). +HASHED | This follows your password, and is used only if the password you supplied was already hashed (encrypted). In that case, SQL Server adds the password without re-encrypting it. +MUST_CHANGE | This is another one of those "is what it sounds like" things. In short, if you supply this option, then the users will be prompted to change their password the first time they login. +SID | Allows you to manually specify what GUID SQL Server will use to identify this login. If you don't supply this (and doing so is something I would consider to be an extreme case), then SQL Server will generate one for you. +DEFAULT_DATABASE | This is the database that will be made current each time the user logs in. +DEFAULT_LANGUAGE | This is the language that things like errors and other system messages will be delivered in for the user. +CHECK_EXPIRATION | Sets whether SQL Server will enforce the password expiration policy. By default, the password will not expire. Setting this to ON will enforce policy. +CHECK_POLICY | Sets whether SQL Server will enforce the password policy (length, character requirements, and so on). By default, the password must meet the Windows password policy. Setting this to OFF will allow virtually any password to be used. +CREDENTIAL | This names a credential (and we'll cover what these are later) for this login to be mapped to. In short, this maps this login to a set of permissions that may allow them to perform actions outside of SQL Server (such as network access and such). + +Any of these can be mixed together, and the order in which you provide them matters only in the case of HASHED and MUST_CHANGE (which must follow the PASSWORD option if you're going to utilize them at all). + +CREATE LOGIN... FROM + +The FROM clause implies that this login isn't SQL Server–specific. The FROM clause specifies the source of that login. The source falls into a few different categories: + + * WINDOWS: In this case, we are mapping to an existing Windows login or group. This is basically saying "Take this existing Windows user or group, and give them rights to my SQL Server." Notice that I say "or group." You can map SQL Server to a Windows group, and that implies that any member of that group will be granted that level of access to your SQL Server. This is really handy for managing users in your network. For example, if you want everyone in accounting to have a certain set of rights in SQL Server, you could create a Windows group called Accounting and map that to a SQL Server login. If you hire someone new, then as soon as you add them to the Accounting group they will have access not only to whatever Windows resources the Accounting group has, but also all the SQL Server permissions that the Accounting group has. + +If you use Windows as your FROM sources, then you can also supply a WITH clause similar to a SQL Server–based login, but limited to just the default database and language. + + * CERTIFICATE: This kind of login is based off of an X.509 certificate that you've already associated with your server by using the CREATE CERTIFICATE command. Certificates can be used in several different ways, but in the end, they essentially serve as a recognized secure encryption key. SQL Server has its own "certificate authority" or can import those generated from other sources. Essentially, presentation of this certificate serves as authorization to log in to the SQL Server. + * ASYMMETRIC KEY: Asymmetric keys are a different flavor of the same general notion that certificates work under. Essentially, it is a key that is presented that SQL Server trusts, and therefore it grants access. Asymmetric keys are merely a different method of presenting a secure key. + +To prepare for the examples we'll use the rest of this chapter, you'll need to set up a user in Windows that we'll supply and remove access to and from over the course of the chapter. I've named my test user TestAccount, but you can substitute another name as you see fit (just make sure you remember to also substitute it in the chapter examples). Once you have an account to test with set up in Windows, try adding it to SQL Server (again, you'll need to change "HOBBES" to the name of your system): + +CREATE LOGIN [HOBBES\TestAccount] FROM WINDOWS + +WITH DEFAULT_DATABASE = NorthwindSecure; + +And our test account now has login rights to the SQL Server. Note, however, that even though we've defaulted our TestAccount to the NorthwindSecure database, the account still does not have access to that database (we'll get to that shortly). + +ALTER LOGIN + +As with most CREATE statements we've seen in SQL, CREATE LOGIN has a complementing statement in the form of ALTER LOGIN. As with most ALTER statements, the syntax is primarily a subset of the options found in the related CREATE statement: + +ALTER LOGIN + +[ { ENABLE | DISABLE } ] + +[ { WITH + +PASSWORD = '' + +[ { OLD_PASSWORD = '' + +| [ UNLOCK ] [ MUST_CHANGE ] } + +| DEFAULT_DATABASE = + +| DEFAULT_LANGUAGE = + +| NAME = + +| CHECK_EXPIRATION = { ON | OFF} + +| CHECK_POLICY = { ON | OFF} + +[ CREDENTIAL = + +| NO CREDENTIAL + +Most of these are exactly the same as they were with the CREATE statement, but let's look at the few differences. + +Option | Description +---|--- +ENABLE | DISABLE | Enables or disables the login. This is something of an indicator of whether or not the login is considered active in the system, and ENABLE should not be confused with UNLOCK (they are different things). Disabling a login leaves it in place but disallows use of the login. Enabling reactivates the login. +OLD PASSWORD | This one applies only if a given login is utilizing ALTER LOGIN to change its own password. Security administrators with the rights to change the password at all are unlikely to know the old password and have the right to set a new password without knowing the old one. +UNLOCK | This allows a user to attempt to log in again after the login has been locked out due to exceeding the bad password count. +NAME | This allows you to change the login name, while otherwise retaining all of the old rights and other properties of the login. +NO CREDENTIAL | This disassociates the login with whatever credential it may have previously been mapped to. + +DROP LOGIN + +This works just like any other DROP statement in SQL Server. + +DROP LOGIN + +And it's gone. + +Creating a Login Using the Management Studio + +Creating a login using Management Studio is fairly straightforward and is much the same as it is for most other objects in SQL Server. Just navigate to the appropriate mode in the Object Explorer (in this case, Security@@Logins), right-click, and choose New Login. This gets us the typical CREATE dialog that we've seen repeatedly in this book, but adjusted for the properties that are appropriate for a login (all the same things we reviewed in the "CREATE LOGIN" section earlier in the chapter, plus a number of additional areas we have yet to take a look at), as shown in Figure 19.1. + +Figure 19.1 + +Only this first set of properties (the General properties) maps to the CREATE LOGIN syntax. The additional tabs map to other objects we will be creating as we continue through the chapter. + +We will be reviewing several other kinds of objects that get associated with logins in some fashion. For now, the thing to notice is how the user interface in Management Studio lets you do everything at once. As we'll see as we continue the chapter, when creating these objects using code, we have to do each step separately rather than all at once as Management Studio offers. (As you might imagine, it's really just collecting all the necessary information in advance and then issuing all those individual programmatic steps for us.) + +SQL Management Objects + +This is largely out of scope for this chapter (we cover SMO in its own chapter later on), but I did want to specifically point out that SMO can create logins for you using a straightforward object model as opposed to the CREATE statement approach. See Chapter 23 for more information. + +Legacy Options + +There are three older options of significance when considering the way that logins have been created in past versions of SQL Server. + + * sp_addlogin and Related Sprocs: This was a stored procedure that essentially maps to CREATE LOGIN except that several parts of the CREATE LOGIN statement implement things that were not supported prior to SQL Server 2005. The basics (creating the typical login as opposed to the certificate or asymmetric key approach) are all there though. We'll take a more detailed look at sp_addlogin shortly. + * WMI: Windows Management Instrumentation is an implementation of an industry-standard Web management protocol. When SQL Server 2000 first came out, the thinking was that a WMI-based model was going to take over as the primary way of automating SQL Server management. In the end, there was no WMI-based model implemented that came anywhere close to being up to the task of exposing all the things we need in SQL Server, and that effort would seem to have been largely junked. WMI is now outside the scope of this book, but realize that it's out there and remains an option if you need to manage older versions of SQL Server or are familiar with WMI for other purposes and want to add SQL Server scripting into your larger WMI plan. + +A Quick Look at sp_addlogin + +This sproc does exactly what it says, and it was the old way of implementing the things that CREATE LOGIN does for us today. While I highly recommend avoiding sp_addlogin for new development, it is still in wide use in legacy code. It requires only one parameter, but most of the time you'll use two or three. There are a couple of additional parameters, but you'll find that you use those far more rarely. The syntax looks like this: + +EXEC sp_addlogin [@loginame =] <'login'> + +[,[@passwd =] <'password'>] + +[,[@defdb =] <'database'>] + +[,[@deflanguage =] <'language'>] + +[,[@sid =] 'sid'] + +[,[@encryptopt =] <'encryption_option'>] + +Parameter | Description +---|--- +@loginame | Just what it sounds like—this is the login ID that will be used. +@passwd | Even more what it sounds like—the password that is used to log in using the aforementioned login ID. +@defdb | The default database. This defines what is the first "current" database when the user logs in. Normally, this will be the main database your application uses. If left unspecified, the default will be the master database (you usually don't want that, so be sure to provide this parameter). +@deflanguage | The default language for this user. You can use this to override the system default if you are supporting localization. +@sid | A binary number that becomes the security identifier (SID) for your login ID. If you don't supply an SID, SQL Server generates one for you. Since SIDs must be unique, any SID you supply must not already exist in the system. Using a specific SID can be handy when you are restoring your database to a different server or are otherwise migrating login information. +@encryptopt | The user's login ID and password information is stored in the sysusers table in the master database. The @encryptopt determines whether or not the password stored in the master database is encrypted. By default (or if you provide a NULL in this parameter), the password is indeed encrypted. The other options are skip_encryption, which does just what it says—the password is not encrypted, and skip_encryption_old, which is there only for backward compatibility, and should not be used. + +As you can see, most of the items here map directly to CREATE LOGIN, and that is the way I recommend doing things unless you need to utilize sp_addlogin for backward-compatibility reasons. + +sp_password + +Since we've looked at sp_addlogin, we ought to look at sp_password. While ALTER LOGIN gives you the ability to address password maintenance on a login (and it is what you should be using), sp_addlogin had no such functionality—sp_password takes care of that. The syntax is pretty straightforward: + +sp_password [[@old =] <'old password'>,] + +[@new =] <'new password'> + +[,[@loginame =] <'login'>] + +The new and old password parameters work, of course, just exactly as you would expect. You need to accept those from the user and pass them into the sproc. Note, however, that the login is an optional parameter. If you don't supply it, then it will assume that you want to change the password on the login used for the current connection. Note that sp_password cannot be executed as part of a transaction. + +You might be thinking something like, "Don't most systems require you to enter the new password twice?" Indeed they do. So the follow up question is, "How come sp_password doesn't do that?" The answer is a simple one—because SQL Server leaves that up to you. You would include the logic to check for a double entry of the new password in your client application before you ever got as far as using sp_password. This same issue exists for ALTER LOGIN. + +sp_grantlogin + +This simulates the CREATE LOGIN...FROM functionality as relates to Windows logins (prior to SQL Server 2005, mapping from certificates and asymmetric keys did not exist as they do now). The syntax is straightforward: + +sp_grantlogin [@loginname = ]'\' + +Again, this is for backward compatibility only. Use the CREATE LOGIN...FROM syntax for 2005 and later installations (which should be the vast majority of new code at this point). + +Windows Authentication + +Windows authentication gives us the capability to map logins from trusted Windows domains into our SQL Server. + +It is simply a model where you take existing Windows domain user accounts or groups and provide SQL Server rights to them directly rather than forcing users to keep separate passwords and make separate logins. + +Windows authentication allows: + + * Maintenance of a user's access from just one place + * Granting of SQL Server rights simply by adding a user to a Windows group (this means that you often don't have to even go into SQL Server in order to grant access to a user) + * Your users need to remember only one password and login + +That being said, let's take a look at how to grant specific rights to specific users. + +User Permissions + +The simplest definition of what a user permission is would be something like, "what a user can and can't do." In this case, the simple definition is a pretty good one. + +User permissions fall into three categories: + + * Permission to log in + * Permission to access a specific database + * Permission to perform specific actions on particular objects within that database + +Since we've already looked at creating logins, we'll focus here on the specific permission that a login can have. + +Granting Access to a Specific Database + +The first thing that you need to do if you want a user to have access to a database is to grant the user permission to access that database. This can be done in Management Studio by adding the user to the Users member of the Databases node of your server. To add a user using T-SQL, you should use CREATE USER. Similar to sp_addlogin there is also, for backward compatibility, the sp_grantdbaccess stored procedure. + +Note that as you CREATE a user in the database, those permissions are actually stored in the database and mapped to the server's identifier for that user. As you restore a database, you may have to remap user rights to the server identifiers where you restored the database. + +CREATE USER + +The CREATE USER command adds a new user to the database. That user can be sourced from an existing login, certificate, or asymmetric key, or can be local to just the current database. The syntax looks like this: + +CREATE USER + +[ { { FOR | FROM } + +{ + +LOGIN + +| CERTIFICATE + +| ASYMMETRIC KEY + +} + +| WITHOUT LOGIN ] + +[ WITH DEFAULT_SCHEMA = ] + +Let's take a quick look at what some of these elements mean: + +Option | Description +---|--- +LOGIN | The name of the login you want to grant access to for the current database. +CERTIFICATE | Logical name of the certificate to be associated with this user. Note that the certificate must have already been created using the CREATE CERTIFICATE command. +ASYMMETRIC KEY | Logical name of the asymmetric key to be associated with this user. Note that the key must have already been created using the CREATE ASYMMETRIC KEY command. +WITHOUT LOGIN | Creates a user that is local to the current database. It can be used to set up a specific security context but cannot be mapped to a login outside of the current database nor can it access any other database. +WITH DEFAULT_SCHEMA | Establishes a schema other than the default "dbo" as being the default schema for the current user. + +So, to grant access to our NorthwindSecure database for our TestAccount, we would issue a command such as: + +CREATE USER [HOBBES\TestAccount] + +FOR LOGIN [HOBBES\TestAccount] + +WITH DEFAULT_SCHEMA = dbo; + +This grants our login access to the specified database (NorthwindSecure in this case) and sets that login's default schema to the database owner. + +sp_grantdbaccess + +This is the legacy method for granting a login access to a specific database. The syntax looks like this: + +sp_grantdbaccess [@loginame =] <'login'>[, [@name_in_db =] <'name in this db'> + +Note that the access granted will be to the current database—that is, you need to make sure that the database you want the user to have access to is the current database when you issue the command. The login name is the actual login ID that was used to log in to SQL Server. The name_in_db parameter allows you to alias this user to another identification. The alias serves for this database only—all other databases will still use the default of the login ID or whatever alias you defined when you granted the user access to that database. The aliasing will affect identification functions such as USER_NAME(). Functions that look at things at the system level, such as SYSTEM_USER, will still return the base login ID. + +Granting Object Permissions within the Database + +Okay, so the user has a login and access to the database you want him or her to have access to, so now everything's done—right? Ah, if only it were that simple! We are, of course, not done yet. + +SQL Server gives us a pretty fine degree of control over what our users can access. Most of the time, you have some information that you want your users to be able to get to, but you also have other information in the database to which you don't want them to have access. For example, you might have a customer service person who has to be able to look at and maintain order information—but you probably don't want them messing around with the salary information. The opposite is also probably true—you need your human resource people to be able to edit employee records, but you probably don't want them giving somebody a major discount on a sale. + +SQL Server allows you to assign a separate set of rights to some of the different objects within SQL Server. The objects you can assign rights to include tables, views, and stored procedures. Triggers are implied to have the rights of the person that created them. + +User rights on objects fall into six different types: + +User Right | Description +---|--- +SELECT | Allows a user to "see" the data. If a user has this permission, the user has the right to run a SELECT statement against the table or view on which the permission is granted. +INSERT | Allows a user to create new data. Users with this permission can run an INSERT statement. Note that, unlike many systems, having INSERT capability does not necessarily mean that you have SELECT rights. +UPDATE | Allows a user to modify existing data. Users with this permission can run an UPDATE statement. Like the INSERT statement, having UPDATE capability does not necessarily mean that you have SELECT rights. +DELETE | Allows a user to delete data. Users with this permission can run a DELETE statement. Again, having DELETE capability does not necessarily mean that you have SELECT rights. +REFERENCES | Allows a user to insert rows, where the table that is being inserted into has a foreign key constraint, which references another table to which that user doesn't have SELECT rights. +EXECUTE | Allows a user to EXECUTE a specified stored procedure. + +You can mix and match these rights as needed on the particular table, view, or sproc to which you're assigning rights. + +You can assign these rights in the Management Studio simply by navigating to the Logins option of the Security node of your server. Just right-click the user and choose Properties. You'll be presented with a different dialog depending on whether you're in the database or security node, but, in either case, you'll have the option of setting permissions. Assigning rights using T-SQL uses three commands that are good to know even if you're only going to assign rights through Management Studio (the terminology is the same). + +GRANT + +GRANT gives the specified user or role the access specified for the object that is the subject of the GRANT statement. + +The syntax for a GRANT statement looks like this: + +GRANT + +ALL [PRIVILEGES] | [,...n] + +ON + +
[([,...n])] + +| + +TO [,...n] + +[WITH GRANT OPTION] + +[AS ] + +The ALL keyword indicates that you want to grant all the rights that are applicable for that object type (EXECUTE never applies to a table). If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want granted for that object. + +PRIVILEGES is a keyword that has no real function other than to provide ANSI/ISO compatibility. + +The ON keyword serves as a placeholder to say that what comes next is the object for which you want the permissions granted. Note that, if you are granting rights on a table, you can specify permissions down to the column level by specifying a column list to be affected—if you don't supply specific columns, then it's assumed to affect all columns. + +Microsoft appears to have done something of an about face in their opinion of column-level permissions. Being able to say that a user can do a SELECT on a particular table but only on certain columns seems like a cool idea, but it really convolutes the security process both in its use and in the work it takes Microsoft to implement it. As such, literature on the subject over the last several years has sometimes said little, and sometimes seemed to indicate that Microsoft wishes that column-level security would go away. They have occasionally recommended against its use (and other times seemed to offer no opinion)—if you need to restrict a user to seeing particular columns, consider using a view instead. + +The TO statement does what you would expect: it specifies those to whom you want this access granted. It can be a login ID or a role name. + +WITH GRANT OPTION allows the user that you're granting access to, in turn, also grant access to other users. + +I recommend against the use of this option since it can quickly become a pain to keep track of who has got access to what. Sure, you can always go into Management Studio and look at the permissions for that object, but then you're in a reactive mode rather than a proactive one—you're looking for what's wrong with the current access levels rather than stopping unwanted access up front. + +Last, but not least, is the AS keyword. This one deals with the issue of a login belonging to multiple roles. + +Now, we can go ahead and move on to an example or two. We'll see later that the TestAccount that we created already has some access based on being a member of the Public role—something that every database user belongs to, and from which you can't remove them. There are, however, a large number of items to which TestAccount doesn't have access (because Public is the only role it belongs to, and Public doesn't have rights either). + +Start by logging in with the TestAccount user. Then try a SELECT statement against the Region table: + +SELECT * FROM Region; + +You'll quickly get a message from SQL Server telling you that you are a scoundrel, and you are attempting to go to places that you shouldn't be going: + +Server: Msg 229, Level 14, State 5, Line 1 + +SELECT permission denied on object 'Region', database 'NorthwindSecure', owner 'dbo'. + +Log in separately as sa—you can do this in the same instance of QA if you like by choosing the File@@Connect menu choice. Then select SQL Server security for the new connection and log in as sa with the appropriate password. Now execute a GRANT statement: + +USE NorthwindSecure; + +GRANT SELECT ON Region TO [HOBBES\TestAccount]; + +Note that you'll need to replace the "HOBBES" with the name of your computer or domain as appropriate. + +Now switch back to the TestAccount connection (remember, the information for what user you're connected in as is in the title bar of the connection window), and try that SELECT statement again: This time, you get better results: + +RegionID RegionDescription + +\--------------- ------------------------- + +1 Eastern + +2 Western + +3 Northern + +4 Southern + +(4 row(s) affected) + +Let's go ahead and try another one. This time, let's run the same tests and commands against the EmployeeTerritories table: + +SELECT * FROM EmployeeTerritories; + +This one fails—again, you don't have rights to it, so let's grant the rights to this table: + +USE NorthwindSecure; + +GRANT SELECT ON EmployeeTerritories TO [HOBBES\TestAccount]; + +Now, if you re-run the select statement, things work just fine: + +EmployeeID TerritoryID + +\---------------- -------------- + +1 06897 + +1 19713 + +... + +... + +... + +9 48304 + +9 55113 + +9 55439 + +(49 row(s) affected) + +To add an additional twist, however, let's try an INSERT into this table: + +INSERT INTO EmployeeTerritories + +VALUES + +(1, '01581'); + +SQL Server wastes no time in telling us to get lost. We don't have the required permissions, so let's grant them (using the sa connection): + +USE NorthwindSecure; + +GRANT INSERT ON EmployeeTerritories TO [HOBBES\TestAccount]; + +Now try that INSERT statement again: + +INSERT INTO EmployeeTerritories + +VALUES + +(1, '01581'); + +Everything works great. + +DENY + +DENY explicitly prevents the user from the access specified on the targeted object. The key to DENY is that it overrides any GRANT statements. Since a user can belong to multiple roles (discussed shortly), it's possible for a user to be part of a role that's granted access but also have a DENY in effect. If a DENY and a GRANT both exist in a user's mix of individual and role-based rights, then the DENY wins every time. In short, if the user or any role the user belongs to has a DENY for the right in question, then the user will not be able to make use of that access on that object. + +The syntax looks an awful lot like the GRANT statement: + +DENY + +[ALL] [PRIVILEGES]|[,...n] + +ON + +
[(column[,...n])] + +| + +TO [,...n] + +[CASCADE] + +Again, the ALL keyword indicates that you want to deny all the rights that are applicable for that object type (EXECUTE never applies to a table). If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want to be denied for that object. + +Note that the ALL keyword is now included solely for backward compatibility. It's also important to understand that ALL no longer truly affects "all" privileges. While it does affect most mainstream privileges (such as a SELECT), there is, as ALL becomes more out of date, an ever increasing list of privileges not affected by ALL. + +PRIVILEGES is still a new keyword and has no real function other than to provide ISO compatibility. + +The ON keyword serves as a placeholder to say that what comes next is the object on which you want the permissions denied. + +Everything has worked pretty much the same as with a GRANT statement until now. The CASCADE keyword matches up with the WITH GRANT OPTION that was in the GRANT statement. CASCADE tells SQL Server that you want to also deny access to anyone that this user has granted access to under the rules of the WITH GRANT OPTION. + +To run an example on DENY, let's try a simple SELECT statement using the TestAccount login: + +USE NorthwindSecure; + +SELECT * FROM Employees; + +This should get you nine records or so. How did you get access when we haven't granted it to TestAccount? TestAccount belongs to Public, and Public has been granted access to Employees. + +Let's say that we don't want TestAccount to have access. For whatever reason, TestAccount is the exception, and we don't want that user snooping in that data—we just issue our DENY statement (remember to issue the DENY using the sa login): + +USE NorthwindSecure; + +DENY ALL ON Employees TO [HOBBES\TestAccount]; + +When you run the SELECT statement again using TestAccount, you'll get an error. You no longer have access. Note also that, since we used the ALL keyword, the INSERT, DELETE, and UPDATE access that Public has is now also denied from TestAccount. + +Again, note that ALL is deprecated, so you will receive a warning when running the previous example code. I have kept this example so you understand the breadth of the ALL keyword, which you may still find in your legacy code. + +REVOKE + +REVOKE eliminates the effects of a previously issued GRANT or DENY statement. Think of this one as like a targeted "Undo" statement. + +The syntax is a mix of the GRANT and DENY statements: + +REVOKE [GRANT OPTION FOR] + +[ALL] [PRIVILEGES] | [,...n] + +ON + +
[( [,...n])] + +| + +TO | FROM [,...n] + +[CASCADE] + +[AS ] + +The explanations here are virtually identical to those of the GRANT and DENY statements. I put them here again in case you're pulling the book back off the shelf for a quick lookup on REVOKE. + +Once again, the ALL keyword indicates that you want to revoke all the rights that are applicable for that object type. If you don't use the ALL keyword, then you need to supply one or more specific permissions that you want to be revoked for that object. + +PRIVILEGES still has no real function other than to provide ANSI/ISO compatibility. + +The ON keyword serves as a placeholder to say that what comes next is the object on which you want the permissions revoked. + +The CASCADE keyword matches up with the WITH GRANT OPTION that was in the GRANT statement. CASCADE tells SQL Server that you want to also revoke access from anyone that this user granted access to under the rules of the WITH GRANT OPTION. + +The AS keyword again just specifies which role you want to issue this command based on. + +Using the sa connection, let's undo the access that we granted to the Region table in NorthwindSecure: + +REVOKE ALL ON Region FROM [HOBBES\TestAccount]; + +After executing this, our TestAccount can no longer run a SELECT statement against the Region table. + +In order to remove a DENY, we also issue a REVOKE statement. This time, we'll regain access to the Employees table: + +USE NorthwindSecure; + +REVOKE ALL ON Employees TO [HOBBES\TestAccount] + +Now that we've seen how all the commands to control access work for individual users, let's take a look at the way we can greatly simplify management of these rights by managing in groupings. + +User Rights and Statement-Level Permissions + +User permissions don't just stop with the objects in your database—they also extend to certain statements that aren't immediately tied to any particular object. SQL Server gives you control over permissions to run several different statements, including: + + * CREATE DATABASE + * CREATE DEFAULT + * CREATE PROCEDURE + * CREATE RULE + * CREATE TABLE + * CREATE VIEW + * BACKUP DATABASE + * BACKUP LOG + +At this point, we've already seen all of these commands at work except for the two backup commands—what those are about is pretty self-explanatory, so I'm not going to spend any time on them here (we'll look at them in Chapter 22)—just keep in mind that they are something you can control at the statement level. + +Okay, so how do we assign these permissions? Actually, now that you've already seen GRANT, REVOKE, and DENY in action for objects, you're pretty much already schooled on statement-level permissions, too. Syntactically speaking, they work just the same as object-level permissions, except that they are even simpler (you don't have to fill in as much). The syntax looks like this: + +GRANT {ALL | } TO [,...n] + +Easy, hey? To do a quick test, let's start by verifying that our test user doesn't already have authority to CREATE. Make sure you are logged in as your TestAccount, and then run the following command. Don't forget to switch your domain name for HOBBES in the following: + +USE NorthwindSecure; + +CREATE TABLE TestCreate + +( + +Col1 int Primary Key + +); + +This gets us nowhere fast: + +Server: Msg 262, Level 14, State 1, Line 2 + +CREATE TABLE permission denied, database 'NorthwindSecure', owner 'dbo'. + +Now log in to SQL Server using the sa account (or another account with dbo authority for NorthwindSecure). Then run our command to grant permissions: + +GRANT CREATE TABLE TO [HOBBES\TestAccount]; + +You should get confirmation that your command completed successfully. Then just try running the CREATE statement again. Remember to log back in using the TestAccount: + +USE NorthwindSecure; + +CREATE TABLE TestCreate + +( + +Col1 int Primary Key + +); + +This time everything works. + +DENY and REVOKE also work the same way as they did for object-level permissions. + +Server and Database Roles + +A role is, in the most general sense, the same thing as a group in Windows, that is, it is a collection of access rights (or denials) that are automatically associated with a user when they are assigned that role. + +A role is a collection of access rights that can be assigned to a user en masse simply by assigning a user to that role. + +A user can belong to as little as one or potentially several roles at one time. This can be incredibly handy since you can group access rights into smaller and more logical groups and then mix and match them into the formula that best fits a user. + +Roles fall into two categories: + + * Server roles + * Database roles + +We'll soon see a third thing that's also called role—though I wish that Microsoft had chosen another name—application roles. These are a special way to alias a user into a different set of permissions. An application role isn't something you assign a user to; it's a way of letting an application have a different set of rights from the user. For this reason, I don't usually think of application roles as a "role" in the true sense of the word. + +Server roles are limited to those that are already built into SQL Server when it ships and are primarily there for the maintenance of the system as well as granting the capability to do non-database-specific things like creating login accounts and creating linked servers. + +Much like server roles, there are a number of built-in (or "fixed") database roles, but you can also define your own database roles to meet your particular needs. Database roles are for setting up and grouping specific user rights within a single given database. + +Let's look at both of these types of roles individually. + +Server Roles + +All server roles available are "fixed" roles and are there right from the beginning. All the server roles that you're ever going to have existed from the moment your SQL Server was installed. + +Role | Nature +---|--- +sysadmin | This role can perform any activity on your SQL Server. Anyone with this role is essentially the sa for that server. The creation of this server role provides Microsoft with the capability to one day eliminate the sa login—indeed, the Books Online refers to sa as being legacy in nature. It's worth noting that the Windows Administrators group on the SQL Server is automatically mapped into the sysadmin role. This means that anyone who is a member of your server's Administrators group also has sa-level access to your SQL data. You can, if you need to, remove the Windows administrators group from the sysadmin role to tighten that security loophole. +serveradmin | This one can set server-wide configuration options or shut down the server. It's rather limited in scope, yet the functions controlled by members of this role can have a very significant impact on the performance of your server. +setupadmin | This one is limited to managing linked servers and startup procedures. +securityadmin | This one is very handy for logins that you create specifically to manage logins, read error logs, and CREATE DATABASE permissions. In many ways, this one is the classic system operator role—it can handle most of the day-to-day stuff, but doesn't have the kind of global access that a true omnipotent superuser would have. +processadmin | Has the capability to manage processes running in SQL Server—this one can kill long-running processes if necessary. +dbcreator | Is limited to creating and altering databases. +diskadmin | Manages disk files (what file group things are assigned to, attaching and detaching databases, and so on). +bulkadmin | This one is something of an oddity. It is created explicitly to give rights to execute the BULK INSERT statement, which otherwise is executable only by someone with sysadmin rights. Frankly, I don't understand why this statement isn't granted with the GRANT command like everything else, but it isn't. Keep in mind that, even if a user has been added to the bulkadmin group, that just gives them access to the statement, not the table that they want to run it against. This means that you need, in addition to adding the user to the bulkadmin task, to GRANT them INSERT permissions to any table you want them to be able to perform the BULK INSERT against. In addition, you'll need to make sure they have proper SELECT access to any tables that they will be referencing in their BULK INSERT statement. + +You can mix and match these roles to individual users that are responsible for administrative roles on your server. In general, I suspect that only the very largest of database shops will use more than the sysadmin and securityadmin roles, but they're still handy to have around. + +Earlier in this chapter, I got into a lengthy soapbox diatribe on the evils of global users. It probably comes as no surprise to you to learn that I was positively ecstatic when the new sysadmin role was added back in version 7.0. The existence of this role means that, on an ongoing basis, you should not need to have anyone have the sa login. Just let the users that need that level of access become members of the sysadmin role, and they shouldn't ever need to log in as sa. Be careful though; having a user always have that level of access can lead to accidents (it won't, on the basis of security, stop you from dropping objects and the like). I've known many IT shops that give their administrators more than one login: one for full sysadmin access, and another "day to day" login that has the privileges they need to get most things done, but limits privileges that have a high risk of being destructive. The admin can still do what they need to do, but they need to make the conscious effort to log in with the special high access account to do the more risky activities (which means they are much more likely to be thinking about it as they do it). + +Database Roles + +Database roles are limited in scope to just one database—just because a user belongs to the db_datareader role in one database doesn't mean that it belongs to that role in another database. Database roles fall into two subcategories: fixed and user defined. + +Fixed Database Roles + +Much as there are several fixed server roles, there are also a number of fixed database roles. Some of them have a special predefined purpose, which cannot be duplicated using normal statements (that is you cannot create a user-defined database role that had the same functionality). However, most exist to deal with the more common situations and make things easier for you. + +Role | Nature +---|--- +db_owner | This role performs as if it were a member of all the other database roles. Using this role, you can create a situation where multiple users can perform the same functions and tasks as if they were the database owner. +db_accessadmin | Performs a portion of the functions similar to the securityadmin server role, except this role is limited to the individual database where it is assigned and the creation of users (not individual rights). It cannot create new SQL Server logins, but members of this role can add Windows users and groups as well as existing SQL Server logins into the database. +db_datareader | Can issue a SELECT statement on all user tables in the database. +db_datawriter | Can issue INSERT, UPDATE, and DELETE statements on all user tables in the database. +db_ddladmin | Can add, modify, or drop objects in the database. +db_securityadmin | The other part of the database-level equivalent of the securityadmin server role. This database role cannot create new users in the database, but does manage roles and members of database roles as well as manage statement and object permissions in the database. +db_backupoperator | Backs up the database (gee, bet you wouldn't have guessed that one!). +db_denydatareader | Provides the equivalent of a DENY SELECT on every table and view in the database. +db_denydatawriter | Similar to db_denydatareader, only affects INSERT, UPDATE, and DELETE statements. + +Much as with the fixed server roles, you're probably not going to see all of these used in anything but the largest of database shops. Some of the roles are not replaceable with your own database roles, and others are just very handy to deal with the quick-and-dirty situations that seem to frequently come up. + +User-Defined Database Roles + +The fixed roles that are available are really only meant to be there to help you get started. The real mainstay of your security is going to be the creation and assignment of user-defined database roles. For these roles, you decide what permissions they include. + +With user-defined roles, you can GRANT, DENY, and REVOKE in exactly the same way as we did for individual users. The nice thing about using roles is that users tend to fall into categories of access needs. By using roles you can make a change in one place and have it propagate to all the similar users (at least the ones that you have assigned to that role). + +We have two means of creating a user-defined role: + + * CREATE ROLE (the preferred choice) + * sp_addrole (for backward compatibility) + +Let's take a look at each. + +Creating a User-Defined Role Using CREATE ROLE + +To create our own role, the preferred option is to use the CREATE ROLE command. Much like many of the other commands we've looked at in this chapter, the functionality of this command has been migrated to a more ANSI/ISO-compliant syntax, but was previously supported by a system stored procedure—in this case, the sp_addrole system sproc. As with the others, the syntax is pretty straightforward: + +CREATE ROLE [AUTHORIZATION ][;] + +The role name is simply what you want to call that role. Examples of common naming schemas would include by department (Accounting, Sales, Marketing, and so on) or by specific job (CustomerService, Salesperson, President, and so on). Using roles like this can make it really easy to add new users to the system. If your accounting department hires someone new, you can just add him or her to the Accounting role (or, if you're being more specific, it might even be the AccountsPayable role) and forget it—no researching "What should this person have for rights?" + +The AUTHORIZATION parameter is optional, and allows you to override what database user or role owns this new role. (By default, it will be owned by whoever ran the CREATE command, usually someone in the db_owner role). + +Let's go ahead and create ourselves a role: + +USE NorthwindSecure; + +CREATE ROLE OurTestRole; + +When you execute this, you should get back a nice friendly message telling you that the new role has been added. + +Now what we need is to add some value to this role in the form of it actually having some rights assigned to it. To do this, we just use our GRANT, DENY, or REVOKE statements just as we did for actual users earlier in the chapter: + +USE NorthwindSecure; + +GRANT SELECT ON Territories TO OurTestRole; + +Anyone who belongs to our role now has SELECT access to the Territories table (unless they have a DENY somewhere else in their security information). + +Using sp_addrole + +As I mentioned earlier, there is an older, system stored procedure–based command that remains for backward compatibility. + +The syntax is again pretty simple: + +sp_addrole [@rolename =] <'role name'> + +[,[@ownername =] <'owner'>] + +The owner is the same thing as it is for all other objects in the system. The default is the database owner, and I strongly suggest leaving it that way (in other words, just ignore this optional parameter). If we were going to add our special test role using the older syntax, it would look something like: + +USE NorthwindSecure; + +EXEC sp_addrole 'OurTestRole'; + +Regardless of which syntax you use, you should, at this point, be ready to start adding users. + +Adding Users to a Role + +Having all these roles around is great, but they are of no use if they don't have anyone assigned to them. Surprisingly, there isn't, as yet anyway, a new command that addresses this. Instead, we go back to the older system stored procedure model, calling the sp_addrolemember system sproc and providing the database name and login ID: + +sp_addrolemember [@rolename =] , + +[@membername =] [;] + +Everything is pretty self-explanatory on the parameters for this one, so let's move right into an example. + +Let's start off by verifying that our TestAccount doesn't have access to the Territories table: + +SELECT * FROM Territories; + +Sure enough, we are rejected (no access yet): + +Server: Msg 229, Level 14, State 5, Line 1 + +SELECT permission denied on object 'Territories', database 'Northwind', owner 'dbo'. + +Now we'll go ahead and add our TestAccount Windows user to our OurTestRole role: + +USE NorthwindSecure; + +EXEC sp_addrolemember OurTestRole, [HOBBES\TestAccount]; + +It's time to try and run the SELECT statement again—this time with much more success (you should get about 53 rows back). + +Removing a User from a Role + +What goes up must come down, and users that are added to a role will also inevitably be removed from roles. + +Removing a user from a role works almost exactly as adding them does, except we use a different system sproc called sp_droprolemember in the form of: + +sp_droprolemember [@rolename =] , + +[@membername =] [;] + +So, let's go right back to our example and remove the TestAccount from the OurTestRole database role: + +USE NorthwindSecure; + +EXEC sp_droprolemember OurTestRole, [HOBBES\TestAccount]; + +You should receive another friendly confirmation that things have gone well. Now try our SELECT statement again: + +SELECT * FROM Territories; + +And, sure enough, we are again given the error that we don't have access. + +You can add and drop users from any role this way. It doesn't matter whether the role is user-defined or fixed, or whether it's a system or database role. In any case, they work pretty much the same. + +Note also that you can do all of this through the Management Studio. To change the rights associated with a role, just click the Roles member of the Security node (under your specific database), and assign permissions by using the checkboxes. When you want to add a user to the role, go to the users node (again, under the specific database) and right-click to select Properties. Then select either the server or database roles by putting a check mark in all the roles you want that user to have. + +Dropping Roles + +Dropping a role is as easy as adding one. The syntax is simply: + +EXEC sp_droprole <'role name'>[;] + +And it's gone. + +Application Roles + +Application roles are something of a different animal than are database and server roles. Indeed, the fact that the term role is used would make you think that they are closely related. They aren't. + +Application roles are really much more like a security alias for the user. Application roles allow you to define an access list (made up of individual rights or groupings of databases). They are also similar to a user in that they have their own password. They are, however, different from a user login because they cannot "log in" as such. A user account must first log in, then he or she can activate the application role. + +So what do we need application roles for? For applications—what else? Time and time again, you'll run into the situation where you would like a user to have a separate set of rights depending on under what context he or she is accessing the database. With an application role, you can do things like grant users no more than read-only access to the database (SELECT statements only), but still allow them to modify data when they do so within the confines of your application. + +The process works like this: + +1. The user logs in (presumably using a login screen provided by your application). + +2. The login is validated, and the user receives his or her access rights. + +3. The application executes a system sproc called sp_setapprole and provides a role name and password. + +4. The application role is validated, and the connection is switched to the context of that application role (all the rights the user had are gone—he or she now has the rights of the application role). + +5. The user continues with access based on the application role rather than his or her personal login throughout the duration of the connection; the user cannot go back to his or her own access information. + +You would only want to use application roles as part of a true application situation, and you would build the code to set the application role right into the application. You would also compile the required password into the application or store the information in some local file to be accessed when it is needed. + +Creating Application Roles + +To create an application role, we use a variation on the CREATE ROLE theme—CREATE APPLICATION ROLE. This is another pretty easy one to use; its syntax looks like this: + +CREATE APPLICATION ROLE + +WITH PASSWORD = <'password'> [, DEFAULT_SCHEMA = ][;] + +Much like the other flavors of CREATE in this chapter, the parameters are pretty self-explanatory; so let's move right on to using it by creating ourselves an application role: + +CREATE APPLICATION ROLE OurAppRole WITH PASSWORD = 'P@ssw0rd'; + +Just that quick, our application role is created. Like most of the security items thus far, there is a system stored procedure that used to serve this functionality that is still supported, but, again, only for backward compatibility. It is very similar to the CREATE syntax, but looks like this: + +sp_addapprole [@rolename =] , + +[@password =] <'password'>[;] + +So creating the previous example using the system stored procedure instead would look like: + +EXEC sp_addapprole OurAppRole, 'P@ssw0rd'; + +Adding Permissions to the Application Role + +Adding permissions to application roles works just like adding permissions to anything else. Just substitute the application role name anywhere that you would use a login ID or regular server or database role. + +Again, we'll move to the quick example: + +GRANT SELECT ON Region TO OurAppRole; + +Our application role now has SELECT rights on the Region table—it doesn't, as yet, have access to anything else. + +Using the Application Role + +Using the application role is a matter of calling a system sproc (sp_setapprole) and providing both the application role name and the password for that application role. The syntax looks like this: + +sp_setapprole [@rolename =] , + +[@password =] {Encrypt N<'password'>}|<'password'> + +[,[@encrypt =] {'none' | 'odbc'] + +[, [@fCreateCookie = ] {true | false} ] + +[, [@cookie = ] OUTPUT][;] + +The role name is simply the name of whatever application role you want to activate. + +The password can be either supplied as is or encrypted using the ODBC encrypt function. If you're going to encrypt the password, then you need to enclose the password in quotes after the Encrypt keyword and precede the password with a capital N—this indicates to SQL Server that you're dealing with a Unicode string (which the password must be in if you're going to encrypt it), and it will be treated accordingly. If you don't want encryption, then just supply the password without using the Encrypt keyword. + +It's worth noting that encryption is only an option with ODBC and OLE DB clients. Thus you cannot test it inside the Query window (which uses SqlClient). Furthermore, if you're not using encryption, realize that the password you supply is going to be plainly viewable to anyone sniffing packets on your network. In short, if you're not using ODBC encryption for sending your password, then you'll want to use SSL or IPSec (two secure transport methods) for the connection. + +This takes us to the cookie side of things. Setting a cookie (and storing the value you get back in the @cookie output variable) provides a bookmark of sorts for the permission set that was active before you activated the application role. You can then use the sp_unsetapprole stored procedure to revert back to the previous security context (the one indicated by the cookie). The syntax for sp_unsetapprole looks like this: + +sp_unsetapprole + +Execute this, and your security context should return to the previous state. + +Moving right into a simple example, let's start by verifying a couple of things about the status of our TestAccount user. At this point in the chapter (assuming you've been following along with all the examples), your TestAccount user should not be able to access the Region table but should be able to access the EmployeeTerritories table. You can verify this to be the case by running a couple of SELECT statements: + +SELECT * FROM Region; + +SELECT * FROM EmployeeTerritories; + +The first SELECT should give you an error, and the second should return around 50 rows or so. + +Now let's activate the application role that we created a short time ago; type this in using TestAccount user: + +EXEC sp_setapprole OurAppRole, 'P@ssw0rd'; + +When you execute this, you should get back a confirmation that your application role is now "active." + +Try it out by running our two SELECT statements. You'll find that what does and doesn't work has been exactly reversed. That is, TestAccount had access to EmployeeTerritories, but that was lost when we went to the application role. TestAccount did not have access to the Regions table, but the application role now provides that access. + +Since we didn't store a cookie (I'm deliberately making a point here...), there is no way to terminate the application role for the current connection. We're stuck with few options other than, perhaps, switching to yet another application role. We have no way of returning to our original security context without the cookie. + +Go ahead and terminate your TestAccount connection. Then, create a new connection with Windows authentication for your TestAccount. Try running those SELECT statements again, and you'll find that your original set of rights has been restored. + +Getting Rid of Application Roles + +When you no longer need the application role on your server, you can use the same DROP command that you should, by now, be very familiar with: + +DROP APPLICATION ROLE + +There is, of course, also a system stored procedure version of this (again, backward compatibility only please!) called sp_dropapprole. The syntax is as follows: + +sp_dropapprole [@rolename =] + +To eliminate our application role from the system using the DROP syntax, we would just issue the command (from sa): + +DROP APPLICATION ROLE OurAppRole; + +More Advanced Security + +This section is really nothing more than an "extra things to think about" section. All of these fall outside the realm of the basic rules we defined at the beginning of the chapter, but they address ways around some problems and also how to close some common loopholes in your system. + +What to Do About the Guest Account + +The guest account provides a way of having default access. When you have the guest account active, several things happen: + + * Logins gain guest-level access to any database to which they are not explicitly given access. + * Outside users can log in through the guest account to gain access. This requires that they know the password for guest, but they'll already know the user exists (although, they probably also know that the sa account exists). + +Personally, one of the first things I do with my SQL Server is to eliminate every ounce of access the guest account has (by default, it has zero, so there should be little to do). It's a loophole, and it winds up providing access in a way you don't intuitively think of. (You probably think that when you assign rights to someone—that's all the rights they have. With guest active, that isn't necessarily so.) + +There is, however, one use that I'm aware of where the guest account actually serves a fairly slick purpose—when it is used with application roles. In this scenario, you leave the guest account with access to a database but without any rights beyond simply logging in to that database—that is, the guest account only makes the logged-on database "current." You can then use sp_setapprole to activate an application role, and, boom, you now have a way for otherwise anonymous users to log in to your server with appropriate rights. They can, however, only perform any useful login if they are using your application. + +This is definitely a scenario where you want to be protecting that application role password as if your job depended on it (it probably does). Use the ODBC encryption option and I would not allow this kind of access via the Internet! + +TCP/IP Port Settings + +By default when using TCP/IP, SQL Server uses port number 1433. A port can be thought of as something like a radio channel; it doesn't matter what channel you're broadcasting on, it won't do you any good if no one is listening to that channel. + +Leaving things with the default value of 1433 can be very convenient. All of your clients will automatically use port 1433 unless you specify otherwise, so this means that you have one less thing to worry about being set right if you just leave well enough alone. + +The problem, however, is that just about any potential SQL Server hacker also knows that port 1433 is the one to which 99 percent of all SQL Servers are listening. If your SQL Server has a direct connection to the Internet, I strongly recommend changing to a non-standard port number. Check with your network administrator for what he or she recommends as an available port. Just remember that, when you change what the server is "listening" to, you'll also need to change what all the IP-based clients are using. For example, if we were going to change to using port 1402, we would go into the Client Network Utility and set up a specific entry for our server with 1402 as the IP port to use. + +We also have the option of telling the client to dynamically determine the port, by checking the "Dynamically determine port" box. + +Note that this isn't really that huge of a security gain. The reality is that a hacker is probably going to use a port scanner or other tool to determine what every open port is on your firewall and, based on responses it seems, to make a fairly accurate guess as to what kind of software is utilizing that port. That said, every little thing you do can make it just a little more difficult for the would-be hacker. + +Don't Use the sa Account + +Everyone who's studied SQL Server for more than about 10 minutes knows about the system administrator account. SQL Server has the sysadmin fixed server role to simulate the sa user's level of access, so I strongly suggest adding true logins to that role, then changing the sa password to something very long and very incomprehensible—something not worth spending the time to hack into. If you only need Windows authentication, then turn SQL Server security off, and that will deal with the sa account issue once and for all. + +Keep xp_cmdshell under Wraps + +Remember to be careful about who you grant access to use xp_cmdshell. It will run any Windows command prompt command. The amount of authority that it grants to your users depends on what account SQL Server is running under. If it is a system or administrator account (as the majority are), then the users of xp_cmdshell will have very significant access to your server. (They could, for example, copy files onto the server from elsewhere on the network, then execute those files.) Let's raise the stakes a bit though—there are also a fair number of servers running out there under the context of a Windows domain administrator account—anyone using xp_cmdshell now has fairly open access to your entire network!!! + +The short rendition here is not to give anyone access to xp_cmdshell that you wouldn't give administrative rights to for your server or possibly even your domain. + +Don't Forget Views, Stored Procedures, and UDFs as Security Tools + +Remember that views, sprocs, and UDFs all have a lot to offer in terms of hiding data. Views can usually take the place of column-level security. They can do wonders to make a user think they have access to an entire table, when they, in reality, have access to only a subset of the entire data (remember our example of filtering out sensitive employee information, such as salary?). Sprocs and UDFs can do much the same. You can grant execute rights to a sproc or UDF, but that doesn't mean users get all the data from a table (they only get what the sproc or UDF gives them)—the end user may not even know what underlying table is supplying the data. In addition, views, sprocs, and UDFs have their own implied authority—that is, just because views and sprocs use a table, it doesn't mean that the user has access rights for that table. + +Certificates and Asymmetric Keys + +We have, at a few different points in the book (including earlier in this chapter), mentioned the notion of encryption. Certificates and asymmetric keys are the primary mechanism for defining the encryption keys for the different levels of your server architecture. Both of these are different methods of doing the same basic thing, and they are largely interchangeable. Whether you use certificates or asynchronous keys, you need to keep in mind that these are much like the keys to your house—if you let everyone have them, then they quickly lose their value (now anyone can get in, so why bother locking anyone out?). + +SQL Server supports the notion of keys at several different levels based on the notion that you may want to separate several different silos of control under different encryption keys. SQL Server maintains a Service Master Key that goes with each server installation. It is encrypted by the Windows-level Service Master Key. Likewise, each database contains a Database Master Key, which can, if you choose, itself be encrypted based on the Service Master Key. Then, within each database, you can define certificates and/or asymmetric keys (both of which are a form of key). Overall, the hierarchy looks something like Figure 19.2. + +Figure 19.2 + +Certificates + +Since SQL Server 2000, SQL Server has included its own certificate authority, or CA. Third-party CAs are also supported. A CA issues a certificate, which includes an encryption key along with some basic information to go with the certificate such as what date range the certificate is valid for (a starting and expiration date), the name of the holder, and information on the authority that issued the certificate. A certificate is added to a server using the CREATE CERTIFICATE command. + +Asymmetric Keys + +An asymmetric key works much as a certificate does but is specified directly and is not validated by any issuing authority. Like a certificate, the encryption key is specified and then utilized to encrypt sensitive information. Asymmetric keys are added using the CREATE ASYMMETRIC KEY command. + +Database Encryption + +Most of the encryption functions that were added in SQL Server 2005 are oriented around the idea of encrypting a particular piece of data. They require you to utilize special functions (which specific functions depend on the type of encryption being used) to encrypt the data, and then another set of functions to decrypt the data. + +Beginning with SQL Server 2008, we also have the option of encrypting the entire database. Note that the idea here is not to password protect the data in the database, but rather to protect the wholesale theft of the entire database. Using database-level encryption, the database file and any backups made of it are effectively keyed to the server the database is on (unless you copy the server's certificate, so make sure you have a backup of that or your backups of the database will become effectively useless in the event of total server failure). + +Summary + +Security is one of those areas that tend to be ignored by developers. Unfortunately, the security of your system is going to be determined by how your client application handles things, so there's only so much a DBA can do after you've shipped your application. + +Treat security as if it is the lifeblood for the success or failure of your system at your customer site (which, if you're building internal projects, may be your site)—it probably is a critical factor. +20 + +A Grand Performance: Designing a Database That Performs Well + +This, and the chapter that follows, are probably the toughest chapters in the book from my perspective as the author, but not for the normal reasons. Usually, the issue is how to relate complex information in a manner that's easy to understand. As we're getting near the end of the book, I hope that I've succeeded there—even if there is still more to come. At this point, you should, from prior experience and the topics covered in this book, have a solid foundation in everything we're going to discuss in this chapter. That means I'm relatively free to get to the nitty-gritty and not worry quite as much about confusion. + +Why then would this be a tough chapter for me to write? Well, because deciding exactly what to put into this and the sibling chapter that follows is difficult. You see, this isn't a book on performance tuning—that can easily be a book unto itself. It is, however, a book about making you successful in your experience developing with SQL Server. Having a well-performing system is critical to that success. The problem lies in a line from Bob Seger: "What to leave in, what to leave out." What can we focus on here that's going to get you the most bang for your buck? + +Perhaps the most important thing to understand about performance tuning is that you are never going to know everything there is to know about it. If you're the average SQL developer, you're going to be lucky if you know 20 percent of what there is to know. Fortunately, performance tuning is one of those areas where the old 80-20 rule (80 percent of the benefit comes from the right 20 percent of the work) definitely applies. + +For this edition of the book, I've decided to expand this topic a bit, maintaining coverage of the structural decisions, and adding additional content on "how to figure out where performance opportunities exist." This chapter will largely be on topics that have been around for a while including such things as: + + * Index choices + * Client vs. server-side processing + * Strategic de-normalization + * Organizing your sprocs + * Uses for temporary tables + * Small gains in repetitive processes vs. big gains in long-running processes + +The focus for this chapter is really going to be about things you should be thinking about in the area of design, those that are somewhat structural in nature. In many cases, it will be a subject we've already covered, but with a particular eye on performance. In our next chapter, we'll take a look at what to do once the system is already in place (maintenance, locating problems, and planning future changes). + +There is, however, a common theme that one should get out of both chapters: This is only the beginning. The biggest thing in performance is really just to stop and think about it. There is, for some strange reason, a tendency when working with SQL to use the first thing that comes to mind that will work. You need to give the same kind of thought to your queries, sprocs, database designs—whatever—that you would give to any other development work that you're doing. Also, keep in mind that your T-SQL code is only one part of the picture—hardware, client code, SQL Server configuration, and network issues are examples of things that are "outside the code" that can have a dramatic impact on your system. + +Performance means a lot of different things to a lot of different people. For example, many will think in terms of simple response time (how fast does my query finish). There is also the notion of perceived performance (many users will think in terms of how fast they receive enough to start working on, rather than how fast it actually finishes). Yet another perspective might focus on scalability (for example, how much load can I put on the system before my response time suffers or until users start colliding with each other?). + +Many of the examples and suggestions in the two performance chapters are about raw speed—how fast do I return results—we do, however, touch on perceived performance and scalability issues where appropriate. Make sure that all facets of performance are considered in your designs—not just time to completion. + +When to Tune + +Okay, so this is probably going to seem a little obvious, but performance starts much earlier in the process than when you are writing your code. Indeed, it really should start in the requirements-gathering process and then never end. + +What's the big deal about performance tuning in the requirements-gathering stage? Well, while you obviously can't do anything yet to physically tune your system, you can do a lot to logically tune your system. For example, is the concern of the customer more toward the side of perceived performance or actual completion of the job? For interactive processes, users will generally be more satisfied and think the system is faster if you do something to show them that something is happening (even if it's just a progress bar). In addition, sometimes it's worth having a process that completes a little more slowly as long as the "first response"—that is, when it starts outputting something—is faster. Which of these is preferable is something you should know in the requirements-gathering stage. Finally, you should, in the requirements-gathering process, determine what your performance requirements are for the system. + +Many is the time that I have seen the system that the developer thought was "fast enough" only to find out that the performance was unacceptable to the user. This can happen for a lot of reasons, though the most common is certainly the developer having his or her head buried in the sand. + +Find out what's expected! Also, remember to test whether you've met expectations under a realistic load on something resembling the real live hardware—not a load based on one or two developers sitting at their development system. + +Performance obviously also continues into design. If you design for performance, then you will generally greatly reduce the effort required to tune at completion. What's more, you'll find that you've greatly enhanced what are the "best" numbers you can achieve. + +I'm starting to drone on here, but performance never stops—when you're actually coding, get it working, but then STOP! Stop and take a look at your code. Once an entire system is together, the actual code will almost never be looked at again unless: + + * Something breaks (there's a bug). + * You need to upgrade that part of the system. + * There is an overt performance problem (usually, a very bad one). + +In the first two of these instances, you probably won't be looking at the performance issues, just how to get things fixed or the additional functionality added. The point here is that an extra few minutes of looking at your code and asking yourself "Could I have done it better?" or "Hey, have I done anything stupid here?" can shave a little bit here and a little bit there and, occasionally, a whole lot in some other place. + +Simply put: I make stupid mistakes, and so will you. It is, however, amazing how often you can step back from your code for a minute or two, then look at it again with a critical eye and say, "Geez, I can't believe I did that!" Hopefully, those moments will be rare, but, if you take the time to be critical of your own code, you'll find most of those critical gaffes that could really bog your system down. As for the ones you don't find, well, that's what the next chapter is for! + +The next big testing milestone time is in the quality assurance process. At this juncture you should be establishing general system benchmarks and comparing those against the performance requirements established during the requirements phase. + +Last, but not least—never stop. Ask end users where their pain is from a performance perspective. Is there something they say is slow? Don't wait for them to tell you (often, they think "that's just the way it is" and say nothing—except to your boss, of course); go ask. + +Index Choices + +Again, this is something that was covered in extreme depth previously, but the topic still deserves something more than a mention here because of its sheer importance to query performance. + +People tend to go to extremes with indexes—I'm encouraging you not to follow any one rule but to instead think about the full range of items that your index choices impact. + +Any table that has a primary key (and with very rare exception, all tables should have a primary key) has at least one index. This doesn't mean, however, that it is a very useful index from a performance perspective. Indexes should be considered for any column that you're going to be frequently using as a target in a WHERE or JOIN, and, to a lesser extent, an ORDER BY clause. + +Remember though, that the more indexes you have, the slower your inserts, updates, and deletes are going to be. When you modify a record, one or more entries may (depending on what's going on in the non-leaf levels of the B-Tree) have to be modified for that index (certainly true in the case of an insert or delete, and true for updates on any column participating in the index). That means more indexes and also more for SQL Server to do on modification statements. In an Online Transaction Processing (OLTP) environment (where you tend to have a lot of inserts, updates, and deletes), this can be a killer. In an Online Analytical Processing (OLAP) environment, this is probably no big deal since your OLAP data is usually relatively stable (few inserts), and what inserts are made are usually done through a highly repetitive batch process (doesn't have quite the lack of predictability that users have). + +Technically speaking, the problem is smaller on updates and deletes. For updates, your indexes need to be updated only if the column that was changed is part of the key for that index. If you do indeed need to update the index though, think about it as a delete and an insert—that means that you're exposed to page splits again. + +So, what, then, about deletes? Well, again, when you delete a record you're going to need to delete all the entries from your indexes too, so you do add some additional overhead, but you don't have to worry about page splits and having to physically move data around. + +The bottom line here is that if you're doing a lot more querying than modifying, then more indexes are okay. However, if you're doing lots of modifications to your data, keep your indexes limited to high use columns. + +If you're treating this book as more of a reference than a full "learn how" book and haven't taken the time to read the index chapters (Chapters 6 and 7) yet—do it! + +Check the Index Tuning Tool in the Database Engine Tuning Advisor + +The Database Engine Tuning Advisor is a descendant of the Index Tuning Wizard that made its first appearance back in version 7.0. While the Database Tuning Advisor has grown to include much more than just index tuning, it still has this key feature. + +Be very careful when using automated tuning tools with indexes. In particular, watch out about what indexes you let it delete. It makes its recommendations based on the workload it has been exposed to—that workload may not include all of the queries that make up your system. Take a look at the recommendations and ask yourself why those recommendations might help. Particularly with deletions, ask yourself what that index might be used for—does deleting it make sense? Is there some long-running report that didn't run when you were capturing the workload file that might make use of that index? + +Client vs. Server-Side Processing + +Where you decide to "do the work" can have a very serious impact—for better or worse—on overall system performance. + +When client/server computing first came along, the assumption was that you would get more/faster/cheaper by "distributing" the computing. For some tasks, this is true. For others though, you lose more than you gain. + +Here's a quick review of some preferences and how they perform on client-side versus server side: + +Static cursors | Usually much better on the client. Since the data isn't going to change, you want to package it up and send it all to the client in one pass—thus limiting roundtrips and network impact. The obvious exception is if the cursor is generated for the sole purpose of modifying other records. In such a case, you should try and do the entire process at the server-side (most likely in the form of a stored procedure)—again eliminating round-trips. +---|--- +Forward-only, read-only cursors | Client-side again. ODBC and other libraries can take special advantage of the FAST_FORWARD cursor type to gain maximum performance. Just let the server spew the records into the client cursor, and then move on with life. +HOLDLOCK situations | Most transactioning works much better on the server than on the client. +Processes that require working tables | This is another of those situations where you want to try to have the finished product created before you attempt to move records to the client. If you keep all of the data server-side until it is really ready to be used, you minimize round-trips to the server and speed up performance. +Minimizing client installations | Okay, so this isn't "performance" as such, but it can be a significant cost factor. If you want to minimize the number of client installations you have to do, then keep as much of the business logic out of the client as possible. Either perform that logic in sprocs, or look at using component-based development with .NET. In an ideal world, you'll have what I like to call "data logic" (logic that exists only for the purpose of figuring out how to get the final data) in sprocs and "business logic" in components. +| +Significant filtering and/or resorting | Use ADO.NET or LINQ. They have a great set of tools for receiving the data from the server just once (fewer round-trips!), then applying filters and sorts locally. If you wanted the data filtered or sorted differently by SQL Server, it would run an entirely new query using the new criteria. It doesn't take a rocket scientist to figure out that the overhead on that can get rather expensive. Both ADO.NET and LINQ also have some cool things built-in to allow you to join different data sets (including homogeneous data sets) right at the client. Note, however, that with very large result sets, your client computer may not have the wherewithal to deal with the filters and sorts effectively—you may be forced to go back to the server. + +These really just scratch the surface. The big thing to remember is that round-trips are a killer even in this age of gigabit Ethernet (keep in mind that connection overhead is often more of the issue than raw bandwidth). What you need to do is move the smallest amount of data back and forth—and only move it once. Usually, this means that you'll preprocess the data as much as possible on the server side, and then move the entire result to the client if possible. + +Keep in mind, though, that you need to be sure that your client is going to be able to handle what you give it. Servers are usually much better equipped to handle the resource demands of larger queries. By the same token, you also have to remember that the server is going to be doing this for multiple users—that means the server needs to have adequate resources to store all of the server-side activity for that number of users. If you take a process that was too big for the client to handle and move it server-side for resource reasons, just remember that you may also run out of resources on the server, if more than one client uses that process at one time. The best thing is to try to keep result sets and processes in the smallest size possible. + +Realize that the term "client" has more than one possible meaning. The client, from a data connection perspective, may not be where the end user sits. If it is a browser-based application, then the client that is truly handling the data is more likely the Web server. While a Web server is likely on some very solid hardware, it may be dealing with multiple such queries at the same time (multiple large data sets), so plan accordingly. + +Strategic De-Normalization + +This could also be called, "When following the rules can kill you." Normalized data tends to work for both data integrity and performance in an OLTP environment. The problem is that not everything that goes on in an OLTP database is necessarily transaction-processing related. Even OLTP systems have to do a little bit of reporting (a summary of transactions entered that day, for example). + +Often, adding just one extra column to a table can prevent a large join, or worse, a join involving several tables. I've seen situations where adding one column made the difference between a two-table join and a nine-table join. We're talking the difference between 100,000 records being involved and several million. This one change made the difference in a query dropping from a runtime of several minutes down to just seconds. + +Like most things, however, this isn't something with which you should get carried away. Normalization is the way that most things are implemented for a reason. It adds a lot to data integrity and can make a big positive difference performance-wise in many situations. Don't de-normalize just for the sake of it. Know exactly what you're trying to accomplish, and test to make sure that it had the expected impact. If it didn't, then look at going back to the original way of doing things. + +Organizing Your Sprocs Well + +I'm not talking from the outside (naming conventions and such are important, but that's not what I'm getting at here) but rather from a "how they operate" standpoint. The next few sections discuss this. + +Keeping Transactions Short + +Long transactions cannot only cause deadlock situations but also basic blocking (where someone else's process has to wait for yours because you haven't finished with the locks yet). Anytime you have a process that is blocked—even if it will eventually be able to continue after the blocking transaction is complete—you are delaying, and therefore hurting the performance of, that blocked procedure. There is nothing that has a more immediate effect on performance than that a process has to simply stop and wait. + +Using the Least Restrictive Transaction Isolation Level Possible + +The tighter you hold those locks, the more likely that you're going to wind up blocking another process. You need to be sure that you take the number of locks that you really need to ensure data integrity—but try not to take any more than that. + +If you need more information on isolation levels, check out transactions and locks in Chapter 11. + +Implementing Multiple Solutions if Necessary + +An example here is a search query that accepts multiple parameters but doesn't require all of them. It's quite possible to write your sproc so that it just uses one query, regardless of how many parameters were actually supplied—a "one-size-fits-all" kind of approach. This can be a real timesaver from a development perspective, but it is really deadly from a performance point of view. More than likely, it means that you are joining several unnecessary tables for every run of the sproc! + +The thing to do here is to add a few IF...ELSE statements to check things out. This is more of a "look before you leap" kind of approach. It means that you will have to write multiple queries to deal with each possible mix of supplied parameters, but once you have the first one written, the others can often be cloned and then altered from the first one. + +This is a real problem area in lots of code out there. Developers are a fickle bunch. We generally only like doing things as long as they are interesting. If you take the preceding example, you can probably see that it would get very boring very quickly to be writing what amounts to a very similar query over and over to deal with the nuances of what parameters were supplied. + +All I can say about this is—well, not everything can be fun, or everyone would want to be a software developer! Sometimes you just have to grin and bear it for the sake of the finished product. + +Avoiding Cursors if Possible + +If you're a programmer who has come from an ISAM or VSAM environment (these were older database storage methods), doing things by cursor is probably going to be something toward which you'll naturally gravitate. After all, the cursor process works an awful lot more like what you're used to in those environments (such looping structures are also common in many non-database data handling constructs). + +Don't go there! + +Almost all things that are first thought of as something you can do by cursors can actually be done as a set operation. Sometimes it takes some pretty careful thought, but it usually can be done. + +By way of illustration, I was asked several years ago for a way to take a multiline cursor-based operation and make it into a single statement if possible. The existing process ran something like 20 minutes. The runtime was definitely problematic, but the customer wasn't really looking to do this for performance reasons (they had accepted that the process was going to take that long). Instead, they were just trying to simplify the code. + +They had a large product database, and they were trying to set things up to automatically price their available products based on cost. If the markup had been a flat percentage (say 10 percent), then the UPDATE statement would have been easy—say something like: + +UPDATE Products + +SET UnitPrice = UnitCost * 1.1 + +The problem was that it wasn't a straight markup—there was a logic pattern to it. The logic went something like this: + + * If the pennies on the product after the markup are greater than or equal to .50, then price it at .95. + * If the pennies are below .50, then mark it at .49. + +The pseudocode to do this by cursor would look something like: + +Declare and open the cursor + +Fetch the first record + +Begin Loop Until the end of the result set + +Multiply cost * 1.1 + +If result has cents of < .50 + +Change cents to .49 + +Else + +Change cents to .95 + +Loop + +This is, of course, an extremely simplified version of things. There would actually be about 30–40 lines of code to get this done. Instead, we changed it around to work with one single correlated subquery (which had a CASE statement embedded in it). The runtime dropped down to something like 12 seconds. + +The point here, of course, is that, by eliminating cursors wherever reasonably possible, we can really give a boost to not only reduce complexity (as was the original goal here) but also performance. + +Uses for Temporary Tables + +The use of temporary tables can sometimes help performance—usually by allowing the elimination of cursors or by allowing working data to be indexed while it is needed. + +Using Temp Tables to Break Apart Complex Problems + +As we've seen before, cursors can be the very bane of our existence. Using temporary tables, we can sometimes eliminate the cursor by processing the operation as a series of two or more set operations. An initial query creates a working data set. Then another process comes along and operates on that working data. + +We can actually make use of the pricing example we laid out in the last section to illustrate the temporary table concept, too. This solution wouldn't be quite as good as the correlated subquery, but it is still quite workable and much faster than the cursor option. The steps would look something like: + +SELECT ProductID, FLOOR(UnitCost * 1.1) + .49 AS TempUnitPrice + +INTO #WorkingData + +FROM Products + +WHERE (UnitCost * 1.1) - FLOOR(UnitCost * 1.1) < .50 + +INSERT INTO #WorkingData + +SELECT ProductID, FLOOR(UnitCost * 1.1) + .95 AS TempUnitPrice + +FROM Products + +WHERE (UnitCost * 1.1) - FLOOR(UnitCost * 1.1) >= .50 + +UPDATE p + +SET p.UnitPrice = t.TempUnitPrice + +FROM Product p + +JOIN #WorkingData t + +ON p.ProductID = t.ProductID + +With this, we wind up with three steps instead of thirty or forty. This won't operate quite as fast as the correlated subquery would, but it still positively screams in comparison to the cursor solution. + +Keep this little interim step using temporary tables in mind when you run into complex problems that you think are going to require cursors. Try to avoid the temptation of just automatically taking this route—look for the single statement query before choosing this option—but if all else fails, this can really save you a lot of time versus using a cursor option. + +Using Temp Tables to Allow Indexing on Working Data + +Often we will run into a process in which we are performing many different operations on what is fundamentally the same data. This is characterized by a situation in which you are running different kinds of updates (perhaps to totally different tables), but utilizing the same source data to figure out what to change or what values to change things to. I've seen many scenarios where the same fundamental data is reused—in the same procedure—hundreds or even thousands of times. + +Under such "reuse" situations, consider querying the data once and placing it into a temp table. Also consider applying indexes to this data as warranted by the queries you're going to be performing against it. + +Even for data you're only going to be hitting twice, I've seen a temp table solution make a huge difference if the original query for the source data was, for whatever reason, inefficient. Sometimes this is due to a lack of suitable indexing on the source data, but, more often, it is a scenario with a multi-table join against a large data set. Sucking it into a temp table often allows you to explicitly filter down a large data set early in the overall process. Again, try and avoid the temptation of automatically taking this approach, but keep it in mind as an option. + +Update Your Code In a Timely Fashion + +Are you still supporting SQL Server 2000? How about 7.0? OK, so you most definitely shouldn't be supporting 7.0 by now, and even 2000 support should be gone (or at least in the late stages of sunsetting it). So, if you're no longer supporting those older editions, why does your system code and design look like you still are? + +OK, OK, I understand it isn't as simple as all that, but with each release of your application, make sure that you have time set aside (I recommend 10%–25%) that is oriented around improving existing performance and features. If you only need to support SQL Server 2008, look for special code you may have to address situations now addressed natively by SQL Server 2008, such as: + + * Procedures or code streams that handle INSERT, UPDATE, and DELETE scenarios into a specific table; these can use the new MERGE command to make all three modifications, as indicated, in a single pass over the data. It also has the advantage of being a single statement, which means you may be able to avoid explicitly defining transactions around the three separate statements. + * Special hierarchy handling: SQL Server now has native constructs for something that is actually very common. The functionality includes not only hierarchy-specific functions (such as pruning or grafting), but both vertical and horizontal index functionality (very cool stuff!). + * Date and Time data type handling. + +Sometimes, It's the Little Things + +A common mistake in all programming for performance efforts is to ignore the small things. Whenever you're trying to squeeze performance, the natural line of thinking is that you want to work on the long-running stuff. + +It's true that the long-running processes are the ones for which you stand the biggest chance of getting big one-time performance gains. It's too bad that this often leads people to forget that it's the total time saved that they're interested in—that is, how much time when the process is really live. + +While it's definitely true that a single change in a query can often turn a several-minute query into seconds (I've actually seen a few that took literally days trimmed to just seconds by index and query tuning), the biggest gains for your application often lie in getting just a little bit more out of what already seems like a fast query. These are usually tied to often-repeated functions or items that are often executed within a loop. + +Think about this for a bit. Say you have a query that currently takes three seconds to run, and this query is used every time an order taker looks up a part for possible sale—say 5,000 items looked up a day. Now imagine that you are able to squeeze one second off the query time. That's 5,000 seconds, or over an hour and 20 minutes! + +Hardware Considerations + +Forgive me if I get too bland here—I'll try to keep it interesting, but if you're like the average developer, you'll probably already know enough about this to make it very boring, yet not enough about it to save yourself a degree of grief. + +Hardware prices have been falling like a rock over the years—unfortunately, so has what your manager or customer is probably budgeting for your hardware purchases. When deciding on a budget for your hardware, remember: + + * Once you've deployed, the hardware is what's keeping your data safe—just how much is that data worth? + * Once you've deployed, you're likely to have many users—if you're creating a public website, it's possible that you'll have tens of thousands of users active on your system 24 hours per day. What is it going to cost you in terms of productivity loss, lost sales, loss of face, and just general credibility loss if that server is unavailable or—worse—you lose some of your data? + * Maintaining your system will quickly cost more than the system itself. Dollars spent early on a mainstream system that is going to have fewer quirks may save you a ton of money in the long run. + +There's a lot to think about when deciding from whom to purchase and what specific equipment to buy. Forgetting the budget for a moment, some of the questions to ask yourself include: + + * Will the box be used exclusively as a database server? + * Will the activity on the system be processor or I/O intensive? (For databases, it's almost always the latter, but there are exceptions.) + * Am I going to be running more than one production database? If so, is the other database of a different type (OLTP versus OLAP)? + * Will the server be on-site at my location, or do I have to travel to do maintenance on it? + * What are my risks if the system goes down? + * What are my risks if I lose data? + * Is performance "everything"? + * What kind of long-term driver support can I expect as my O/S and supporting systems are upgraded? + +Again, we're just scratching the surface of things—but we've got a good start. Let's look at what these issues mean to us. + +Exclusive Use of the Server + +I suppose it doesn't take a rocket scientist to figure out that, in most cases, having your SQL Server hardware dedicated to just SQL Server and having other applications reside on totally separate system(s) is the best way to go. Note, however, that this isn't always the case. + +If you're running a relatively small and simple application that works with other sub-systems (say IIS as a Web server, for example), then you may actually be better off, performance-wise, to stay with one box. Why? Well, if there are large amounts of data going back and forth between the two sub-systems (your database in SQL Server and your Web pages or whatever in a separate process), then memory space to memory space communications are going to be much faster than the bottleneck that the network can create—even in a relatively dedicated network backbone environment. + +Remember that this is the exception, though, not the rule. The instance where this works best usually meets the following criteria: + + * The systems have a very high level of interaction. + * The systems have little to do beyond their interactions (the activity that's causing all the interaction is the main thing that the systems do). + * Only one of the two processes is CPU intensive and only one is I/O intensive. + +If in doubt, go with conventional thinking on this and separate the processing into two or more systems. + +I/O vs. CPU Intensive + +I can just hear a bunch of you out there yelling "Both!" If that's the case, then I hope you have a very large budget—but we'll talk about that scenario, too. Assuming you haven't installed yet, it's guesswork. While almost anything you do in SQL Server is data-based and will, therefore, certainly require a degree of I/O, how much of a burden your CPU is under varies widely depending on the types of queries you're running: + +Low CPU Load | High CPU Load +---|--- +Simple, single-table queries and updates | Large joins +Joined queries over relatively small tables | Aggregations (SUM, AVG, etc.) Sorting of large result sets + +With this in mind, let's focus in a little closer on each situation. + +I/O Intensive + +I/O-intensive tasks should cause you to focus your budget more on the drive array than on the CPU(s). Notice that I said the drive "array"—I'm not laying that out as an option. In my not-so-humble opinion on this matter, if you don't have some sort of redundancy arrangement on your database storage mechanism, then you have certainly lost your mind. Any data worth saving at all is worth protecting—we'll talk about the options there in just a moment. + +Before we get into talking about the options on I/O, let's look briefly into what I mean by I/O intensive. In short, I mean that a lot of data retrieval is going on, but the processes being run on the system are almost exclusively queries (not complex business processes), and those do not include updates that require wild calculations. Remember—your hard drives are, more than likely, the slowest thing in your system (short of a CD-ROM) in terms of moving data around. + +A Brief Look at RAID + +RAID; it brings images of barbarian tribes raining terror down on the masses. Actually, most of the RAID levels are there for creating something of a fail-safe mechanism against the attack of the barbarian called "lost data." If you're not a RAID aficionado, then it might surprise you to learn that not all RAID levels provide protection against lost data. + +RAID originally stood for Redundant Array of Inexpensive Disks. The notion was fairly simple—at the time, using a lot of little disks was cheaper than using one great big one. In addition, an array of disks meant that you had multiple drive heads at work and could also build in (if desired) redundancy. + +Since drive prices have come down so much (I'd be guessing, but I'd bet that drive prices are, dollar per meg, far less than 1 percent of what they were when the term RAID was coined), I've heard other renditions of what RAID stands for. The most common are Random Array of Independent Disks (this one seems like a contradiction in terms to me) and Random Array of Individual Disks (this one's not that bad). The thing to remember, no matter what you think it's an acronym for, is that you have two or more drives working together—usually for the goal of some balance between performance and safety. + +There are lots of places you can get information on RAID, but let's take a look at the three (well, four if you consider the one that combines two of the others) levels that are most commonly considered: + +RAID Level | Description +---|--- +RAID 0 | a.k.a. Disk Striping without Parity. Out of the three that you are examining here, this is the one you are least likely to know. This requires at least three drives to work just as RAID 5 does. Unlike RAID 5, however, you get no safety net from lost data. (Parity is a special checksum value that allows reconstruction of lost data in some circumstances—as indicated by the time, RAID 0 doesn't have parity.) RAID 0's big claim to fame is giving you maximum performance without losing any drive space. With RAID zero, the data you store is spread across all the drives in the array (at least 3). While this may seem odd, it has the advantage of meaning that you always have three or more disk drives reading or writing your data for you at once. Under mirroring, the data is all on one drive (with a copy stored on a separate drive). This means you'll just have to wait for that one head to do the work for you. +RAID 1 | a.k.a. Mirroring. For each active drive in the system, there is a second drive that "mirrors" (keeps an exact copy of) the information. The two drives are usually identical in size and type, and store all the information to each drive at the same time. (Windows NT has software-based RAID that can mirror any two volumes as long as they are the same size.) Mirroring provides no performance increase when writing data (you still have to write to both drives) but can, depending on your controller arrangement, double your read performance since it will use both drives for the read. What's nice about mirroring is that as long as only one of the two mirrored drives fails, the other will go on running with no loss of data or performance (well, reads may be slower if you have a controller that does parallel reads). The biggest knock on mirroring is that you have to buy two drives to every one in order to have the disk space you need. +RAID 5 | The most commonly used. Although, technically speaking, mirroring is a RAID (RAID 1), when people refer to using RAID, they usually mean RAID 5. RAID 5 works exactly as RAID 0 does with one very significant exception—parity information is kept for all the data in the array. Say, for example, that you have a five-drive array. For any given write, data is stored across all five of the drives, but a percentage of each drive (the sum of which adds up to the space of one drive) is set aside to store parity information. Contrary to popular belief, no one drive is the parity drive. Instead, some of the parity information is written to all the drives—it's just that the parity information for a given byte of data is not stored on the same drive as the actual data is. If any one drive is lost, then the parity information from the other drives can be used to reconstruct the data that was lost. The great thing about RAID 5 is that you get the multi-drive read performance. The downside is that you lose one drive's worth of space (if you have a three-drive array, you'll see the space of two; if it's a seven-drive array, you'll see the space of six). It's not as bad as mirroring in the price per megabyte category, but you still see great performance. +RAID 6 | Raid can be considered to be something of an extension of RAID 5 and is generally only used in very large arrays (where the overhead of algorithm required to provide the extra redundancy can be spread out and therefore provides less waste on a per disk basis). RAID 6 provides extra parity encoding versus RAID 5, and the extra information can be utilized to recover from multiple drive loss. RAID 5 is generally less expensive at lower array sizes, but RAID 6 maintains a level of redundancy even while rebuilding a single failed drive. +RAID 10, (a.k.a. RAID 1 + 0) or RAID 0 + 1 | RAID 10 offers the best of both RAID 0 and RAID 1 in terms of performance and data protection. It is, however, far and away the most expensive of the options discussed here. RAID 10 is implemented in a coupling of both RAID 1 (Mirroring) and RAID 0 (striping without parity). The end result is mirrored sets of striped data. You will also hear of RAID 0 + 1. These are striped sets of mirrored data. The end result in total drive count is the same, but RAID 10 performs better in recovery scenarios and is therefore what is typically implemented. +RAID 50 | This is implemented by mirroring two RAID 5 arrays. While it is arguably the most redundant, it is still at risk of failure if two drives happen to fail in the same array. It is the most expensive of the options provided here, and generally only implemented in the most extreme of environments. + +The long and the short of it is that RAID 5 is the de facto minimum for database installations. That being said, if you have a loose budget, then I'd actually suggest mixing things up a bit. + +RAID 10 has become the standard in larger installations. For the average shop, however, RAID 5 will likely continue to rule the day for a while yet—perhaps that will change as we get into the era where even server level drives are measured in multi-tera-, peta-, and even exabytes. We certainly are getting there fast. + +What you'd like to have is at least a RAID 5 setup for your main databases but a completely separate mirrored set for your logs. People who manage to do both usually put both Windows and the logs on the mirror set and the physical databases on the RAID 5 array, but those with a little more cash to spend often put the O/S on a separate mirror set from the logs (with the data files still on their own RAID 5 array). Since I'm sure inquiring minds want to know why you would want to do this, let's make a brief digression into how log data is read and written. + +Unlike database information, which can be read in parallel (thus why RAID 5 or 10 works so well performance-wise), the transaction log is chronology dependent—that is, it needs to be written and read serially to be certain of integrity. I'm not necessarily saying that physically ordering the data in a constant stream is required; rather, I'm saying that everything needs to be logically done in a stream. As such, it actually works quite well if you can get the logs into their own drive situation where the head of the drive will only seldom have to move from the stream from which it is currently reading and writing. The upshot of this is that you really want your logs to be in a different physical device than your data, so the reading and writing of data won't upset the reading and writing of the log. + +Note that this sequential read/write performance of the mirror set disappears if you are keeping logs for multiple databases on the same mirror set (it has to jump around between the separate logs!). + +Logs, however, don't usually take up nearly as much space as the read data does. With mirroring, we can just buy two drives and have our redundancy. With RAID 5, we would have to buy three, but we don't see any real benefit from the parallel read nature of RAID 5. When you look at these facts together, it doesn't make much sense to go with RAID 5 for the logs or O/S. + +You can have all the RAID arrays in the world, and they still wouldn't surpass a good backup in terms of long-term safety of your data. Backups are easy to take off-site, and are not subject to mechanical failure. RAID units, while redundant and very reliable, can also become worthless if two (instead of just one) drives fail. Another issue—what if there's a fire? Probably all the drives will burn up—again, without a backup, you're in serious trouble. We'll look into how to back up your databases in Chapter 22. + +CPU Intensive + +On a SQL Server box, you'll almost always want to make sure that you go multiprocessor (yes, even in these days of multi-core processors), even for a relatively low-utilization machine. This goes a long way to preventing little "pauses" in the system that will drive your users positively nuts, so consider this part of things to be a given—particularly in this day of dual core processors. Keep in mind that the Workgroup version of SQL Server supports only up to two processors—if you need to go higher than that, you'll need to go up to either Standard (four processors) or the Enterprise edition (which is limited only by your hardware and budget). + +Even if you're only running SQL Server Express—which supports only one processor—you'll want to stick with the dual-proc box if at all possible. Remember, there is more going on in your system than SQL Server, so having that other proc available to perform external operations cuts down on lag on your SQL Server. + +Perhaps the biggest issue of all, though, is memory. This is definitely one area that you don't want to short change. In addition, remember that if you are in a multiprocessor environment (and you should be), then you are going to have more things going on at once in memory. In these days of cheap memory, no SQL Server worth installing should ever be configured with less than 512MB of RAM—even in a development environment. Production servers should be equipped with no less than 2GB of RAM—quite likely more. + +Things to think about when deciding how much RAM to use include: + + * How many user connections will there be at one time (each one takes up space)? Each connection takes up about 24K of memory (it used to be even higher). This isn't really a killer since 1,000 users would only take up 24MB, but it's still something to think about. + * Will you be doing a lot of aggregations and/or sorts? These can be killers depending on the size of the data set you're working with in your query. + * How large is your largest database? If you have only one database, and it is only 1GB (and, actually, most databases are much smaller than people think), then having 4GB of RAM probably doesn't make much sense depending on how many queries you're running simultaneously and exactly what actions they are taking. + * The Workgroup edition of SQL Server 2008 only supports addressing of memory up to 3GB. If you need more than this, you'll need to go with at least the Standard edition. + +In addition, once you're in operation—or when you get a fully populated test system up and running—you may want to take a look at your cache-hit ratio in perfmon. We'll talk about how this number is calculated a little bit in Chapter 21. For now, it's sufficient to say that this can serve as something of a measurement for how often we are succeeding at getting things out of memory rather than off disk (memory is going to run much, much faster than disk). A low cache-hit ratio is usually a certain indication that more memory is needed. Keep in mind though, that a high ratio does not necessarily mean that you shouldn't add more memory. The read-ahead feature of SQL Server may create what is an artificially high cache-hit ratio and may disguise the need for additional memory. + +OLTP vs. OLAP + +The needs between these two systems are often at odds with each other. We discuss some of the design differences in Chapter 24, so I hope you will come to have a concept of just how different the design considerations can be. + +In any case, I'm going to keep my "from a hardware perspective" recommendation short here: + +If you are running databases to support both of these kinds of needs, run them on different servers—it's just that simple. + +I can't stress enough the need to separate these two. A large data warehouse import, export, or even a large report run can cause significant turnover in your OLTP procedure and/or data caches and simply decimate the performance of your system for what can be many users (and, therefore, a whole lot of cost). + +On-Site vs. Off-Site + +It used to be that anything that would be SQL Server–based would be running on-site with those who were responsible for its care and upkeep. If the system went down, people were right there to worry about reloads and to troubleshoot. + +In the Internet era, many installations are co-located with an Internet service provider (ISP). The ISP is responsible for making sure that the entire system is backed up—they will even restore according to your directions—but they do not take responsibility for your code. This can be very problematic when you run into a catastrophic bug in your system. While you can always connect remotely to work on it, you're going to run into several configuration and performance issues, including: + + * Security—Remote access being open to you means that you're also making it somewhat more open to others who you may not be interested in having access. My two bits' worth on this is to make sure that you have very tight routing and port restrictions in place. For those of you not all that network savvy (which includes me), this means that you restrict what IP addresses are allowed to be routed to the remote server, what ports they have available, and even what protocols (SSL vs. non-SSL) are allowed through. + * Performance—You're probably going to be used to the 100 Mbps to 1 Gbps network speeds that you have around the home office. Now you're communicating via virtual private network (VPN) over the Internet or, worse, dialup, and you are starting to hate life (things are SLOW!). + * Responsiveness—It's a bit upsetting when you're running some e-commerce site or whatever and you can't get someone at your ISP to answer the phone, or they say that they will get on it right away and hours later you're still down. Make sure you investigate your remote hosting company very closely—don't assume that they'll still think you're important after the sale. + * Hardware Maintenance—Many co-hosting facilities will not do hardware work for you. If you have a failure that requires more than a reloading, you may have to travel to the site yourself or call yet another party to do the maintenance—that means that your application will be offline for hours or possibly days. + +If you're a small shop doing this with an Internet site, then off-site can actually be something of a saving grace. It's expensive, but you'll usually get lots of bandwidth plus someone to make sure that the backups actually get done—just make sure that you really check out your ISP. Many of them don't know anything about SQL Server, so make sure that expertise is there. + +One recent trend in major ISPs has been to locate major hosting facilities in far more remote locations than you might, at first, expect. This is usually done for accessibility to water (for cooling), cheap power, or both (near hydroelectric facilities seems to be popular). In many ways, this shouldn't matter, but think about it if you're using a third-party hardware support company—does that support company have appropriate staff located near the facility where you will be hosted? + +If you were thinking of your hosting company as being located in a major metropolitan area, then you would reasonably assume that your hosting company had a large number of support staff within 30–60 minutes' response time of your ISP location. If, however, your ISP is, let's say, "outside Portland, Oregon," you may want to make sure that "outside" doesn't mean 60 or 80 miles away. If it is, check with your support company about just how many people they keep on staff truly close to your ISP location. + +The Risks of Being Down + +How long and how often can I afford to be down? This may seem like a silly question. When I ask it, I often get this incredulous look. For some installations, the answer is obvious—they can't afford to be down, period. This number is not, however, as high as it might seem. You see, the only true life-and-death kinds of applications are the ones that are in acute medical applications or are immediately tied to safety operations. Other installations may lose money—they may even cause bankruptcy if they go down—but that's not life and death either. + +That being said, it's really not as black and white as all that. There is really something of a continuum in how critical downtime is. It ranges from the aforementioned medical applications at the high end to data-mining operations on old legacy systems at the low end (usually—for some companies, it may be all they have). The thing that pretty much everyone can agree on for every system is that downtime is highly undesirable. + +So, the question becomes one of just how undesirable is it? How do we quantify that? + +If you have a bunch of bean counters (I can get away with saying that since I was one) working for you, it shouldn't take you all that long to figure out that there are a lot of measurable costs to downtime. For example, if you have a bunch of employees sitting around saying that they can't do anything until the system comes back up, then the number of affected employees times their hourly cost (remember, the cost of an employee is more than just his or her wages) equals the cost of the system being down from a productivity standpoint. But wait, there's more. If you're running something that has online sales—how many sales did you lose because you couldn't be properly responsive to your customers? Oops—more cost. If you're running a plant with your system, then how many goods couldn't be produced because the system was down—or, even if you could still build them, did you lose quality assurance or other information that might cost you down the line? + +I think by now you should be able to both see and sell to your boss the notion that downtime is very expensive—how expensive depends on your specific situation. Now the thing to do is to determine just how much you're willing to spend to make sure that it doesn't happen. + +Lost Data + +There's probably no measuring this one. In some cases, you can quantify this by the amount of cost you're going to incur reconstructing the data. Sometimes you simply can't reconstruct it, in which case you'll probably never know for sure just how much it cost you. + +Again, how much you want to prevent this should affect your budget for redundant systems as well as things like backup tape drives and off-site archival services. + +Is Performance Everything? + +More often than not, the answer is no. It's important, but just how important has something of diminishing returns to it. For example, if buying those extra 10 percent of CPU power is going to save you two seconds per transaction—that may be a big deal if you have 50 data entry clerks trying to enter as much as they can a day. Over the course of a day, seemingly small amounts of time saved can add up. If each of those 50 clerks is performing 500 transactions a day, then saving two seconds per transaction adds up to over 13 man hours (that's over one person working all day!). Saving that time may allow you to delay a little longer in adding staff. The savings in wages will probably easily pay for the extra computing power. + +The company next door may look at the situation a little differently, though—they may only have one or two employees; furthermore, the process that they are working in might be one where they spend a lengthy period of time just filing out the form—the actual transaction that stores it isn't that big of deal. In such a case, their extra dollars for the additional speed may not be worth it. + +Driver Support + +Let's start off by cutting to the chase—I don't at all recommend that you save a few dollars (or even a lot of dollars) when buying your server by purchasing it from some company like "Bob's Pretty Fine Computers." Remember all those risks? Now, try introducing a strange mix of hardware and driver sets. Now imagine when you have a problem—you're quickly going to find all those companies pointing the finger at each other saying, "It's their fault!" Do you really want to be stuck in the middle? + +What you want is the tried and true—the tested—the known. Servers—particularly data servers—are an area to stick with well-known, trusted names. I'm not advocating anyone in particular (no ads in this book!), but I'm talking very mainstream people like Dell, IBM, HP, and so on. Note that, when I say well-known, trusted names, I mean names that are known in servers. Just because someone sells a billion desktops a year doesn't mean they know anything about servers—it's almost like apples and oranges. They are terribly different. + +By staying with well-known equipment, in addition to making sure that you have proper support when something fails, it also means that you're more likely to have that equipment survive upgrades well into the future. Each new version of the O/S only explicitly supports just so many pieces of equipment—you want to be sure that yours is one of them. + +The Ideal System + +Let me preface this by saying that there is no one ideal system. That being said, there is a general configuration (size excluded) that I and a very large number of other so-called "experts" seem to almost universally push as where you'd like to be if you had the budget for it. What we're talking about is drive arrangements here (the CPU and memory tends to be relative chicken feed budget- and setup-wise). + +What you'd like to have is a mix of mirroring and RAID 5 or 10. You place the O/S and the logs on the mirrored drives (ideally on separate mirror sets). You place the data on the RAID 5/10 array. That way, the O/S and logs—which both tend to do a lot of serial operations—have a drive setup all of their own without being interfered with by the reads and writes of the actual data. The data has a multi-head read/write arrangement for maximum performance, while maintaining a level of redundancy. + +Summary + +Performance could be, and should be, in a book by itself (indeed, there is a Wrox title around the very subject). There's simply just too much to cover and get acquainted with to do it all in one or even several chapters. The way I've tried to address this is by pointing out performance issues throughout the book, so you could take them on a piece at a time. This chapter is all about the first of two different slants I'm taking on it—design (addressing performance before it is a problem). In our next chapter, we'll look at how we can identify and address performance issues when our system is already live. It's important to note that the techniques discussed there are ones you may want to also utilize while you're still in test so you can tweak your design accordingly. +21 + +What Comes After: Forensic Performance Tuning + +Well, wouldn't it be nice if we could just develop the software, get paid for it, and forget it...? Yeah, well.... You can stop dreaming now—it just doesn't work that way. + +At some point, any software we consider to be part of a successful development project is going to get rolled out in front of some user base. Even if it's just a prototype, we're going to be analyzing how the prototype matched our original goals. Part of assessing whether we met our goals is taking a look at performance and asking ourselves what we could be doing better. + +In the previous chapter, I suggested that the most important thing to understand about performance tuning is that you are never going to know everything there is to know about it. If I were to come up with a competing idea for "most important thing to understand," it would be that you are never really done with performance tuning. The content of your system will change, the state of your server will change, the use of your system will change. In short, the overall system will change, and that will affect performance. The trick is to understand what's working poorly, what's working well, and what's working "well enough." + +Just as we did the previous chapter, we're going to be roaming around quite a bit in terms of the topics covered. Everything we talk about is going to be performance related in some fashion, but this time we'll be more focused on figuring out what is hurting performance. If you did your job in design and development, you should already have a great design in place, but the reality of software is that the design requirements rarely exactly match the reality of a live system. So, this chapter will be all about figuring out what's already occurring in our system and deciding what we can do better. Topics we'll cover in this chapter include: + + * Routine maintenance + * Hardware configuration issues + * The SQL Server Profiler + * Data Collector + +When to Tune (Mark Two) + +So, I had a section in the previous chapter named this very thing—When to Tune. If you paid attention at all, you know the process should have started well before the "in test or production" mode that we're in with this chapter. That said, the new answer for this chapter is simply "regularly." Don't wait until users are screaming at you about something—instead plan on a regular optimization process. + +Much of the post-release maintenance is thought of as in the realm of the DBA, and I'm not necessarily going to dispute that, save for a few problems with that philosophy: + + * You are producing a product that is used by many (are you going to expect every customer's DBA to individually deal with the problem you handed them?). + * What if there isn't a DBA (depending on your install, there may not be a DBA on staff, so what is your system and/or recommendations doing to prevent trouble for your end users?)? + * What if you are the DBA? + +This is all oversimplified, but the real key here is that you should be thinking about performance even after the product has been released and gone live. Whether it's how to build it better for the next release or simply trying to keep your paying customers happy, you should always be looking for problems (best if you know about them before your customer does) or simple ways of making your system a bit better. + +Routine Maintenance + +I hate it when good systems go bad. It happens on a regular basis though. It usually happens when people buy or build systems, put them into operation, and then forget about them. + +Maintenance is as much about performance as it is about system integrity. Query plans get out of date, index pages get full (so you have a lot of page splits), fragmentation happens, the best indexes need to be changed as usage and the amount of data in various tables changes. + +Watch the newsgroups. Talk to a few people who have older systems running. Visit some of the many SQL Server support sites on the Web. You'll hear the same story over and over again. "My system used to run great, but it just keeps getting slower and slower—I haven't changed anything, so what happened?" Well, systems will naturally become slower as the amount of data they have to search through increases; however, the change doesn't have to be all that remarkable and usually it shouldn't be. Instead, the cause is usually that the performance enhancements you put in place when you first installed the system don't really apply anymore; as the way your users use the system and the amount of data has changed, so has the mix of things that will give you the best performance. + +We'll be looking at maintenance quite a bit in the next chapter; however, we've discussed it here for two reasons. First, it will help if you are checking out this chapter because you have a specific performance problem; second, and perhaps more importantly, because there is a tendency to just think about maintenance as being something you do to prevent the system from going down and to ensure backups are available should the worst happen. This simply isn't the case. Maintenance is also a key from a performance perspective. + +Troubleshooting + +SQL Server offers a number of options to help with the prevention, detection, and measurement of long-running queries. The options range from a passive approach of measuring actual performance, so you know what's doing what, to a more active approach of employing a query "governor" to automatically kill queries that run over a length of time you choose. These tools are very often ignored or used only sparingly—which is something of a tragedy—they can save hours of troubleshooting by often leading you right to the problem query and even to the specific portion of your query that is creating the performance issues. + +Tools to take a look at include: + + * The Data Collector + * SHOWPLAN TEXT|ALL and Graphical showplan + * STATISTICS IO + * Database Console Commands (DBCC) + * The sys.processes system view + * The Activity Monitor + * The SQL Server Profiler + * PerfMon + +Many people are caught up in just using one of these, but the reality is that there is little to no (depending on which two you're comparing) overlap between them. This means that developers and DBAs who try to rely on just one of them are actually missing out on a lot of potentially important information. + +Also, keep in mind that many of these are still useful in some form even if you are writing in a client-side language and sending the queries to the server (no sprocs). You can either watch the query come through to your server using the SQL Server Profiler, or you could even test the query in QA before moving it back to your client code. + +The Data Collector + +The Data Collector is new with SQL Server 2008 and provides a framework that pulls together the collection of data about your system's data and activity and performs analysis, troubleshooting (yes, SQL Server can use data to actually troubleshoot some of its own problems!), as well as persistence of the results for further analysis and diagnostics. + +Things included in the Data Collector include: + + * The actual data collection engine + * Active performance monitoring, troubleshooting, and tuning + * Reporting + +This is a quantum leap in diagnostic possibilities over what we had in previous releases. Data collection can be aggregated on an enterprise-wide basis and reporting and analysis can span multiple servers. + +Setup and configuration of the Data Collector requires significant thought and analysis in its own right, and is largely deemed beyond the scope of this book (very much an administrator sort of thing), but some of the key elements include: + + * Setting up logins to have appropriate rights to collect data and monitor collected data + * Creation of collection sets (groups of objects that collect data using one or more collection providers) + * Scheduling of data collection + +This is obviously far from comprehensive, but it gives a taste of the idea that setting up the data collection is non-trivial. Still, it can provide a wealth of information and is very worthwhile for test systems when doing scalability analysis and for larger production environments. + +The Data Collector and its associated framework of tools are domain aware, and can collect and warehouse data from multiple servers for comparison and overall enterprise analysis. Setup of enterprise-wide data collection is in the realm of the DBA and is considered outside the scope of this book (but it's a great thing to be aware is available!). + +The Various Showplans and STATISTICS + +SQL Server gives you a few different options for showing the specific plan being used by any given query. The information that they provide varies a bit depending on what option you choose, but this is one area where there is a fair amount of overlap between your options; however, each one definitely has its own unique thing that it brings to the picture. In addition, there are a number of options available to show query statistics. + +Let's take a look at the options and what they do. + +SHOWPLAN TEXT|ALL + +When either of these two SHOWPLAN options (they are mutually exclusive) is executed, SQL Server changes what results you get for your query. Indeed, the NOEXEC option (which says, "Figure out the query plan but don't actually perform the query") is put in place, and you receive no results other than those put out by the SHOWPLAN. + +The syntax for turning the SHOWPLAN on and off is pretty straightforward: + +SET SHOWPLAN TEXT|ALL ON|OFF + +When you use the TEXT option, you get back the query plan along with the estimated costs of running that plan. Since the NOEXEC option automatically goes with SHOWPLAN, you won't see any query results. + +When you use the ALL option, you receive everything you received with the TEXT option, plus a slew of additional statistical information, including such things as: + + * The actual physical and logical operations planned + * Estimated row counts + * Estimated CPU usage + * Estimated I/O + * Average row size + * Whether or not the query will be run in parallel + +Let's run a very brief query utilizing (one at a time) both of these options: + +USE AdventureWorks2008; + +GO + +SET SHOWPLAN_TEXT ON; + +GO + +SELECT * + +FROM Sales.SalesOrderHeader; + +GO + +SET SHOWPLAN_TEXT OFF; + +GO + +SET SHOWPLAN_ALL ON; + +GO + +SELECT * + +FROM Sales.SalesOrderHeader; + +GO + +SET SHOWPLAN_ALL OFF; + +GO + +Notice that every statement is followed by a GO—thus making it part of its own batch. The batches that contain the actual query could have had an unlimited number of statements, but the batches setting the SHOWPLAN option have to be in a batch by themselves. + +The SHOWPLAN_TEXT portion of the results should look something like this: + +StmtText + +\------------------------------------------- + +SELECT * + +FROM Sales.SalesOrderHeader + +(1 row(s) affected) + +StmtText + +\------------------------------------------------------------------------------ + +|--Compute Scalar(DEFINE:([AdventureWorks2008].... + +|--Compute Scalar(DEFINE:([AdventureWorks2008]... + +|--Clustered Index Scan(OBJECT:([AdventureWorks2008]... + +(3 row(s) affected) + +Unfortunately, the results are far too wide to fit all of it gracefully in the pages of this book, but there are a couple of key things I want you to notice about what was produced: + + * There are multiple steps displayed. + * At each step, what object is being addressed and what kind of operation is being supplied. + +If we had been running a larger query—say something with several joins—then even more sub-processes would have been listed with indentations to indicate hierarchy. + +I'm not going to include the ALL results here since they simply will not fit in a book format (it's about 800 characters wide and won't fit in any readable form in a book—even if we flipped things sideways), but it includes a host of other information. Which one of these to use is essentially dependent on just how much information you want to be flooded with. If you just want to know the basic plan—such as is it using a merge or hash join, you probably just want to use the TEXT option. If you really want to know where the costs are and such, then you want the ALL option. + +Since the SHOWPLAN options imply the NOEXEC, that means nothing in your query is actually being executed. Before you do anything else, you need to set the option back to off; that even includes switching from one showplan option to the other (for example, SET SHOWPLAN_ALL ON wouldn't have any effect if you had already run SET SHOWPLAN_TEXT ON and hadn't yet turned it off). + +I like to make sure that every script I run that has a SET SHOWPLAN statement in it has both the on and off within that same script. It goes a long way toward keeping me from forgetting that I have it turned on and being confused when things aren't working the way I expect. + +Graphical Showplan + +The graphical showplan tool combines bits and pieces of the SHOWPLAN_ALL and wraps them up into a single graphical format. Graphical showplan is a Management Studio–only tool. It is selected through options in Management Studio rather than through T-SQL syntax—this means that it is only available when using Management Studio. + +The graphical showplan comes in two versions: estimated and actual. The estimated version is more like the SHOWPLAN in T-SQL. It implies that the query plan is just developed but not actually executed. This essentially waits until the query is done and shows you the way the query was actually done in the end. + +Why are these different? Well, SQL Server is smart enough to recognize when it starts down a given query plan based on an estimated cost and then finds the reality to be something other than what its estimates were based on. SQL Server uses statistics it keeps on tables and indexes to estimate cost. Those statistics can sometimes become skewed or downright out of date. The Query Optimizer will adjust on the fly if it starts down one path and finds something other than what it expected. + +For most things we do, the estimated execution plan is just fine. We have three options to activate the graphical showplan option: + + * Select the Display Estimated Execution Plan option from the Query menu + * Press Control + L on your keyboard + * Click the Display Estimated Execution Plan button on the toolbar and in the Query menu (this option just shows us the plan with the NOEXEC option active) + +Personally, I like the option of having the graphical showplan in addition to my normal query run. While it means that I have to put the actual hit of the query on my system, it also means that the numbers I get are no longer just estimates but are based on the actual cost numbers. Indeed, if you run the showplan both ways and wind up with wildly different results, then you may want to take a look at the last time your statistics were updated on the tables on which the query is based. If necessary, you can then update them manually and try the process again. + +The hierarchy of the different subprocesses is then shown graphically. In order to see the costs and other specifics about any subprocess, just hover your mouse over that part of the graphical showplan and a tooltip will come up with the information: + +This arrangement, as shown in Figure 21.1, can often make it much easier to sort out the different pieces of the plan. The downside is that you can't print it out for reporting the way that you can with the text versions. + +Figure 21.1 + +STATISTICS + +In addition to using the graphical showplan with actual execution of the query, you have a couple of other options for retrieving the "real" information on the statistics of your query: using SQL Server Profiler (discussed later in this chapter) and turning on STATISTICS PROFILE. + +STATISTICS actually has a couple of options that can be very handy in troubleshooting query performance, including those discussed in the following sections. + +SET STATISTICS IO ON|OFF + +This one is a very commonly used tool to figure out where and how the query is performing. STATISTICS IO provides several key pieces of information regarding the actual work necessary to perform your query. Information provided includes: + + * Physical Reads: This represents the actual physical pages read from disk. It is never any more than, and is usually smaller than, the number for logical reads. This one can be very misleading in the sense that it will usually change (be less than the first run) the second time that you run your query. Any page that is already in the buffer cache will not have a physical read done on it, so, the second time you run the query in a reasonably short succession, the pages involved will, more than likely, still be in cache. In addition, this number will not be incremented if the page has already been read due to the read-ahead mechanism that is part of SQL Server. This means that your query may be responsible for loading the page physically into cache, but it still may not show up as part of the physical reads. + * Logical Reads: This is the number of times that the page was actually looked at—regardless of where it came from. That is, any page already in the memory cache will still create a logical read if the query makes use of it. Note that I said it is how many times the page was looked at. That means that you may have several logical reads for a single page if the page is needed several times (say for a nested loop that affects a page that has several rows on it). + * Read-Ahead Reads: This is the number of pages that SQL Server reads into the cache as a result of the read-ahead mechanism anticipating that the pages will be needed. The page may actually be used—or it may not. In either case, the read still counts as a read ahead. Read aheads are very similar to physical reads in the sense that they represent data being physically read from disk. The problem is that the number you get is based on the optimistic nature of the read-ahead mechanism and does not necessarily mean that all that work was actually put to use. + * Scan Count: The scan count represents the number of times that a table was accessed. This is somewhat different from logical reads, which was focused on page access. This is another situation where a nested loop is a good example. The outer table that is forming the basis for the condition on the query that is on the inside may only have a scan count of 1, where the inner loop table would have a scan count added for every time through the loop—that is, every record in the outer table. + +Some of the same information that forms the basis for STATISTICS IO is the information that feeds your cache-hit ratio if you look in PerfMon. The cache-hit ratio is based on the number of logical reads, less the physical reads, divided into the total actual reads (logical reads). + +The thing to look for with STATISTICS IO is for any one table that seems disproportionately high in either physical or logical reads. + +A very high physical read count could indicate that the data from the table is being pushed out of the buffer cache by other processes. If this is a table that you are going to be accessing with some regularity, then you may want to look at purchasing (or, if you're an ISV developing a SQL Server product, recommending) more memory for your system. + +If the logical reads are very high, then the issue may be more one of proper indexing. I'll give an example here from a client I had some time back. A query was taking approximately 15 seconds to run on an otherwise unloaded system. Since the system was to be a true OLTP system, this was an unacceptable time for the user to have to wait for information. (The query was actually a fairly simple lookup that happened to require a four-table join.) In order to find the problem, I used what amounted to STATISTICS IO. It happened to be the old graphical version that came with 6.5, but the data was much the same. After running the query just once, I could see that the process was requiring less than 20 logical reads from three of the tables, but it was performing over 45,000 logical reads from the fourth table. This is what I liked about the old graphical version; it took about a half a second to see that the bar on one table stretched all the way across the screen when the others were just a few pixels! From there, I knew right where to focus—in about two minutes, I had an index built to support a foreign key (remember, they aren't built by default), and the response time dropped to less than a second. The entire troubleshooting process on this one took literally minutes. Not every performance troubleshooting effort is that easy (indeed, most aren't), but using the right tools can often help a lot. + +SET STATISTICS TIME ON|OFF + +This one is amazingly little known. It shows the actual CPU time required to execute the query. Personally, I often use a simple SELECT GETDATE() before and after the query I'm testing—as we've done throughout most of the book, but this one can be handy because it separates out the time to parse and plan the query versus the time required to actually execute the query. It's also nice to not have to figure things out for yourself. (It will calculate the time in milliseconds; using GETDATE() you have to do that yourself.) + +Include Client Statistics + +You also have the ability to show statistical information about your connection as part of your query run. To make use of this, just select Include Client Statistics from the Query menu. As long as that option is set, every execution you make will produce a Client Statistics tab in the results pane of the Query window, as shown in Figure 21.2. + +Figure 21.2 + +Database Console Commands (DBCC) + +The Database Console Commands (or DBCC) has a number of different options available to allow you to check the integrity and structural makeup of your database. This is far more the realm of the DBA than the developer, so I am, for the most part, considering the DBCC to be out of scope for this book. + +You may also hear of DBCC referred to as the Database Consistency Checker. This is what DBCC used to stand for. To be honest, I have no idea when what DBCC stood for changed, but, if you hear the other term, now you know why. + +Dynamic Management Views + +Over the last edition or two of SQL Server, Microsoft has been adding an increasing number of what are called dynamic management views—or DMVs. There is description and use information on these provided in Appendix B. They can provide a wide range of information on the current state of your server and/or database in a very code readable fashion (they can be wonderful for automating administrative tasks). To get a quick example, however, of how powerful these can be, let's take a quick look at one DMV that might be of interest. + +I can't stress enough that what I'm showing you in this section is really just a very small taste of what is possible with the various metadata and dynamic management views now available in SQL Server. You can get a solid start on learning them by checking out Appendix B in this book, but if you're looking to build a robust support tool, you may want to look for a book that is highly focused on this growing toolset in SQL Server. + +We will start by reviewing one that we first visited back in Chapter 13. We'll make a variation on a query we used in a cursor example: + +SELECT SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)) + +\+ '.' + + +OBJECT_NAME(i.object_id) + +\+ '.' + + +i.name AS Name, + +ps.avg_fragmentation_in_percent + +FROM sys.dm_db_index_physical_stats (DB_ID(), NULL, NULL, NULL, NULL) AS ps + +JOIN sys.indexes AS i + +ON ps.object_id = i.object_id + +AND ps.index_id = i.index_id + +WHERE SCHEMA_NAME(CAST(OBJECTPROPERTYEX(i.object_id, 'SchemaId') AS int)) = + +'Purchasing' + +AND avg_fragmentation_in_percent > 30; + +This gives us all the indexes—regardless of what specific table they belong to—that are associated with a table in the Purchasing schema, but, more importantly, have index fragmentation in excess of 30%. + +What's powerful here is that we can easily script maintenance tasks based on the condition of our table. This is a major advance versus the older Database Console Commands options we previously used to view fragmentation. + +This is, as I suggested earlier, a relatively simple example. As is the case with many SQL Server topics, I'm sure there will be entire books written purely around the dynamic management views now available in SQL Server. Again, check out Appendix B for more information. + +The Activity Monitor + +The Activity Monitor has received a major face lift and some extra muscle with SQL Server 2008. All the old process information is there, but there is now a host of other information collected from a variety of other sources, such as PerfMon (a Windows tool for monitoring your system) and the Data Collector. + +The Activity Monitor can be found by right-clicking the server node of the Management Studio. Open it up and you get five major subject areas: + + * Overview + * Processes + * Resource Waits + * Data File I/O + * Recent Expensive Queries + +Let's take a quick look at each of these. + +Overview + +This section is the one that will most remind you of PerfMon. It provides a relatively straightforward graph (as shown in Figure 21.3) of system activity as sampled on a adjustable interval (the default is every 10 seconds). Note that the values presented here are information on what SQL Server is utilizing—not your entire system. + +Figure 21.3 + +Processes + +This largely maps, as shown in Figure 21.4, to the Activity Monitor as you would have seen it in SQL Server 2005. It provides information about what processes are running, the command they are currently executing, and metrics on resource and blocking used or incurred by that process. + +Figure 21.4 + +Resource Waits + +Much like the Overview, this should remind you of PerfMon, providing metrics on wait times for a number of different counters (as shown in Figure 21.5). + +Figure 21.5 + +Data File I/O + +Still providing largely PerfMon-based numbers here, this one provides information on the physical files being utilized by SQL Server. Prior to this information being gathered in one place (as shown in Figure 21.6), you would have had to set each file up individually in PerfMon. SQL Server now pulls that kind of metric up for you automatically. + +Figure 21.6 + +Recent Expensive Queries + +This section, as shown in Figure 21.7, provides information we didn't really have prior to SQL Server 2008. We could map out some of this by using the SQL Server Profiler (discussed shortly), but it was tedious at best and very likely to be flooded with information we didn't really want or need (thus masking the information we were really after). + +Figure 21.7 + +It's definitely worth noting that the expensive query information is among the information that can be logged to the Performance Data Warehouse, which means that you can use the warehouse to gather metrics not only for the last few minutes, but days or even weeks depending on the retention rules you've set up for your warehouse. + +The SQL Server Profiler + +The true lifesaver among the tools provided with SQL Server, this one is about letting you "sniff out" what's really going on with the server. + +Profiler can be started from the Start menu in Windows. You can also run it by selecting the Tools menu in Management Studio. When you first start it up, you can either load an existing profile template or create a new one. + +Let's take a look at some of the key points of the main Profiler by walking through a brief example. + +Start by choosing New⇒Trace from the File menu. Log in to the server you've been working with, and you should be presented with the dialog box in Figure 21.8. + +Figure 21.8 + +The trace name is probably obvious enough, but the template information might not be. A template is a set of pre-established events, data columns, and filters that you want to see in a trace, and the templates provided with SQL Server are named for the kind of situation that you might want to use them in. Any templates that are stored in the default profiler template directory (which is under the tools subdirectory of wherever you installed SQL Server) are included in the Use the Template drop-down box. + +Pay particular attention to what template you choose. It determines exactly how much is available to you on the next tab. If you choose too restrictive of a template, you can select Show All Events and Show All Columns to expose all possible choices. + +Next up, you can choose whether to capture the trace to a file on disk or a table in the database. If you save to a file, then that file will be available only to the system that you store it on (or anyone who has access to a network share if that's where you save it). If you save it to a table, then everyone who can connect to the server and has appropriate permissions will be able to examine the trace. + +Last, but not least, on this dialog is the stop time feature. This allows you to leave a trace running (for example, for a workload file or some other long-running trace need) and have it shut down automatically at a later time. + +Things get somewhat more interesting on the tab that comes next (Events Selection), as shown in Figure 21.9. + +Figure 21.9 + +I've chosen the "blank" template here, and have scrolled down to the Performance area and expanded it. This tab is all about what events you are going to track, and, as you can see, there's quite a range. If, for example, you chose the Tuning trace template, then the initial setup is one that tracks what's needed for the Database Engine Tuning Advisor plus a bit more. In addition, you use the table to select what information you want collected for each class of event. + +The temptation here is just to select everything under the sun, so you'll be sure to have all the information. There are a couple of reasons not to do this. First, it means that a lot of additional text has to come back down the pipe to your server. Remember that SQL Server Profiler has to place some audits in the system, and this means that your system is having an additional burden placed on it whenever the Profiler is running. The bigger the trace, the bigger the burden. Second, it often means lower productivity for you since you have to wade through a huge morass of data—much of which you probably won't need. + +I want to point out a couple of key fields here before we move on: + + * TextData: This is the actual text of the statement that the Profiler happens to have added to the trace at that moment in time. + * Application Name: Another of those highly underutilized features. The application name is something you can set when you create the connection from the client. If you're using ADO.NET or some other data object model and underlying connection method, you can pass the application name as a parameter in your connection string. It can be quite handy for your DBAs when they are trying to troubleshoot problems in the system. + * NT User Name: This one is what it sounds like. What's great about this is that it can provide a level of accountability. + * Login Name: Same as NT User Name, only used when operating under SQL Server Security rather than Windows Security. + * CPU: The actual CPU cycles used. + * Duration: How long the query ran—includes time waiting for locks and such (where the CPU may not have been doing anything, so doesn't reference that load). + * SPID (SQL Process ID): This one can be nice if your trace reveals something where you want to kill a process. This is the number you would use with your KILL statement. + +Moving right along, let's take a look at what I consider to be one of the most important options—Column Filters. + +This is the one that makes sure that, on a production or load test server, you don't get buried in several thousand pages of garbage just by opening a trace up for a few minutes. + +With Column Filters, you can select from a number of different options to use to filter out data and limit the size of your result set. By default, Profiler automatically sets up to exclude its own activity in order to try to reduce the Profiler's impact on the end numbers. For the example in Figure 21.10, I'm adding in a Duration value where I've set the minimum to 3,000 milliseconds with no maximum. + +Odds are that, if you run this with a query against the Sales.SalesOrderHeaders table, you're not going to see it appear in the trace. Why is that? Because that query will probably run very fast and not meet the criteria for being included in our trace—this is an example of how you might set up a trace to capture the query text and username of someone who has been running very long-running queries on the system. Now try running something a little longer—such as a query that joins many large tables. There's a good chance that you'll now exceed the duration threshold, and your query will show up in the Profiler (if not, then try adjusting down the duration expectation that you set in Profiler). + +Figure 21.10 + +I can't say enough about how important this tool is in solving performance and other problems. There have been too many times to count in which I've thought that my sproc was running down one logic path only to find that a totally different branch was being executed. How did I originally find out? I watched it execute in Profiler. + +The Performance Monitor (PerfMon) + +When you install SQL Server on Windows, SQL Server adds several counters to the Reliability and Performance Monitor (which is sometimes called PerfMon because of the executable's filename—perfmon.msc). This can be an excellent tool for finding where problems are happening and even determining the nature of some problems. + +Prior to Windows Vista and Windows Server 2008, the Reliability and Performance Monitor was known simply as Performance Monitor. + +While many of the relevant counters are now in the Activity Monitor within the Management Studio, the Reliability and Performance Monitor can be accessed through the Administrative Tools menu in Windows. SQL Server has a number of different Performance Objects, and, within each of these, you will find a series of counters related to that object. Historically, some of the important ones have included: + + * SQLServer Cache Manager: Buffer Hit Cache Ratio: This is the number of pages that were read from the buffer cache rather than from a physical read from disk. The thing to watch out for here is that this number can be thrown off depending on how effective the read-ahead mechanism was—anything that the read-ahead mechanism got to and put in cache before the query actually needed it is counted as a buffer-cache hit—even though there really was a physical read related to the query. Still, this one is going to give you a decent idea of how efficient your memory usage is. You want to see really high numbers here (in the 90 + percent range) for maximum performance. Generally speaking, a low buffer hit cache ratio is indicative of needing more memory. + * SQLServer General Statistics: User Connections: Pretty much as it sounds, this is the number of user connections currently active in the system. + * SQLServer Memory Manager: Total Server Memory: The total amount of dynamic memory that the SQL Server is currently using. As you might expect, when this number is high relative to the amount of memory available in your system (remember to leave some for the O/S!), you need to seriously consider adding more RAM. + * SQLServer SQL Statistics: SQL Compilations/sec: This is telling you how often SQL Server needs to compile things (sprocs, triggers). Keep in mind that this number will also include recompiles (due to changes in index statistics or because a recompile was explicitly requested). When your server is first getting started, this number may spike for a bit, but it should become stable after your server has been running for a while at a constant set and rate of activities. + * SQLServer Buffer Manager: Page Reads/sec: The number of physical reads from disk for your server. You'd like to see a relatively low number here. Unfortunately, because the requirements and activities of each system are different, I can't give you a benchmark to work from here. + * SQLServer Buffer Manager: Page Writes/sec: The number of physical writes performed to disk for your server. Again, you'd like a low number here. + +If you want to add or change any of these, just click the plus ( + ) sign up on the toolbar. You'll be presented with a dialog, as shown in Figure 21.11, that lets you choose between all the different objects and counters available on your system (not just those related to SQL Server): + +Figure 21.11 + +The big thing here is to realize that you can mix and match a wide variety of counters to be able to reach a better understanding of what's going on with your server and make the appropriate adjustments. Much of the time, this kind of task is going to have more to do with the DBA than the developer, but many of these stats can be helpful to you when you are doing load testing for your application. + +Summary + +Performance could be, and should be, in a book by itself. There's simply just too much to cover and get acquainted with to do it all in one or even several chapters. The way I've tried to address this is by pointing out performance issues throughout the book, so you could take them on a piece at a time. + +The biggest thing is to have a plan—a performance plan. Make performance an issue from the first stages of your project. Set benchmarks early on, and continually measure your system against those benchmarks to know where you are improving and what problems you might need to address. + +In this chapter, we've reviewed a number of the performance considerations touched on throughout the book, plus added several new tools and ideas to consider. + +In the next chapter, we'll be taking a look at administration issues. As you've seen through some of the portions of this chapter, proper administration can also be a key ingredient to performance. +22 + +Administration + +So, at this point we've covered all of the core database topics and then some. We still have a chapter or two to clean up the edges around our development effort, but we've mostly covered everything—heh, NOT!!! For the developer, we like to think our job is done, but for the application we're building, it's just beginning. And so, it's time to talk a bit about maintenance and administration of the databases you develop. + +If there is anything I hope to instill in you in your database development efforts, it's to avoid the "hey, I just build 'em—now it's your problem" attitude that is all too common in the world of database-driven applications. Far too many developers are guilty of attempting to build relatively bug-free code, and calling it good. Well, just because it runs, doesn't mean your end user is going to be successful with your software over the long haul. It is, therefore, important for you to look at how your system is going to be used, and what will be necessary to keep it functioning properly. + +In this chapter, we're going to take a look at some of the tasks that are necessary to make sure that your end users can not only recover from problems and disasters but also perform some basic maintenance that will help things keep running smoothly. + +Among the things we'll touch on are: + + * Scheduling jobs + * Backing up and recovering + * Basic defragmenting and index rebuilding + * Setting alerts + * Archiving + * Using PowerShell + * Considering Policy-Based Management + +While these are far from the only administration tasks available, these do represent something of "the minimum" you should expect to address in the deployment plans for your app. We'll also take a further look at monitoring (several items in that area were discussed as part of the performance tuning coverage in the preceding chapter) through the use of the Policy-Based Management framework that was added with SQL Server 2008. + +This is one of those chapters where I feel that overlap with some of the coverage in my Beginning title is an unfortunate necessity. The reality is that most developers I know—even relative experts in SQL Server—know precious little about the job scheduling, index fragmentation, and even backup and recovery. Be careful, however, assuming that you've seen everything this chapter has to offer just because you may have read the Beginning title. I've added more advanced coverage of several of these topics, and I also include code-driven handling of many administrative tasks. + +Scheduling Jobs + +Many of the tasks that we'll go over in the remainder of the chapter can be scheduled. Scheduling jobs allows you to run tasks that place a load on the system at off-peak hours. It also ensures that you don't forget to take care of things. From index rebuilds to backups, you'll hear of horror stories over and over about shops that "forgot" to do that, or thought they had set up a scheduled job but never checked on it. + +If your background is in Windows Server, and you have scheduled other jobs using the Windows Scheduler service, you could utilize that scheduling engine to support SQL Server. Doing things all in the Windows Scheduler allows you to have everything in one place, but SQL Server has some more robust branching options. + +There are basically two terms to think about: jobs and tasks. + + * Tasks: These are single processes that are to be executed, or batches of commands that are to be run. Tasks are not independent—they exist only as members of jobs. + * Jobs: These are a grouping of one or more tasks that should be run together. You can, however, set up dependencies and branching depending on the success or failure of individual tasks (for example, task A runs if the previous task succeeds, but task B runs if the previous task fails). + +Jobs can be scheduled based on: + + * A daily, weekly, or monthly basis + * A specific time of the day + * A specific frequency (say, every 10 minutes, or every hour) + * When the CPU becomes idle for a period of time + * When the SQL Server Agent starts + * In response to an alert + +Tasks are run by virtue of being part of a job and based on the branching rules you define for your job. Just because a job runs doesn't mean that all the tasks that are part of that job will run. Some may be executed and others not depending on the success or failure of previous tasks in the job and what branching rules you have established. SQL Server not only allows one task to automatically fire when another finishes, but it also allows for doing something entirely different (such as running some sort of recovery task) if the current task fails. + +In addition to branching you can, depending on what happens, also tell SQL Server to: + + * Provide notification of the success or failure of a job to an operator. You're allowed to send a separate notification for a network message (which would pop up on a user's screen as long as they are logged in), a pager, and an e-mail address to one operator each. + * Write the information to the event log. + * Automatically delete the job (to prevent executing it later and generally "clean up"). + +Let's take a quick look at how to create operators in Management Studio, and then we'll move on to creating the other objects needed to get jobs scheduled. + +Creating an Operator + +If you're going to make use of the notification features of the SQL Agent, then you must have an operator set up to define the specifics for who is notified. This side of things—the creation of operators—isn't typically done through any kind of automated process or as part of the developed code. These are usually created manually by the DBA. We'll go ahead and take a rather brief look at creating operators here just to understand how it works in relation to the scheduling of tasks. + +Creating an Operator Using Management Studio + +To create an operator using Management Studio, you need to navigate to the SQL Server Agent node of the server for which you're creating the operator. Expand the SQL Server Agent node, right-click the Operators member, and choose New Operator. + +Be aware that, depending on your particular installation, the SQL Server Agent Service may not start automatically by default. If you run into any issues or if you notice the SQL Server Agent icon in the Management Studio has a little red square in it, then the service is probably set to manual or even disabled—you will probably want to change the service to start automatically. Regardless, make sure that it is running for the examples found in this chapter. You can do this by right-clicking the Agent node and selecting Start. + +You should be presented with the dialog box shown in Figure 22.1 (mine is partially filled in). + +Figure 22.1 + +You can then fill out a schedule for what times this operator is to receive e-mail notifications for certain kinds of errors that we'll see on the Notifications tab. + +Speaking of that Notifications tab, go ahead and click over to that tab. It should appear as in Figure 22.2. + +Figure 22.2 + +Until you have more alerts in your system (we'll get to those later in this chapter), this page may not make a lot of sense. What it is about is setting up what notifications you want this operator to receive depending on what defined alerts get triggered. Again, hard to understand this concept before we've gotten to alerts, but suffice to say that alerts are triggered when certain things happen in your database, and this page defines which alerts this particular operator receives. + +Creating an Operator Using T-SQL + +If you do decide to create operators programmatically, you can make use of the sp_add_operator sproc found in msdb. + +Note that sp_add_operator and most other SQL Server Agent–related stored procedures are managed through the msdb database rather than being true system stored procedures. As such, you need to either have msdb current when you call them or use three-part naming. + +After seeing all the different things you need to choose in Management Studio, it probably won't surprise you to find out that this sproc has a ton of different parameters. Fortunately, a number of them are optional, so you need to supply them only if you're going to make use of them. The syntax looks like this: + +sp_add_operator [@name =] '' + +[, [@enabled =] <0 for no, 1 for yes>] + +[, [@email_address =] ''] + +[, [@pager_address =] ''] + +[, [@weekday_pager_start_time =] ] + +[, [@weekday_pager_end_time =] ] + +[, [@saturday_pager_start_time =] ] + +[, [@saturday_pager_end_time =] ] + +[, [@sunday_pager_start_time =] ] + +[, [@sunday_pager_end_time =] ] + +[, [@pager_days =] ] + +[, [@netsend_address =] ''] + +[, [@category_name =] ''] + +Most of the parameters in this sproc are self-explanatory, but there are a few we need to take a closer look at: + + * @enabled: This is a Boolean value and works just the way you would typically use a bit flag—0 means disable this operator and 1 means enable the operator. + * @email_address: This one is just a little tricky. In order to use e-mail with your SQL Server, you need to configure Database Mail to be operational using a specific mail server. This parameter assumes that whatever value you supply is an alias on that mail server. If you are providing the more classic e-mail address type (somebody@SomeDomain.com), then you need to enclose it in square brackets—like [somebody@SomeDomain.com]. Note that the entire address—including the brackets—must still be enclosed in quotation marks. + * @pager_days: This is a number that indicates the days that the operator is available for pages. This is probably the toughest of all the parameters. This uses a single-byte bit-flag approach similar to what we saw with the @@OPTIONS global variable described in the system functions appendix at the back of the book). You simply add the values together for all the values that you want to set as active days for this operator. The options are: + +Value | Day of Week +---|--- +Sunday | 1 +Monday | 2 +Tuesday | 4 +Wednesday | 8 +Thursday | 16 +Friday | 32 +Saturday | 64 + +Okay, so let's go ahead and create our operator using sp_add_operator. We'll keep our use of parameters down, since many of them are redundant: + +USE msdb; + +DECLARE @PageDays int; + +SELECT @PageDays = 2 + 8 + 32 -- Monday, Wednesday, and Friday; + +EXEC sp_add_operator @name = 'TSQLOperator', + +@enabled = 1, + +@pager_address = 'YourEmail@YourDomain.com', + +@weekday_pager_start_time = 080000, + +@weekday_pager_end_time = 170000, + +@pager_days = @PageDays; + +If you go back into Management Studio and refresh your Operators list, you should see your new operator there. + +There are three other sprocs (plus one to retrieve information) that you need to make use of in order to have power over your operator from T-SQL: + + * sp_help_operator: Provides information on the current settings for the operator. + * sp_update_operator: Accepts all the same information as sp_add_operator; the new information completely replaces the old information. + * sp_delete_operator: Removes the specified operator from the system. + * sp_add_notification: Accepts an alert name, an operator name, and a method of notification (e-mail, pager, netsend). Adds a notification such that, if the alert is triggered, then the specified operator will be notified via the specified method. + +Now that you've seen how to create operators, let's take a look at creating actual jobs and tasks. + +Creating Jobs and Tasks + +As I mentioned earlier, jobs are a collection of one or more tasks. A task is a logical unit of work, such as backing up one database or running a T-SQL script to meet a specific need, such as rebuilding all your indexes. + +Even though a job can contain several tasks, this is no guarantee that every task in a job will run. They will either run or not run depending on the success or failure of other tasks in the job and what you've defined as the response for each case of success or failure. For example, you might cancel the remainder of the job if one of the tasks fails. + +Like operators, jobs can be created in Management Studio as well as programmatic constructs. + +Creating Jobs and Tasks Using Management Studio + +The SQL Server Management Studio makes it very easy to create scheduled jobs. Just navigate to the SQL Server Agent node of your server. Then right-click the Jobs member and select New Job. You should get a multinode dialog box, shown in Figure 22.3, that will help you build the job one step at a time. + +Figure 22.3 + +The name can be whatever you like as long as it adheres to the SQL Server rules for naming, as discussed early in this book. + +Most of the rest of the information is, again, self-explanatory with the exception of Category—which is just one way of grouping together jobs. Many of your jobs that are specific to your application are going to be Uncategorized, although you will probably on occasion run into instances where you want to create Web Assistant, Database Maintenance, Full Text, or Replication Jobs. Those each go into their own category for easy identification. + +We can then move on to Steps, as shown in Figure 22.4. This is the place where we tell SQL Server to start creating our new tasks that will be part of this job. + +Figure 22.4 + +To add a new step to our job, we just click the New button and fill in the new dialog box, shown in Figure 22.5. We'll use a T-SQL statement to raise a bogus error just so we can see that things are really happening when we schedule this job. Note, however, that there is an Open button to the left of the command box—you can use this to import SQL Scripts that you have saved in files. + +Figure 22.5 + +Let's go ahead and move on to the Advanced tab for this dialog, shown in Figure 22.6—it's here that we really start to see some of the cool functionality that our job scheduler offers. + +Notice several things in this dialog: + + * You can automatically set the job to retry at a specific interval if the task fails. + * You can choose what to do if the job succeeds or fails. For each result (success or failure), you can: + * Quit reporting success + * Quit reporting failure + * Move on to the next step + * You can output results to a file. (This is very nice for auditing.) + * You can impersonate another user (for rights purposes). Note that you have to have the rights for that user. Because we're logged in as a sysadmin, we can run the job as the dbo or just about anyone. The average user would probably only have, at most, the guest account available (unless they were the database owner)—but, hey, in most cases a general user shouldn't be scheduling his or her own jobs this way anyway (let your client application provide that functionality). + +Figure 22.6 + +Okay, so there's little chance that our RAISERROR statement is going to fail, so we'll just take the default of "Quit the job reporting failure" on this one (we'll see other possibilities later in the chapter when we come to backups). + +That moves us back to the main New Job dialog, and we're now ready to move on to the Schedules node, shown in Figure 22.7. + +Figure 22.7 + +In this dialog, we can manage one or more scheduled times for this job to run. To actually create a new scheduled time for the job to run, we need to click the New button. That brings up yet another dialog, shown in Figure 22.8. + +Figure 22.8 + +I've largely filled this one out already (lest you get buried in a sea of screenshots), but it is from this dialog that we create a new schedule for this job. Recurrence and frequency are set here. + +The frequency side of things can be a bit confusing because of the funny way that they've worded things. If you want something to run at multiple times every day, then you need to set the job to Occur Daily—every 1 day. This seems like it would run only once a day, but then you also have the option of setting whether it runs once or on an interval. In our case, we want to set our job to run every 5 minutes. + +Now we're ready to move on to the next node of our job properties—Alerts, shown in Figure 22.9. + +Figure 22.9 + +From here, we can select which alerts we want to make depending on what happens. Choose Add and we get yet another rich dialog, shown in Figure 22.10. + +Figure 22.10 + +Our first node—General—is going to let us fill out some of the basics. We can, for example, limit this notification to one particular database. We also define just how severe the condition needs to be before the alert will fire (in terms of severity of the error). + +From there, it is on to the Response node (see Figure 22.11). + +Figure 22.11 + +Notice that I was able to choose either of the operators that we created earlier in the chapter. (I've just stuck with the one we created using the Management Studio.) It is through the definitions of these operators that the SQL Server Agent knows what e-mail address or netsend address to make the notification to. Also notice that we have control, on the right-hand side, over how our operator is notified. + +Last, but not least, we have the Options node (see Figure 22.12), to complete the creation of our new alert. + +Figure 22.12 + +With the new alert created, we can go back to the Notifications node of the main New Job dialog (see Figure 22.13). + +Figure 22.13 + +This window lets you bypass the older alerts model and define a response that is specific to this one job—we'll just stick with what we already have for now, but you could define specific additional notifications in this dialog. + +At this point, you are ready to say OK and exit the dialog. You'll need to wait a few minutes before the task will fire, but you should start to see log entries appear every five minutes in the Windows event log. You can look at this by navigating to the Event Viewer in the Computer Management utility for your system (where to find this varies a bit depending on what version and edition of Windows you are running). You'll need to switch the view to use the Application log (under Windows logs). + +Don't forget that, if you're going to be running scheduled tasks like this one, you need to have the SQL Server Agent running in order for them to be executed. You can check the status of the SQL Server Agent by running the SQL Server Configuration Manager and selecting the SQL Server Agent service, or by navigating to the SQL Server Agent node of the Object Explorer in Management Studio. + +Also, don't forget to disable this job (right-click the job in Management Studio after you've seen that it's working the way you expect). Otherwise, it will just continue to sit there and create entries in your Application log. Eventually, the Application log will fill up and you can have problems with your system. + +Creating Jobs and Tasks Using T-SQL + +Before we get started, I want to point out that using T-SQL for this kind of stuff (creating scheduled jobs and tasks) is not usually the way things are done on a day-to-day basis. Most jobs wind up being scheduled by the DBA based on a specific need and a specific schedule that is required. If you're not in a situation where you need to script the installation of tasks, then you may want to just skip this section (it's a lot to learn if you aren't going to use it!). That being said, there can be times where your end users won't have a DBA handy (small shops, for example, often don't have anything even remotely resembling a DBA), so you'll want to script some jobs to help out unsophisticated users. + +Automating the creation of certain jobs is very frequently overlooked in installation procedures—particularly for shrink-wrap software. If you're working in some form of consulting or private IS shop environment, then there's a good chance that you are going to need to take care of scheduling all the needed tasks when you do the install. With shrink-wrap software, however, you often aren't at all in control of the installation process—indeed, you may be hundreds or thousands of miles away from the install and may not even know that it's happening. + +How then do you make sure that basic tasks (like backups, for example) get done? You can make it part of your installation process. + +Jobs can be added to SQL Server using T-SQL by using three different stored procedures: + + * sp_add_job: This creates the actual job. + * sp_add_job_step: This creates a task within the job. + * sp_add_jobschedule: This determines when the job will run. + +Each of these builds a piece of the overall execution of the scheduled task much as the different tabs in Management Studio did. The next sections take a look at each individually. + +All jobs and tasks are stored in the msdb database. As such, you'll need to make sure that msdb is the current database (utilizing the USE command) when calling any of these sprocs. + +sp_add_job + +This one creates the top-level of a hierarchy and establishes who owns the job and how notifications should be handled. There are quite a few parameters, but most of them are fairly easy to figure out: + +sp_add_job [@job_name =] '' + +[,[@enabled =] <0 for no, 1 for yes>] + +[,[@description =] ''] + +[,[@start_step_id =] ] + +[,[@category_name =] ''] + +[,[@category_id =] ] + +[,[@owner_login_name =] ''] + +[,[@notify_level_eventlog =] ] + +[,[@notify_level_email =] ] + +[,[@notify_level_netsend =] ] + +[,[@notify_level_page =] ] + +[,[@notify_email_operator_name =] ''] + +[,[@notify_netsend_operator_name =] ''] + +[,[@notify_page_operator_name =] ''] + +[,[@delete_level =] ] + +[,[@job_id =] OUTPUT] + +Again, most of the parameters here are self-explanatory, but let's again touch on some of the more sticky ones. + + * @start_step_id: This one is going to default to 1, and that's almost always going to be the place to leave it. We'll be adding steps shortly, but those steps will have identifiers to them, and this just lets the SQL Server Agent know where to begin the job. + * @category_name: This one equates directly with the category we saw in Management Studio. It will often be none (in which case, see @category_ID) but could be a Database Maintenance (another common choice), Full Text, Web Assistant, Replication, or a category that you add yourself using sp_add_category. + * @category_id: This is just a way of providing a category without being dependent on a particular language. If you don't want to assign any particular category, then I recommend using this option instead of the name and supplying a value of either 0 (Uncategorized, but runs local) or 1 (Uncategorized Multi-Server). + * @notify_level_eventlog: For each type of notification, this determines under what condition the notification occurs. To use this sproc, though, we need to supply some constant values to indicate when we want the notification to happen. The constants are: + +Constant Value | When the Notification Occurs +---|--- +0 | Never +1 | When the task succeeds +2 | When the task fails (this is the default) +3 | Every time the task runs + + * @job_id: This is just a way of finding out what job ID was assigned to your newly created job. You'll need this value when you go to create job steps and the job schedule(s). The big things on this one are: + * Remember to receive the value into a variable so you can reuse it. + * The variable needs to be of type uniqueidentifier rather than the types you might be more familiar with at this point. + +Note that all the non-level "notify" parameters are expecting an operator name. You should create your operators before running this sproc. + +So, let's create a job to test this process out. What we're going to do here is create a job that's nearly identical to the job we created in Management Studio. + +First, we need to create our top-level job. All we're going to do for notifications is to send a message on failure to the Windows event log. If you have Database Mail set up, then feel free to add in notification parameters for your operator. + +USE msdb; + +DECLARE @JobID uniqueidentifier; + +EXEC sp_add_job + +@job_name = 'TSQLCreatedTestJob', + +@enabled = 1, + +@notify_level_eventlog = 3, + +@job_id = @JobID OUTPUT; + +SELECT 'JobID is ' + CONVERT(varchar(128),@JobID); + +Now, execute this, and you should wind up with something like this: + +\--------------------------------------------------------------------- + +JobID is 83369994-6C5B-45FA-A702-3511214A2F8A + +(1 row(s) affected) + +Note that your particular GUID will be different from the one I got here. (Remember that GUIDs are effectively guaranteed to be unique across time and space.) You can either use this value or you can use the job name to refer to the job later. (I happen to find this a lot easier, but it can create problems when dealing with multiple servers.) + +sp_add_jobserver + +This is a quick-and-dirty one. We've now got ourselves a job, but we don't have anything assigned for it to run against. You see, you can create a job on one server but still run it against a completely different server if you choose. + +In order to target a particular server, we'll use a sproc (in msdb still) called sp_add_jobserver. The syntax is the easiest by far of any we'll be looking at in this section, and looks like this: + +sp_add_jobserver [@job_id =] |[@job_name =] '', + +[@server_name =] '' + +Note that you supply either the job ID or the job name—not both. + +So, to assign a target server for our job, we need to run a quick command: + +USE msdb; + +EXEC sp_add_jobserver + +@job_name = 'TSQLCreatedTestJob', + +@server_name = "(local)"; + +Note that this will just point at the local server regardless of what that server is named. We could have also put the name of another valid SQL Server in to be targeted. + +sp_add_jobstep + +The second step in the process is to tell the job specifically what it is going to do. At the moment, all we have in our example is the shell. The job doesn't have any tasks to perform, and that makes it a very useless job indeed. There is a flip side to this though—a step can't even be created without some job to assign it to. + +The next step then is to run sp_add_jobstep. This is essentially adding a task to the job. If we had multiple steps we wanted the job to do, then we would run this particular sproc several times. + +The syntax looks like this: + +sp_add_jobstep [@job_id =] | [@job_name =] ''] + +[,[@step_id =] ] + +[,[@step_name =] ''] + +[,[@subsystem =] ''] + +[,[@command =] ''] + +[,[@additional_parameters =] ''] + +[,[@cmdexec_success_code =] ] + +[,[@on_success_action =] ] + +[,[@on_success_step_id =] ] + +[,[@on_fail_action =] ] + +[,[@on_fail_step_id =] ] + +[,[@server =] ''] + +[,[@database_name =] ''] + +[,[@database_user_name =] ''] + +[,[@retry_attempts =] ] + +[,[@retry_interval =] ] + +[,[@os_run_priority =] ] + +[,[@output_file_name =] ''] + +[,[@flags =] ] + +Not as many of the parameters are self-explanatory here, so let's look at the more confusing ones in the list: + + * @job_id vs. @job_name: This is actually a rather odd sproc in the sense that it expects you to enter one of the first two parameters, but not both. You can either attach this step to a job by its GUID (as you saved from the last sproc run) or by the job name. + * @step_id: All the steps in any job have an ID. SQL Server assigns these IDs automatically as you insert the steps. So why, if it does it automatically, do we have a parameter for it? That's in case we want to insert a step in the middle of a job. If there are already numbers 1–5 in the job, and we insert a new step and provide a step ID of 3, then our new step will be assigned to position number 3. The previous step 3 will be moved to position 4 with each succeeding step being incremented by 1 to make room for the previous step. + * @step_name: Is what it says—the name of that particular task. Just be aware that there is no default here. You must provide a step name. + * @subsystem: This ties in very closely to job categories and determines which subsystem within SQL Server (such as the replication engine, or the command line—the command prompt—or Integration Services) is responsible for executing the script. The default is that you're running a set of T-SQL statements. The possible subsystems are: + +SubSystem | Description +---|--- +ACTIVESCRIPTING | The scripting engine (VB Script). Note that this one is considered deprecated, and Microsoft will remove it from the product at some point. +ANALYSISQUERY | Analysis Services query (MDX, DMX). +ANALYSISCOMMAND | Analysis Services command (XMLA). +CMDEXEC | Gives you the capability to execute compiled programs or batch files from a command (DOS) prompt. +DISTRIBUTION | The Replication Distribution Agent. +'Dts' | Integration Services package execution. +LOGREADER | Replication Log Reader Agent. +MERGE | The Replication Merge Agent. +'PowerShell' | PowerShell script. +'QueueReader' | Replication Queue Reader Agent job. +SNAPSHOT | The Replication Snapshot Agent. +TSQL | A T-SQL batch. This is the default. + + * @command: This is the actual command you're issuing to a specific subsystem. In our example, this is going to be the RAISERROR command just like we issued when using Management Studio, but it could be almost any T-SQL command. What's cool here is that there are some system-supplied values you can use in your commands. You place these in the middle of your scripts as needed, and they are replaced at runtime (we'll make use of this in our example). The possible system-supplied values are: + +Tag | Description +---|--- +A-DBN | Substitutes in the database name. +A-SVR | Substitutes the server name in the place of the tag. +A-ERR | Error number. +A-SEV | Error severity. +A-MSG | The message text from the error. +DATE | Supplies the current date (in YYYYMMDD format). +INST | Provides the name of the current instance of SQL Server (it's blank if it is the default instance). +JOBID | Supplies the current Job ID. +MACH | The current computer name. +MSSA | Master SQL Server Agent name. +OSCMD | The program that runs CmdExec steps. +SQLDIR | The directory in which SQL Server is installed (usually C:\Program Files\Microsoft SQL Server\MSSQL10.MSSQLSERVER\MSSQL). +STEPCT | A count of the number of times this step has executed (excluding retries). You could use this one to keep count of the number of executions and force the termination of a multistep loop. +STEPID | Step ID. +SVR | The name of the computer the job is running on, including the SQL Server instance name if applicable. +TIME | The current time in HHMMSS format. +STRTTM | The start time for the job in HHMMSS format. +STRTDT | The start date for the job in YYYYMMDD format. + +Note that all of these tokens must be wrapped in parentheses. This is a somewhat different requirement than was required through SQL Server 2005 RTM (which, like SQL Server 2000, required a square bracket instead). Beginning with SQL Server 2005 SP1, parentheses replaced the earlier square bracket requirement, and an escape sequence is required (we'll look at that in a bit). + +Beginning with SQL Server 2005 SP1, you must wrap any of the previous tokens used in the @COMMAND parameter in an escape clause. Value escape functions include: + +$(ESCAPE_SQUOTE(token name)) | Replaces any single quotation mark with two single quotation marks in the token replacement string. +---|--- +$(ESCAPE_DQUOTE(token name)) | Replaces any single instance of a double quotation mark with two double quotation marks in the token replacement string. +$(ESCAPE_RBRACKET(token name)) | Replaces any single instance of a right bracket in the token replacement string with two right brackets. +$(ESCAPE_NONE(token name)) | Provided solely for backward compatibility, this performs the token replacement without escaping any characters in the string. + + * @cmdexec_success_code: This is the value you expect to be returned by whatever command interpreter ran your job if the job ran successfully (applies only to command prompt subsystem). The default is zero. + * @on_success_action and @on_fail_action: This is where you say what to actually do at the success or failure of your step. Remember that at the job level we define what notifications we want to happen, but, at the step level, we can define how we want processing to continue (or end). For this parameter, you need to supply one of the following constant values: + +Value | Description +---|--- +1 | Quit with success. This is the default for successful task executions. +2 | Quit with failure. This is the default for failed tasks. +3 | Go to the next step. +4 | Go to a specific step as defined in on_success_step_id or on_fail_step_id. + + * @on_success_step_id and @on_fail_step_id: What step you want to run next if you've selected option 4 in the preceding table. + * @server: The server the task is to be run against (you can run tasks on multiple target servers from a single master server). + * @database_name: The database to be set as current when the task runs. + * @retry_interval: This is set in minutes. + * @os_run_priority: Ah, an undocumented feature. The default here is normal, but you can adjust how important Windows is going to think that your cmdExec (command line) scheduled task is. The possible values are: + +Value | Priority +---|--- +−15 | Run at idle only +−1 thru −14 | Increasingly below normal +0 | Normal (this is the default) +1 thru 14 | Increasingly above normal +15 | Time critical + +I just can't help but think of the old Lost in Space TV show here and think of the robot saying "DANGER Will Robinson—DANGER!" Don't take messing with these values lightly. If you're not familiar with the issues surrounding Windows thread priorities, I'd suggest staying as far away from this one as possible. Going with the higher values, in particular, can have a very detrimental impact on your system—including creating significant instabilities. When you say that this is the most important thing, remember that you are taking away some of the importance of things like operating system functions—not something that's smart to do. Stay clear of this unless you really know what you're doing. + + * @flags: This one relates to the Output File parameter, and indicates whether to overwrite or append your output information to the existing file. The options are: + +Value | Description +---|--- +0 | No option specified (currently, this means your file will be overwritten every time). +2 | Append information to the existing file (if one exists). +4 | Explicitly overwrite the file. + +Okay, now that we've looked at the parameters, let's add a step to the job we created a short time ago: + +EXEC sp_add_jobstep + +@job_name = 'TSQLCreatedTestJob', + +@step_name = 'This Is The Step', + +@command = 'RAISERROR + +("RAISERROR (""TSQL Task is Job ID + +$(ESCAPE_SQUOTE(JOBID))."",10,1) WITH LOG",10,1) + +WITH LOG', + +@database_name = 'AdventureWorks2008', + +@retry_attempts = 3 , + +@retry_interval = 5; + +Note the requirement for the escape function. Without the escape function (in this case, any one of the four would have worked), the JOBID would not be treated as a substitution token, and would have been left as the literal string of "JOBID". + +Technically speaking, our job should be able to be run at this point. The reason I say "technically speaking" is because we haven't scheduled the job, so the only way to run it is to manually tell the job to run. Let's take care of the scheduling issue, and then we'll be done. + +sp_add_jobschedule + +This is the last piece of the puzzle. We need to tell our job when to run. To do this, we'll make use of sp_add_jobschedule, which, like all the other sprocs we've worked on in this section, can only be found in the msdb database. Note that we could submit an entry from this sproc multiple times to create multiple schedules for our job. Keep in mind though that getting too many jobs scheduled can lead to a great deal of confusion, so schedule jobs wisely. (For example, don't schedule one job for every day of the week when you can schedule a single job to run daily.) + +The syntax has some similarities to what we've already been working with, but adds some new pieces to the puzzle: + +sp_add_jobschedule + +[@job_id =] , | [@job_name =] '', [@name =] '' + +[,[@enabled =] <0 for no, 1 for yes>] + +[,[@freq_type =] ] + +[,[@freq_interval =] ] + +[,[@freq_subday_type =] ] + +[,[@freq_subday_interval =] ] + +[,[@freq_relative_interval =] ] + +[,[@freq_recurrence_factor =] ] + +[,[@active_start_date =] ] + +[,[@active_end_date =] ] + +[,[@active_start_time =] ] + +[,[@active_end_time =] ] + +Again, let's look at some of these parameters: + + * @freq_type: Defines the nature of the intervals that are set up in the following parameters. This is another of those parameters that uses bit flags (although you should only use one at a time). Some of the choices are clear, but some aren't until you get to @freq_interval (which is next). Your choices are: + +Value | Frequency +---|--- +1 | Once +4 | Daily +8 | Weekly +16 | Monthly (fixed day) +32 | Monthly (relative to @freq_interval) +64 | Run at start of SQL Server Agent +128 | Run when CPU is idle + + * @freq_interval: Decides the exact days that the job is executed, but the nature of this value depends entirely on @freq_type (see the preceding point). This one can get kind of confusing; just keep in mind that it works with both @freq_type and @frequency_relative_interval. The interpretation works like this: + +freq_type Value | Matching freq_interval Values +---|--- +1 (once) | Not Used +4 (daily) | Runs every x days where x is the value in the frequency interval +8 (weekly) | The frequency interval is one or more of the following: +1 (Sunday) +2 (Monday) +4 (Tuesday) +8 (Wednesday) +16 (Thursday) +32 (Friday) +64 (Saturday) +16 (monthly - fixed) | Runs on the exact day of the month specified in the frequency interval +32 (monthly - relative) | Runs on exactly one of the following: +1 (Sunday) +2 (Monday) +3 (Tuesday) +4 (Wednesday) +5 (Thursday) +6 (Friday) +7 (Saturday) +8 (Specific Day) +9 (Every Weekday) +10 (Every Weekend Day) +64 (Run at Agent startup) | Not Used +128 (Run at CPU idle) | Not Used + + * @freq_subday_type: Specifies the units for @freq_subday_interval. If you're running daily, then you can set a frequency to run within a given day. The possible values here are: + +Value | Description +---|--- +1 | At the specified time +4 | Every x minutes where x is the value of the frequency sub-day interval +8 | Every x hours where x is the value of the frequency sub-day interval + + * @freq_subday_interval: This is the number of @freq_subday_type periods to occur between each execution of the job (x in the preceding table). + * @freq_relative_interval: This is used only if the frequency type is monthly (relative) (32). If this is the case, then this value determines in which week a specific day of week job is run or flags things to be run on the last day of the month. The possible values are: + +Value | Description +---|--- +1 | First Week +2 | Second Week +4 | Third Week +8 | Fourth Week +16 | Last Week or Day + + * @freq_recurrence_factor: How many weeks or months between execution. The exact treatment depends on the frequency type and is applicable only if the type was weekly or monthly (fixed or relative). This is an integer value, and, for example, if your frequency type is 8 (weekly) and the frequency recurrence factor is 3, then the job would run on the specified day of the week every third week. + +The default for each of these parameters is 0. + +Okay, so let's move on to getting that job scheduled to run every five minutes as we did when using Management Studio: + +EXEC sp_add_jobschedule + +@job_name = 'TSQLCreatedTestJob', + +@name = 'Every 5 Minutes', + +@freq_type = 4, + +@freq_interval = 1, + +@freq_subday_type = 4, + +@freq_subday_interval = 5, + +@active_start_date = 20080731; + +Now, if you go and take a look at the job in Management Studio, you'll find that you have a job that is (other than the name) identical to the job we created directly in Management Studio. Our job has been fully implemented using T-SQL this time. + +Maintaining and Deleting Jobs and Tasks + +Maintaining jobs in Management Studio is pretty simple. Just double-click the job and edit it just as if you were creating a new job. Deleting jobs and tasks in Management Studio is simpler. Just highlight the job and press the Delete button. After one confirmation, your job is gone. + +Checking out what you have, editing it, and deleting it are all slightly trickier in T-SQL. The good news, however, is that maintaining jobs, tasks, and schedules works pretty much as creating did, and that deleting any of them is a snap. + +Editing and Deleting Jobs with T-SQL + +To edit or delete each of the four steps we just covered for T-SQL, you just use (with one exception) the corresponding update sproc—the information provided to the update sproc completely replaces that of the original add (or prior updates)—or delete sproc. The parameters are the same as the add sproc for each: + +If the Add Was | Then Update With | And Delete With +---|---|--- +sp_add_job | sp_update_job | sp_delete_job +sp_add_jobserver | None (drop and add) | sp_delete_jobserver +sp_add_jobstep | sp_update_jobstep | sp_delete_jobstep +sp_add_jobschedule | sp_update_jobschedule | sp_delete_jobschedule + +Backup and Recovery + +No database-driven app should ever be deployed or sold to a customer without a mechanism for dealing with backup and recovery. As I've probably told people at least 1,000 times: You would truly be amazed at the percentage of database operations that I've gone into that do not have any kind of reliable backup. In a word: EEEeeeeeek! + +There is one simple rule to follow regarding backups—do them early and often. The follow up to this is to not just back up to a file on the same disk and forget it—you need to make sure that a copy moves to a completely separate place (ideally off-site) to be sure that it's safe. I've personally seen servers catch fire (the stench was terrible, as were all the freaked out staff). You don't want to find out that your backups went up in the same smoke that your original data did. + +For apps being done by the relative beginner, then, you're probably going to stick with referring the customer or on-site administrator to SQL Server's own backup and recovery tools, but, even if you do, you should be prepared to support them as they come up to speed in its use. In addition, there is no excuse for not understanding what it is the customer needs to do. + +Creating a Backup—a.k.a. "A Dump" + +Creating a backup file of a given database in the Management Studio is actually pretty easy. Simply navigate in the Object Explorer to the database you're interested in, and right-click. + +Now choose Tasks and Back Up, as shown in Figure 22.14. + +Figure 22.14 + +And you'll get a dialog that lets you define pretty much all of the backup process, as in Figure 22.15. + +Figure 22.15 + +The first setting here is pretty self-explanatory. Here you indicate which database you want to back up. From there, however, things get a bit trickier. + +Getting into the items that may not yet make sense, first up is the Recovery Model. The Recovery Model field here is just notifying you of what the database you've selected for backup is set to. It is actually a database-level setting. We're going to defer discussion of what this is for a bit—we'll get to it in the next section when we talk about backing up transaction logs. + +Now, those are the simple parts, but let's break down some of the rest of the options that are available. + +Backup Type + +First of the choices to be made is the Backup Type. Depending on the recovery model for your database (again, be patient with me, we'll get there on what this is!), you'll have either two or three types of backups available: + + * Full: This is just what it sounds like—a full backup of your actual database file as it is as of the last transaction that was committed prior to you issuing the Backup command. + * Differential: This might be referred to as a "backup since" backup. When you take a differential backup, it only writes out a copy of the extents (see Chapter 6 if you've forgotten!) that have changed since you did the last full backup. These typically run much faster than a Full backup and will take up less space. How much less? Well, that depends on how much your data actually changes. For very large databases where backups can take a very long time to run, it is very common to have a strategy where you take a full backup only once a week or even only once a month, and then take differential backups in between to save both space and time. + * Transaction Log: This is again just what it sounds like—a copy of the transaction log. This option will only show up if your database is set to Full or Bulk logging (this option is hidden if you are using simple logging). Again, a full discussion of what these are is coming up shortly. + +A subtopic of the Backup Type is the Backup Component, which applies only to Full and Differential backups. + +For purposes of this book, we should pretty much just be focused on backing up the whole database. That said, you'll notice another option titled "Files and Filegroups." Back in Chapter 1, we touched briefly on the idea of filegroups and individual files for data to be stored in. This option lets you select just one file or filegroup to participate in for this backup. I highly recommend avoiding this option until you have graduated to the "expert" class of SQL Server user. + +Again, I want to stress avoiding this particular option until you've got yourself something just short of a doctorate in SQL Server backups. These are special use—designed to help with very large database installations (figure terabytes) that are in high-availability scenarios. There are major consistency issues to be considered when taking and restoring from this style of backup, and they are not for the faint of heart. + +Backup Set + +A backup set is basically a single name used to refer to one or more destinations for your backup. + +SQL Server allows for the idea that your backup may be particularly large or that you may otherwise have reason to back up across multiple devices—be it drives or tapes. When you do this, however, you need to have all of the devices you used as a destination available in order to recover from any of them—that is, they are a "set." The backup set essentially holds the definition of what destinations were involved in your particular backup. In addition, a backup set contains some property information for your backup. You can, for example, identify an expiration date for the backup. Creating a backup set is as easy as naming multiple file or tape destinations at the time you define your backup. + +Destination + +This is where your data is going to be backed up to. Here is where you define potentially several destinations to be utilized for one backup set. For most installations this will be a file location, but it can also be any valid UNC path (which may wind up being something other than a disk. SQL Server doesn't care as long as it's valid storage). + +Options + +In addition to those items we just covered from the General node of the dialog, you also have a node that lets you set other miscellaneous options. Most of these are fairly self-explanatory. Of particular note, however, is the Transaction Log area. + +Schedule + +With all this set up, wouldn't it be nice to set up a job to run this backup on a regular basis? Well, the Schedule button up at the top of the dialog is meant to facilitate your doing just that. Click it, and it will bring up the Job Schedule dialog you saw earlier in the chapter. You can then define a regular schedule to run the backup you just defined. + +Backing Up Using T-SQL + +To back up the database or the log in T-SQL, we make use of the BACKUP command. The syntax for BACKUP works almost, but not quite, the same, depending on whether you're backing up the database or the log. The syntax looks like this. + +BACKUP DATABASE|LOG + +{WITH + +NO_LOG|TRUNCATE_ONLY} + +| TO {DISK|TAPE} [,...n] + +[MIRROR TO [,...n]] + +[WITH + +[BLOCKSIZE = ] + +[[,] CHECKSUM | NO CHECKSUM ] + +[[,] COMPRESSION | NO COMPRESSION] + +[[,] STOP_ON_ERROR | CONTINUE_AFTER_ERROR] + +[[,] DESCRIPTION = ] + +[[,] DIFFERENTIAL] + +[[,] EXPIREDATE = | RETAINDAYS = ] + +[[,] PASSWORD = ] + +[[,] FORMAT|NOFORMAT] + +[[,] INIT|NOINIT] + +[[,] MEDIADESCRIPTION = ] + +[[,] MEDIANAME = ] + +[[,] MEDIAPASSWORD = ] + +[[,] NAME = ] + +[[,] REWIND|NOREWIND] + +[[,] NOSKIP|SKIP] + +[[,] NOUNLOAD|UNLOAD] + +[[,] RESTART] + +[[,] STATS [= ]] + +[[,] COPY_ONLY] + +Let's look at some of the parameters: + + * : That's right; you can back up to more than one device. This creates what's called a media set. These can really speed up your backups if the media are spread over several disks, as it creates a parallel load situation. You're not bound by the I/O limitations of any of the individual devices. However, beware—you must have the entire media set intact to restore from this kind of backup. + +Also note that the TAPE option is only provided for backward compatibility—all backups should now appear to SQL Server as being to DISK (even if the actual device does happen to be a tape). + + * BLOCKSIZE: This is automatically determined in a hard drive backup, but, for tape, you need to provide the correct block size. Contact your vendor for help on this one. + * COMPRESSION: This is what it sounds like: an indication of whether or not you want compression used in the backup. The default is no compression, but this can be changed at a server-wide level. + * DIFFERENTIAL: This is to perform a differential backup. A differential backup only backs up the data that is changed since your last full backup. Any log or other differential backup is ignored. Any row/column changed, added, or deleted since the last full backup is included in the new backup. Differential backups have the advantage of being much faster to create than a full backup and much faster to restore than applying each individual log when restoring. + * EXPIREDATE/RETAINDAYS: You can have your backup media expire after a certain time. Doing so lets SQL Server know when it can overwrite the older media. + * FORMAT/NOFORMAT: Determines whether or not the media header (required for tapes) should be rewritten. Be aware that formatting affects the entire device—this means that formatting for one backup on a device destroys all the other backups on that device as well. + * INIT/NOINIT: Overwrites the device data but leaves the header intact. + * MEDIADESCRIPTION and MEDIANAME: Just describes and names the media—maximum of 255 characters for a description and 128 for a name. + * SKIP/NOSKIP: Decides whether or not to pay attention to the expiration information from previous backups on the tape. If SKIP is active, then the expiration is ignored so the tape can be overwritten. + * UNLOAD/NOUNLOAD: Used for tape only. This determines whether to rewind and eject the tape (UNLOAD) or leave it in its current position (NOUNLOAD) after the backup is complete. + * RESTART: Picks up where a previously interrupted backup left off. + * STATS: Displays a progress bar indicating progress as the backup runs. + * COPY_ONLY: Creates a backup but does not affect any other backup sequence you have in any way. For example, logs that are differential backups will continue as if the copy backup had never occurred. + +Now let's try one out for a true backup: + +BACKUP DATABASE AdventureWorks2008 + +TO DISK = 'C:\Program Files\Microsoft SQL + +Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLDataBackup.bck' + +WITH + +DESCRIPTION = 'My what a nice backup!', + +STATS; + +The highlighted code should appear on one line. + +Note that you may need to change the path to a different location depending on the specifics of your particular installation. + +We now have a backup of our AdventureWorks2008 database. + +SQL Server is even nice enough to provide progress messages as it processes the backup: + +10 percent processed. + +20 percent processed. + +30 percent processed. + +40 percent processed. + +50 percent processed. + +60 percent processed. + +70 percent processed. + +80 percent processed. + +90 percent processed. + +Processed 25448 pages for database 'AdventureWorks2008', file + +'AdventureWorks2008_Data' on file 1. + +Processed 36 pages for database 'AdventureWorks2008', file 'FileStreamDocuments' + +on file 1. + +Processed 1 pages for database 'AdventureWorks2008', file 'AdventureWorks2008_Log' + +on file 1. + +100 percent processed. + +BACKUP DATABASE successfully processed 25484 pages in 10.825 seconds (18.391 MB/sec). + +It's that simple, so let's follow it up with a simple backup of the log: + +BACKUP LOG AdventureWorks2008 + +TO DISK = 'C:\Program Files\Microsoft SQL + +Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLLogBackup.bck' + +WITH + +DESCRIPTION = 'My what a nice backup of a log!', + +STATS; + +The highlighted code should appear on one line. + +It's worth noting that you can't do a backup of a log while the database recovery model is set to Simple. To change this to a different recovery model, right-click the AdventureWorks2008 database, select Properties and the Options tab—in T-SQL, use the sp_dboption system sproc. If you think about it, this makes sense given that your log is always going to be essentially free of any committed transactions. + +It's also worth noting that backups work just fine while there are users in your database. SQL Server is able to reconcile the changes that are being made by knowing the exact point in the log that the backup was begun, and using that as a reference point for the rest of the backup. + +Recovery Models + +Well, I spent most of the last section promising that we would discuss them, so it's time to ask: What is a recovery model? + +Well, back in Chapter 11, we talked about the transaction log. In addition to keeping track of transactions to deal with transaction rollback and atomicity of data, transaction logs are also critical to being able to recover data right up to the point of system failure. + +Imagine for a moment that you're running a bank. Let's say you've been taking deposits and withdrawals for the last six hours—the time since your last full backup was done. Now, if your system went down, I'm guessing you're not going to like the idea of going to last night's backup and losing all track of what money went out the door or came in during the interim. See where I'm going here? You really need every moment's worth of data. + +Keeping the transaction log around gives us the ability to "roll forward" any transactions that happened since the last full or differential backup was done. Assuming both the data backup and the transaction logs are available, you should be able to recover right up to the point of failure. + +The recovery model determines how long and what types of log records are kept. There are three options: + + * Full: This is what it says. Everything is logged. Under this model, you should have no data loss in the event of system failure, assuming you had a backup of the data available and have all transaction logs since that backup. If you are missing a log or have one that is damaged, then you'll be able to recover all data up through the last intact log you have available. Keep in mind, however, that as keeping everything suggests, this can take up a fair amount of space in a system that receives a lot of changes or new data. + * Bulk-Logged: This is like "Full recovery light." Under this option, regular transactions are logged just as they are with the Full recovery method, but bulk operations are not. The result is that, in the event of system failure, a restored backup will contain any changes to data pages that did not participate in bulk operations (bulk import of data or index creation, for example), but any bulk operations must be redone. The good news on this one is that bulk operations perform much better. This performance comes with risk attached, so your mileage may vary... . + * Simple: Under this model, the transaction log essentially exists to support transactions as they happen. The transaction log is regularly truncated, with any completed or rolled back transactions essentially being removed from the log (not quite that simple, but that is the effect). This gives us a nice tight log that is smaller and often performs a bit better, but the log is of zero use for recovery from system failure. + +For most installations, Full recovery is going to be what you want to have for a production-level database—end of story. + +Recovery + +This is something of the reverse of the backup side of things. You've done your backups religiously, and now you want to restore one—either for recovery purposes or merely to make a copy of a database somewhere. + +Once you have a backup of your database, it's fairly easy to restore it to the original location. To get started—it works much as it did for backup: navigate to the database you want to restore to and right-click—then select Tasks⇒Restore, and up comes your Restore dialog, as in Figure 22.16. + +Figure 22.16 + +As long as what you're after is to take your old backup and slam it over the top of the database you made the backup of, this is pretty straightforward. Simply say OK, and it should restore for you without issue. + +Restoring to a Different Location + +When things get tricky is when you want to change something about where you're restoring to. As part of the backup process, the backup knows the name of the database that was backed up, and, perhaps more important, it knows the path(s) to the physical files that it was supposed to be using. + +Changing the destination database name is right there—no biggie—the problem is that changing the destination database name does nothing to change what physical files (the .MDF and .LDF files) it's going to try to store to. To deal with this, go to the Options node of the Restore dialog. + +Again, most of the options here are self-explanatory, but, in particular, notice the "Restore As" column. In this part of the dialog, you can replace every original file's destination, location, and name, which provides you with a way to deal with restoring multiple copies of a database to the same server (perhaps for test purposes) or installing your database to a new volume or even a new system. + +Recovery Status + +This one is merely about the state you want to have the database in when you are done with this restore. This has particular relevance when you are restoring a database and still have logs to apply to the database later. + +If you go with the default option (which translates to using the WITH RECOVERY option if you were using T-SQL), then the database will immediately be in a full online status when the restore operation is complete. If, for example, you wanted to restore logs after your initial restore was done, you would want to select one of the two other options. Both of these prevent updates happening to the database and leave it in a state where more recovery can be done. The difference is merely one of whether users are allowed to access the database in a "read-only" mode or whether the database should appear as still being offline. + +The issue of availability is a larger one than you probably think it is. As big of a deal as I'm sure it already seems, it's really amazing how quickly users will find their way into your system when the restore operation suddenly marks the database as available. Quite often, even if you know that you will be "done" after the current restore is done, you'd like a chance to look over the database prior to actual users being in there. If this is the case, then be sure to use the NO RECOVERY method of restoring. You can later run a restore that is purely for a WITH RECOVERY option, and get the database fully back online once you're certain you have things just as you want them. + +Restoring Data Using T-SQL + +We use the RESTORE command to recover the data that we have in our backups. The basic syntax looks like this (there are a ton of variations on this, so, if you need every nuance, I'd suggest a book oriented toward administration, which will investigate backup and recovery as a chapter unto itself): + +RESTORE DATABASE|LOG + +[FROM [,...n]] + +[WITH + +[DBO_ONLY] + +[[,] FILE = ] + +[[,] MEDIANAME = ] + +[[,] MOVE '' TO ''][,...n] + +[[,] {NORECOVERY|RECOVERY|STANDBY = }] + +[[,] {NOUNLOAD|UNLOAD}] + +[[,] REPLACE] + +[[,] RESTART] + +[[,] STATS [= percentage]] + +[[,] { STOPAT = { } + +| STOPATMARK = { '' } + +[ AFTER ] + +| STOPBEFOREMARK = { '' } + +[ AFTER ] + +Let's look at some of these options: + + * DBO_ONLY: When the restore is done, the database will be set with the dbo_only database option turned on. This gives the dbo a chance to look around and test things out before allowing users back onto the system. + +This is a biggie, and I very strongly recommend that you always use it. You would be amazed at how quickly users will be back on the system once it's backed up for even a moment. When a system is down, you'll find users very impatient to get back to work. They'll constantly be trying to log in, and they won't bother to ask if it's okay or not. They'll assume that when it's up, it's okay to go into it. + + * FILE: You can back up multiple times to the same media. This option lets you select a specific version to restore. If this one isn't supplied, SQL Server will assume that you want to restore from the most recent backup. + * MOVE: Allows you to restore the database to a different physical file that the database was using when it was originally backed up. + * NORECOVERY/RECOVERY/STANDBY: RECOVERY and NORECOVERY are mutually exclusive. STANDBY works in conjunction with NORECOVERY. They work as follows: + +Option | Description +---|--- +NORECOVERY | Restores the database but keeps it marked as offline. Uncommitted transactions are left intact. This allows you to continue with the recovery process—for example, if you still have additional logs to apply. +RECOVERY | As soon as the restore command is done successfully, the database is marked as active again. Data can again be changed. Any uncommitted transactions are rolled back. This is the default if none of the options are specified. +STANDBY | STANDBY allows you to create an undo file so that the effects of a recovery can be undone. STANDBY allows you to bring the database up for read-only access before you have issued a RECOVERY (which means at least part of your data's been restored, but you aren't considering the restoration process complete yet). This allows users to make use of the system in a read-only mode while you verify the restoration process. + + * REPLACE: Overrides the safety feature that prevents you from restoring over the top of an existing database. + * RESTART: Tells SQL Server to continue a previously interrupted restoration process. + +Let's go ahead and look at an example run of restoring the AdventureWorks2008 database. Do not run this statement unless you are absolutely certain that your backup was successful and is intact. + +First, we drop the existing AdventureWorks2008 database: + +USE master; + +DROP DATABASE AdventureWorks2008; + +Once that's done, we'll try to restore it using my RESTORE command: + +RESTORE DATABASE AdventureWorks2008 + +FROM DISK = 'C:\Program Files\Microsoft SQL + +Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLDataBackup.bck' + +WITH + +DBO_ONLY, + +NORECOVERY, + +STATS; + +The highlighted code should appear on one line. + +We restored with NORECOVERY because we want to add another piece to the puzzle. Our log will contain any transactions that happened between when our database or log was last backed up and when this log was backed up. "Apply" this log, and that should bring the database as up to date as we can make it: + +RESTORE LOG AdventureWorks2008 + +FROM DISK = 'C:\Program Files\Microsoft SQL + +Server\MSSQL10.MSSQLSERVER\MSSQL\Backup\TSQLLogBackup.bck' + +WITH + +DBO_ONLY, + +NORECOVERY, + +STATS; + +Note that if we had several logs to apply from this one device, then we would have to name them as we wanted to apply them. They would also need to be applied in the order in which they were backed up. + +Now, we could have turned everything on there, but we want to hold off for a bit before making the database active again. Even though we don't have any more logs to apply, we still need to re-run the RESTORE statement to make the database active again: + +RESTORE LOG AdventureWorks2008 WITH RECOVERY; + +We should now be able to test our database: + +USE AdventureWorks2008; + +SELECT * FROM Region; + +And, sure enough, we get the results we're looking for. Run a few SELECT statements to see that, indeed, our database was restored properly. + +After you've checked things out, remember that we chose the DBO_ONLY option for all this. If we run sp_dboption, we'll see that no one else is able to get in: + +EXEC sp_dboption; + +Look for the dbo use only: + +Settable database options: + +\----------------------------------- + +ANSI null default + +ANSI nulls + +ANSI padding + +ANSI warnings + +arithabort + +auto create statistics + +auto update statistics + +autoclose + +autoshrink + +concat null yields null + +cursor close on commit + +db chaining + +dbo use only + +default to local cursor + +merge publish + +numeric roundabort + +offline + +published + +quoted identifier + +read only + +recursive triggers + +select into/bulkcopy + +single user + +subscribed + +torn page detection + +trunc. log on chkpt. + +Remember to turn that option off or your users won't be able to get into the system: + +EXEC sp_dboption AdventureWorks2008, 'dbo use only', 'false'; + +We now have a restored and active database. + +Index Maintenance + +Back in Chapter 6, we talked about how indexes can become fragmented. This can become a major impediment to the performance of your database over time, and it's something that you need to have a strategy in place to deal with. Fortunately, SQL Server has commands that will reorganize your data and indexes to clean things up. Couple that with the job scheduling that we've already learned about, and you can automate routine defragmentation. + +ALTER INDEX is the workhorse of database maintenance. It is simultaneously much easier and slightly harder than the previous maintenance mainstay—DBCC—used to be. Let's take a look at this one real quick, and then at how to get it scheduled. + +ALTER INDEX + +The command ALTER INDEX is somewhat deceptive in what it does. Up until now, ALTER commands have always been about changing the definition of our object. We ALTER tables to add or disable constraints and columns, for example. ALTER INDEX is different; it is all about maintenance and zero about structure. If you need to change the make-up of your index, you still need to either DROP and CREATE it, or you need to CREATE and use the DROP_EXISTING=ON option. + +The ALTER INDEX syntax looks like this: + +ALTER INDEX { | ALL } + +ON
+ +{ REBUILD + +[ [ WITH ( [ ,...n ] ) ] + +| [ PARTITION = + +[ WITH ( + +[ ,...n ] ) ] ] ] + +| DISABLE + +| REORGANIZE + +[ PARTITION = ] + +[ WITH ( LOB_COMPACTION = { ON | OFF } ) ] + +| SET ( [ ,...n ] ) + +} + +[ ; ] + +A decent amount of this is fairly detailed "Realm of the advanced DBA" stuff—usually used on an ad hoc basis to deal with very specific problems. But there are some core elements here that should be part of our regular maintenance planning. We'll start by looking at a couple of top parameters, and then look at the options that are part of our larger maintenance planning needs. + +Index Name + +You can name a specific index if you want to maintain one specific index, or use ALL to indicate that you want to perform this maintenance on every index associated with the named table. + +Table or View Name + +Pretty much just what it sounds like—the name of the specific object (table or view) that you want to perform the maintenance on. Note that it needs to be one specific table. (You can't feed it a list and say "do all of these please!") + +REBUILD + +This is the "industrial strength" approach to fixing an index. If you run ALTER INDEX with this option, the old index is completely thrown away and reconstructed from scratch. The result is a truly optimized index, where every page in both the leaf and non-leaf levels of the index have been reconstructed as you have defined them (either the defaults, or using switches to change things like the fill factor). + +Careful on this one. As soon as you kick off a REBUILD, the index you are working on is essentially gone until the rebuild is complete. Any queries that relied on that index may become exceptionally slow (potentially by orders of magnitude). This is the sort of thing you want to test on an offline system first to have an idea how long it's going to take, and then schedule to run in off hours (preferably with someone monitoring it to be sure it's back online when peak hours come along). + +This one can have major side effects while it runs, and thus it falls squarely in the domain of the database administrator in my not-so-humble opinion. + +DISABLE + +This one does what it says, only in somewhat drastic fashion. It would be nice if all this command did was take your index offline until you decided further what you want to do, but instead it essentially marks the index as unusable. Once an index has been disabled, it must be rebuilt (not reorganized, but rebuilt) before it will be active again. + +This is one you're very, very rarely going to do yourself. (You would more likely just drop the index.) It is far more likely to happen during a SQL Server upgrade or some other oddball situation. + +Yet another BE CAREFUL!!! warning on this one. If you disable the clustered index for your table, it has the effect of disabling the table. The data will remain but will be inaccessible by all indexes (since they all depend on the clustered index) until you rebuild the clustered index. + +REORGANIZE + +BINGO!!! from the developer perspective. With REORGANIZE we hit much more of a happy medium in life. When you reorganize your index, you get a slightly less complete optimization than you get with a full rebuild, but one that occurs online (users can still utilize the index). + +This should, if you're paying attention, bring about the question "What exactly do you mean by 'slightly less complete'?" Well, REORGANIZE only works on the leaf level of your index—non-leaf levels of the index go untouched. This means that we're not quite getting a full optimization, but, for the lion's share of indexes, that is not where your real cost of fragmentation is (though it can happen and your mileage may vary). + +Given its much lower impact on users, this is usually the tool you'll want to use as part of your regular maintenance plan. Let's take a look at running an index reorganization command. + +To run this through its paces, we're going to do a reorg on a table in the AdventureWorks2008 database. The Production.TransactionHistory table is an excellent example of a table that is likely to have many rows inserted over time and then have rows purged back out of it as the transactions become old enough to delete. In this case, we'll reorganize all the indexes on the table in one simple command: + +USE AdventureWorks2008; + +ALTER INDEX ALL + +ON Production.TransactionHistory + +REORGANIZE; + +The ALTER INDEX command sees that ALL was supplied instead of a specific index name, and looks up what indexes are available for our Production.TransactionHistory table (leaving out any that are disabled since a reorganization will do nothing for them). It then enumerates each index behind the scenes and performs the reorganization on each—reorganizing just the leaf level of each index (including reorganizing the actual data since the clustered index on this table will also be reorganized). + +You should get back essentially nothing from the database—just a simple "Command(s) completed successfully." + +Archiving of Data + +Ooh—here's a tricky one. There are as many ways of archiving data as there are database engineers. If you're building an OLAP database—for example, to utilize with Analysis Services—then that will often address your archiving for long-term reporting needs. Regardless of how you're making sure the data you need long-term is available, there will likely come a day when you need to deal with the issue of your data becoming too voluminous for your system to perform well. + +As I said, there are just too many ways to go about archiving because every database is a little bit different. The key is to think about archiving needs at the time that you create your database. Realize that, as you start to delete records, you're going to be hitting referential integrity constraints and/or orphaning records—design in a logical path to delete or move records at archive time. Here are some things to think about as you write your archive scripts: + + * If you already have the data in an OLAP database, then you probably don't need to worry about saving it anywhere else. Talk to your boss and your attorney on that one. + * How often is the data really used? Is it worth keeping? Human beings are natural born pack rats in a larger size. Simply put, we hate giving things up—that includes our data. If you're only worried about legal requirements, think about just saving a copy of never or rarely used data to tape (I'd suggest multiple backups for archive data) and reducing the amount of data you have online—your users will love you for it when they see improved performance. + * Don't leave orphans. As you start deleting data, your referential integrity constraints should keep you from leaving that many orphans, but you'll wind up with some where referential integrity didn't apply. This situation can lead to serious system errors. + * Realize that your archive program will probably need a long time to run. The length of time it runs and the number of rows affected may create concurrency issues with the data your online users are trying to get at—plan on running it at a time when your system will not be used. + * TEST! TEST! TEST! + +PowerShell + +SQL Server now has support for a command environment known as PowerShell. For those who haven't heard of PowerShell before, it's worth a look well beyond what we'll go into here, so I recommend a good search on the Web. + +What is PowerShell? At its most basic level, PowerShell is a classic command-line environment—and is not, on the surface, much different than a Windows Command window. PowerShell, however, is extensible through .NET integration and can be hosted within other applications (much as it is for SQL Server 2008). Examples of applications and operating systems that include special functionality for PowerShell include: + + * SQL Server 2008 (why else would we be talking about it, eh?) and above + * Exchange 2007 and above + * Microsoft Office SharePoint Services (MOSS) 2007 and above + * Vista, Widows XP, Windows Server 2003 (through downloaded functionality add ons) + * Windows Server 2008 and later include it natively or as an option (depending on edition and version) + +The extensibility of PowerShell is implemented via what are called cmdlets (pronounced commandlets). These are specialized .NET assemblies that implement functionality for a given application within the PowerShell environment. The real power here is that, through the mix of different cmdlets available to PowerShell, we can create powerful scripts utilizing a mix of operating system commands and functionality that is specific to one or more applications (for example, waiting for confirmation on a load script to the database before kicking off an application hosted in another environment). + +PowerShell cmdlets have a standardized command structure based on a verb-noun combination such as Get-Help, or Get-Children. It also includes a robust help mechanism that is updated regularly (via TechNet). + +Trying Out PowerShell + +To get a feel for how it works, we're going to take a fairly quick test drive of PowerShell. Start by opening a command prompt window (Start⇒Run and type cmd before hitting Enter). At the command line, simply type PowerShell. + +C:\Users\Administrator.Kierkegaard>sqlps + +There is relatively little indication that you've left the standard command prompt and entered the world of PowerShell. Indeed, the only significant indication (besides the PowerShell header) is the PS prefix on a line that otherwise looks just like your command prompt: + +Microsoft SQL Server PowerShell + +Version 10.0.1600.22 + +Microsoft Corp. All rights reserved. + +PS SQLSERVER:\> + +Let's go ahead and issue our first PowerShell command. We'll simply ask for the help page: + +PS SQLSERVER:\> Get-Help + +This spews forth a page or so worth of information: + +TOPIC + +Get-Help + +SHORT DESCRIPTION + +Displays help about PowerShell cmdlets and concepts. + +LONG DESCRIPTION + +SYNTAX + +get-help { | } + +help { | } + + -? + +"Get-help" and "-?" display help on one page. + +"Help" displays help on multiple pages. + +Examples: + +get-help get-process : Displays help about the get-process cmdlet. + +get-help about-signing : Displays help about the signing concept. + +help where-object : Displays help about the where-object cmdlet. + +help about_foreach : Displays help about foreach loops in PowerShell. + +match-string -? : Displays help about the match-string cmdlet. + +You can use wildcard characters in the help commands (not with -?). + +If multiple help topics match, PowerShell displays a list of matching + +topics. If only one help topic matches, PowerShell displays the topic. + +Examples: + +get-help * : Displays all help topics. + +get-help get-* : Displays topics that begin with get-. + +help *object* : Displays topics with "object" in the name. + +get-help about* : Displays all conceptual topics. + +For information about wildcards, type: + +get-help about_wildcard + +REMARKS + +To learn about PowerShell, read the following help topics: + +get-command : Displays a list of cmdlets. + +about_object : Explains the use of objects in PowerShell. + +get-member : Displays the properties of an object. + +Conceptual help files are named "about_", such as: + +about_regular_expression. + +The help commands also display the aliases on the system. + +For information about aliases, type: + +get-help about_alias + +PS SQLSERVER:\> + +This is just basic information about getting help in PowerShell. Little if anything provided in this particular help window is SQL Server specific. We can, however, get help on a cmdlet that runs generic T-SQL commands: + +PS SQLSERVER:\> Get-Help Invoke-Sqlcmd + +This gets us helpful information about the SQL Server–specific cmdlet called Invoke-Sqlcmd: + +NAME + +Invoke-Sqlcmd + +SYNOPSIS + +Runs a script containing statements from the languages + +(Transact-SQL and XQuery) and commands supported by the SQL Server sqlcmd utility. + +SYNTAX + +Invoke-Sqlcmd [-ServerInstance ] [-Database ] [-EncryptCo + +nnection] [-Username ] [-Password ] [[-Query] ] [-Q + +ueryTimeout ] [-ConnectionTimeout ] [-ErrorLevel ] [-S + +everityLevel ] [-MaxCharLength ] [-MaxBinaryLength ] [ + +-AbortOnError] [-DedicatedAdministratorConnection] [-DisableVariables] [-Di + +sableCommands] [-HostName ] [-NewPassword ] [-Variable ] [-InputFile ] [-OutputSqlErrors] [-SuppressProviderContextW + +arning] [-IgnoreProviderContext] [] + +DETAILED DESCRIPTION + +Runs a script containing the languages and commands supported by the SQL Se + +rver sqlcmd utility. The languages supported are Transact-SQL and the XQuer + +y syntax supported by the Database Engine. Invoke-Sqlcmd also accepts many + +of the commands supported by sqlcmd, such as GO and QUIT. Invoke-Sqlcmd ac + +cepts the sqlcmd scripting variables, such as SQLCMDUSER. Invoke-Sqlcmd doe + +s not set sqlcmd scripting variables by default. + +Invoke-Sqlcmd does not support the sqlcmd commands primarily related to int + +eractive script editing. The commands not supported include :!!, :connect, + +:error, :out, :ed, :list, :listvar, :reset, :perftrace, and :serverlist. + +The first result set the script returns is displayed as a formatted table. + +Result sets after the first are not displayed if their column list is diffe + +rent from the column list of the first result set. If result sets after the + +first set have the same column list, their rows are appended to the format + +ted table that contains the rows that were returned by the first result set + +. + +Invoke-Sqlcmd does not return message output, such as the output of PRINT s + +tatements, unless you use the PowerShell -Verbose parameter. + +RELATED LINKS + +SQL Server Books Online: Transact-SQL Reference + +SQL Server Books Online: sqlcmd Utility + +SQL Server Books Online: XQuery Reference + +REMARKS + +For more information, type: "get-help Invoke-Sqlcmd -detailed". + +For technical information, type: "get-help Invoke-Sqlcmd -full". + +PS SQLSERVER:\> + +Let's take a quick look at this using a relatively simple system stored procedure (sp_help): + +PS SQLSERVER:\> Invoke-Sqlcmd -Query "EXEC sp_helpdb" + +sp_helpdb provides a listing of all databases in the system. Normally we would see a column-oriented result set, but PowerShell has reoriented the output in a manner that is much more suitable to the limited number of characters a command window can display: + +name : AdventureWorks2008 + +db_size : 245.81 MB + +owner : sa + +dbid : 7 + +created : Dec 6 2008 + +status : Status=ONLINE, Updateability=READ_WRITE, UserAccess=MULTI + +_USER, Recovery=SIMPLE, Version=655, + +Collation=SQL_Latin1 + +_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled, + +IsAnsiPaddingEnabled, IsAnsiWarningsEnabled, IsArithmetic + +AbortEnabled, IsAutoCreateStatistics, + +IsAutoUpdateStatist + +ics, IsFullTextEnabled, IsNullConcat, + +IsQuotedIdentifiers + +Enabled, IsPublished + +compatibility_level : 100 + +name : AdventureWorksDW2008 + +db_size : 71.06 MB + +owner : sa + +dbid : 8 + +created : Dec 6 2008 + +status : Status=ONLINE, Updateability=READ_WRITE, + +UserAccess=MULTI + +_USER, Recovery=SIMPLE, Version=655, + +Collation=SQL_Latin1 + +_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled, + +IsAnsiPaddingEnabled, IsAnsiWarningsEnabled, + +IsArithmetic + +AbortEnabled, IsAutoCreateStatistics, + +IsAutoUpdateStatist + +ics, IsFullTextEnabled, IsNullConcat, + +IsQuotedIdentifiers + +Enabled + +compatibility_level : 100 + +name : AdventureWorksLT2008 + +db_size : 7.13 MB + +owner : sa + +dbid : 9 + +created : Dec 6 2008 + +status : Status=ONLINE, Updateability=READ_WRITE, + +UserAccess=MULTI + +_USER, Recovery=SIMPLE, Version=655, + +Collation=SQL_Latin1 + +_General_CP1_CI_AS, SQLSortOrder=52, IsAnsiNullsEnabled, + +IsAnsiPaddingEnabled, IsAnsiWarningsEnabled, IsArithmetic + +AbortEnabled, IsAutoCreateStatistics, + +IsAutoUpdateStatist + +ics, IsFullTextEnabled, IsNullConcat, + +IsQuotedIdentifiers + +Enabled + +compatibility_level : 100 + +name : tempdb + +db_size : 8.75 MB + +owner : sa + +dbid : 2 + +created : Dec 31 2008 + +status : Status=ONLINE, Updateability=READ_WRITE, + +UserAccess=MULTI + +_USER, Recovery=SIMPLE, Version=655, + +Collation=SQL_Latin1 + +_General_CP1_CI_AS, SQLSortOrder=52, + +IsAutoCreateStatisti + +cs, IsAutoUpdateStatistics + +compatibility_level : 100 + +PS SQLSERVER:\> + +I have, for the sake of brevity, snipped a few databases out of the middle of the result sets here, but you can see how we were able to execute virtually any command from within PowerShell. Many commands will, over time, have specific cmdlets supporting them—supporting stronger typing and parameterization. For now, most implemented cmdlets support four major object models: + + * The Database Engine: This allows you to navigate a given server. + * Policy-Based Management: The rules-based management tool that is new with SQL Server 2008 (we will discuss this in brief in our next major section). + * Database Collection: This contains the meat of manipulating a given database or set of databases. + * Server Registration: This is all about identifying servers and registering them locally to make them somewhat easier to access. + +Through the use of these object models, PowerShell can provide scripted access to almost any administrative task. Watch for more specific support and help to be added via download over the life cycle of SQL Server 2008. + +Navigating in PowerShell + +PowerShell also provides the ability to navigate in a more directory-like fashion than we have previously experienced with SQL Server. Indeed, you can think of the SQL Server world as one large hierarchy (much as a domain/directory structure is). You can navigate from a collection of registered servers to specific servers, and, from there, to roles and users on that server, or perhaps to databases and objects within the database. + +Let's check this out real quick by issuing a simple dir command, much as you would in a command window for the operating system: + +PS SQLSERVER:\> dir + +This may surprise you by providing a listing of the four object model areas I mentioned at the end of the previous section: + +Name Root Description + +\---- ---- ----------- + +SQL SQLSERVER:\SQL SQL Server Database Engine + +SQLPolicy SQLSERVER:\SQLPolicy SQL Server Policy Management + +SQLRegistration SQLSERVER:\SQLRegistration SQL Server Registrations + +DataCollection SQLSERVER:\DataCollection SQL Server Data Collection + +PS SQLSERVER:\> + +We can actually navigate these just as we would a directory structure in Windows—for example: + +PS SQLSERVER:\> cd SQL + +You should quickly notice that we are climbing down a directory structure: + +PS SQLSERVER:\SQL> + +Let's jump forward a bit, and navigate much deeper into the tree. We'll need to navigate through our specific server (mine is KIERKEGAARD; you should replace it with the name of your SQL Server system), for instance (mine is the default, so I'll refer to it as DEFAULT), and on into the DATABASES node (we could also do other server-level objects, such as LOGINS): + +PS SQLSERVER:\SQL> cd KIERKEGAARD\DEFAULT\DATABASES + +We cut straight down to our databases node of the hierarchy just as if we were navigating a directory structure: + +PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES> + +But it gets better. We can issue a directory listing (in the form of the dir command) and get a list of databases, much like the one we created using sp_help earlier in the chapter (albeit not quite as verbose): + +PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES> dir + +This gets us: + +WARNING: column "Owner" does not fit into the display and was removed. + +Name Status Recovery Model CompatLvl Collation + +\---- ------ -------------- --------- --------- + +AdventureWorks2008 Normal Simple 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +AdventureWorksDW2008 Normal Simple 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +AdventureWorksLT2008 Normal Simple 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +AWSubscriber Normal Full 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +OurInsteadOfTest Normal Full 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +ReportServer Normal Full 100 Latin1_General_CI + +_AS_KS_WS + +ReportServerTempDB Normal Simple 100 Latin1_General_CI + +_AS_KS_WS + +Test Normal Full 100 SQL_Latin1_Genera + +l_CP1_CI_AS + +PS SQLSERVER:\SQL\KIERKEGAARD\DEFAULT\DATABASES> + +This is, of course, a pretty simplistic example, but it can be taken much further. For example, PowerShell will allow you to enumerate a list such as the directory list we just created. You can then script different behaviors depending on the contents of the script. + +A Final Word on PowerShell + +As I write this, PowerShell is, from a SQL Server point of view, just getting started. The documentation on the cmdlets available is still rather sparse, but new items are being published regularly, and the nature of the PowerShell model is such that they will be able to continue extending the functionality within PowerShell even before Kilimanjaro (code name for the next release of SQL Server) is released. + +I highly recommend watching the SQL Server community on the Internet (or just Google SQL Server PowerShell from time to time) to keep an eye on what's new and where this new scripting tool is going. I can say, for example, that it is quickly becoming my preferred installation and upgrade scripting environment! + +Policy-Based Management + +Policy-Based Management—known during much of the beta phase of SQL Server 2008 as the Distributed Management Framework—is a rules-based management infrastructure primarily aimed at the management of SQL Server farms in larger enterprises. The concept is pretty simple. There are too many SQL Servers out there managed by too many different people (often completely different IT departments or DBAs that don't even know anything about the other servers and DBAs out there), so why not allow for all your SQL Servers to police themselves according to a set of "policies." What is enforced by the policies can vary ranging from things as simple as object naming guidelines to blocking specific changes to server settings. The management engine can just note violations of policy (simply be able to report on it), or it can actually block or reverse the change. + +The full effect of Policy-Based Management on the developer community is yet to be seen. I suspect that there are going to be some good scripting applications for it, but how exactly Policy-Based Management is going to be rolled out and just how enforcement policies are going to be implemented is something still being explored in many companies. For now, all I can say is that all of the Policy-Based Management features are exposed through SMO (in the Microsoft.SqlServer.Management.Dmf library) and through PowerShell. There is much left to be desired for documentation of the object model outside of the Management Studio, but a number of individual functions are documented in Books Online and I suspect updates to Books Online over the life of SQL Server 2008 will help fill in the picture of the Policy-Based Management object model. + +Summary + +Well, that gives you a few things to think about. It's really easy as a developer to think about many administrative tasks and establish what the inaccurately named Hitchhiker's Guide to the Galaxy trilogy called an "SEP" field. That's something that makes things like administration seem invisible because it's "somebody else's problem." Don't go there! + +A project I'm familiar with from several years ago is a wonderful example of taking responsibility for what can happen. A wonderful system was developed for a nonprofit group that operates in the northwestern United States. After about eight months of operation, an emergency call was placed to the company that developed the software (it was a custom job). After some discussion, it was determined that the database had somehow become corrupted, and it was recommended to the customer that the database be restored from a backup. The response? "Backup?" The development company in question missed something very important—they knew they had an inexperienced customer that would have no administration staff, and who was going to tell the customer to do backups and help set it up if the development company didn't? I'm happy to say that the development company in question learned from that experience—and so should you. + +Think about administration issues as you're doing your design and especially in your deployment plan. If you plan ahead to simplify the administration of your system, you'll find that your system is much more successful—that usually translates into rewards for the developer (that is, you!). +23 + +SMO: SQL Management Objects + +It's been a long road, and we're getting closer and closer to the end of our walk through SQL Server. It is, of course, no coincidence that the chapter about how to manage your SQL Server programmatically has been held until very close to the end. Among other things, we needed to have a solid idea as to what objects we were managing and what administrative needs we had before we were ready to understand the SMO object model and talk about some of the reasons we might want to use SMO. + +So, what exactly is SMO? Well, as the title of this chapter implies, SMO is an object model for managing SQL Server. Whereas connectivity models like ADO and LINQ are all about accessing data, SMO is all about access to the structure and health of your system. + +In this chapter, we'll look at: + + * The convoluted history of SQL Server management object models + * The basics of the SQL SMO object model + * A simple SMO example project + +As with many of the SQL Server topics we cover in this book, SQL SMO is a book unto itself, so please do not expect to come out of this chapter as an expert. That said, hopefully, you will have the fundamentals down to at least the point to where you know what's possible and how much work is likely to be involved. From there, you can look for sources of more information as necessary. + +The History of SQL Server Management Object Models + +This is, to me—even as someone who genuinely loves the product—not an area where SQL Server shines. This is not to say that SMO is a bad thing but rather that the history of SQL Server management object models is a rather sordid history indeed. The team has had a tough time picking a horse and sticking with it. + +As I write this, I've been working with SQL Server for just under 15 years. In that time, the methods of managing SQL Server have changed several different times. "A new release? A new management method!" could be the motto for SQL Server. + +The good news is that SMO, at least so far, seems to be here to stay. It's on its second version as the primary object model for managing SQL Server (I know it's sad that two releases worth seems like an accomplishment). Still, there are other models that remain out there, so let's look at the highlights from the last couple of releases. These are some of the different models and technologies you may bump into as you work on legacy code out there. + +SQL Distributed Management Objects + +Distributed Management Objects, or DMO, is the relative "old dog" of the management models. When you think of the old Enterprise Manager from SQL Server 2000 and earlier, most of its underlying functionality ended up in a DMO call. The DMO model supported COM, and could perform all the basic tasks you might want management-wise, such as: + + * Start a backup + * Restore from backup + * Create a database + * Create jobs and other agent-related tasks + * Reverse engineer tables into SQL code + +The list goes on. + +So, what went wrong with DMO? Well, the object model was often deemed "clunky" at best. Indeed, parts of DMO often did not work well together, and the scripting engine was buggy. In short, most developers I know only used DMO after going through an electroshock therapy program to desensitize them to the pain of it (okay, it wasn't that bad, but not far from there). + +SQL Namespaces + +SQL Namespaces (SQL NS) is actually largely about providing UI-level functionality. SQL NS encapsulates all of the functionality that you would find in the old Enterprise Manager—complete with the UI elements. You instantiate the UI objects, and those objects already utilizing SQL DMO underneath, and remove that layer of programming from the equation. In short, if you needed to build a tool that already had the UI to do management tasks, then SQL NS was your tool. The problem? Well, put it this way—EM? They decided they needed to replace it. DMO? They decided they need to replace it, too. As you can guess, apparently not even Microsoft was all that impressed. + +Now, lest I sound like all I am is a Microsoft basher or that I think EM was a bad product, I'll put it this way: EM was a fairly good "first shot at it." None of the RDBMS systems out there had anything remotely as powerful and useful as Enterprise Manager was when it first came out—it was a huge part of why SQL Server has been perceived as so much more usable than, say, Oracle (although Oracle has certainly made inroads in the management area). That usability, coupled with what used to be a very cheap price tag, is a big part of Microsoft's success with SQL Server. + +EM did, however, have a number of flaws that became more and more obvious as the Windows era taught us what a Windows application should look and act like. + +Windows Management Instrumentation + +Windows Management Instrumentation (WMI) is very different from the other management objects we've talked about this far in the sense that it is not SQL Server specific, but, rather, an implementation of a management scripting model that was already taking hold to manage servers across Windows and beyond. + +WMI is an implementation of the industry open standard Web-Based Enterprise Management (WBEM) initiative. WBEM goes well beyond Microsoft products, and the idea was that server administrators would be able to learn one core scripting model and manage all of their servers with it. Exchange, SQL Server, Windows O/S features, and more—it was all going to be managed using WMI (and, indeed, most of it can be). + +Going into SQL Server 2000, the message was clear: WMI was the future. Many of the SQL Server stalwarts (like me) were told over and over again—DMO would be going away (well, that much turned out to be true), and we should do any new management in WMI (that much turned out to be not so true). + +The reality is that WMI was never fully implemented for SQL Server, but what there is of it will also not go away any time soon. WMI is, as I've said, an industry standard, and many other Windows servers use WMI for configuration management. Having WMI available for the configuration fundamentals makes a lot of sense, and, for that space, it's likely here to stay (with no complaints from me). + +It's worth noting that WMI is now implemented as a layer over SMO—go figure. + +SMO + +It's unclear to me exactly when Microsoft decided to make the move to SMO. What I can say is that they knew they had a problem: DMO was clearly at the end its useful life, and a complete rewrite of Enterprise Manager was already planned for SQL Server 2005. At the same time, WMI was clearly not going to address everything that needed to be done. (WMI is configuration oriented, but SQL Server needs more administrative love than WMI was likely to give in any kind of usable way.) + +So, as SQL Server 2000 was coming to market, .NET was already clearly on the horizon. What has become Visual Studio 2005 was already in heavy design. C# was already being sold as the programming language of the future. The decision was made to use Visual Studio plug-ins as the management center (indeed, you still see that very clearly for Reporting, Integration, and somewhat for Analysis Services). + +In the end, what we have in SMO is a very useful set of .NET assemblies. Management Studio has gone back to being its own thing (being too tied in to Visual Studio apparently didn't work out so well, but I like the decision to keep them separate), but it is based on Visual Studio, and leverages several Visual Studio notions right down to the IntelliSense that became part of the product in SQL Server 2008. The services that require the notion of a designer-use Business Intelligence Development Studio, which is still basically a set of projects, controls, and templates for Visual Studio (indeed, it says Visual Studio as you start it up). + +My guess? Well, depending on how long it is before SQL Server goes to a new version again, I think it's safe to say you can count on SMO as being the object model for no less than another 1–2 releases. There is no replacement on the horizon, and SMO looks very viable (no reason to replace it in the foreseeable future). In short, you should be able to count on it for at least 5–10 years, which is about as much as anyone can hope for anything in the software business. + +Even though it's safe to assume SMO will be around for at least a few more releases, it's worth noting that SMO is not 100% code compatible from release to release. For example, certain classes that were part of the core Microsoft.SqlServer.Smo.dll have been moved to a new file called Microsoft.SqlServer.SmoExtended.dll. If you don't have the new reference as part of your project, then things will break when you compile using the SQL Server 2008 libraries. + +The SMO Object Model + +Server Management Objects, or SMO, replaces DMO. That said, SMO goes well beyond anything DMO was conceived to do. Beyond basic configuration or even statement execution, SMO has some truly advanced features such as: + + * Event handling: SMO supports the notion of trapping events that are happening on the server and injecting code to handle the event situation. + * The ability to address types of objects in your server as collections (making it easy to enumerate them and provide consistent and complete treatment for all objects of that type). + * The ability to address all of the various server objects that are part of SQL Server in a relatively consistent manner. + +Like all object models, SMO establishes something of a hierarchy among objects. Because SQL Server is such a complex product, there are many, many objects to consider. Figure 23.1 includes an example of the hierarchy of what I would consider to be "core" objects in SQL Server. + +Note that this is not at all a comprehensive list! If you want a diagram with everything, check Books Online (they have one that isn't bad, though it's not great either—at least it's complete). This is my attempt at giving you something that is more readable and has all the core objects plus a few. + +Figure 23.1 + +Walking through Some Examples + +This may well be the messiest section in the entire book in terms of hearing me "talk" about things, as it includes a ton of Visual Studio stuff that goes well beyond what is built into the base SQL Server Business Intelligence Studio. + +You must have some version of Visual Studio .NET in order to actually build these examples yourself. Not to fear, however, if you don't—I do show all the lines of code here, so you can at least look them over. + +Also, the following examples are done in C#, but the basic object references and method calls are the same—conversion to VB or C++ should be simple for those more comfortable in those languages. + +What we're going to be doing in this section is building up a little application that does a number of different "basics" that you might be interested in. Among the things that will happen at least once among all these various actions are: + + * Creating a reference to a specific server, including a connection to a server using a trusted connection + * Creating an entirely new database + * Creating tables in a database + * Creating primary key constraints for those tables + * Creating a foreign key referencing from one table to another + * Dropping a database + * Backing up a database + * Scripting a database object + +Each of these is a hyper-simplified version of what is required. Keep in mind that each of the objects I reference here has many more possible properties and methods to be set. For example, in the scripting example, we could play around with scripting options to change what general property commands do and do not appear in the script. + +Getting Started + +Start by creating a new Windows Application project in Visual Studio. I called mine SQLSMOExample. In order to make use of the SMO assemblies, you'll need to set references in your project to at least five assemblies: + + * Microsoft.SqlServer.ConnectionInfo + * Microsoft.SqlServer.Management.Sdk.Sfc + * Microsoft.SqlServer.Smo + * Microsoft.SqlServer.SmoExtended + * Microsoft.SqlServer.SqlEnum + +Setting a reference is as easy as right-clicking References in the Solution Explorer (or in the Project menu) and choosing Add Reference. Select the five assemblies in the preceding list, and click OK. + +For my example, all of my code is, for simplicity's sake, done in a Form called frmMain. In most cases, you would want to set up separate component files for your methods and just call them from a form as needed. + +Declarations + +We need to add declarations to a couple of the management libraries to make it simple to utilize those objects in our code: + +using Microsoft.SqlServer.Management.Smo; + +using Microsoft.SqlServer.Management.Common; + +using Microsoft.SqlServer.Management.Smo.SqlEnum; + +This will allow us to reference several objects within these libraries without having to fully qualify them. + +Basic Connection and Server References + +There is a block of code you will see me reuse in every one of the methods we'll create in this chapter. The purpose of the code is to establish a connection and a server reference—everything we do will need these. + +In practice, we would likely establish one or more connections that would be global to the application rather than a specific method, but, again, I am trying to keep the code blocks somewhat independent, so that you can look at them individually. + +The connection and server reference code looks like this: + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +Creating a Database + +Creating a database is pretty straightforward. In the implementation that follows, I create a Database object and immediately initialize with a reference to our svr Server object. Note, however, that all I am creating is a database definition object. The database itself is not actually created on the server until we call the Create() method of the database object. So, in short, we define the object, modify the various properties that define it, and then, and only then, do we call the Create() method to actually create the database on the server that is referenced in our Server object. + +Drop a button onto the main form—I've called mine btnCreateDB—and you're ready to add some code. A simple method to create the database this might include: + +private void btnCreateDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +Database db = new Database(); + +db.Parent = svr; + +db.Name = "SMODatabase"; + +db.Create(); + +txtResult.Text = "Database Created"; + +cn.Disconnect(); + +} + +I've established a generic database object. I then associated it with a specific server, gave the logic name for the database, and then created it. + +The result is really nothing different than if we had connected to our database and issued the command: + +CREATE DATABASE SMODatabase + +We wind up with an empty database that is created completely with defaults. We could, however, have set things like the physical file location (including creating it with multiple filegroups), default collation, growth and size properties—basically anything you normally think of as a property of the database. More importantly, however, we are operating in a native .NET environment, so any errors, success messages, or other notifications can be handled easily within our client language. + +Creating Tables + +In this example, I'm going to add a pair of tables to our empty SMODatabase. We'll add ParentTable and ChildTable. ChildTable will have a foreign key to ParentTable. Both will have primary keys. + +First, we'll need to set a reference to what database we want to create our tables in: + +private void btnCreateTables_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +// Get a reference to our test SMO Database + +Database db = svr.Databases["SMODatabase"]; + +Notice that this time I did not create the Database object as "new." Instead, I associated it with an existing database object from our referenced Server object. + +From there, I create a new table object. Much as when we created the Database object, all we are doing is creating an object definition in our application. No table will be created in the database until after we've fully defined our Table object and called its Create() method. + +// Create Table object, and begin defining said table + +Table ParentTable = new Table(db, "ParentTable"); + +Now we're ready to start adding some meat to the definition of our table. Unlike a database, which has enough defaults that you really only need to specify a name to create one (the rest it will just be copied from the model database), tables require a lot of specification—specifically, it needs at least one column. + +Let's add a column that will eventually serve as our primary key: + +// Build up the table definition + +Column ParentKey = new Column(ParentTable, "ParentKey"); + +ParentKey.DataType = DataType.Int; + +ParentKey.Nullable = false; + +ParentKey.Identity = true; + +We've created a new column object. It has been templated from the ParentTable and named ParentKey. I've given it a data type of int, made it non-nullable, and defined it as an IDENTITY column. + +Even though we've templated the column from the ParentTable, it is not yet associated directly with that table! The templating reference just helps establish what the initial property values are for the column (such as collation). + +Now let's add another column called ParentDescription: + +Column ParentDescription = new Column(ParentTable, "ParentDescription"); + +ParentDescription.DataType = DataType.NVarCharMax; + +ParentDescription.Nullable = false; + +Again, the column is created, but not directly associated with the Table object yet—let's take care of that now: + +// Now actually add them to the table definition + +ParentTable.Columns.Add(ParentKey); + +ParentTable.Columns.Add(ParentDescription); + +It is not until we add them to the Columns collection of the Table object that they become directly associated with that table. + +So, we have a table object defined, and it has two columns associated with it. What we need now is a primary key. + +// Add a Primary Key + +Index PKParentKey = new Index(ParentTable, "PKParentKey"); + +PKParentKey.IndexKeyType = IndexKeyType.DriPrimaryKey; + +PKParentKey.IndexedColumns.Add(new IndexedColumn(PKParentKey, + +"ParentKey")); + +ParentTable.Indexes.Add(PKParentKey); + +Notice that we're defining the primary key as an index rather than as anything explicitly called a constraint. Instead, we define the index, and then tell the index (via its IndexKeyType) that it is a primary key. When the index is created, the constraint definition will also be added. + +Primary and Unique constraints are not added specifically as constraints. They are, instead, added as indexes with an IndexKeyType that implies that they are to be added as a constraint rather than a raw index. + +Much like our columns, the primary key is not directly associated with the table until we explicitly add it to the Indexes collection of our table. + +With all that done, we're ready to create our table: + +ParentTable.Create(); + +It is at this point that the table is physically created in the database. + +Okay, with our parent table created, we're ready to add our child table. The code up through the creation of the primary key looks pretty much just as the ParentTable object did: + +// Create Table object for child, and begin defining said table + +Table ChildTable = new Table(db, "ChildTable"); + +// Build up the Child table definition + +Column ChildParentKey = new Column(ChildTable, "ParentKey"); + +ChildParentKey.DataType = DataType.Int; + +ChildParentKey.Nullable = false; + +Column ChildKey = new Column(ChildTable, "ChildKey"); + +ChildKey.DataType = DataType.Int; + +ChildKey.Nullable = false; + +Column ChildDescription = new Column(ChildTable, "ChildDescription"); + +ChildDescription.DataType = DataType.NVarCharMax; + +ChildDescription.Nullable = false; + +// Now actually add them to the table definition + +ChildTable.Columns.Add(ChildParentKey); + +ChildTable.Columns.Add(ChildKey); + +ChildTable.Columns.Add(ChildDescription); + +// Add a Primary Key that is a composite key + +Index PKChildKey = new Index(ChildTable, "PKChildKey"); + +PKChildKey.IndexKeyType = IndexKeyType.DriPrimaryKey; + +PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, + +"ParentKey")); + +PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, + +"ChildKey")); + +ChildTable.Indexes.Add(PKChildKey); + +But with ChildTable, we want to add a twist in the form of a foreign key. To do this, we create a ForeignKey object: + +// Add a Foreign Key + +ForeignKey FKParent = new ForeignKey(ChildTable, "FKParent"); + +And then create ForeignKeyColumn objects to add to the ForeignKey object. + +// The first "Parent Key" in the definition below is the name in the + +// current table + +// The second is the name (of just the column) in the referenced table. + +ForeignKeyColumn FKParentParentKey = new ForeignKeyColumn(FKParent, + +// "ParentKey", "ParentKey"); + +FKParent.Columns.Add(FKParentParentKey); + +Next, set a reference to a specific table: + +FKParent.ReferencedTable = "ParentTable"; + +// I could have also set a specific schema, but since the table was created + +// using just a + +// default schema, I'm leaving the table reference to it default also. They + +// would be + +// created using whatever the user's default schema is + +/* + +** Note that there are several other properties we could define here + +** such as CASCADE actions. We're going to keep it simple for now. + +*/ + +Then actually add the foreign key to and create the table: + +ChildTable.ForeignKeys.Add(FKParent); + +ChildTable.Create(); + +cn.Disconnect(); + +txtResult.Text = "Tables Created"; + +} + +I recognize that this probably seems convoluted compared to just connecting and issuing a CREATE TABLE statement, but there are several advantages: + + * If you are dynamically building a table, you can encapsulate the various parts of the table construction more easily than trying to do string manipulation. + * Changes to the properties of the various objects involved are far less sensitive to specific order of execution than trying to build a string would be. + * All the properties remain discrete, so they are easily addressed and edited without significant string manipulation. + * It is the SMO way of doing things—if the other actions you're taking are already in SMO, then doing things consistently in SMO is probably going to yield less confusion than if you mix string-based commands with SMO commands. + +Dropping a Database + +As with most drop situations, this one is pretty straightforward. We start with our now-familiar server and connection info and then set a reference to what database we're interested in: + +private void btnDropDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +Database db = svr.Databases["SMODatabase"]; + +Then just call the Drop() method and we're done: + +db.Drop(); + +txtResult.Text = "Database Dropped"; + +cn.Disconnect(); + +} + +Note that we do not have any error trapping added here (there really isn't anything different than other error-trapping issues in your language of choice). You may run into some issues dropping the database if you still have connections open to that database elsewhere in this or other applications (such as Management Studio). I encourage you to experiment with this and what you might do in your error handler (remember, we have robust error handling in most .NET languages), such as identifying and killing all connections that have locks on the database we want to drop. + +Backing Up a Database + +For this one, we're actually going to switch over and use the AdventureWorks database just to give us something in which to make a meatier backup. + +As you might suspect from how many different objects we've seen so far, the Backup object is its own thing. It is considered a child of the Server object but has its own set of properties and methods. + +To create a backup, you do the same server connection code that we've seen several times now: + +private void btnBackupDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +We're then ready to create a new Backup object. Note that, unlike the Database object, which we associated with a server early on, we don't need to reference a specific server for our Backup object until we actually go to execute the backup. + +// Create and define backup object + +Backup bkp = new Backup(); + +bkp.Action = BackupActionType.Database; + +bkp.Database = "AdventureWorks2008"; + +bkp.Devices.AddDevice(@"c:\SMOSMOSample.bak", DeviceType.File); + +I've created the Backup object and told it what kind of a backup it should expect to do (A Database backup as opposed to, say, a Log backup). I've also set what database it's going to be backing up and defined a device for it to use. + +Note that, while here I defined a file device and path on the fly, you could just as easily connect to the server and query what devices are already defined on the server and then select one of those for your backup. Similarly, the device could be of a different type—such as a tape. + +Now we're ready to execute the backup. We have two different methods available for this: + + * SqlBackup: This is a synchronous backup—your code will not gain control again until the backup is either complete or errors out. + * SqlBackupAsync: This tells the server to start the backup and then returns control to your application as soon as the server accepts the backup request as being valid (the backup will then run in the background). It's important to note that you do have the ability to receive notifications as the backup reaches completion points (you can define the granularity of those completion points). + +I've chosen the asynchronous backup method in my example. + +// Actually start the backup. Note that I've said to do this Asynchronously + +// I could easily have make it synchronous by choosing SqlBackup instead + +// Also note that I'm telling it to initialize (overwrite the old if it's + +// there). + +// Without the initialize, it would append onto the existing file if found. + +bkp.Initialize = true; + +bkp.SqlBackupAsync(svr); + +cn.Disconnect(); + +} + +After you've run this, go take a look for the SQLSMOSample.bak file in the root of your C: drive and it should be there! Also try running the backup multiple times and notice that it is overwritten each time. If we removed the bkp.Initialize command, then each new backup would append to the existing file. + +Scripting + +Perhaps one of the most compelling abilities that SMO offers the true developer crown is the ability to script out objects that are already in the database. Indeed, SMO can script out backups, reverse engineer tables, and even record the statements being sent to the server. + +For our example, we're going to reverse engineer a script for the HumanResources.Employee table in the AdventureWorks database. We'll see just how easily even a relatively complex table definition can be scripted out for other use. + +We start with the same server, connection, and database reference code we've used several times in this chapter: + +private void btnScript_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +// Now define the database we want to reference the table from. + +Database db = svr.Databases["AdventureWorks2008"]; + +Next, we set a reference to the table that we want to script out—we could just as easily be scripting out a different type of SQL Server object such as a stored procedure, a view, or even a database. Indeed, it can even be a server-level object such as a device or login. + +// Get a reference to the table. Notice that schema is actually the *2nd* + +// parameter + +// not the first. + +Table Employee = db.Tables["Employee", "HumanResources"]; + +We're then ready to call the Script() method. The only real trick here is to realize that it returns not just a single string but rather a collection of strings. In order to receive this, we'll need to set up a variable of the proper StringCollection type, which is not defined in any of our using declarations; we will, therefore, need to fully qualify that variable declaration. + +// Call the Script method. The issue with this is that it returns a string + +// *collection* rather than a string. We'll enumerate it into a string + +// shortly. + +System.Collections.Specialized.StringCollection script = + +Employee.Script(); + +Okay, so we've received our script, but now we want to take a look. I'll define a holding variable and copy all of the separate strings into just one string to use in a MessageBox: + +string MyScript = ""; + +foreach (string s in script) + +{ + +MyScript = MyScript + s + "\r\n"; + +} + +// Now show what we got out of it - very cool stuff. + +MessageBox.Show(MyScript); + +cn.Disconnect(); + +} + +Execute this, and you get a very usable script returned, as shown in Figure 23.2. + +Figure 23.2 + +Pulling It All Together + +Okay, we looked at the code in fragments, so I wanted to provide something of a reference section to show what all my code looked like when pulled together. How you choose to do your form is up to you, but mine looks like Figure 23.3. Which buttons are which in the code should be self-descriptive based on the button names you'll see in the code. The very bottom box is a text box that I called txtReturn in the code. + +Figure 23.3 + +Following is my entire form code: + +using System; + +using System.Text; + +using System.Windows.Forms; + +using Microsoft.SqlServer.Management.Smo; + +using Microsoft.SqlServer.Management.Common; + +using Microsoft.SqlServer.Management.Smo.SqlEnum; + +namespace SQLSMOSample + +{ + +public partial class frmMain : Form + +{ + +public frmMain() + +{ + +InitializeComponent(); + +} + +private void btnBackupDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +// Create and define backup object + +Backup bkp = new Backup(); + +bkp.Action = BackupActionType.Database; + +bkp.Database = "AdventureWorks2008"; + +bkp.Devices.AddDevice(@"c:\SMOSample.bak", DeviceType.File); + +// Actually start the backup. Note that I've said to do this Asynchronously + +// I could easily have make it synchronous by choosing SqlBackup instead + +// Also note that I'm telling it to initialize (overwrite the old if it's there). + +// Without the initialize, it would append onto the existing file if found. + +bkp.Initialize = true; + +bkp.SqlBackupAsync(svr); + +cn.Disconnect(); + +} + +private void btnCreateDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +Database db = new Database(); + +db.Parent = svr; + +db.Name = "SMODatabase"; + +db.Create(); + +txtResult.Text = "Database Created"; + +cn.Disconnect(); + +} + +private void btnScript_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +// Now define the database we want to reference the table from. + +Database db = svr.Databases["AdventureWorks2008"]; + +// Get a reference to the table. Notice that schema is actually the *2nd* parameter + +// not the first. + +Table Employee = db.Tables["Employee", "HumanResources"]; + +// Call the Script method. The issue with this is that it returns a string + +// *collection* rather than a string. We'll enumerate it into a string shortly. + +System.Collections.Specialized.StringCollection script = Employee.Script(); + +string MyScript = ""; + +foreach (string s in script) + +{ + +MyScript = MyScript + s + "\r\n"; + +} + +// Now show what we got out of it - very cool stuff. + +//MessageBox.Show(MyScript); + +this.txtResult.Text = MyScript; + +cn.Disconnect(); + +} + +private void btnDropDB_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +Database db = svr.Databases["SMODatabase"]; + +db.Drop(); + +txtResult.Text = "Database Dropped"; + +cn.Disconnect(); + +} + +private void btnCreateTables_Click(object sender, EventArgs e) + +{ + +// Create the server and connect to it. + +ServerConnection cn = new ServerConnection(); + +cn.LoginSecure = true; + +Server svr = new Server(cn); + +svr.ConnectionContext.Connect(); + +// Get a reference to our test SMO Database + +Database db = svr.Databases["SMODatabase"]; + +// Create Table object, and begin defining said table + +Table ParentTable = new Table(db, "ParentTable"); + +// Build up the table definition + +Column ParentKey = new Column(ParentTable, "ParentKey"); + +ParentKey.DataType = DataType.Int; + +ParentKey.Nullable = false; + +ParentKey.Identity = true; + +Column ParentDescription = new Column(ParentTable, "ParentDescription"); + +ParentDescription.DataType = DataType.NVarCharMax; + +ParentDescription.Nullable = false; + +// Now actually add them to the table definition + +ParentTable.Columns.Add(ParentKey); + +ParentTable.Columns.Add(ParentDescription); + +// Add a Primary Key + +Index PKParentKey = new Index(ParentTable, "PKParentKey"); + +PKParentKey.IndexKeyType = IndexKeyType.DriPrimaryKey; + +PKParentKey.IndexedColumns.Add(new IndexedColumn(PKParentKey, "ParentKey")); + +ParentTable.Indexes.Add(PKParentKey); + +ParentTable.Create(); + +// Create Table object for child, and begin defining said table + +Table ChildTable = new Table(db, "ChildTable"); + +// Build up the Child table definition + +Column ChildParentKey = new Column(ChildTable, "ParentKey"); + +ChildParentKey.DataType = DataType.Int; + +ChildParentKey.Nullable = false; + +Column ChildKey = new Column(ChildTable, "ChildKey"); + +ChildKey.DataType = DataType.Int; + +ChildKey.Nullable = false; + +Column ChildDescription = new Column(ChildTable, "ChildDescription"); + +ChildDescription.DataType = DataType.NVarCharMax; + +ChildDescription.Nullable = false; + +// Now actually add them to the table definition + +ChildTable.Columns.Add(ChildParentKey); + +ChildTable.Columns.Add(ChildKey); + +ChildTable.Columns.Add(ChildDescription); + +// Add a Primary Key that is a composite key + +Index PKChildKey = new Index(ChildTable, "PKChildKey"); + +PKChildKey.IndexKeyType = IndexKeyType.DriPrimaryKey; + +PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, "ParentKey")); + +PKChildKey.IndexedColumns.Add(new IndexedColumn(PKChildKey, "ChildKey")); + +ChildTable.Indexes.Add(PKChildKey); + +// Add a Foreign Key + +ForeignKey FKParent = new ForeignKey(ChildTable, "FKParent"); + +// The first "Parent Key" in the definition below is the name in the current table + +// The second is the name (of just the column) in the referenced table. + +ForeignKeyColumn FKParentParentKey = new ForeignKeyColumn(FKParent, + +"ParentKey", "ParentKey"); + +FKParent.Columns.Add(FKParentParentKey); + +FKParent.ReferencedTable = "ParentTable"; + +// I could have also set a specific schema, but since the table was created + +// using just a + +// default schema, I'm leaving the table reference to it default also. They would be + +// created using whatever the user's default schema is + +/* + +** Note that there are several other properties we could define here + +** such as CASCADE actions. We're going to keep it simple for now. + +*/ + +ChildTable.ForeignKeys.Add(FKParent); + +ChildTable.Create(); + +cn.Disconnect(); + +txtResult.Text = "Tables Created"; + +} + +private void frmMain_Load(object sender, EventArgs e) + +{ + +} + +} + +} + +Summary + +Well, all I can say is "Wow!" Okay, so, in a way, this is nothing all that new—after all, DMO used to do a lot of this stuff (indeed, most everything we've looked at with actual code). SMO has, however, made things simpler. The "Wow!" is about thinking of the possibilities: + + * Imagine issuing commands asynchronously. + * Imagine still being able to monitor the progress of those commands by receiving events as progress continues. + * Imagine being able to generate script code to support most anything you might want to do. + * Imagine being able to register event handlers on your SQL Server and being notified when custom events occur on the server. + +The list goes on and on. + +Most of the concepts in this chapter are nothing new. We've already looked at ways to create tables, as well as create, back up, and drop databases. The power, then, is in how discretely you can manage those tasks using SMO. We have the prospect for very robust event and error handling. We can far more easily receive configuration information about objects already in the server in a form that yields separate properties as opposed to trying to parse those values out of system-stored procedures. + +This chapter truly just scratches the surface of what you can do. If I've piqued your interest at all, I encourage you to consider the use of SMO in your design work, and, of course, go get a book specific to SMO if you need one (you probably will!). +24 + +Data Warehousing + +Well, while it may seem that we've already roamed all over the realm of SQL Server, we have, up to this point, been working safely within the type of databases that are the most common, and that most database developers are the most comfortable with: The Online Transaction Processing—or OLTP—database. + +This chapter, however, will turn things somewhat upside down (in terms of the traditional "rules" that determine how we do things). When, for example, we talked about design earlier in this book or in my Beginning title, we were talking mostly in terms of a normalized database. In this chapter, we'll be largely tossing that out the window. Instead of the transaction-oriented databases we've focused on up to this point, we're going to focus on databases and models that are oriented around the notion of data analysis. We will, for now, focus primarily on data warehousing and the special needs relating to its storage ramifications and reporting in data warehousing situations. We'll explore a new sea of terms that you may not have heard before—the lingo of data warehousing and analytics—the language of Business Intelligence (often referred to simply as BI). We'll also explore the world of multidimensional modeling by taking a quick look at yet another service included with SQL Server—Analysis Services. + +In this chapter we will: + + * Discuss the differences between the needs of transaction processing versus analysis processing + * Discuss how these differences necessarily lead to substantially different solutions + * Explore the problems with the idea of using your OLTP solution as your OLAP solution + * Define the concept of a data cube, and indicate how they can help provide a solution to the special requirements of an analytics environment + * Look at some other aspects of Analysis Services that come as part of SQL Server 2008 + +Considering Differing Requirements + +As corporations build increasingly complex applications and store their daily data in the databases that support those applications, the databases grow in size. As the size of each database increases, there are typically negative impacts on the system performance of the applications that utilize it. Left unchecked, databases can grow to sizes that seriously impact response times, increase contention (conflict between users trying to get at the same data), or even causing the entire system to go offline. + +End users may use data sources differently from one another. From a "how they use it" perspective, users fall into four significant categories: + + * Those who want to access the data sources on a daily basis, retrieving certain records, adding new records, updating, or deleting existing records + * Those who want to make sense of the enormous amounts of data piling in the database, generating reports that will help them come up with the right decisions for the corporation and give it the competitive edge that will make it succeed in the marketplace + * Those who want to take the knowledge they gained from their analytical or transactional systems a step further by predicting business performance and analyzing trends for the future + * Those who want to make use of highly focused "at a glance" information to obtain fast indications of where they should focus their time + +The separate OLTP and OLAP systems help satisfy the different requirements of the first two categories of users. Data mining and cube analysis (through pivot tables and other "What if?" analysis) help satisfy the requirements of the third category. The final item listed tends to be served by targeted screens—or "Dashboards"—that are typically presented when someone first logs in to their system. The following sections present the characteristics of these systems and technologies, and how and when each of them can be used. + +Online Transaction Processing (OLTP) + +As previously mentioned, the OLTP systems we have focused on until now are designed to allow for high concurrency, making it possible for many users to access the same data source and conduct the processing they need. They also tend to be the "system of authority" for most data, so they place an exceptionally high value on data integrity. In addition, they tend to store data at the detail level, so they implement strategies that minimize the amount of space required to store the data. + +As the "transaction processing" in the name implies, OLTP systems are oriented around the idea of transaction processing against the database. Transactions further imply controlled changes to the data in the tables, due to inserts, updates, and deletes during the operation of your business. Typically, an OLTP system will have numerous client applications accessing the database to address small pieces of information in a variety of ways (inserts, updates, deletes—virtually anything). + +Examples of OLTP systems include data-entry programs such as banking, ticket reservation, online sales, and inventory management systems (such as AdventureWorks2008), but, no matter what the application is, OLTP systems are usually built with the following objectives in mind: + + * Process data generated by transactions + * Maintain a high degree of accuracy by eliminating data redundancy + * Ensure data and information integrity + * Produce timely (generally "real time") documents and reports, such as receipts and invoices + * Increase work efficiency + +In focusing on these particular objectives, the design of the database is usually in the third normal form we discussed back in Chapter 5, eliminating redundancy and maximizing the power of relationships between tables. + +Online Analytical Processing (OLAP) + +The Online Analytical Processing (or OLAP) systems fall under the broader scope of Decision Support Systems (DSS), or, as is becoming more popular these days, Business Intelligence (BI). The goal of BI systems is to analyze huge amounts of data, generating summaries and aggregations in many different ways ranging from daily, weekly, quarterly, and annual reports to highly focused scorecards and dashboards typically aimed at very specific users who are prepared to act on that data to gain a competitive edge. + +With OLAP and BI, we generally forget about keeping our data normalized. Instead, we deliberately de-normalize the database (or flatten it) to some extent, allowing some redundancy to avoid joins and focus performance specifically on data retrieval rather than modification. Why is this okay in a data warehouse? Well, once the data arrives in the data warehouse, it is rarely changed. The data is kept there for query purposes; to generate reports that would help decision makers plan the future of their enterprise, but, since it is usually viewed as history by the time it arrives in a data warehouse environment, it doesn't need to concern itself with inserts, updates, or deletes. Instead of a highly normalized, transactional database, we wind up with what is usually called a dimensional database that follows a specific structure or schema. Dimensional databases can be used to build data cubes, which are multidimensional representations of the data that facilitates online business analysis and query performance. The dimensions of a cube represent distinct categories for analyzing business data. The dimensions found in a typical cube will almost always include time, and will usually also include geography and something akin to a product line. From there, the possibilities are endless, depending on the specific characteristics of your organization. + +Just because it is called a "cube," don't allow yourself to fall into the trap of considering it limited to three dimensions. Cubes allow for queries that are of n-dimensions. The "cube" representation is merely meant to get across that we are beyond the typical tabular representation seen in OLTP systems. + +A Brief Word on Data Mining + +Traditional querying techniques such as the queries we've largely focused on in this book and queries into a data warehouse help you find information from your data that is based on relationships you likely already know. (Heck, they are probably declared in your transactional system.) For instance, you can use queries or even a cube to find the number of customers who bought a certain product in a certain period of time per state or city. The information you are seeking is already in your database and the query to retrieve it is usually based on a question you know intuitively. + +Data mining, on the other hand, shows its power by helping you discover hidden relationships in your data. You might use it for discovering new trends, speculating on causes for certain events, or even forecasting the performance or direction of certain aspects of your data. For example, data mining might help you find out why a certain product is selling more than another product in a certain region. Data mining makes use of algorithms that bring non-intuitive relationships to our attention. For example, data mining done many years ago discovered that people who bought beer were more inclined to also purchase cheese. Retailers picked up on this, and, for a time, it wasn't uncommon to see cheese located very near the beer aisle to facilitate and encourage the sales of those products as a pair rather than just a single sale at a time. + +SQL Server 2008 continues SQL Server's strong support for data mining. The complexities of data mining are, however, well beyond the scope of this book. I did, however, want to make sure you were aware of its availability should you get comfortable enough with analytics to explore data mining. + +OLTP or OLAP? + +Now that you have seen the general ideas behind the two systems of OLAP and OLTP, let's consider the banking business, for example. During the bank's working hours, bank tellers help customers perform transactions, like depositing funds into their accounts, transferring funds between accounts, and withdrawing funds from these accounts. The customers may also conduct their own transactions using an ATM (Automatic Teller Machine), or a phone-based and/or computer-based banking service. In other words, such transactions are not limited to a particular part of the day but can take place around the clock. All of these operations lead to changes in the data stored in the database. These changes could be inserting new records, or updating or deleting existing records. + +OLTP is built to allow these transactions to be made by a large number of users accessing the database concurrently. Databases serving OLTP systems are usually highly normalized relational databases, and their table indexes need to be selected carefully for the right fields. OLTP databases should be built to balance performance away from reporting and toward high frequency of transactions. Queries executed in OLTP systems include a significant mix of inserts, updates, deletes, and selects. + +Let's now look at a different scenario with the banking example. Suppose that the bank managers are conducting future planning. They need to look at both current and historical performance data of the bank. If they were to query the database that is used for the OLTP system, they will likely run into significant contention issues with employees who are conducting the day-to-day business of the bank. The variety of reporting and analysis that bank management is likely to be looking for are often long running, and it can put a significant load on the transactional system as many tables are joined to relate a wide range of information and are formatted in a way that is meant to summarize and aggregate the data. For example, they might want to know the total amount of transactions conducted by all customers in a certain region. Such a query would have to sift through large amounts of data that is fragmented and scattered over many joined tables. For example, an accounting general ledger transaction could be stored in a dozen different tables. The queries will have to pull fields from these joined tables to build the views needed by the management, grouping and performing aggregations as it does so. Now imagine this process being repeated over and over again as multiple managers all ask the same general questions and look at the same data. + +To face these challenges, it is necessary to isolate the managers who use existing bank data to build their future outlook and planning, and have them use a different system based on OLAP principles. This means creating two different systems: an OLTP system for transaction processing by bank staff and customers, and an OLAP system to help with the decision making. + +Now we have two different systems; should these systems use the same database with separate tables for each system, or should they use two completely different databases? The answer to this question depends on how much effect one of the systems will have on the performance of the other, and on how the management and administration plans of these systems work. It is very likely that the two systems will be used at the same time. This causes performance problems even if the tables are separate. This is because the two systems still share many resources on the database server, and these resources may be depleted quickly with the two systems in use. These two systems are usually optimized differently. If we optimize for OLAP, we may adversely affect the performance of the OLTP system, and vice versa. Also, the two systems may have to be administered differently, with different user accounts, backup and maintenance strategies, and so on. Therefore, even though it is theoretically possible to tap into the same database, it is a good idea to keep separate databases on separate database servers for the two systems. With this, each system will have its own resources, and optimizing it will not affect the other system. + +Dimensional Databases + +The solution to the problems inherent with requesting complex queries from OLTP systems is to build a separate database that would represent the business facts more concisely. The structure of this database will not be relational; instead, it will be dimensional. + +The Fact Table + +The central table of a dimensional database is called the fact table. Its rows are known as facts and the central theme of a fact table will be measures of some kind of distinct instance of an activity or event. + +For example, the AdventureWorksDW2008 data warehouse sample includes a table called FactInternetSales and several related tables (shown in Figure 24.1). It focuses on individual sales, but on the key metrics for the sale at a line item level. It holds a set of measures (usually numeric)—in this case Order Quantity, Unit Price, and Extended Amount among other measures—and relates them to a set of appropriate dimensions. (In this case, product information, relevant dates, customer information, and other dimensions on which we may want to base our analysis.) + +Figure 24.1 + +The Dimension Tables + +Dimensions help put the facts in context and represent such things as time, product, customer, and location. The dimensions describe the data in the fact table. Continuing with our AdventureWorksDW2008 example, it would make sense to have date, customer, and product dimensions, among other things. + +The fact table, FactInternetSales, captures transactions on a daily level for all customers and all products. Since it has a row for every line of detail, this table will likely grow to be very large (or, at least, we hope so since that means we made many sales!). Since storing every piece of customer data for every sale would take up a prohibitive amount of space, we go ahead and break out the items that don't change with every instance of a measure. These tables we link to the fact table are called dimension tables. They are used to create something of a group by which to determine the level of aggregations from the fact table. For instance, we could find the total monthly sales of all products in all sales territories if we were to query the FactInternetSales table grouping by month of the year. Alternatively, we could find the total sales by sales territory at all times, for all customers, and for all products if we queried the FactInternetSales table grouping on state. We can also have aggregations on a combination of the dimensions in FactInternetSales. For example, we could find the total sales for a particular product model by sales territory on a monthly basis for a specific type of customer by grouping on state and month and adding the appropriate criteria in the WHERE clause for the customer and product. + +The Star and Snowflake Schemas + +The database schema in Figure 24.1, where there is a single fact table with a number of dimension tables linked directly to it, is an example of a star schema. In a star schema, all objects likely to be involved in a query are no more than one join away from the fact table. You may also hear of a snowflake schema. In a snowflake schema, multiple tables may relate to a dimension that, in turn, is the one to relate directly to the fact table. A snowflake schema can be considered an extension of the star schema, providing a bit more normalization, but also requiring additional tables be joined to relate all the data. + +Data Cubes + +Until now, we have seen that data is moved from the transactional system into a data warehouse—most likely in the form of a star or snowflake schema. In a dimensional model such as we've described here, the database is frequently used as the basis for constructing what are known as cubes. To understand what cubes are, think of the data in the dimensional database as the transformed raw data for your analysis. In other words, if you look at the example in the previous section, you notice that the fact table includes the transaction information and pointers (foreign keys) to the dimensions we wish to analyze. The reports we generate based on the schema in Figure 24.1 are usually something like total sales for customers in a particular territory over a particular period of time for a specific product or category of products. To obtain such a result, you have to aggregate the values in the fact table based on the dimensions you are using to construct the needed report. SQL Server's Analysis Services allows you to pre-calculate such results and store them in a cube. Hence, the cube is a structure that stores the data aggregations from the dimensional database by combining all possible dimension values with the Internet sales facts in the fact table. With this, retrieving the final reports becomes much more efficient, since no complex queries are evaluated at runtime. + +To visualize what a cube looks like, look at Figure 24.2. The dimensions of the cube represent the dimensions of the fact table. Each cell in the cube represents a fact corresponding to a level of detail for the different dimensions of the cube. Although the graphical representation of the cube can only show three dimensions, a data cube can have many more dimensions when using Analysis Services. The following figure shows a representation of a data cube for the FactInternetSales table, with the territory, product category, and time dimensions shown. + +Figure 24.2 + +If you want to use this cube to find the total sales in the Michigan territory during 2002 for the bicycles category, you need to look at the shaded cell in the figure, which is the resulting cell from the intersection of those three dimensions. + +Analysis Services allows you to build your cube from any source of data that has an OLE DB provider. This source can be a relational database in any database management system that has an ODBC driver (such as Oracle, DB2, or even MySQL) or a native OLE DB provider (such as SQL Server, Oracle, or MS Access). The data source for the cube can also be a dimensional database, text files, or even a lightweight directory access protocol (LDAP) data source. + +Data Warehouse Concepts + +Now that we have seen what cubes and dimensional databases are, let's define the larger concept of what a data warehouse is and how it might be built in SQL Server 2008. + +A data warehouse is a data store that holds the data collected during the company's conduction of business over a long period of time. The data warehouse may be made up of one or more data marts (smaller collections of summary or dimensional data that is generally focused on a subset of the data warehouse as a whole). The data warehouse typically uses the OLTP systems that collect the data from everyday activities and transactions as its source. The data warehouse concept also includes the processes that extract, scrub (see "Data Scrubbing" later in the chapter), and transform the data, making it ready for the data warehouse. Finally, it also includes the tools needed by the business analysts to present and use the data. These tools include BI tools (such as pivot tables in Excel, or Performance Point Server), as well as data mining and reporting tools. Figure 24.3 depicts the conceptual structure and components of a data warehouse solution. + +Figure 24.3 + +Data Warehouse Characteristics + +A data warehouse is usually built to support decision making and analytics because it is designed with the following unique characteristics: + + * Consolidated and Consistent Data: In a data warehouse, data is collected from different sources and consolidated and made consistent in many ways, including the use of naming conventions, measurements, physical attributes, and semantics. This is important because business analysts accessing the data warehouse and using its data for their decision-making processes have to use consistent standards. For example, date formats may all follow one standard, showing day, month, quarter, and year. Data should be stored in the data warehouse in a single, acceptable format. This allows for the referencing, consolidating, and cross-referencing of data from numerous heterogeneous sources, such as legacy data on mainframes, data in spreadsheets, or even data from the Internet, giving the analysts a better understanding of the business. + +I can't stress enough the need to treat your data consistently, including the name you use to refer to it. Make sure that you don't use the same name to refer to different things in your database. If, for example, you have more than one type of sales you're going to refer to, then require *every* instance of sales to be name qualified—for example, "bicycle sales" versus "apparel sales" with a separate name for "aggregate sales." I strongly suggest keeping a data "dictionary" that defines the meaning of each name you use and the source of that data. + + * Subject-oriented Data: The data warehouse organizes key business information from OLTP sources so that it is available for business analysis. In the process, it weeds out irrelevant data that might exist in the source data store. The organization takes place based on the subject of the data, separating customer information from product information, which may have been intermingled in the source data store. + * Historical Data: Unlike OLTP systems, the data warehouse represents historical data. In other words, when you query the data warehouse, you use data that was collected using the OLTP system in the past. The historical data could cover a long period of time compared to the OLTP system, which contains current data that accurately describes the system, for the most part. + * Read-only Data: After data has been moved to the data warehouse, you may not be able to change it unless the data was incorrect in the first place. The data in the data warehouse cannot be updated because it represents historical data, which cannot be changed. Deletes, inserts, and updates (other than those involved in the data-loading process) are not applicable in a data warehouse. The only operations that occur in a data warehouse once it has been set up are loading of additional data and querying. + +Data Marts + +You may find out, after building your data warehouse, that many people in your organization access only certain portions of the data in the data warehouse. For instance, the sales managers may access only data relevant to their departments. Alternatively, they may access only data for the last year. In this case, it would be inefficient to have these people query the whole data warehouse to get their reports. Instead, it would be wise to partition the data warehouse in smaller units, called data marts, which are based on their business needs. + +In addition, some people in your organization may want to be able to access the data in the data warehouse in remote areas far from the company buildings. For instance, a sales manager may want to access data about products and sales particular to his or her market area while on a sales venture. People such as this would benefit from a data mart, as they would be able to carry a section of the data warehouse on their laptop computers, allowing them to access the data they need at any time. + +As often as not, this process actually works backwards, with a smaller data mart serving as the beginning of a larger data warehouse. Indeed, many enterprise data warehouses in use today were created through a process of unifying multiple disparate data marts under one data dictionary and consistent definition before providing additional data aggregation and rollup that takes data from all the various data marts. + +Of course, with data marts, the data should be kept in synch with the data warehouse at all times. This can be done in a variety of ways, such as using SQL Server Integration Services, scripting (of T-SQL or other languages), or full-blown data management + +SQL Server Integration Services + +We already looked at Integration Services extensively back in Chapter 16, but given its consistent use in association with data warehousing, it's worth mentioning again here. + +Many organizations need to centralize data to improve decision making. The data being centralized is often stored in a large variety of formats and comes from a number of different sources. The row data that exists in these sources has to be reconciled and transformed in many cases before it can be stored in the data warehouse. SSIS is a fabulous tool for performing this task by providing a means to move data from the source to the destination data warehouse while validating, cleaning up, consolidating, and transforming the data when needed. + +Data Validation + +Conducting data validation before the data is transferred to the destination data warehouse is extremely important. If the data is not valid, the integrity of the business analysis conducted with it will be in question. For example, if one of the fields is a currency field, and the OLTP data sources exist in multiple countries around the globe, the data in this currency field must always be transferred in the currency of the destination data warehouse and the values must always be properly adjusted for exchange rates at the time the transaction took place (not just the value that was current when the transfer took place). + +Data Scrubbing + +Often the degree or nature of "clean up" required is such that it can't be performed directly during the transformation process. You may, for example, need to reconcile data between multiple sources feeding the same data warehouse. The process of reconciling multiple data sources and applying other consistency rules to your data is referred to as data scrubbing. For example, if a bicycle is classified in one source as the mountain bike category, and in another source as the recreational category, aggregations in the data warehouse involving this category will yield inaccurate results unless the two data sources have been reconciled during the data transformation process. + +Data scrubbing can be achieved in different ways. These methods are beyond the scope of this book, but are mentioned briefly here: + + * Using SSIS to modify data as it is copied from the source to the destination data store + * Use of T-SQL scripts applied to a temporary "scrubbing" database or set of tables + +Creating an Analysis Services Solution + +In this section, we're going to take a quick look at what cubes are all about, and how to create them. Then we move on to a quick example of how to use them. This is going to be a simple walk-through meant to let you get a quick taste of what's possible. If, after you get done with this taste of Analysis Services, you want more, I would suggest picking up an Analysis Services book and books on data warehousing, business intelligence, and dimensional modeling. + +It really is important to realize that, just because you're a great database developer, you are not automatically a great developer of data warehouses or business intelligence systems. The way of thinking required to create a great decision support system is very different from that required to build a great transactional processing system. History is littered with dead projects created when a seemingly experienced database developer assured management that he or she knew all about data warehousing and analytics. Make sure you know what you're getting into before you make such a commitment. + +The example shown in the remainder of this chapter requires the AdventureWorksDW2008 database. + +Start by firing up the Business Intelligence Development Studio (aka, BIDS). It's been discussed in earlier chapters that used BIDS, but, again, this is just a special version of Visual Studio 2008 that is included with SQL Server. Go ahead and select New Project. What you see will vary somewhat depending on whether you have Visual Studio 2008 installed separately from SQL Server and, if so, what edition of Visual Studio you have. + +In any case, you should wind up with a dialog that looks something like Figure 24.4. The exact set of project types may vary from mine somewhat (again, depending on what edition of Visual Studio you're working with). I have already selected Business Intelligence Projects and, more specifically, the Analysis Services template. + +Figure 24.4 + +After you select a name for your project (I've chosen the oh-so-descriptive "AnalysisServicesProject"), click OK to create the project. Visual Studio will give you an empty Analysis Services project, but notice the various folders created for you. While we won't work with every one of these in this book, it does give you a feel for how broad the work on an Analysis Services project can be. + +Let's move right along and create a new data source. To do this, simply right-click the Data Sources folder and select New Data Source, as shown in Figure 24.5. + +Figure 24.5 + +This should give you a Welcome dialog (unless you've had it up before and selected the "Don't show this page again" option). Click Next to get to a dialog that allows you to choose a method for defining the data source. Stick with the default of "Create a data source based on an existing or new connection" and then click New to bring up the dialog shown in Figure 24.6. + +Figure 24.6 + +I have already filled in several key fields to fit my particular need. (You may want to choose a remote server or to use Windows Authentication.) Click OK to create the data source and go back to the previous dialog. (The new data source should now show in the Data Connections list). Click Next to move on to the Impersonation Information dialog shown in Figure 24.7. We can utilize one of four security options here to determine what credentials Analysis Services will pass when it needs to connect to the data source we're defining. I've told it to use the service account, which equates to whatever Windows account Analysis Services is running under. (So, if you use this option, make sure that accounts has rights to your source data.) + +Figure 24.7 + +Clicking Next should take you to the Completing the Wizard dialog, where you can name your data source and click Finish. + +Next, right-click the Data Source Views folder and choose New Data Source View. This should bring up the dialog shown in Figure 24.8. As you can see, the data source we created a few moments ago is listed and chosen by default. (It also gives us a shortcut to create a new data source if we so choose.) Click Next to select the tables and views you want to work with. I've selected all the tables we saw in our star schema example earlier in the chapter as shown in Figure 24.9. + +Figure 24.8 + +Figure 24.9 + +Again click Next to get the Completing the Wizard dialog. Choose a name (I'm going with the default of Adventure Works 2008) and click Finish. This time we get a more dramatic result, as our main project window (as shown in Figure 24.10) opens up with a view designer for our new data source view. + +Figure 24.10 + +Notice that it has figured out that our tables are related, and even mapped the visual into a decent representation of the "star" idea. + +We're going to briefly skip down to the Dimensions folder. Again, right-click and select New Dimension. Click Next to go past the Welcome dialog and get to the Select Creation Method dialog shown in Figure 24.11. + +Figure 24.11 + +Notice that there are utilities here for producing a time dimension table if we needed one (AdventureWorksDW2008 comes with one already). Keep the default and again click Next to see the Specify Source Information dialog shown in Figure 24.12. I've left it at the default table of DimCurrency (this was chosen alphabetically). It has chosen the correct column as the key, so we'll again click Next to get to the Dimension Attributes dialog shown in Figure 24.13. Note that I've added Currency Name as an attribute. + +Figure 24.12 + +Figure 24.13 + +Again click Next for the Completing the Wizard dialog. Change the name to be Currency, and click Finish to finish the wizard and create the dimension. + +Now repeat the New Dimension process for the rest of the dimension tables in our data source view (all of the tables that start with Dim), selecting all attributes for each dimension. You should wind up with a Dimensions node in the Solution Explorer that looks like Figure 24.14. + +Figure 24.14 + +Okay, so our dimensions are created, but we're not quite ready to build a cube yet. The issue we need to take care of first is the construction of a time dimension. "But wait!" you say, "We already have a time dimension." If you said that, you would be correct. There is, however, a small problem. SQL Server doesn't know that it's a time dimension. To fix this, select the Date.dim entry (if you didn't rename it as you created it, it would still be called DimDate.dim) under Dimensions, then look at the Attributes list on the left as shown in Figure 24.15. + +Figure 24.15 + +Right-click the Date node and select Properties. In the Properties pane, scroll down to the Basic section and notice the entry for Type. We need to change that to Time as shown in Figure 24.16. + +Figure 24.16 + +With all this created, we're ready to build our cube. Simply right-click the project and select Deploy (you could also choose to limit things to a build), as shown in Figure 24.17. + +Figure 24.17 + +This should get us a fully realized cube. What we need from there is to take a quick look at what exactly a cube gives us. + +Accessing a Cube + +So, given the example we just created, we're ready to actually make use of our cube. We can do this in several ways: + + * Microsoft Excel (if you connect, as we will momentarily, you'll automatically get a pivot table) + * Direct connection and query using Multi-Dimensional Expressions or MDX (the Analysis Services equivalent of T-SQL) + * Other tools that are analytics centric, such as Performance Point Server + +As a quick example, we're going to connect to the cube we just built using a pivot table in Excel 2007. Excel has a rich set of functionalities for asking "What if?" questions, and the pivot table and pivot chart features integrate fairly easily with Analysis Services cubes. + +Let's check this out by firing up Excel 2007 and navigating to the Data ribbon as show in Figure 24.18. Note that I've clicked the From Other Sources tab and selected the From Analysis Services option. (This is built in, and requires no special configuration!) This will bring up the Data Connection Wizard dialog that is very similar to many other connection dialogs we've seen throughout this book. Enter the name of your server (or simply (local) if the cube is on the same server on which you are running Excel) and click Next to move on to the Select Database and Table dialog shown in Figure 24.19. Now go ahead and click Finish to bring up the Import Data dialog shown in Figure 24.20. This allows us to position where the data goes on our sheet and to confirm what we want to do with the data (in this case, create a pivot table). Go ahead and click OK here to accept the defaults. + +Figure 24.18 + +Figure 24.19 + +Figure 24.20 + +If you're new to pivot tables, the sheet that appears (shown in Figure 24.21) may seem a bit anticlimactic. After all, there are no numbers and no real report. Looks, however, are deceiving. The secret to the horsepower in Excel pivot tables is found in the panes along the right-hand side of the workbook as we see in Figure 24.21. + +Figure 24.21 + +At first it appears you have no report. However, a template makes it easy for you to manipulate the kind of information you want on your report and, more importantly, explore the results. To check this out, let's manipulate the data a bit. You can do this by dragging fields you are interested in from the PivotTable Field List into the areas listed below (as shown in Figure 24.22). As you do this, notice the effect that dropping a field in each box has on the main PivotTable area. + +Be careful as you click around in the main sheet area. If you click outside of the PivotTable, all the PivotTable fields will vanish. If this happens, just click in the area of the PivotTable and they should all re-appear. + +Figure 24.22 + +I'm going to leave full exploration of what PivotTables can do to a book on Excel, but hopefully you've got a taste of just how easily an Analysis Services cube can enable you to explore your data. Keep in mind that this was just an example of one easy way to connect to your data. You can also issue complex queries against the cube using MDX. Such queries can compare multiple dimensions and allow for special functions for such things as comparing year over year results. What's more, the data coming out of the cube is highly optimized for just this kind of comparison. + +Summary + +What we covered in this chapter was not really meant to make you an expert in data warehousing, Analysis Services, or business intelligence. Instead, the idea is to give you a concept of what is involved in creating cubes and perhaps a little taste of what they might do for you. I can't stress enough just how surface our coverage of the product was. Analysis Services is a full book to itself. Hopefully the information covered here has given you enough of a feel for Analysis Services to know whether you want to pursue it further. +25 + +Being Well Connected + +Having a SQL Server but not allowing programs to connect to it is almost the same as not having a SQL Server at all. Sure, we may log in to Management Studio and write queries, but the reality is that the vast majority of our users out there never actually see the database directly. They are just using input and reporting screens in some system we've written. (Ok, it is today's massively multiplayer online world and with other large applications out there, they could be on some other highly scalable system too, but not too many of us are going to work on one of those.) + +With this in mind, it probably makes sense to figure out how your application is actually going to talk to the database. There are tons of books out there that cover this topic directly (and, outside of a basic connection, it really is a huge topic unto itself), so I'm not even going to attempt to discuss every fine point of every access model in every language. Instead, we're going to explore basic concepts and some fundamental issues of performance, memory use, and general best practices. As I've done with some of the other broad topics we spend time with in this book, the idea is to get you some fundamentals in a quick but useful way and give you something of a taste of what's involved and what kinds of questions you should be asking. + +So, having tempted you with a teaser, it's time for what may seem like bad news (but it's not and we'll get to why in a moment). This particular chapter is a "Web release only" chapter, which is a long-winded way of saying, "You need to go download it off the Web." You can fetch it from either the p2p.wrox.com support site or my personal site at www.professionalsql.com. Why did we do that? Well, it's a multifold thing. Some of it was, I'll admit, time constraints on the book. There is, however, another reason—timeliness. Connectivity has been one of the most changing areas of database work over the last decade or slightly more. As we'll discuss to some extent in the downloaded copy of this chapter, history is littered with various access models that have come and gone. (Heck, there are quite a few still in use.) As I write this, the .NET world is largely using ADO.NET and LINQ. Up and coming, however, is Microsoft's whole Entity Frameworks initiative—who knows what else by the time Kilimanjaro (the code name for the next version of SQL Server) is out and we publish another book. Going to a Web release makes it far more realistic that we can update this chapter if there are enough changes to warrant it. (While we still need to go through all the editing, we don't have to typeset or deal with page numbers.) + +Once downloaded, you'll find information such as: + + * Various data access object models past and present (a little history) + * Some basic best practices for data access + * Some brief examples of connecting to your database in .NET + +A + +System Functions + +SQL Server includes a number of "System Functions" as well as more typical functions with the product. Some of these are used often and are fairly clear right from the beginning in terms of how to use them. Others, though, are both rarer in use and more cryptic in nature. + +In this appendix, we'll try to clarify the use of most of these functions in a short, concise manner. + +Just as an FYI, in prior releases, many system functions were often referred to as "Global Variables." This was a misnomer, and Microsoft has striven to fix it over the last few releases—changing the documentation to refer to them by the more proper "System Function" name. Just keep the old terminology in mind in case any old fogies (such as myself) find themselves referring to them as Globals. + +The T-SQL functions available in SQL Server 2008 fall into 14 categories: + + * Legacy "system" functions + * Aggregate functions + * Configuration functions + * Cryptographic functions + * Cursor functions + * Date and time functions + * Mathematical functions + * Metadata functions + * Ranking functions + * Rowset functions + * Security functions + * String functions + * System functions + * Text and image functions + +In addition, we have the OVER operator, which largely works as a ranking tool, and can be applied to other forms of T-SQL functions (most notably aggregates). While I only discuss it as part of the ranking functions, you may see it referenced several other places in this appendix. + +Legacy System Functions (a.k.a. Global Variables) + +@@CONNECTIONS + +Returns the number of connections attempted since the last time your SQL Server was started. + +This one is the total of all connection attempts made since the last time your SQL Server was started. The key thing to remember here is that we are talking about attempts, not actual connections, and that we are talking about connections as opposed to users. + +Every attempt made to create a connection increments this counter regardless of whether or not that connection was successful. The only catch with this is that the connection attempt has to have made it as far as the server. If the connection failed because of NetLib differences or some other network issue, then your SQL Server wouldn't even know that it needed to increase the count; it only counts if the server saw the connection attempt. Whether the attempt succeeded or failed does not matter. + +It's also important to understand that we're talking about connections instead of login attempts. Depending on your application, you may create several connections to your server, but you'll probably only ask the user for information once. Indeed, even Query Analyzer does this. When you click for a new window, it automatically creates another connection based on the same login information. + +This, like a number of other system functions, is often better served by a system stored procedure, sp_monitor. This procedure, in one command, produces the information from the number of connections, CPU busy, through to the total number of writes by SQL Server. So, if basic information is what you're after, sp_monitor may be better. If you need discrete data that you can manipulate, then @@CONNECTIONS provides a nice, neat, scalar piece of data. + +@@CPU_BUSY + +Returns the time in milliseconds that the CPU has been actively doing work since SQL Server last started. This number is based on the resolution of the system timer, which can vary, and can therefore vary in accuracy. + +This is another of the "since the server started" kind of functions. This means that you can't always count on the number going up as your application runs. It's possible, based on this number, to figure out a percentage of the CPU that your SQL Server is taking up. Realistically though, I'd rather tap right into the Performance Monitor for that if I had some dire need for it. The bottom line is that this is one of those really cool things from a "gee, isn't it swell to know that" point of view, but doesn't have all that many practical uses in most applications. + +@@IDLE + +Returns the time in milliseconds (based on the resolution of the system timer) that SQL Server has been idle since it was last started. + +You can think of this one as being something of the inverse of @@CPU_BUSY. Essentially, it tells you how much time your SQL Server has spent doing nothing. If anyone finds a programmatic use for this one, send me an e-mail (robv@professionalsql.com). I'd love to hear about it (I can't think of one). + +@@IO_BUSY + +Returns the time in milliseconds (based on the resolution of the system timer) that SQL Server has spent doing input and output operations since it was last started. This value is reset every time SQL Server is started. + +This one doesn't really have any rocket science to it, and it is another one of those that I find falls into the "no real programmatic use" category. + +@@PACK_RECEIVED and @@PACK_SENT + +Respectively return the number of input packets read to and written from the network by SQL Server since it was last started. + +Primarily, these are network troubleshooting tools. + +@@PACKET_ERRORS + +Returns the number of network packet errors that have occurred on connections to your SQL Server since the last time the SQL Server was started. + +Primarily a network troubleshooting tool. + +@@TIMETICKS + +Returns the number of microseconds per tick. This varies by machines and is another of those that falls under the category of "no real programmatic use." + +@@TOTAL_ERRORS + +Returns the number of disk read/write errors encountered by the SQL Server since it was last started. + +Don't confuse this with runtime errors or as having any relation to @@ERROR. This is about problems with physical I/O. This one is another of those of the "no real programmatic use" variety. The primary use here would be more along the lines of system diagnostic scripts. Generally speaking, I would use the Windows Reliability and Performance Monitor for this instead. + +@@TOTAL_READ and @@TOTAL_WRITE + +Respectively return the total number of disk reads/writes by SQL Server since it was last started. + +The names here are a little misleading, as these do not include any reads from cache. They are only physical I/O. + +@@TRANCOUNT + +Returns the number of active transactions—essentially the transaction nesting level—for the current connection. + +This is a very big one when you are doing transactioning. I'm not normally a big fan of nested transactions, but there are times where they are difficult to avoid. As such, it can be important to know just where you are in the transaction-nesting side of things. (For example, you may have logic that only starts a transaction if you're not already in one.) + +If you're not in a transaction, then @@TRANCOUNT is 0. From there, let's look at a brief example: + +SELECT @@TRANCOUNT As TransactionNestLevel --This will be zero at this + +point + +BEGIN TRAN + +SELECT @@TRANCOUNT As TransactionNestLevel --This will be one at this + +point + +BEGIN TRAN + +SELECT @@TRANCOUNT As TransactionNestLevel --This will be two at this + +point + +COMMIT TRAN + +SELECT @@TRANCOUNT As TransactionNestLevel --This will be back to one + +\--at this point + +ROLLBACK TRAN + +SELECT @@TRANCOUNT As TransactionNestLevel --This will be back to zero + +\--at this point + +Note that, in this example, the @@TRANCOUNT at the end would also have reached zero if we had a COMMIT as our last statement. + +Aggregate Functions + +Aggregate functions are applied to sets of records rather than to a single record. The information in the multiple records is processed in a particular manner and then is displayed in a single record answer. Aggregate functions are often used in conjunction with the GROUP BY clause. + +The aggregate functions are: + + * AVG + * CHECKSUM + * CHECKSUM_AGG + * COUNT + * COUNT_BIG + * GROUPING + * MAX + * MIN + * STDEV + * STDEVP + * SUM + * VAR + * VARP + +In most aggregate functions, the ALL or DISTINCT keywords can be used. The ALL argument is the default and will apply the function to all the values in the expression, even if a value appears numerous times. The DISTINCT argument means that a value will only be included in the function once, even if it occurs several times. + +Aggregate functions cannot be nested. The expression cannot be a subquery. + +AVG + +AVG returns the average of the values in expression. The syntax is as follows: + +AVG([ALL | DISTINCT] ) + +The expression must contain numeric values. NULL values are ignored. This function supports the OVER operator described in the ranking functions section of this appendix. + +CHECKSUM + +This is a basic hash algorithm usually used to detect changes or consistency in data. This particular function accepts either an expression as an argument or a * (which implies that you want all columns in all the joined tables to be included). The basic syntax is: + +CHECKSUM(, [...n] | * ) + +Note that the order of your expression, or in the case of a *, the join order, will affect the checksum value, so, for example: + +CHECKSUM(SalesOrderID, OrderDate) + +would not give the same result as: + +CHECKSUM(OrderDate, SalesOrderID ) + +This function is not compatible with the OVER operator. + +CHECKSUM_AGG + +Like CHECKSUM, this is a basic hash algorithm usually used to detect changes or consistency in data. The primary difference is that CHECKSUM is oriented around rows, whereas CHECKSUM_AGG is oriented around columns. The basic syntax is: + +CHECKSUM_AGG( [ALL | DISTINCT] ) + +The expression value can be virtually anything, including, if you wish, concatenation of columns (just remember to cast as necessary); however, remember that expression order does matter, so if you're concatenating, Col1 + Col2 does not equal Col2 + Col1. + +COUNT + +COUNT returns the number of items in expression. The data type returned is of type int. The syntax is as follows: + +COUNT + +( + +[ALL | DISTINCT] | * + +) + +The expression cannot be of the uniqueidentifier, text, image, or ntext data types. The * argument returns the number of rows in the table; it does not eliminate duplicate or NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +COUNT_BIG + +COUNT_BIG returns the number of items in a group. This is very similar to the COUNT function, with the exception that the return value has a data type of bigint. The syntax is as follows: + +COUNT_BIG + +( + +[ALL | DISTINCT ] | * + +) + +Like COUNT, this function supports the OVER operator described in the ranking functions section of this appendix. + +GROUPING + +GROUPING adds an extra column to the output of a SELECT statement. The GROUPING function is used in conjunction with CUBE or ROLLUP to distinguish between normal NULL values and those added as a result of CUBE and ROLLUP operations. Its syntax is: + +GROUPING () + +GROUPING is used only in the SELECT list. Its argument is a column that is used in the GROUP BY clause and that is to be checked for NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +MAX + +The MAX function returns the maximum value from expression. The syntax is as follows: + +MAX([ALL | DISTINCT] ) + +MAX ignores any NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +MIN + +The MIN function returns the smallest value from expression. The syntax is as follows: + +MIN([ALL | DISTINCT] ) + +MIN ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +STDEV + +The STDEV function returns the standard deviation of all values in expression. The syntax is as follows: + +STDEV() + +STDEV ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +STDEVP + +The STDEVP function returns the standard deviation for the population of all values in expression. The syntax is as follows: + +STDEVP() + +STDEVP ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +SUM + +The SUM function will return the total of all values in expression. The syntax is as follows: + +SUM([ALL | DISTINCT] ) + +SUM ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +VAR + +The VAR function returns the variance of all values in expression. The syntax is as follows: + +VAR() + +VAR ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +VARP + +The VARP function returns the variance for the population of all values in expression. The syntax is as follows: + +VARP() + +VARP ignores NULL values. + +This function supports the OVER operator described in the ranking functions section of this appendix. + +Configuration Functions + +Well, I'm sure it will come as a complete surprise (ok, not really...), but configuration functions are those functions that tell us about options as they are set for the current server or database (as appropriate). + +@@DATEFIRST + +Returns the numeric value that corresponds to the day of the week that the system considers the first day of the week. + +The default in the United States is 7, which equates to Sunday. The values convert as follows: + + * 1—Monday (the first day for most of the world) + * 2—Tuesday + * 3—Wednesday + * 4—Thursday + * 5—Friday + * 6—Saturday + * 7—Sunday + +This can be really handy when dealing with localization issues, so you can properly layout any calendar or other day-of-week-dependent information you have. + +Use the SET DATEFIRST function to alter this setting. + +@@DBTS + +Returns the last used timestamp for the current database. + +At first look, this one seems to act an awful lot like @@IDENTITY in that it gives you the chance to get back the last value set by the system (this time, it's the last timestamp instead of the last identity value). The things to watch out for on this one include: + + * The value changes based on any change in the database, not just the table you're working on. + * Any timestamp change in the database is reflected, not just those for the current connection. + +Because you can't count on this value truly being the last one that you used (someone else may have done something that would change it), I personally find very little practical use for this one. + +@@LANGID and @@LANGUAGE + +Respectively return the ID and the name of the language currently in use. + +These can be handy for figuring out if your product has been installed in a localization situation or not, and if so what language is the default. + +For a full listing of the languages currently supported by SQL Server, use the system stored procedure, sp_helplanguage. + +@@LOCK_TIMEOUT + +Returns the current amount of time in milliseconds before the system will time out waiting for a blocked resource. + +If a resource (a page, a row, a table, whatever) is blocked, your process will stop and wait for the block to clear. This determines just how long your process will wait before the statement is canceled. + +The default time to wait is 0 (which equates to indefinitely) unless someone has changed it at the system level (using sp_configure). Regardless of how the system default is set, you will get a value of −1 from this global unless you have manually set the value for the current connection using SET LOCK_TIMEOUT. + +@@MAX_CONNECTIONS + +Returns the maximum number of simultaneous user connections allowed on your SQL Server. + +Don't mistake this one to mean the same thing as you would see under the Maximum Connections property in the Management Console. This one is based on licensing and will show a very high number if you have selected "per seat" licensing. + +Note that the actual number of user connections allowed also depends on the version of SQL Server you are using and the limits of your application(s) and hardware. + +@@MAX_PRECISION + +Returns the level of precision currently set for decimal and numeric data types. + +The default is 38 places, but the value can be changed by using the /p option when you start your SQL Server. The /p can be added by starting SQL Server from a command line or by adding it to the Startup parameters for the MSSQLServer service in the Windows Services applet. + +@@NESTLEVEL + +Returns the current nesting level for nested stored procedures. + +The first stored procedure (sproc) to run has an ΠTLEVEL of 0. If that sproc calls another, then the second sproc is said to be nested in the first sproc (and ΠTLEVEL is incremented to a value of 1). Likewise, the second sproc may call a third, and so on up to maximum of 32 levels deep. If you go past the level of 32 levels deep, not only will the transaction be terminated, but you should revisit the design of your application. + +@@OPTIONS + +Returns information about options that have been applied using the SET command. + +Since you get back only one value, but can have many options set, SQL Server uses binary flags to indicate what values are set. In order to test whether the option you are interested in is set, you must use the option value together with a bitwise operator. For example: + +IF (@@OPTIONS & 2) + +If this evaluates to True, then you would know that IMPLICIT_TRANSACTIONS had been turned on for the current connection. The values are: + +Bit | SET Option | Description +---|---|--- +1 | DISABLE_ DEF_CNST_CHK | Interim vs. deferred constraint checking. +2 | IMPLICIT_ TRANSACTIONS | A transaction is started implicitly when a statement is executed. +4 | CURSOR_CLOSE ON_COMMIT | Controls behavior of cursors after a COMMIT operation has been performed. +8 | ANSI_WARNINGS | Warns of truncation and NULL in aggregates. +16 | ANSI_PADDING | Controls padding of fixed-length variables. +32 | ANSI_NULLS | Determines handling of nulls when using equality operators. +64 | ARITHABORT | Terminates a query when an overflow or divide-by-zero error occurs during query execution. +128 | ARITHIGNORE | Returns NULL when an overflow or divide-by-zero error occurs during a query. +256 | QUOTED_ IDENTIFIER | Differentiates between single and double quotation marks when evaluating an expression. +512 | NOCOUNT | Turns off the row(s) affected message returned at the end of each statement. +1024 | ANSI_NULL_ DFLT_ON | Alters the session's behavior to use ANSI compatibility for nullability. Columns created with new tables or added to old tables without explicit null option settings are defined to allow nulls. Mutually exclusive with ANSI_NULL_DFLT_OFF. +2048 | ANSI_NULL_ DFLT_OFF | Alters the session's behavior not to use ANSI compatibility for nullability. New columns defined without explicit nullability are defined not to allow nulls. Mutually exclusive with ANSI_NULL_DFLT_ON. +4096 | CONCAT_NULL_ YIELDS_NULL | Returns a NULL when concatenating a NULL with a string. +8192 | NUMERIC_ ROUNDABORT | Generates an error when a loss of precision occurs in an expression. + +@@REMSERVER + +Returns the value of the server (as it appears in the login record) that called the stored procedure. + +Used only in stored procedures. This one is handy when you want the sproc to behave differently depending on what remote server (often a geographic location) the sproc was called from. + +@@SERVERNAME + +Returns the name of the local server that the script is running from. + +If you have multiple instances of SQL Server installed (a good example would be a Web hosting service that uses a separate SQL Server installation for each client), then @@SERVERNAME returns the following local server name information if the local server name has not been changed since setup: + +Instance | Server Information +---|--- +Default instance | +Named instance | +Virtual server—default instance | +Virtual server—named instance | + +@@SERVICENAME + +Returns the name of the registry key under which SQL Server is running. + +Only returns something under Windows 2000/2003/XP, and (under either of these) should always return MSSQLService unless you've been playing games in the registry. + +@@SPID + +Returns the server process ID (SPID) of the current user process. + +This equates to the same process ID that you see if you run sp_who. What's nice is that you can tell the SPID for your current connection, which can be used by the DBA to monitor, and if necessary terminate, that task. + +@@TEXTSIZE + +Returns the current value of the TEXTSIZE option of the SET statement, which specifies the maximum length, in bytes, returned by a SELECT statement when dealing with text or image data. + +The default is 4096 bytes (4KB). You can change this value by using the SET TEXTSIZE statement. + +@@VERSION + +Returns the current version of SQL Server as well as the processor type and OS architecture. + +For example, a run on an old SQL Server 2005 box might look like this: + +SELECT @@VERSION + +and gives: + +\---------------------------------------------------------------------------------- + +Microsoft SQL Server 2008 (RTM) - 10.0.1600.22 (X64) + +Jul 9 2008 14:17:44 + +Copyright (c) 1988–2008 Microsoft Corporation + +Developer Edition (64-bit) on Windows NT 6.0 (Build 6001: Service Pack 1) + +(1 row(s) affected) + +Unfortunately, this doesn't return the information into any kind of structured field arrangement, so you have to parse it if you want to use it to test for specific information. + +Consider using the xp_msver system sproc instead. It returns information in such a way that you can more easily retrieve specific information from the results. + +Cryptographic Functions + +These are functions that help support the encryption, decryption, digital signing, and digital signature validation. Some of these are new with SQL Server 2008, and some came with SQL Server 2005. Notice that there are duplicates of most functions from a general use point of view, but that they are different in that one supports a symmetric key and the duplicate (usually with an "Asym" in the name) supports an asymmetrical key. + +Now, you may ask "why would I need these?" The answer is as varied as the possible applications for SQL Server. The quick answer though is this: Anytime you're sending or accepting data that you want to protect during transport. For example, since SQL Server supports HTTP endpoints, and, from that, hosting of its own Web services, you may want to accept or return encrypted information with a client of your Web service. Perhaps a more basic example is simply that you've chosen to encrypt the data in your database, and now you need to get it back out in a useful manner. + +AsymKey_ID + +Given the name of an asymmetric key, this function returns an int that corresponds to the related ID from the database. The syntax is simple: + +AsymKey_ID('') + +You must have permissions to the key in question to use this function. + +Cert_ID + +Similar to AsymKey_ID, this returns an ID that relates to the name of a certificate name. The syntax is simple: + +Cert_ID('') + +You must have permissions to the certificate in question to use this function. + +CertProperty + +Allows you to fetch various properties of a given certificate (as identified by the certificate's ID). Valid properties include the start date, expiration date, certificate issuer's name, serial number, security ID (The 'SID', which can also be returned as a string), and the subject of the certificate (who or what is being certified). The syntax looks like this: + +CertProperty ( Cert_ID , + +'Expiry_Date'|'Start_Date'|'Issuer_Name'|'Cert_Serial_Number'|'Subject' + +|'SID'|'String_SID' ) + +The data type returned will vary depending on the specific property you're looking for (datetime, nvarchar, or varbinary as appropriate). + +DecryptByAsmKey + +As you can imagine by the name, this one decrypts a chunk of data utilizing an asymmetric key. It requires the key (by ID), the encrypted data (either as a literal string or a string coercible variable), and the password used to encrypt the asymmetric key in the database. The syntax is straightforward enough: + +DecryptByAsymKey(, {''|} + +[, '']) + +DecryptByCert + +This is basically the same as DecryptByAsmKey, except that it expects a certificate rather than an asymmetric key. Like DecryptByAsmKey, this one decrypts a chunk of data utilizing a key. It requires the certificate (by ID), the encrypted data (either as a literal string or a string coercible variable), and the password used to encrypt the private key of the certificate (if one was used). The syntax looks almost just like DecryptByAsymKey: + +DecryptByCert(, {''|} + +[, '']) + +Again, any password utilized when encrypting the private key of the certificate will be needed to properly decrypt it. + +DecryptByKey + +Like its asymmetric and certificate-based brethren, this one decrypts a chunk of data utilizing a key. What's different is that this one not only expects a symmetric key (instead of the other types of key), but it also expects that key to already be "open" (using the OPEN SYMMETRIC KEY command). Other than that, it is fairly similar in use, with the encrypted data (either as a literal string or a string coercible variable) fed in as a parameter and, in this case, a hash key optionally accepted as an authenticator: + +DecryptByKey({''|}, + +[, ''|]) + +Note that if you provide an add authenticator value (in the form of an int), that value must match the value supplied when the string was encrypted, and you must also supply a hash value that matches the hash supplied at encryption time. + +DecryptByPassPhrase + +Like the name says, this one decrypts data that was encrypted not by a formal key, but by a passphrase. Other than accepting a passphrase parameter instead of assuming an open key, DecryptByPassPhrase works almost exactly like DecryptByKey: + +DecryptByPassPhrase({''|}, + +{''|}, + +[, ''|]) + +As with DecryptByKey, if you provide an add authenticator value (in the form of an int), that value must match the value supplied when the string was encrypted, and you must also supply a hash value that matches the hash supplied at encryption time. + +EncryptByAsmKey + +Encrypts a chunk of data utilizing an asymmetric key. It requires the key (by ID) and the data to be encrypted (either as a literal string or a string coercible variable). The syntax is straightforward enough: + +EncryptByAsymKey(, {''|}) + +EncryptByCert + +This is basically the same as EncryptByAsmKey, except that it expects a certificate rather than an asymmetric key. Like EncryptByAsmKey, this one encrypts a chunk of data utilizing the provided key. It requires the certificate (by ID) and the data to be encrypted (either as a literal string or a string coercible variable). The syntax looks almost just like EncryptByAsymKey: + +EncryptByCert(, {''|}) + +EncryptByKey + +This one not only expects a symmetric key (instead of the other types of key), but it also expects that key to already be "open" (using the OPEN SYMMETRIC KEY command) and a GUID to be available to reference that key by. Other than that, it is fairly similar in use, with the data to be encrypted (either as a literal string or a string coercible variable) fed in as a parameter and, in this case, a hash key optionally accepted as an authenticator: + +EncryptByKey({, ''|}, + +[, ''|]) + +Note that if you provide an add authenticator value (in the form of an int), that value must be supplied when the string is decrypted, and you must also supply a hash value (which again will be needed at decryption time). + +EncryptByPassPhrase + +This one encrypts data not by using a formal key, but by a passphrase. Other than accepting a passphrase parameter instead of assuming an open key, EncryptByPassPhrase works almost exactly like EncryptByKey: + +EncryptByPassPhrase({''|}, + +{''|}, + +[, ''|]) + +As with EncryptByKey, if you provide an add authenticator value (in the form of an int), that value must be supplied when the string is decrypted, and you must also supply a hash value. + +Key_GUID + +Fetches the GUID for a given symmetric key in the current database: + +Key_GUID('') + +Key_ID + +Fetches the GUID for a given symmetric key in the current database: + +Key_ID('') + +SignByAsymKey + +Adds an asymmetric key signature to a given plain text value: + +SignByAsymKey(, [, '']) + +SignByCert + +Returns a varbinary(8000) containing the resulting signature provided a given certificate and plain text value: + +SignByCert(, [, '']) + +VerifySignedByAsymKey + +Returns an int (though, personally I think this odd since it is functionally a bit) indicating successful or failed validation of a signature against a given asymmetric key and plain text value: + +VerifySignedByAsymKey(, , ) + +VerifySignedByCert + +Returns an int (though, personally I think this odd since it is functionally a bit) indicating successful or failed validation of a signature against a given asymmetric key and plain text value: + +VerifySignedByCert(, , ) + +Cursor Functions + +These provide various information on the status or nature of a given cursor. + +@@CURSOR_ROWS + +How many rows are currently in the last cursor set opened on the current connection. Note that this is for cursors, not temporary tables. + +Keep in mind that this number is reset every time you open a new cursor. If you need to open more than one cursor at a time, and you need to know the number of rows in the first cursor, then you'll need to move this value into a holding variable before opening subsequent cursors. + +It's possible to use this to set up a counter to control your WHILE loop when dealing with cursors, but I strongly recommend against this practice. The value contained in @@CURSOR_ROWS can change depending on the cursor type and whether or not SQL Server is populating the cursor asynchronously. Using @@FETCH_STATUS is going to be far more reliable and at least as easy to use. + +If the value returned is a negative number larger than −1, then you must be working with an asynchronous cursor, and the negative number is the number of records so far created in the cursor. If, however, the value is −1, then the cursor is a dynamic cursor, in that the number of rows is constantly changing. A returned value of 0 informs you that either no cursor has been opened or the last cursor opened is no longer open. Finally, any positive number indicates the number of rows within the cursor. + +To create an asynchronous cursor, set sp_configure cursor threshold to a value greater than 0. Then, when the cursor exceeds this setting, the cursor is returned, while the remaining records are placed into the cursor asynchronously. + +@@FETCH_STATUS + +Returns an indicator of the status of the last cursor FETCH operation. + +If you're using cursors, you're going to be using @@FETCH_STATUS. This one is how you know the success or failure of your attempt to navigate to a record in your cursor. It will return a constant depending on whether or not SQL Server succeeded in your last FETCH operation, and, if the FETCH failed, why. The constants are: + + * 0—Success + * −1—Failed. Usually because you are beyond either the beginning or end of the cursorset. + * −2—Failed. The row you were fetching wasn't found, usually because it was deleted between the time when the cursorset was created and when you navigated to the current row. Should only occur in scrollable, non-dynamic cursors. + +For purposes of readability, I often will set up some constants prior to using @@FETCH_STATUS. + +For example: + +DECLARE @NOTFOUND int + +DECLARE @BEGINEND int + +SELECT @NOTFOUND = −2 + +SELECT @BEGINEND = −1 + +I can then use these in my conditional in the WHILE statement of my cursor loop instead of just the row integer. This can make the code quite a bit more readable. + +CURSOR_STATUS + +The CURSOR_STATUS function allows the caller of a stored procedure to determine if that procedure has returned a cursor and result set. The syntax is as follows: + +CURSOR_STATUS + +( + +{'', ''} + +| {', ''} + +| {'', ''} + +) + +local, global, and variable all specify constants that indicate the source of the cursor. Local equates to a local cursor name, global to a global cursor name, and variable to a local variable. + +If you are using the cursor name form, then there are four possible return values: + + * 1—The cursor is open. If the cursor is dynamic, its result set has zero or more rows. If the cursor is not dynamic, it has one or more rows. + * 0—The result set of the cursor is empty. + * −1—The cursor is closed. + * −3—A cursor of cursor name does not exist. + +If you are using the cursor variable form, there are five possible return values: + + * 1—The cursor is open. If the cursor is dynamic, its result set has zero or more rows. If the cursor is not dynamic, it has one or more rows. + * 0—The result set is empty. + * −1—The cursor is closed. + * −2—There is no cursor assigned to the cursor variable. + * −3—The variable with name cursor variable does not exist, or if it does exist, has not had a cursor allocated to it yet. + +Date and Time Functions + +This is an area with several new items in SQL Server 2008. In addition to working with timestamp data (which is actually more oriented toward versioning than anything to do with a clock or calendar), date and time functions perform operations on values that have any of the various date and time data types supported by SQL Server. + +When working with many of these functions, SQL Server recognizes eleven "dateparts" and their abbreviations, as shown in the following table: + +Datepart | Abbreviations +---|--- +year | yy, yyyy +quarter | qq, q +month | mm, m +dayofyear | dy, y +day | dd, d +week | wk, ww +weekday | dw +hour | hh +minute | mi, n +second | ss, s +millisecond | ms + +CURRENT_TIMESTAMP + +The CURRENT_TIMESTAMP function simply returns the current date and time as a datetime type. It is equivalent to GETDATE(). The syntax is as follows: + +CURRENT_TIMESTAMP + +DATEADD + +The DATEADD function adds an interval to a date and returns a new date. The syntax is as follows: + +DATEADD(, , ) + +The datepart argument specifies the time scale of the interval (day, week, month, and so on) and may be any of the dateparts recognized by SQL Server. The number argument is the number of dateparts that should be added to the date. + +DATEDIFF + +The DATEDIFF function returns the difference between two specified dates in a specified unit of time (for example: hours, days, weeks). The syntax is as follows: + +DATEDIFF(, , ) + +The datepart argument may be any of the dateparts recognized by SQL Server and specifies the unit of time to be used. + +DATENAME + +The DATENAME function returns a string representing the name of the specified datepart (for example: 1999, Thursday, July) of the specified date. The syntax is as follows: + +DATENAME(, ) + +DATEPART + +The DATEPART function returns an integer that represents the specified datepart of the specified date. The syntax is as follows: + +DATEPART(, ) + +The DAY function is equivalent to DATEPART(dd, ); MONTH is equivalent to DATEPART(mm, ); YEAR is equivalent to DATEPART(yy, ). + +DAY + +The DAY function returns an integer representing the day part of the specified date. The syntax is as follows: + +DAY() + +The DAY function is equivalent to DATEPART(dd, ). + +GETDATE + +The GETDATE function returns the current system date and time. The syntax is as follows: + +GETDATE() + +GETUTCDATE + +The GETUTCDATE function returns the current UTC (Universal Time Coordinate) time. In other words, this returns Greenwich Mean Time. The value is derived by taking the local time from the server, and the local time zone, and calculating GMT from this. Daylight saving is included. GETUTCDATE cannot be called from a user-defined function. The syntax is as follows: + +GETUTCDATE() + +ISDATE + +The ISDATE function determines whether an input expression is a valid date. The syntax is as follows: + +ISDATE() + +MONTH + +The MONTH function returns an integer that represents the month part of the specified date. The syntax is as follows: + +MONTH() + +The MONTH function is equivalent to DATEPART(mm, ). + +SYSDATETIME + +Much like the more venerable GETDATE function, SYSDATETIME returns the current system date and time. The differences are twofold: First, SYSDATETIME returns a higher level of precision. Second, the newer function returns the newer datetime2 data type (to support the higher precision—a precision of 7 in this case). The syntax is as follows: + +SYSDATETIME() + +SYSDATETIMEOFFSET + +Similar to SYSDATETIME, this returns the current system date and time. Instead of the simple datetime2 data type, however, SYSDATETIMEOFFSET returns the time in the new datetimeoffset data type (with a precision of 7), thus providing offset information versus universal time. The syntax is as follows: + +SYSDATETIMEOFFSET() + +SYSUTCDATETIME + +Much like the more venerable GETUTCDATE function, SYSDATETIME returns the current UTC date and time. SYSDATETIME, however, returns the newer function returns the newer datetime2 data type (to a precision of7). The syntax is as follows: + +SYSUTCDATETIME() + +SWITCHOFFSET + +This one accepts two arguments—an input value of type datetimeoffset(), and a new offset to represent the time as. The syntax looks like this: + +SWITCHOFFSET(, ) + +So, if we run a quick test: + +CREATE TABLE TimeTest + +( + +MyTime datetimeoffset + +); + +INSERT TimeTest + +VALUES ('2008-12-31 6:00:00 -5:00'); + +SELECT SWITCHOFFSET(MyTime, '-08:00') AS Pacific + +FROM TimeTest; + +DROP TABLE TimeTest; + +we would get back: + +(1 row(s) affected) + +Pacific + +\---------------------------------- + +2008-12-31 03:00:00.0000000 -08:00 + +(1 row(s) affected) + +TODATETIMEOFFSET + +Accepts a given piece of date/time information and adds a provided time offset to produce a datetimeoffset data type. The syntax is: + +TODATETIMEOFFSET(,
', '') + +The column parameter specifies the name of the column for which the length is to be determined. The table parameter specifies the name of the table that contains that column. + +COL_NAME + +The COL_NAME function takes a table ID number and a column ID number and returns the name of the database column. The syntax is as follows: + +COL_NAME(, ) + +The column_id parameter specifies the ID number of the column. The table_id parameter specifies the ID number of the table that contains that column. + +COLUMNPROPERTY + +The COLUMNPROPERTY function returns data about a column or procedure parameter. The syntax is as follows: + +COLUMNPROPERTY(, , ) + +The id parameter specifies the ID of the table/procedure. The column parameter specifies the name of the column/parameter. The property parameter specifies the data that should be returned for the column or procedure parameter. The property parameter can be one of the following values: + + * AllowsNull—Allows NULL values. + * IsComputed—The column is a computed column. + * IsCursorType—The procedure is of type CURSOR. + * IsFullTextIndexed—The column has been full-text indexed. + * IsIdentity—The column is an IDENTITY column. + * IsIdNotForRepl—The column checks for IDENTITY NOT FOR REPLICATION. + * IsOutParam—The procedure parameter is an output parameter. + * IsRowGuidCol—The column is a ROWGUIDCOL column. + * Precision—The precision for the data type of the column or parameter. + * Scale—The scale for the data type of the column or parameter. + * UseAnsiTrim—The ANSI padding setting was ON when the table was created. + +The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid—except for Precision (where the precision for the data type will be returned) and Scale (where the scale will be returned). + +DATABASEPROPERTY + +The DATABASEPROPERTY function returns the setting for the specified database and property name. The syntax is as follows: + +DATABASEPROPERTY('', '') + +The database parameter specifies the name of the database for which data on the named property will be returned. The property parameter contains the name of a database property and can be one of the following values: + + * IsAnsiNullDefault—The database follows the ANSI-92 standard for NULL values. + * IsAnsiNullsEnabled—All comparisons made with a NULL cannot be evaluated. + * IsAnsiWarningsEnabled—Warning messages are issued when standard error conditions occur. + * IsAutoClose—The database frees resources after the last user has exited. + * IsAutoShrink—Database files can be shrunk automatically and periodically. + * IsAutoUpdateStatistics—The autoupdate statistics option has been enabled. + * IsBulkCopy—The database allows nonlogged operations (such as those performed with the Bulk Copy Program). + * IsCloseCursorsOnCommitEnabled—Any cursors that are open when a transaction is committed will be closed. + * IsDboOnly—The database is only accessible to the dbo. + * IsDetached—The database was detached by a detach operation. + * IsEmergencyMode—The database is in emergency mode. + * IsFulltextEnabled—The database has been full-text enabled. + * IsInLoad—The database is loading. + * IsInRecovery—The database is recovering. + * IsInStandby—The database is read-only and restore log is allowed. + * IsLocalCursorsDefault—Cursor declarations default to LOCAL. + * IsNotRecovered—The database failed to recover. + * IsNullConcat—Concatenating to a NULL results in a NULL. + * IsOffline—The database is offline. + * IsQuotedIdentifiersEnabled—Identifiers can be delimited by double quotation marks. + * IsReadOnly—The database is in a read-only mode. + * IsRecursiveTriggersEnabled—The recursive firing of triggers is enabled. + * IsShutDown—The database encountered a problem during startup. + * IsSingleUser—The database is in single-user mode. + * IsSuspect—The database is suspect. + * IsTruncLog—The database truncates its logon checkpoints. + * Version—The internal version number of the SQL Server code with which the database was created. + +The return value from this function will be 1 for true, 0 for false, and NULL if the input was not valid, except for Version (where the function will return the version number if the database is open and NULL if the database is closed). + +DATABASEPROPERTYEX + +The DATABASEPROPERTYEX function is basically a superset of DATABASEPROPERTY, and also returns the setting for the specified database and property name. The syntax is pretty much just the same as DATABASEPROPERTY and is as follows: + +DATABASEPROPERTYEX('', '') + +DATABASEPROPERTYEX just has a few more properties available, including: + + * Collation—Returns the default collation for the database (remember, collations can also be overridden at the column level). + * ComparisonStyle—Indicates the Windows comparison style (for example, case sensitivity) of the particular collation. + * IsAnsiPaddingEnabled—Whether strings are padded to the same length before comparison or insert. + * IsArithmaticAbortEnabled—Whether queries are terminated when a major arithmetic error (such as a data overflow) occurs. + +The database parameter specifies the name of the database for which data on the named property will be returned. The property parameter contains the name of a database property and can be one of the following values. + +DB_ID + +The DB_ID function returns the database ID number. The syntax is as follows: + +DB_ID(['']) + +The optional database_name parameter specifies which database's ID number is required. If the database_name is not given, the current database will be used instead. + +DB_NAME + +The DB_NAME function returns the name of the database that has the specified ID number. The syntax is as follows: + +DB_NAME([]) + +The optional database_id parameter specifies which database's name is to be returned. If no database_id is given, the name of the current database will be returned. + +FILE_ID + +The FILE_ID function returns the file ID number for the specified file name in the current database. The syntax is as follows: + +FILE_ID('') + +The file_name parameter specifies the name of the file for which the ID is required. + +FILE_NAME + +The FILE_NAME function returns the file name for the file with the specified file ID number. The syntax is as follows: + +FILE_NAME() + +The file_id parameter specifies the ID number of the file for which the name is required. + +FILEGROUP_ID + +The FILEGROUP_ID function returns the filegroup ID number for the specified filegroup name. The syntax is as follows: + +FILEGROUP_ID('') + +The filegroup_name parameter specifies the filegroup name of the required filegroup ID. + +FILEGROUP_NAME + +The FILEGROUP_NAME function returns the filegroup name for the specified filegroup ID number. The syntax is as follows: + +FILEGROUP_NAME() + +The filegroup_id parameter specifies the filegroup ID of the required filegroup name. + +FILEGROUPPROPERTY + +The FILEGROUPPROPERTY returns the setting of a specified filegroup property, given the filegroup and property name. The syntax is as follows: + +FILEGROUPPROPERTY(, ) + +The filegroup_name parameter specifies the name of the filegroup that contains the property being queried. The property parameter specifies the property being queried and can be one of the following values: + + * IsReadOnly—The filegroup name is read-only. + * IsUserDefinedFG—The filegroup name is a user-defined filegroup. + * IsDefault—The filegroup name is the default filegroup. + +The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid. + +FILEPROPERTY + +The FILEPROPERTY function returns the setting of a specified file name property, given the file name and property name. The syntax is as follows: + +FILEPROPERTY(, ) + +The file_name parameter specifies the name of the filegroup that contains the property being queried. The property parameter specifies the property being queried and can be one of the following values: + + * IsReadOnly—The file is read-only. + * IsPrimaryFile—The file is the primary file. + * IsLogFile—The file is a log file. + * SpaceUsed—The amount of space used by the specified file. + +The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for SpaceUsed (which will return the number of pages allocated in the file). + +FULLTEXTCATALOGPROPERTY + +The FULLTEXTCATALOGPROPERTY function returns data about the full-text catalog properties. The syntax is as follows: + +FULLTEXTCATALOGPROPERTY(, ) + +The catalog_name parameter specifies the name of the full-text catalog. The property parameter specifies the property that is being queried. The properties that can be queried are: + + * PopulateStatus—For which the possible return values are: 0 (idle), 1 (population in progress), 2 (paused), 3 (throttled), 4 (recovering), 5 (shutdown), 6 (incremental population in progress), 7 (updating index). + * ItemCount—Returns the number of full-text indexed items currently in the full-text catalog. + * IndexSize—Returns the size of the full-text index in megabytes. + * UniqueKeyCount—Returns the number of unique words that make up the full-text index in this catalog. + * LogSize—Returns the size (in bytes) of the combined set of error logs associated with a full-text catalog. + * PopulateCompletionAge—Returns the difference (in seconds) between the completion of the last full-text index population and 01/01/1990 00:00:00. + +FULLTEXTSERVICEPROPERTY + +The FULLTEXTSERVICEPROPERTY function returns data about the full-text service-level properties. The syntax is as follows: + +FULLTEXTSERVICEPROPERTY() + +The property parameter specifies the name of the service-level property that is to be queried. The property parameter may be one of the following values: + + * ResourceUsage—Returns a value from 1 (background) to 5 (dedicated). + * ConnectTimeOut—Returns the number of seconds that the Search Service will wait for all connections to SQL Server for full-text index population before timing out. + * IsFulltextInstalled—Returns 1 if Full-Text Service is installed on the computer and a 0 otherwise. + +INDEX_COL + +The INDEX_COL function returns the indexed column name. The syntax is as follows: + +INDEX_COL('
', , ) + +The table parameter specifies the name of the table, index_id specifies the ID of the index, and key_id specifies the ID of the key. + +INDEXKEY_PROPERTY + +This function returns information about the index key. + +INDEXKEY_PROPERTY(, , , ) + +The table_id parameter is the numerical ID of data type int, which defines the table you wish to inspect. Use OBJECT_ID to find the numerical table_id. index_id specifies the ID of the index, and is also of data type int. key_id specifies the index column position of the key; for example, with a key of three columns, setting this value to 2 will determine that you are wishing to inspect the middle column. Finally, the property is the character string identifier of one of two properties you wish to find the setting of. The two possible values are ColumnId, which will return the physical column ID, and IsDescending, which returns the order that the column is sorted (1 is for descending and 0 is ascending). + +INDEXPROPERTY + +The INDEXPROPERTY function returns the setting of a specified index property, given the table ID, index name, and property name. The syntax is as follows: + +INDEXPROPERTY(, , ) + +The property parameter specifies the property of the index that is to be queried. The property parameter can be one of these possible values: + + * IndexDepth—The depth of the index. + * IsAutoStatistic—The index was created by the autocreate statistics option of sp_dboption. + * IsClustered—The index is clustered. + * IsStatistics—The index was created by the CREATE STATISTICS statement or by the autocreate statistics option of sp_dboption. + * IsUnique—The index is unique. + * IndexFillFactor—The index specifies its own fill factor. + * IsPadIndex—The index specifies space to leave open on each interior node. + * IsFulltextKey—The index is the full-text key for a table. + * IsHypothetical—The index is hypothetical and cannot be used directly as a data access path. + +The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for IndexDepth (which will return the number of levels the index has) and IndexFillFactor (which will return the fill factor used when the index was created or last rebuilt). + +OBJECT_ID + +The OBJECT_ID function returns the specified database object's ID number. The syntax is as follows: + +OBJECT_ID('') + +OBJECT_NAME + +The OBJECT_NAME function returns the name of the specified database object. The syntax is as follows: + +OBJECT_NAME() + +OBJECTPROPERTY + +The OBJECTPROPERTY function returns data about objects in the current database. The syntax is as follows: + +OBJECTPROPERTY(, ) + +The id parameter specifies the ID of the object required. The property parameter specifies the information required on the object. The following property values are allowed: + +CnstIsClustKey | ExecIsTriggerDisabled +---|--- +CnstIsColumn | ExecIsTriggerNotForRepl +CnstIsDeleteCascade | ExecIsUpdateTrigger +CnstIsDisabled | HasAfterTrigger +CnstIsNonclustKey | HasDeleteTrigger +CnstIsNotRepl | HasInsertTrigger +CnstIsNotTrusted | HasInsteadOfTrigger +CnstIsUpdateCascade | HasUpdateTrigger +ExecIsAfterTrigger | IsAnsiNullsOn +ExecIsAnsiNullsOn | IsCheckCnst +ExecIsDeleteTrigger | IsConstraint +ExecIsFirstDeleteTrigger | IsDefault +ExecIsFirstInsertTrigger | IsDefaultCnst +ExecIsFirstUpdateTrigger | IsDeterministic +ExecIsInsertTrigger | IsExecuted +ExecIsInsteadOfTrigger | IsExtendedProc +ExecIsLastDeleteTrigger | IsForeignKey +ExecIsLastInsertTrigger | IsIndexed +ExecIsLastUpdateTrigger | IsIndexable +ExecIsQuotedIdentOn | IsInlineFunction +ExecIsStartup | IsMSShipped +IsPrimaryKey | TableFulltextPopulateStatus +IsProcedure | TableHasActiveFulltextIndex +IsQuotedIdentOn | TableHasCheckCnst +IsQueue | TableHasClustIndex +IsReplProc | TableHasDefaultCnst +IsRule | TableHasDeleteTrigger +IsScalarFunction | TableHasForeignKey +IsSchemaBound | TableHasForeignRef +IsSystemTable | TableHasIdentity +IsTable | TableHasIndex +IsTableFunction | TableHasInsertTrigger +IsTrigger | TableHasNonclustIndex +IsUniqueCnst | TableHasPrimaryKey +IsUserTable | TableHasRowGuidCol +IsView | TableHasTextImage +OwnerId | TableHasTimestamp +TableDeleteTrigger | TableHasUniqueCnst +TableDeleteTriggerCount | TableHasUpdateTrigger +TableFullTextBackgroundUpdateIndexOn | TableInsertTrigger +TableFulltextCatalogId | TableInsertTriggerCount +TableFullTextChangeTrackingOn | TableIsFake +TableFulltextDocsProcessed | TableIsLockedOnBulkLoad +TableFulltextFailCount | TableIsPinned +TableFulltextItemCount | TableTextInRowLimit +TableFulltextKeyColumn | TableUpdateTrigger +TableFulltextPendingChanges | TableUpdateTriggerCount + +The return value from this function will be 1 for True, 0 for False, and NULL if the input was not valid, except for: + + * OwnerId—Returns the database user ID of the owner of that object—note that this is different from the SchemaID of the object and will likely not be that useful in SQL Server 2005 and beyond. + * TableDeleteTrigger, TableInsertTrigger, TableUpdateTrigger—Return the ID of the first trigger with the specified type. Zero is returned if no trigger of that type exists. + * TableDeleteTriggerCount, TableInsertTriggerCount, TableUpdateTriggerCount—Return the number of the specified type of trigger that exists for the table in question. + * TableFulltextCatalogId—Returns the ID of the full-text catalog if there is one, and zero if no full-text catalog exists for that table. + * TableFulltextKeyColumn—Returns the ColumnID of the column being utilized as the unique index for that full-text index. + * TableFulltextPendingChanges—The number of entries that have changed since the last full-text analysis was run for this table. Change tracking must be enabled for this function to return useful results. + * TableFulltextPopulateStatus—This one has multiple possible return values: + * 0—Indicates that the full-text process is currently idle. + * 1—A full population run is currently in progress. + * 2—An incremental population is currently running. + * 3—Changes are currently being analyzed and added to the full-text catalog. + * 4—Some form of background update (such as that done by the automatic change tracking mechanism) is currently running. + * 5—A full-text operation is in progress, but has either been throttled (to allow other system requests to perform as needed) or has been paused. + * You can use the feedback from this option to make decisions about what other full-text-related options are appropriate (to check whether a population is in progress so you know whether other functions, such as TableFulltextDocsProcessed, are valid). + * TableFulltextDocsProcessed—Valid only while full-text indexing is actually running, this returns the number of rows processed since the full-text index processing task started. A zero result indicates that full-text indexing is not currently running (a null result means full-text indexing is not configured for this table). + * TableFulltextFailCount—Valid only while full-text indexing is actually running, this returns the number of rows that full-text indexing has, for some reason, skipped (no indication of reason). As with TableFulltextDocsProcessed, a zero result indicates the table is not currently being analyzed for full text, and a null indicates that full text is not configured for this table. + * TableIsPinned—This is left in for backward compatibility only and will always return "0" in SQL Server 2005 and beyond. + +OBJECTPROPERTYEX + +OBJECTPROPERTYEX is an extended version of the OBJECTPROPERTY function. + +OBJECTPROPERTYEX(, ) + +Like OBJECTPROPERTY, the id parameter specifies the ID of the object required. The property parameter specifies the information required on the object. OBJECTPROPERTYEX supports all the same property values as OBJECTPROPERTY but adds the following property values as additional options: + + * BaseType—Returns the base data type of an object. + * IsPrecise—Indicates that your object does not contain any imprecise computations. For example an int or decimal is precise, but a float is not. Computations that utilize imprecise data types must be assumed to return imprecise results. Note that you can specifically mark any .NET assemblies you produce as being precise or not. + * IsSystemVerified—Indicates whether the IsPrecise and IsDeterministic properties can be verified by SQL Server itself (as opposed to just having been set by the user). + * SchemaId—Just what it sounds like. Returns the internal system ID for a given object. You can then use SCHEMA_NAME to put a more user-friendly name on the schema ID. + * SystemDataAccess—Indicates whether the object in question relies on any system table data. + * UserDataAccess—Indicates whether the object in question utilizes any of the user tables or system user data. + +@@PROCID + +Returns the stored procedure ID of the currently running procedure. + +Primarily a troubleshooting tool when a process is running and using up a large amount of resources. Is used mainly as a DBA function. + +SCHEMA_ID + +Given a schema name, returns the internal system ID for that schema. Utilizes the syntax: + +SCHEMA_ID( ) + +SCHEMA_NAME + +Given an internal schema system ID, returns the user-friendly name for that schema. The syntax is: + +SCHEMA_NAME( ) + +SQL_VARIANT_PROPERTY + +SQL_VARIANT_PROPERTY is a powerful function and returns information about a sql_variant. This information could be from BaseType, Precision, Scale, TotalBytes, Collation, MaxLength. The syntax is: + +SQL_VARIANT_PROPERTY (expression, property) + +Expression is an expression of type sql_variant. Property can be any one of the following values: + +Value | Description | Base Type of sql_variant Returned +---|---|--- +BaseType | Data types include: char, int, money, nchar, ntext, numeric, nvarchar, real, smalldatetime, smallint, smallmoney, text, timestamp, tinyint, uniqueidentifier, varbinary, varchar | sysname +Precision | The precision of the numeric base data type: +datetime = 23 +smalldatetime = 16 +float = 53 +real = 24 +decimal (p,s) and numeric (p,s) = p +money = 19 +smallmoney = 10 +int = 10 +smallint = 5 +tinyint = 3 +bit = 1 +All other types = 0 | int +Scale | The number of digits to the right of the decimal point of the numeric base data type: +decimal (p,s) and numeric (p,s) = s +money and smallmoney = 4 +datetime = 3 +All other types = 0 | int +TotalBytes | The number of bytes required to hold both the metadata and data of the value. If the value is greater than 900, index creation will fail. | int +Collation | The collation of the particular sql_variant value. | sysname +MaxLength | The maximum data type length, in bytes. | int + +TYPEPROPERTY + +The TYPEPROPERTY function returns information about a data type. The syntax is as follows: + +TYPEPROPERTY(, ) + +The type parameter specifies the name of the data type. The property parameter specifies the property of the data type that is to be queried; it can be one of the following values: + + * Precision—Returns the number of digits/characters. + * Scale—Returns the number of decimal places. + * AllowsNull—Returns 1 for True and 0 for False. + * UsesAnsiTrim—Returns 1 for True and 0 for False. + +Rowset Functions + +The rowset functions return an object that can be used in place of a table reference in a T-SQL statement. The rowset functions are: + + * CHANGETABLE + * CONTAINSTABLE + * FREETEXTTABLE + * OPENDATASOURCE + * OPENQUERY + * OPENROWSET + * OPENXML + +CHANGETABLE + +CHANGETABLE ( + +{ CHANGES
, + +| VERSION
, } ) + +[AS]
[ ( [ ,...n ] ) + +Returns all rows in the specified table since the point specified in the "last sync version" argument. + +CONTAINSTABLE + +The CONTAINSTABLE function is used in full-text queries. Please refer to Chapter 18 for an example of its usage. The syntax is as follows: + +CONTAINSTABLE (
, { | *}, '') + +FREETEXTTABLE + +The FREETEXTTABLE function is used in full-text queries. Please refer to Chapter 18 for an example of its usage. The syntax is as follows: + +FREETEXTTABLE (
, { | *}, '') + +OPENDATASOURCE + +The OPENDATASOURCE function provides ad hoc connection information. The syntax is as follows: + +OPENDATASOURCE (, ) + +The provider_name is the name registered as the ProgID of the OLE DB provider used to access the data source. The init_string should be familiar to VB programmers, as this is the initialization string to the OLE DB provider. For example, the init_string could look like: + +"User Id=wonderison;Password=JuniorBlues;DataSource=MyServerName" + +OPENQUERY + +The OPENQUERY function executes the specified pass-through query on the specified linked_server. The syntax is as follows: + +OPENQUERY(, '') + +OPENROWSET + +The OPENROWSET function accesses remote data from an OLE DB data source. The syntax is as follows: + +OPENROWSET('' + +{ + +'';'';'' + +| '' + +}, + +{ + +[][] + +| '' + +}) + +The provider_name parameter is a string representing the friendly name of the OLE DB provided as specified in the registry. The data_source parameter is a string corresponding to the required OLE DB data source. The user_id parameter is a relevant username to be passed to the OLE DB provider. The password parameter is the password associated with the user_id. + +The provider_string parameter is a provider-specific connection string and is used in place of the datasource, user_id, and password combination. + +The catalog parameter is the name of the catalog/database that contains the required object. The schema parameter is the name of the schema or object owner of the required object. The object parameter is the object name. + +The query parameter is a string that is executed by the provider and is used instead of a combination of catalog, schema, and object. + +OPENXML + +By passing in an XML document as a parameter, or by retrieving an XML document and defining the document within a variable, OPENXML allows you to inspect the structure and return data, as if the XML document were a table. The syntax is as follows: + +OPENXML( [in], nvarchar[in],[ byte[in]]) + +[WITH ( | )] + +The idoc_int parameter is the variable defined using the sp_xml_prepareddocument system sproc. Rowpattern is the node definition. The flags parameter specifies the mapping between the XML document and the rowset to return within the SELECT statement. SchemaDeclaration defines the XML schema for the XML document; if there is a table defined within the database that follows the XML schema, then TableName can be used instead. + +Before being able to use the XML document, it must be prepared by using the sp_xml_preparedocument system procedure. + +Security Functions + +The security functions return information about users and roles. They are: + + * HAS_DBACCESS + * IS_MEMBER + * IS_SRVROLEMEMBER + * SUSER_ID + * SUSER_NAME + * SUSER_SID + * USER + * USER_ID + * USER_NAME + +HAS_DBACCESS + +The HAS_DBACCESS function is used to determine whether the user that is logged in has access to the database being used. A return value of 1 means the user does have access, and a return value of 0 means that he or she does not. A NULL return value means the database_name supplied was invalid. The syntax is as follows: + +HAS_DBACCESS ('') + +IS_MEMBER + +The IS_MEMBER function returns whether the current user is a member of the specified Windows NT group/SQL Server role. The syntax is as follows: + +IS_MEMBER ({'' | ''}) + +The group parameter specifies the name of the NT group and must be in the form domain\group. The role parameter specifies the name of the SQL Server role. The role can be a database fixed role or a user-defined role but cannot be a server role. + +This function will return a 1 if the current user is a member of the specified group or role, a 0 if the current user is not a member of the specified group or role, and NULL if the specified group or role is invalid. + +IS_SRVROLEMEMBER + +The IS_SRVROLEMEMBER function returns whether a user is a member of the specified server role. The syntax is as follows: + +IS_SRVROLEMEMBER ('' [,'']) + +The optional login parameter is the name of the login account to check. The default is the current user. The role parameter specifies the server role and must be one of the following possible values: + + * sysadmin + * dbcreator + * diskadmin + * processadmin + * serveradmin + * setupadmin + * securityadmin + +This function returns a 1 if the specified login account is a member of the specified role, a 0 if the login is not a member of the role, and a NULL if the role or login is invalid. + +SUSER_ID + +The SUSER_ID function returns the specified user's login ID number. The syntax is as follows: + +SUSER_ID(['']) + +The login parameter is the specified user's login ID name. If no value for login is provided, the default of the current user will be used instead. + +The SUSER_ID system function is included in SQL Server 2000 for backward compatibility, so if possible you should use SUSER_SID, which is inherently more secure, instead. + +SUSER_NAME + +The SUSER_NAME function returns the specified user's login ID name. The syntax is as follows: + +SUSER_NAME([]) + +The server_user_id parameter is the specified user's login ID number. If no value for server_user_id is provided, the default of the current user will be used instead. + +The SUSER_NAME system function is included in SQL Server 2000 for backward compatibility only, so if possible you should use SUSER_SNAME instead. + +SUSER_SID + +The SUSER_SID function returns the security identification number (SID) for the specified user. The syntax is as follows: + +SUSER_SID(['']) + +The login parameter is the user's login name. If no value for login is provided, the current user will be used instead. + +SUSER_SNAME + +The SUSER_SNAME function returns the login ID name for the specified security identification number (SID). The syntax is as follows: + +SUSER_SNAME([]) + +The server_user_sid parameter is the user's SID. If no value for the server_user_sid is provided, the current user's will be used instead. + +USER + +The USER function allows a system-supplied value for the current user's database username to be inserted into a table if no default has been supplied. The syntax is as follows: + +USER + +USER_ID + +The USER_ID function returns the specified user's database ID number. The syntax is as follows: + +USER_ID(['']) + +The user parameter is the username to be used. If no value for user is provided, the current user is used. + +USER_NAME + +The USER_NAME function is the functional reverse of USER_ID, and returns the specified user's username in the database given a database ID number. The syntax is as follows: + +USER_NAME(['']) + +The user id parameter is the id of the user you want the name for. If no value for user id is provided, the current user is assumed. + +String Functions + +The string functions perform actions on string values and return strings or numeric values. The string functions are: + + * ASCII + * CHAR + * CHARINDEX + * DIFFERENCE + * LEFT + * LEN + * LOWER + * LTRIM + * NCHAR + * PATINDEX + * QUOTENAME + * REPLACE + * REPLICATE + * REVERSE + * RIGHT + * RTRIM + * SOUNDEX + * SPACE + * STR + * STUFF + * SUBSTRING + * UNICODE + * UPPER + +ASCII + +The ASCII function returns the ASCII code value of the leftmost character in character_expression. The syntax is as follows: + +ASCII() + +CHAR + +The CHAR function converts an ASCII code (specified in expression) into a string. The syntax is as follows: + +CHAR() + +The expression can be any integer between 0 and 255. + +CHARINDEX + +The CHARINDEX function returns the starting position of an expression in a character_string. The syntax is as follows: + +CHARINDEX(, [, ]) + +The expression parameter is the string to be found. The character_string is the string to be searched, usually a column. The start_location is the character position to begin the search; if this is anything other than a positive number, the search will begin at the start of character_string. + +DIFFERENCE + +The DIFFERENCE function returns the difference between the SOUNDEX values of two expressions as an integer. The syntax is as follows: + +DIFFERENCE(, ) + +This function returns an integer value between 0 and 4. If the two expressions sound identical (for example, blue and blew) a value of 4 will be returned. If there is no similarity, a value of 0 is returned. + +LEFT + +The LEFT function returns the leftmost part of an expression, starting a specified number of characters from the left. The syntax is as follows: + +LEFT(, ) + +The expression parameter contains the character data from which the leftmost section will be extracted. The integer parameter specifies the number of characters from the left to begin; it must be a positive integer. + +LEN + +The LEN function returns the number of characters in the specified expression. The syntax is as follows: + +LEN() + +LOWER + +The LOWER function converts any uppercase characters in the expression into lowercase characters. The syntax is as follows: + +LOWER() + +LTRIM + +The LTRIM function removes any leading blanks from a character_expression. The syntax is as follows: + +LTRIM() + +NCHAR + +The NCHAR function returns the Unicode character that has the specified integer_code. The syntax is as follows: + +NCHAR() + +The integer_code parameter must be a positive whole number from 0 to 65,535. + +PATINDEX + +The PATINDEX function returns the starting position of the first occurrence of a pattern in a specified expression or zero if the pattern was not found. The syntax is as follows: + +PATINDEX('<%pattern%>', ) + +The pattern parameter is a string that will be searched for. Wildcard characters can be used, but the % characters must surround the pattern. The expression parameter is character data in which the pattern is being searched for—usually a column. + +QUOTENAME + +The QUOTENAME function returns a Unicode string with delimiters added to make the specified string a valid SQL Server delimited identifier. The syntax is as follows: + +QUOTENAME(''[, '']) + +The character_string parameter is Unicode string. The quote_character parameter is a one-character string that will be used as a delimiter. The quote_character parameter can be a single quotation mark ('), a left or a right bracket ([]), or a double quotation mark ("). The default is for brackets to be used. + +REPLACE + +The REPLACE function replaces all instances of the second specified string in the first specified string with a third specified string. The syntax is as follows: + +REPLACE('', '', '') + +The string_expression1 parameter is the expression in which to search. The string_expression2 parameter is the expression to search for in string_expression1. The string_expression3 parameter is the expression with which to replace all instances of string_expression2. + +REPLICATE + +The REPLICATE function repeats a character_expression a specified number of times. The syntax is as follows: + +REPLICATE(, ) + +REVERSE + +The REVERSE function returns the reverse of the specified character_expression. The syntax is as follows: + +REVERSE() + +RIGHT + +The RIGHT function returns the rightmost part of the specified character_expression, starting a specified number of characters (given by integer) from the right. The syntax is as follows: + +RIGHT(, ) + +The integer parameter must be a positive whole number. + +RTRIM + +The RTRIM function removes all the trailing blanks from a specified character_expression. The syntax is as follows: + +RTRIM() + +SOUNDEX + +The SOUNDEX function returns a four-character (SOUNDEX) code, which can be used to evaluate the similarity of two strings. The syntax is as follows: + +SOUNDEX() + +SPACE + +The SPACE function returns a string of repeated spaces, the length of which is indicated by integer. The syntax is as follows: + +SPACE() + +STR + +The STR function converts numeric data into character data. The syntax is as follows: + +STR([, [, ]]) + +The numeric_expression parameter is a numeric expression with a decimal point. The length parameter is the total length including decimal point, digits, and spaces. The decimal parameter is the number of places to the right of the decimal point. + +STUFF + +The STUFF function deletes a specified length of characters and inserts another set of characters in their place. The syntax is as follows: + +STUFF(, , , ) + +The expression parameter is the string of characters in which some will be deleted and new ones added. The start parameter specifies where to begin deletion and insertion of characters. The length parameter specifies the number of characters to delete. The characters parameter specifies the new set of characters to be inserted into the expression. + +SUBSTRING + +The SUBSTRING function returns part of an expression. The syntax is as follows: + +SUBSTRING(, , ) + +The expression parameter specifies the data from which the substring will be taken, and can be a character string, binary string, text, or an expression that includes a table. The start parameter is an integer that specifies where to begin the substring. The length parameter specifies how long the substring is. + +UNICODE + +The UNICODE function returns the Unicode number that represents the first character in character_expression. The syntax is as follows: + +UNICODE('') + +UPPER + +The UPPER function converts all the lowercase characters in character_expression into uppercase characters. The syntax is as follows: + +UPPER() + +System Functions + +The system functions can be used to return information about values, objects, and settings with SQL Server. The functions are as follows: + + * APP_NAME + * CASE + * CAST and CONVERT + * COALESCE + * COLLATIONPROPERTY + * CURRENT_TIMESTAMP + * CURRENT_USER + * DATALENGTH + * FORMATMESSAGE + * GETANSINULL + * HOST_ID + * HOST_NAME + * IDENT_CURRENT + * IDENT_INCR + * IDENT_SEED + * IDENTITY + * ISDATE + * ISNULL + * ISNUMERIC + * NEWID + * NULLIF + * PARSENAME + * PERMISSIONS + * ROWCOUNT_BIG + * SCOPE_IDENTITY + * SERVERPROPERTY + * SESSION_USER + * SESSIONPROPERTY + * STATS_DATE + * SYSTEM_USER + * USER_NAME + +APP_NAME + +The APP_NAME function returns the application name for the current session if one has been set by the application as an nvarchar type. It has the following syntax: + +APP_NAME() + +CASE + +The CASE function evaluates a list of conditions and returns one of multiple possible results. It also has two formats: + + * The simple CASE function compares an expression to a set of simple expressions to determine the result. + * The searched CASE function evaluates a set of Boolean expressions to determine the result. + +Both formats support an optional ELSE argument. + +Simple CASE function: + +CASE + +WHEN THEN + +ELSE + +END + +Searched CASE function: + +CASE + +WHEN THEN + +ELSE + +END + +CAST and CONVERT + +These two functions provide similar functionality in that they both convert one data type into another type. + +Using CAST: + +CAST( AS ) + +Using CONVERT: + +CONVERT ([()], [,