From b6b1230d82c66b1cec45c6cce1669c79bc38c4a1 Mon Sep 17 00:00:00 2001 From: Maggsl Date: Fri, 14 Apr 2017 13:11:16 -0700 Subject: [PATCH] Updated files from GitHub wiki --- .vscode/settings.json | 3 + .../test/Associate-an-id-with-a-prediction.md | 2 + articles/test/BatchNormalization.md | 6 +- ...al.md => BrainScript-Train,-Test,-Eval.md} | 0 articles/test/Build-a-constant-3D-tensor.md | 25 -- articles/test/CNTK-2.0-Beta-Highlights.md | 87 +++++ articles/test/CNTK-2.0-Python-API.md | 1 + articles/test/CNTK-2.0-Setup-from-Sources.md | 1 + articles/test/CNTK-2.0-Setup.md | 1 + .../CNTK-Binary-Download-and-Configuration.md | 3 + ...inary-Download-and-Manual-Configuration.md | 3 + .../CNTK-Library-Evaluation-on-Windows.md | 1 + ... => CNTK-Library-Native-Eval-Interface.md} | 144 +++++++- ...NTK-Python-known-issues-and-limitations.md | 1 + .../CNTK-Shared-Libraries-Naming-Format.md | 92 +++++ articles/test/Debug-a-Python-notebook.md | 14 - articles/test/Examples.md | 15 - articles/test/Express-a-gating-mechanism.md | 5 - .../Express-a-softmax-over-a-dynamic-axis.md | 14 - ...-a-softmax-with-a-temperature-parameter.md | 4 - ...-the-error-rate-of-my-binary-classifier.md | 7 - ...ghlighting-for-BrainScript-config-files.md | 7 - ...arted-in-sequence-to-sequence-modelling.md | 17 - ...n-I-take-the-last-element-of-a-sequence.md | 11 - .../test/Hands-On-Labs-Image-Recognition.md | 8 +- .../Hands-On-Labs-Language-Understanding.md | 4 +- .../How-do-I-Adapt-Models-in-BrainScript.md | 89 +++++ ....md => How-do-I-Adapt-models-in-Python.md} | 4 + ...ow-do-I-Deal-with-Errors-in-BrainScript.md | 78 +++++ .../How-do-I-Deal-with-Errors-in-Python.md | 33 ++ ...How-do-I-Evaluate-Models-in-BrainScript.md | 102 ++++++ ... => How-do-I-Evaluate-models-in-Python.md} | 9 + ...d => How-do-I-Express-Things-In-Python.md} | 175 +++++++++- .../How-do-I-Express-Things-in-BrainScript.md | 107 ++++++ .../How-do-I-Read-Things-in-BrainScript.md | 51 +++ ...).md => How-do-I-Read-Things-in-Python.md} | 4 + .../How-do-I-Train-Models-in-BrainScript.md | 260 +++++++++++++++ .../test/How-do-I-Train-models-in-Python.md | 116 +++++++ articles/test/How-do-I-in-BrainScript.md | 38 +-- articles/test/How-do-I-in-Python.md | 8 + articles/test/How-do-I.md | 7 + articles/test/Implement-Zoneout.md | 8 - .../test/Implement-an-attention-mechanism.md | 39 --- articles/test/Interpret-the-training-loss.md | 6 - ...e-use-of-MinibatchSource.next_minibatch.md | 23 -- ...of-a-network-from-within-the-Python-API.md | 20 -- ...r-inspect-or-list-model-input-variables.md | 24 -- articles/test/Layer-wise-training.md | 15 - ...ckpointed-model-and-continue-retraining.md | 11 - articles/test/Managed-EvalDLL-API.md | 110 ------ articles/test/Native-EvalDLL-API.md | 72 ---- .../Port-LSTM-NDL-primitives-to-Python.md | 27 -- ...to-1D-output-from-Python-API-to-C---API.md | 16 - ...-learning-rate-and-momentum-in-adam_sgd.md | 3 - .../CNTK_2_0_RC_1_Release_Notes.md | 152 +++++++++ ...rict-a-prediction-to-a-bounded-interval.md | 7 - ...art-2-\342\200\223-Machine-Translation.md" | 96 ++++++ ...the-verbosity-or-traceLevel-from-Python.md | 5 - articles/test/Setup-CNTK-on-your-machine.md | 6 +- ...-a-DSSM-(or-a-convolutional-DSSM)-model.md | 97 ------ ...Train-a-multilabel-classifier-in-Python.md | 10 - .../test/Train-a-multilabel-classifier.md | 17 - .../Train-a-regression-model-on-images.md | 71 ---- .../test/Train-two-or-more-models-jointly.md | 25 -- .../test/Train-with-a-multitask-objective.md | 8 - articles/test/Train-with-a-weighted-loss.md | 6 - articles/test/Tutorial2/Tutorial2.md | 53 ++- articles/test/Tutorials,-Examples,-etc...md | 4 + articles/test/Tutorials-examples.md | 6 - articles/test/Tutorials.md | 8 +- ...CNTK-with-multiple-GPUs-and-or-machines.md | 1 + articles/test/out.txt | 314 ------------------ 72 files changed, 1674 insertions(+), 1143 deletions(-) create mode 100644 .vscode/settings.json rename articles/test/{BrainScript-Train-Test-Eval.md => BrainScript-Train,-Test,-Eval.md} (100%) delete mode 100644 articles/test/Build-a-constant-3D-tensor.md create mode 100644 articles/test/CNTK-2.0-Beta-Highlights.md create mode 100644 articles/test/CNTK-2.0-Python-API.md create mode 100644 articles/test/CNTK-2.0-Setup-from-Sources.md create mode 100644 articles/test/CNTK-2.0-Setup.md create mode 100644 articles/test/CNTK-Binary-Download-and-Configuration.md create mode 100644 articles/test/CNTK-Binary-Download-and-Manual-Configuration.md rename articles/test/{Native-CNTK-Library-Eval-Interface.md => CNTK-Library-Native-Eval-Interface.md} (58%) create mode 100644 articles/test/CNTK-Python-known-issues-and-limitations.md create mode 100644 articles/test/CNTK-Shared-Libraries-Naming-Format.md delete mode 100644 articles/test/Debug-a-Python-notebook.md delete mode 100644 articles/test/Express-a-gating-mechanism.md delete mode 100644 articles/test/Express-a-softmax-over-a-dynamic-axis.md delete mode 100644 articles/test/Express-a-softmax-with-a-temperature-parameter.md delete mode 100644 articles/test/Express-the-error-rate-of-my-binary-classifier.md delete mode 100644 articles/test/Get-nice-syntax-highlighting-for-BrainScript-config-files.md delete mode 100644 articles/test/Get-started-in-sequence-to-sequence-modelling.md delete mode 100644 articles/test/Get-things-to-work-correctly-when-I-take-the-last-element-of-a-sequence.md create mode 100644 articles/test/How-do-I-Adapt-Models-in-BrainScript.md rename articles/test/{Read-and-modify-the-training-weights-from-Python.md => How-do-I-Adapt-models-in-Python.md} (51%) create mode 100644 articles/test/How-do-I-Deal-with-Errors-in-BrainScript.md create mode 100644 articles/test/How-do-I-Deal-with-Errors-in-Python.md create mode 100644 articles/test/How-do-I-Evaluate-Models-in-BrainScript.md rename articles/test/{Evaluate-a-saved-convolutional-network.md => How-do-I-Evaluate-models-in-Python.md} (77%) rename articles/test/{Expose-new-operands-in-V2-Python-from-previous-V1-implementations.md => How-do-I-Express-Things-In-Python.md} (60%) create mode 100644 articles/test/How-do-I-Express-Things-in-BrainScript.md create mode 100644 articles/test/How-do-I-Read-Things-in-BrainScript.md rename articles/test/{Load-model-and-access-network-weights-(parameters).md => How-do-I-Read-Things-in-Python.md} (69%) create mode 100644 articles/test/How-do-I-Train-Models-in-BrainScript.md create mode 100644 articles/test/How-do-I-Train-models-in-Python.md create mode 100644 articles/test/How-do-I-in-Python.md create mode 100644 articles/test/How-do-I.md delete mode 100644 articles/test/Implement-Zoneout.md delete mode 100644 articles/test/Implement-an-attention-mechanism.md delete mode 100644 articles/test/Interpret-the-training-loss.md delete mode 100644 articles/test/Interpret-the-use-of-MinibatchSource.next_minibatch.md delete mode 100644 articles/test/Interrogate-the-dimensions-of-internal-layers-of-a-network-from-within-the-Python-API.md delete mode 100644 articles/test/Introspect-or-inspect-or-list-model-input-variables.md delete mode 100644 articles/test/Layer-wise-training.md delete mode 100644 articles/test/Load-pre-trained-checkpointed-model-and-continue-retraining.md delete mode 100644 articles/test/Managed-EvalDLL-API.md delete mode 100644 articles/test/Native-EvalDLL-API.md delete mode 100644 articles/test/Port-LSTM-NDL-primitives-to-Python.md delete mode 100644 articles/test/Port-projection-of-1D-input-to-1D-output-from-Python-API-to-C---API.md delete mode 100644 articles/test/Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd.md create mode 100644 articles/test/ReleaseNotes/CNTK_2_0_RC_1_Release_Notes.md delete mode 100644 articles/test/Restrict-a-prediction-to-a-bounded-interval.md create mode 100644 "articles/test/Sequence-to-Sequence-\342\200\223-Deep-Recurrent-Neural-Networks-in-CNTK-\342\200\223-Part-2-\342\200\223-Machine-Translation.md" delete mode 100644 articles/test/Set-the-verbosity-or-traceLevel-from-Python.md delete mode 100644 articles/test/Train-a-DSSM-(or-a-convolutional-DSSM)-model.md delete mode 100644 articles/test/Train-a-multilabel-classifier-in-Python.md delete mode 100644 articles/test/Train-a-multilabel-classifier.md delete mode 100644 articles/test/Train-a-regression-model-on-images.md delete mode 100644 articles/test/Train-two-or-more-models-jointly.md delete mode 100644 articles/test/Train-with-a-multitask-objective.md delete mode 100644 articles/test/Train-with-a-weighted-loss.md create mode 100644 articles/test/Tutorials,-Examples,-etc...md delete mode 100644 articles/test/Tutorials-examples.md create mode 100644 articles/test/Using-CNTK-with-multiple-GPUs-and-or-machines.md delete mode 100644 articles/test/out.txt diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..20af2f68 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +// Place your settings in this file to overwrite default and user settings. +{ +} \ No newline at end of file diff --git a/articles/test/Associate-an-id-with-a-prediction.md b/articles/test/Associate-an-id-with-a-prediction.md index 7d06cdfc..1af05ed8 100644 --- a/articles/test/Associate-an-id-with-a-prediction.md +++ b/articles/test/Associate-an-id-with-a-prediction.md @@ -1,3 +1,5 @@ +# Associate an ID with a prediction + Assuming you have an input called Id (which contains a unique numeric id for each example) in your file you can do ``` predictionWithId = Splice(prediction:Id) diff --git a/articles/test/BatchNormalization.md b/articles/test/BatchNormalization.md index d0890a7d..aa32e7f6 100644 --- a/articles/test/BatchNormalization.md +++ b/articles/test/BatchNormalization.md @@ -6,10 +6,10 @@ ### Parameters * `input` is the input of the batch normalization node -* `scale` is a [ParameterTensor{}](Parameters-And-Constants.md) that holds the learned componentwise-scaling factors (`gamma` term in the equation below). -* `bias` is a [ParameterTensor{}](Parameters-And-Constants.md) that holds the learned bias (`beta` term). `scale` and `bias` must have the same dimensions which must be equal to the `input` dimensions in case of `spatial = false` or number of output convolution feature maps in case of `spatial = true`. +* `scale` is a [ParameterTensor{}](./Parameters-And-Constants#parametertensor) that holds the learned componentwise-scaling factors (`gamma` term in the equation below). +* `bias` is a [ParameterTensor{}](./Parameters-And-Constants#parametertensor) that holds the learned bias (`beta` term). `scale` and `bias` must have the same dimensions which must be equal to the `input` dimensions in case of `spatial = false` or number of output convolution feature maps in case of `spatial = true`. * `runMean` is the running mean which is used during evaluation phase and might be used during training as well. -You must pass a [ParameterTensor{}](Parameters-And-Constants.md) with +You must pass a [ParameterTensor{}](./Parameters-And-Constants#parametertensor) with the same dimensions as `scale` and `bias`, initial value 0, and `learningRateMultiplier=0` set. * `runVariance` is the running variance. It is represented the same way as `runMean`. diff --git a/articles/test/BrainScript-Train-Test-Eval.md b/articles/test/BrainScript-Train,-Test,-Eval.md similarity index 100% rename from articles/test/BrainScript-Train-Test-Eval.md rename to articles/test/BrainScript-Train,-Test,-Eval.md diff --git a/articles/test/Build-a-constant-3D-tensor.md b/articles/test/Build-a-constant-3D-tensor.md deleted file mode 100644 index e0252c93..00000000 --- a/articles/test/Build-a-constant-3D-tensor.md +++ /dev/null @@ -1,25 +0,0 @@ -I want to build a constant 3D tensor in CNTK. I learned how to produce 1D and 2D constant arrays, I can stack (conacenate or combine) them and repeat them. Now I need to stack the 2D Tensors to make a 3D tensor? - -Say you have three tensors, e.g. - -``` -A = ParameterTensor {100:200} -B = ParameterTensor {100:200} -C = ParameterTensor {100:200} -``` - -You can now say - -``` - ABC = Splice (A:B:C, axis=3) -``` - -which will give you a [100 x 200 x 3] tensor. - -(If, on the other hand, you had says Splice (A:B:C, axis=1), you would get a [300 x 200] tensor, and Splice (A:B:C, axis=2) would get you a [100 x 600] tensor.) - -Note that to splice in a new dimension, dimensions of all inputs must match. E.g. you cannot stack a ParameterTensor {100:200} and a ParameterTensor {100:50} along axis 3 (but you could stack it along axis 2, which wold give you a [100 x 250] tensor). - -Also note that axis indices are 1-based, like in math. E.g. the row dimension of a matrix is the first axis, axis=1, and the column dimension is the second axis, axis=2. - - diff --git a/articles/test/CNTK-2.0-Beta-Highlights.md b/articles/test/CNTK-2.0-Beta-Highlights.md new file mode 100644 index 00000000..5ac1f378 --- /dev/null +++ b/articles/test/CNTK-2.0-Beta-Highlights.md @@ -0,0 +1,87 @@ +In this article we have summarized new features, changes, and improvements that were introduced during Beta releases of CNTK v.2.0 (October 2016 - March 2017). + +For the detailed information on all Beta releases, please see the correspondent Release Notes - you will find links at the [CNTK Releases Page](https://github.com/Microsoft/CNTK/releases). + +## What's new in CNTK 2.0 +### CNTK as a Library. Python, C++, and C# / .NET Managed API + +The Microsoft Cognitive Toolkit 2.0 enables CNTK as a software library supporting multiple programming languages in addition to running the CNTK executable and programming the Networks with BrainScript. + +The following programming languages are supported: + +* Python + * Python versions supported are 2.7, 3.4, and 3.5 +* C++ +* C#/.NET Managed + +We have introduced features for each of these API and provided examples illustrating their usage. Use the following resources: + +* [CNTK API Summary page](https://github.com/microsoft/cntk/wiki/CNTK-Library-API) +* [CNTK Python API Documentation](https://cntk.ai/pythondocs/) + * The documentation includes links to all existing Tutorials and Examples. Also see next section +* [CNTK Managed API](https://github.com/microsoft/cntk/wiki/CNTK-Library-Managed-API) + * [CNTK Eval Examples](https://github.com/microsoft/cntk/wiki/CNTK-Eval-Examples) + + +### Python Examples and Tutorials (Jupyter Notebooks) + +Recognizing the importance of Python in Deep Learning we have published (a constantly growing) set of Python Examples and Tutorials (the latter are implemented as Jupyter Notebooks). You will find all information at these locations: + +* [Python Examples](https://cntk.ai/pythondocs/examples.html) +* [Python Tutorials (Jupyter Notebooks)](https://cntk.ai/pythondocs/tutorials.html) + +On Linux you easily can use our preconfigured [CNTK Docker Containers](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers#using-docker-container-to-run-cntk-jupyter-notebook-tutorials) to run CNTK Jupyter Notebooks. + +### CNTK Evaluation library + +The CNTK Library API allows to evaluate both CNTK model-v1 and model-V2 format. + +In CNTK 2.0 the Evaluation Library can be used on Windows and Linux, in CPU and GPU configuration. C++, Python, as well as C# and other .NET languages are supported. + +More informationSee more on [CNTK Evaluation Library here](https://github.com/Microsoft/CNTK/wiki/CNTK-Library-Evaluation-Overview). + +### CNTK new features + +We have introduced a lot of new features during the Beta period. The list below highlights some of them: + +* Support of [object recognition using Fast R-CNN](https://github.com/Microsoft/CNTK/wiki/Object-Detection-using-Fast-R-CNN) algorithm. +* Integration with [NVIDIA NCCL](https://github.com/NVIDIA/nccl), a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. See how to enable NCCL in the [CNTK Wiki](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#optional-nccl). +* Support of Asynchronous Stochastic Gradient Descent (ASGD)/Hogwild! training parallelization support using Microsoft’s Parameter Server ([Project Multiverso](https://github.com/Microsoft/multiverso)). +* Support of Distributed scenarios in Python API. See more in the sections on Distributed scenarios in [ConvNet](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/Python/README.md) and [ResNet](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ResNet/Python/README.md) examples. +* Support for training on one-hot and sparse arrays via NumPy. +* Lambda rank and NDCG at 1 are accessible from Python for real this time. +* New Python and BrainScript for VGG16 and 19. +* [Performance Profiler for BrainScript and Python](https://github.com/Microsoft/CNTK/wiki/BrainScript-and-Python-Performance-Profiler). +* Support in training session for cross validation and preservation of all checkpoints. +* Ability to write your own optimizer in Python by inheriting from `UserLearner` and overriding the update method. +* Ability to implement Learners in Python using `UserLearner` [Read more here](https://www.cntk.ai/pythondocs/extend.html#user-learners). +* All deserializers are exposed in C++. + * HTK deserializers are also exposed in Python. +* Support for TensorBoard output in both Python and BrainScript. [Read more here](https://github.com/Microsoft/CNTK/wiki/Using-TensorBoard-for-Visualization). +* Support for Model debugging in Python, that can be done conveniently similarly to gdb/pdb, by wrapping the model with `debug_model()` and training/evaluating it. [Read more here](https://www.cntk.ai/pythondocs/cntk.debugging.html#module-cntk.debugging.debug). + +We have also implemented significant performance improvements. + +## CNTK Installation and Usage +### New automated installation procedures + +We have enabled different ways to install CNTK in an automated manner: + +* Installation scripts for binary distribution. +* Installation scripts for the users who would like to work with CNTK source code. +* Installation via `pip install` procedure. + +Read more about [different ways to install CNTK here](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine). + +### CNTK at Docker Hub and as self-built Docker Images + +CNTK is now available as [Docker Images at Docker Hub](https://hub.docker.com/r/microsoft/cntk/). + +You may also build your own Docker Images using pre-configured Docker files from CNTK Code base. See this [Wiki page on using CNTK as Docker Images and Containers](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers). + +### CNTK as NuGet packages + +Windows developers using C++ and C# / .NET are welcome to use multiple CNTK NuGet packages supporting both v.1 and v.2 model formats. + +See more on [CNTK NuGet packages here](https://github.com/Microsoft/CNTK/wiki/NuGet-Package). + diff --git a/articles/test/CNTK-2.0-Python-API.md b/articles/test/CNTK-2.0-Python-API.md new file mode 100644 index 00000000..9412a645 --- /dev/null +++ b/articles/test/CNTK-2.0-Python-API.md @@ -0,0 +1 @@ +See [here](https://www.cntk.ai/pythondocs/). diff --git a/articles/test/CNTK-2.0-Setup-from-Sources.md b/articles/test/CNTK-2.0-Setup-from-Sources.md new file mode 100644 index 00000000..ec6d355e --- /dev/null +++ b/articles/test/CNTK-2.0-Setup-from-Sources.md @@ -0,0 +1 @@ +Please see [here](./Setup-CNTK-on-your-machine). diff --git a/articles/test/CNTK-2.0-Setup.md b/articles/test/CNTK-2.0-Setup.md new file mode 100644 index 00000000..9a1459c3 --- /dev/null +++ b/articles/test/CNTK-2.0-Setup.md @@ -0,0 +1 @@ +See [here](./Setup-CNTK-on-your-machine) diff --git a/articles/test/CNTK-Binary-Download-and-Configuration.md b/articles/test/CNTK-Binary-Download-and-Configuration.md new file mode 100644 index 00000000..d01aa04b --- /dev/null +++ b/articles/test/CNTK-Binary-Download-and-Configuration.md @@ -0,0 +1,3 @@ +# CNTK V2 Setup and Installation + +For installation of CNTK (from binary or from source code) see [here](./Setup-CNTK-on-your-machine) diff --git a/articles/test/CNTK-Binary-Download-and-Manual-Configuration.md b/articles/test/CNTK-Binary-Download-and-Manual-Configuration.md new file mode 100644 index 00000000..d01aa04b --- /dev/null +++ b/articles/test/CNTK-Binary-Download-and-Manual-Configuration.md @@ -0,0 +1,3 @@ +# CNTK V2 Setup and Installation + +For installation of CNTK (from binary or from source code) see [here](./Setup-CNTK-on-your-machine) diff --git a/articles/test/CNTK-Library-Evaluation-on-Windows.md b/articles/test/CNTK-Library-Evaluation-on-Windows.md index 554a759d..f230785b 100644 --- a/articles/test/CNTK-Library-Evaluation-on-Windows.md +++ b/articles/test/CNTK-Library-Evaluation-on-Windows.md @@ -69,6 +69,7 @@ For using GPU, you need in addition to include the following NVIDIA CUDA related * `cudnn64_5.dll` * `curand64_80.dll` * `cusparse64_80.dll` +* `nvml.dll` All these dlls can be found in the CNTK binary release version, see the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases). diff --git a/articles/test/Native-CNTK-Library-Eval-Interface.md b/articles/test/CNTK-Library-Native-Eval-Interface.md similarity index 58% rename from articles/test/Native-CNTK-Library-Eval-Interface.md rename to articles/test/CNTK-Library-Native-Eval-Interface.md index 935d2398..0261b1b1 100644 --- a/articles/test/Native-CNTK-Library-Eval-Interface.md +++ b/articles/test/CNTK-Library-Native-Eval-Interface.md @@ -1,5 +1,8 @@ The C++ CNTK Library for Evaluation is based on [CNTK Library API](https://github.com/microsoft/cntk/wiki/CNTK-Library-API). The following methods are used to evaluate a model. +*** +### The following methods load a model into Function. + *** `static FunctionPtr CNTK::Function::LoadModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice())` @@ -10,15 +13,49 @@ This method loads a model file and returns the pointer to the Function that repr This method loads a model from a memory buffer and returns the pointer to the Function that represents the loaded model. The `modelBuffer` points to the buffer containing the serialized model content, and the `modelBufferLength` is the buffer length. The `computeDevice` specifies the device to run evaluation. +*** +### The following method evaluates the Function using the specified input. + *** `void CNTK::Function::Evaluate(const std::unordered_map& arguments, std::unordered_map& outputs, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice())` This method starts evaluation of the `this` Function representing the model with specificed parameters. The `arguments` contains values of all input variables required for evaluation, and the `outputs` store the values of output variables. The storage of output values can either be pre-allocated by the caller, or by the system if the ValuePtr mapping is passed as null by the caller. The `computeDevice` specifies the device to run evaluation. *** -`FunctionPtr CNTK::Function::Clone(ParameterCloningMethod parameterCloneMethod, const std::unordered_map& replacements) const` +### The following methods are helper functions to manipulate the Function to be evaluated. + +*** +`FunctionPtr CNTK::Function::Clone(ParameterCloningMethod parameterCloneMethod = ParameterCloningMethod::Clone, const std::unordered_map& replacements = {}) const;` + +For evaluation, this method is called to create a cloned Function which can then be used by another thread to evaluate the same model. For this purpose, the `parameterCloneMethod` should be set to its default value `ParameterCloningMethod::Share`. The parameter `replacements` specifies any variable replacements that are applied in the cloned Function instance, and is usually not needed for evaluation. + +*** +`FunctionPtr CNTK::Function::FindByName(const std::wstring& name, bool nestedSearchInsideBlockFunction = false)` + +Find a function with the given _name_ in the Function graph underlying 'this' Function. If more than one function with the same name exists, an exception is thrown. If _nestedSearchInsideBlockFunction_ is true, all functions inside block functions are also searched. + +*** +`std::vector CNTK::Function::FindAllWithName(const std::wstring& name, bool nestedSearchInsideBlockFunction = false)` + +Find a list of functions with the given _name_ in the Function graph underlying 'this' Function. If _nestedSearchInsideBlockFunction_ is true, all functions inside block functions are also searched. + +*** +`FunctionPtr CNTK::Combine(const std::vector& operands, const std::wstring& name = L"")` + + Create a new Function instance which combines the outputs of the specified list of 'operands' of Functions. The 'Outputs' of the new 'Function' are union of the 'Outputs' of each of the specified 'operands' Functions. As an example, when creating a classification model, typically the CrossEntropy loss Function and the ClassificationError Function comprise the roots of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output. + +*** +`FunctionPtr CNTK::AsComposite(const FunctionPtr& rootFunction, const std::wstring& name = L"")` + +Creates a composite Function that has the specified `_rootFunction_` as its root. The composite denotes a higher-level Function encapsulating the entire graph of Functions underlying the specified `rootFunction`. + +*** +`FunctionPtr CNTK::Alias(const Variable& operand, const std::wstring& name = L"")` -This method clones `this` Function. For evaluation, this method is called to create a cloned Function which can then be used by another thread to evaluate the same model. For this purpose, the `parameterCloneMethod` is set to its default value `cloned`. The parameter `replacements` specifies any variable replacements that are applied in the cloned Function instance, and is usually not needed for evaluation. +Creates a new Function instance which is just an alias of the specified `_operand_`. + +*** +### The following methods create a Value object from input data in dense format. *** `template static ValuePtr CNTK::Value::CreateBatch(const NDShape& sampleShape, const std::vector& batchData, const DeviceDescriptor& device, bool readOnly = false)` @@ -77,6 +114,9 @@ Parameters: * _device_: on which device the Value should be created. * _readOnly_: the Value is read-only if this flag is `true`. +*** +### The following methods create a Value object from input data in one-hot vector format. + *** `template static ValuePtr CNTK::Value::CreateBatch(size_t dimension, const std::vector& batchData, const DeviceDescriptor& device, bool readOnly = false);` @@ -134,6 +174,106 @@ Parameters: * _device_: on which device the Value object should be created. * _readOnly_: the Value is read-only if this flag is `true`. +*** +### The following methods create a Value object from input data in sparse CSC format. +Currently the Compressed Sparse Column Format (CSC) is supported. The CSC format stores the matrix in column-major format, and the array containing the column indices is compressed. A matrix in CSC format is represented by the following parameters: +* _nonZeroValues_: the data array that holds all nonzero values of the matrix in column-major format. +* _rowIndices_: the array that contains the row indices of the corresponding elements in array _nonZeroValues_. +* _colStarts_: the array that holds indices into the arrays _rowIndices_ and _nonZeroValues_. + +A detailed description of the CSC format can be found [here](http://docs.nvidia.com/cuda/cusparse/index.html#compressed-sparse-column-format-csc). + +*** +`template static ValuePtr CNTK::Value::CreateSequence(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false)` + +Creates a new Value object containing a sequence of samples using CSC sparse input format. The sequence length is the number of rows of the sparse matrix. The created sequence is a new sequence. _ElementType_ is the data type of the created Value object. Currently, `float` and `double` are supported. + +Parameters: +* _sampleShape_: the tensor shape of the Value. For sparse input, the tensor shape leading dimensionality must be the same as the total size of the tensor shape. +* _sequenceLength_: the sequence length, which is also the number of rows in the sparse matrix. +* _colStarts_: the array holds indices for each column into the arrays _rowIndices_ and _nonZeroValues_. +* _rowIndices_: the array that contains the row indices of the corresponding elements in array _nonZeroValues_. +* _nonZeroValues_: the array that holds all nonzero values in the sparse matrix. +* _numNonZeroValues_: the number of nonzero values in the sparse matrix. +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +`template static ValuePtr CNTK::Value::CreateSequence(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly = false)` + +Creates a new Value object containing a sequence of samples using CSC sparse input format. The sequence length is the number of rows of the sparse matrix. The sequenceStartFlag specifies whether this sequence is a new sequence or continuation of a previous sequence from a previous call to this method. _ElementType_ is the data type of the created Value object. Currently, `float` and `double` are supported. + +Parameters: +* _sampleShape_: the tensor shape of the Value. For sparse input, the tensor shape leading dimensionality must be the same as the total size of the tensor shape. +* _sequenceLength_: the sequence length, which is also the number of rows in the sparse matrix. +* _colStarts_: the array holds indices for each column into the arrays _rowIndices_ and _nonZeroValues_. +* _rowIndices_: the array that contains the row indices of the corresponding elements in array _nonZeroValues_. +* _nonZeroValues_: the array that holds all nonzero values in the sparse matrix. +* _numNonZeroValues_: the number of nonzero values in the sparse matrix. +* _sequenceStartFlag_: `true` indicates that it is a new sequence. `false` means a continuation of a previous sequence. +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +`template static ValuePtr CNTK::Value::CreateSequence(size_t dimension, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false)` + +Creates a new Value object containing a sequence of samples using CSC sparse input format. The sequence length is the number of rows of the sparse matrix. The created sequence is a new sequence. _ElementType_ is the data type of the created Value object. Currently, `float` and `double` are supported. + +Parameters: +* _dimension_: the size of dimension of the one-hot vector. +* _sequenceLength_: the sequence length, which is also the number of rows in the sparse matrix. +* _colStarts_: the array holds indices for each column into the arrays _rowIndices_ and _nonZeroValues_. +* _rowIndices_: the array that contains the row indices of the corresponding elements in array _nonZeroValues_. +* _nonZeroValues_: the array that holds all nonzero values in the sparse matrix. +* _numNonZeroValues_: the number of nonzero values in the sparse matrix. +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +`template static ValuePtr CNTK::Value::CreateSequence(size_t dimension, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly = false)` + +Creates a new Value object containing a sequence of samples using CSC sparse input format. The sequence length is the number of rows of the sparse matrix. The sequenceStartFlag specifies whether this sequence is a new sequence or continuation of a previous sequence. _ElementType_ is the data type of the created Value object. Currently, `float` and `double` are supported. + +Parameters: +* _dimension_: the size of dimension of the one-hot vector. +* _sequenceLength_: the sequence length, which is also the number of rows in the sparse matrix. +* _colStarts_: the array holds indices for each column into the arrays _rowIndices_ and _nonZeroValues_. +* _rowIndices_: the array that contains the row indices of the corresponding elements in array _nonZeroValues_. +* _nonZeroValues_: the array that holds all nonzero values in the sparse matrix. +* _numNonZeroValues_: the number of nonzero values in the sparse matrix. +* _sequenceStartFlag_: `true` indicates that it is a new sequence. `false` means a continuation of a previous sequence. +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +### The following methods create a Value object from NDArrayView. + +*** +`static ValuePtr CNTK::Value::Create(const NDShape& sampleShape, const std::vector& sequences, const DeviceDescriptor& device, bool readOnly = false)` + +Creates a new Value object based on a collection of NDArrayViews. Each sequence in _sequences_ is a new sequence. + +Parameters: +* _sampleShape_: the tensor shape of the Value being created. +* _sequences_: a collection of sequences represented by NDArrayView. Each NDArrayView represents a sequence. +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +`static ValuePtr CNTK::Value::Create(const NDShape& sampleShape, const std::vector& sequences, const std::vector& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly, bool createNewCopy)` + +Creates a new Value object based on a collection of NDArrayViews. The sequenceStartFlags specifies whether a sequence is a new sequence or continuation of a previous sequence. + +Parameters: +* _sampleShape_: the tensor shape of the Value being created. +* _sequences_: a collection of sequences represented by NDArrayView. Each NDArrayView represents a sequence. +* _sequenceStartFlags_: A collection of boolean values. Each element represents whether the corresponding sequence in _sequences_ is a new sequence (in case of `true`) or a continuation of a previous sequence (in case of `false`). +* _device_: on which device the Value object should be created. +* _readOnly_: the Value is read-only if this flag is `true`. + +*** +### The following methods copy data stored in a Value object as dense or one-hot vector output. + *** `template void CNTK::Value::CopyVariableValueTo(const Variable& outputVariable, std::vector>& sequences)` diff --git a/articles/test/CNTK-Python-known-issues-and-limitations.md b/articles/test/CNTK-Python-known-issues-and-limitations.md new file mode 100644 index 00000000..f330654a --- /dev/null +++ b/articles/test/CNTK-Python-known-issues-and-limitations.md @@ -0,0 +1 @@ +- The core API itself is implemented in C++ for speed and efficiency and python bindings are created through SWIG. We are increasingly creating thin python wrappers for the APIs to attach docstrings to, but this is a work in progress and for some of the APIs, you may directly encounter SWIG generated API definitions (which are not the prettiest to read). diff --git a/articles/test/CNTK-Shared-Libraries-Naming-Format.md b/articles/test/CNTK-Shared-Libraries-Naming-Format.md new file mode 100644 index 00000000..09ea885a --- /dev/null +++ b/articles/test/CNTK-Shared-Libraries-Naming-Format.md @@ -0,0 +1,92 @@ +Starting from CNTK **v.2.0 RC 1** we introduce new naming format for CNTK Shared Libraries for both Windows (`*.dll`, `*.lib`) and Linux (`*.so`) versions. + +We also changed the names of the existing CNTK libraries to make them more meaningful and clearly distinguish between debug and release versions. Debug versions are now marked by letter `d` at the end of the file name (right before file extension). See complete name match tables below. + +Finally we introduce the concept of **CNTK Component version** which is now an integral part of every name of CNTK Shared Libraries. + +The new format of a CNTK Shared Library file name is: +``` +-[d]. +``` +See complete name match tables below. + +#### Match between old and new CNTK Shared Libraries names. Windows. +In the table below we use `2.0rc1` as CNTK Component Version. **Note, that in your actual installation Component Version may, and very likely will be different.** + +| Old name | New name Release | New name Debug | +|:--------:|:----------------:|:--------------:| +|`ActionsLib.lib`|`Cntk.Actions-2.0rc1.lib`|`Cntk.Actions-2.0rc1d.lib`| +|`BinaryReader.dll`|` Cntk.Reader.Binary.Deprecated-2.0rc1.dll`| `Cntk.Reader.Binary.Deprecated-2.0rc1d.dll`| +|`BinaryReader.lib`|` Cntk.Reader.Binary.Deprecated-2.0rc1.lib`|` Cntk.Reader.Binary.Deprecated-2.0rc1d.lib`| +|`CNTK.lib`|`CNTK.lib` (no change)|`CNTK.lib` (no change)| +|`CNTKBinaryReader.dll`|`Cntk.Deserializers.Binary-2.0rc1.dll`|`Cntk.Deserializers.Binary-2.0rc1d.dll`| +|`CNTKBinaryReader.lib`|`Cntk.Deserializers.Binary-2.0rc1.lib`|`Cntk.Deserializers.Binary-2.0rc1d.lib`| +|`CNTKLibrary-2.0.dll`|`Cntk.Core-2.0rc1.dll`|`Cntk.Core-2.0rc1d.dll`| +|`CNTKLibrary-2.0.lib`|`Cntk.Core-2.0rc1.lib`|`Cntk.Core-2.0rc1d.lib`| +|`CNTKLibraryCSBinding.dll`|`Cntk.Core.CSBinding-2.0rc1.dll`|`Cntk.Core.CSBinding-2.0rc1d.dll`| +|`CNTKLibraryCSBinding.lib`|`Cntk.Core.CSBinding-2.0rc1.lib`|`Cntk.Core.CSBinding-2.0rc1d.lib`| +|`CNTKLibraryManaged-2.0.dll`|`Cntk.Core.Managed-2.0rc1.dll`|`Cntk.Core.Managed-2.0rc1d.dll`| +|`CNTKTextFormatReader.dll`|`Cntk.Deserializers.TextFormat-2.0rc1.dll`|`Cntk.Deserializers.TextFormat-2.0rc1d.dll`| +|`CNTKTextFormatReader.lib`|`Cntk.Deserializers.TextFormat-2.0rc1.lib`|`Cntk.Deserializers.TextFormat-2.0rc1d.lib`| +|`Common.lib`|`Cntk.Common-2.0rc1.lib`|`Cntk.Common-2.0rc1d.lib`| +|`CompositeDataReader.dll`|`Cntk.Composite-2.0rc1.dll`|`Cntk.Composite-2.0rc1d.dll`| +|`CompositeDataReader.lib`|`Cntk.Composite-2.0rc1.lib`|`Cntk.Composite-2.0rc1d.lib`| +|`DSSMReader.dll`|`Cntk.Reader.DSSM-2.0rc1.dll`|`Cntk.Reader.DSSM-2.0rc1d.dll`| +|`DSSMReader.lib`|`Cntk.Reader.DSSM-2.0rc1.lib`|`Cntk.Reader.DSSM-2.0rc1d.lib`| +|`EvalDll.dll`|`Cntk.Eval-2.0rc1.dll`|`Cntk.Eval-2.0rc1d.dll`| +|`EvalDll.lib`|`Cntk.Eval-2.0rc1.lib`|`Cntk.Eval-2.0rc1d.lib`| +|`EvalWrapper.dll`|`Cntk.Eval.Wrapper-2.0rc1.dll`|`Cntk.Eval.Wrapper-2.0rc1d.dll`| +|`HTKDeserializers.dll`|`Cntk.Deserializers.HTK-2.0rc1.dll`|`Cntk.Deserializers.HTK-2.0rc1d.dll`| +|`HTKDeserializers.lib`|`Cntk.Deserializers.HTK-2.0rc1.lib`|`Cntk.Deserializers.HTK-2.0rc1d.lib`| +|`HTKMLFReader.dll`|`Cntk.Reader.HTKMLF-2.0rc1.dll`|`Cntk.Reader.HTKMLF-2.0rc1d.dll`| +|`HTKMLFReader.lib`|`Cntk.Reader.HTKMLF-2.0rc1.lib`|`Cntk.Reader.HTKMLF-2.0rc1d.lib`| +|`ImageReader.dll`|`Cntk.Deserializers.Image-2.0rc1.dll`|`Cntk.Deserializers.Image-2.0rc1d.dll`| +|`ImageReader.lib`|`Cntk.Deserializers.Image-2.0rc1.lib`|`Cntk.Deserializers.Image-2.0rc1d.lib`| +|`LibSVMBinaryReader.dll`|`Cntk.Reader.SVMBinary-2.0rc1.dll`|`Cntk.Reader.SVMBinary-2.0rc1d.dll`| +|`LibSVMBinaryReader.lib`|`Cntk.Reader.SVMBinary-2.0rc1.lib`|`Cntk.Reader.SVMBinary-2.0rc1d.lib`| +|`LMSequenceReader.dll`|`Cntk.Reader.LMSequence-2.0rc1.dll`|`Cntk.Reader.LMSequence-2.0rc1d.dll`| +|`LMSequenceReader.lib`|`Cntk.Reader.LMSequence-2.0rc1.lib`|`Cntk.Reader.LMSequence-2.0rc1d.lib`| +|`LUSequenceReader.dll`|`Cntk.Reader.LUSequence-2.0rc1.dll`|`Cntk.Reader.LUSequence-2.0rc1d.dll`| +|`LUSequenceReader.lib`|`Cntk.Reader.LUSequence-2.0rc1.lib`|`Cntk.Reader.LUSequence-2.0rc1d.lib`| +|`Math.dll`|`Cntk.Math-2.0rc1.dll`|`Cntk.Math-2.0rc1d.dll`| +|`Math.lib`|`Cntk.Math-2.0rc1.lib`|`Cntk.Math-2.0rc1d.lib`| +|`MathCUDA.lib`|`Cntk.Math.Cuda-2.0rc1.lib`|`Cntk.Math.Cuda-2.0rc1d.lib`| +|`PerformanceProfilerDll.dll`|`Cntk.PerformanceProfiler-2.0rc1.dll`|`Cntk.PerformanceProfiler-2.0rc1d.dll`| +|`PerformanceProfilerDll.lib`|`Cntk.PerformanceProfiler-2.0rc1.lib`|`Cntk.PerformanceProfiler-2.0rc1d.lib`| +|`ReaderLib.lib`|`Cntk.Reader-2.0rc1.lib`|`Cntk.Reader-2.0rc1d.lib`| +|`SequenceTrainingLib.lib`|`Cntk.SequenceTrainingLib-2.0rc1.lib`|`Cntk.SequenceTrainingLib-2.0rc1d.lib`| +|`SGDLib.lib`|`Cntk.SGD-2.0rc1.lib`|`Cntk.SGD-2.0rc1d.lib`| +|`SparsePCReader.dll`|`Cntk.Reader.SparsePC-2.0rc1.dll`|`Cntk.Reader.SparsePC-2.0rc1d.dll`| +|`SparsePCReader.lib`|`Cntk.Reader.SparsePC-2.0rc1.lib`|`Cntk.Reader.SparsePC-2.0rc1d.lib`| +|`UCIFastReader.dll`|`Cntk.Reader.UCIFast-2.0rc1.dll`|`Cntk.Reader.UCIFast-2.0rc1d.dll`| +|`UCIFastReader.lib`|`Cntk.Reader.UCIFast-2.0rc1.lib`|`Cntk.Reader.UCIFast-2.0rc1d.lib`| + +---------- + +#### Match between old and new CNTK Shared Libraries names. Linux. +In the table below we use `2.0rc1` as CNTK Component Version. **Note, that in your actual installation Component Version may, and very likely will be different.** + +| Old name | New name Release | New name Debug | +|:--------:|:----------------:|:--------------:| +|`CNTKBinaryReader.so`|`Cntk.Deserializers.Binary-2.0rc1.so`|`Cntk.Deserializers.Binary-2.0rc1d.so`| +|`CNTKTextFormatReader.so`|`Cntk.Deserializers.TextFormat-2.0rc1.so`|`Cntk.Deserializers.TextFormat-2.0rc1d.so`| +|`CompositeDataReader.so`|`Cntk.Composite-2.0rc1.so`|`Cntk.Composite-2.0rc1d.so`| +|`HTKDeserializers.so`|`Cntk.Deserializers.HTK-2.0rc1.so`|`Cntk.Deserializers.HTK-2.0rc1d.so`| +|`HTKMLFReader.so`|`Cntk.Reader.HTKMLF-2.0rc1.so`|`Cntk.Reader.HTKMLF-2.0rc1d.so`| +|`ImageReader.so`|`Cntk.Deserializers.Image-2.0rc1.so`|`Cntk.Deserializers.Image-2.0rc1d.so`| +|`Kaldi2Reader.so`|`Cntk.Reader.Kaldi2-2.0rc1.so`|`Cntk.Reader.Kaldi2-2.0rc1d.so`| +|`libcntklibrary-2.0.so`|`libCntk.Core-2.0rc1.so`|`libCntk.Core-2.0rc1d.so`| +|`libcntkmath.so`|`libCntk.Math-2.0rc1.so`|`libCntk.Math-2.0rc1d.so`| +|`libeval.so`|`libCntk.Eval-2.0rc1.so`|`libCntk.Eval-2.0rc1d.so`| +|`libperfprofiler.so`|`libCntk.PerformanceProfiler-2.0rc1.so`|`libCntk.PerformanceProfiler-2.0rc1d.so`| +|`LibSVMBinaryReader.so`|`Cntk.Reader.SVMBinary-2.0rc1.so`|`Cntk.Reader.SVMBinary-2.0rc1d.so`| +|`LMSequenceReader.so`|`Cntk.Reader.LMSequence-2.0rc1.so`|`Cntk.Reader.LMSequence-2.0rc1d.so`| +|`LUSequenceReader.so`|`Cntk.Reader.LUSequence-2.0rc1.so`|`Cntk.Reader.LUSequence-2.0rc1d.so`| +|`SparsePCReader.so`|`Cntk.Reader.SparsePC-2.0rc1.so`|`Cntk.Reader.SparsePC-2.0rc1d.so`| +|`UCIFastReader.so`|`Cntk.Reader.UCIFast-2.0rc1.so`|`Cntk.Reader.UCIFast-2.0rc1d.so`| + + + + + + diff --git a/articles/test/Debug-a-Python-notebook.md b/articles/test/Debug-a-Python-notebook.md deleted file mode 100644 index 559abf4f..00000000 --- a/articles/test/Debug-a-Python-notebook.md +++ /dev/null @@ -1,14 +0,0 @@ -You will need to install ipdb via -``` -pip install ipdb -``` -Make sure the above installation happens for the correct python environment (the one used to run the notebook). -Afterwards, in your notebook you should add this line to import the debugger. -```python -from IPython.core.debugger import Tracer -``` -Finally, at the point where you want to debug add this line -```python -Tracer()() -``` -When you run the cell with the `Tracer()()` the execution will drop into ipdb at the next line. Then you can start inspecting things with the usual [pdb](https://docs.python.org/2/library/pdb.html) commands. Don't forget to quit the debugger when you are done! diff --git a/articles/test/Examples.md b/articles/test/Examples.md index aa954978..e3931e03 100644 --- a/articles/test/Examples.md +++ b/articles/test/Examples.md @@ -8,20 +8,5 @@ The examples are structured by topic into Image, Language Understanding, Speech, and so forth. To get started with CNTK we recommend the tutorials in the `Tutorials` folder. -## Python Examples - -The best way to learn about the APIs is to look at the following examples in the [CNTK clone root]/Examples directory: - -* [MNIST](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/Image/Classification/MLP/Python/SimpleMNIST.py): A fully connected feed-forward model for classification of MNIST images. (follow the instructions in Examples/Image/DataSets/MNIST/README.md) -* [TrainResNet_CIFAR10](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py): An image classification ResNet model for training on the CIFAR image dataset. (follow the instructions in Examples/Image/DataSets/CIFAR-10/README.md to get the CIFAR dataset and convert it to the CNTK supported format) -* [SequenceClassification](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/SequenceClassification/SimpleExample/Python/SequenceClassification.py): An LSTM sequence classification model for text data. -* [Sequence2Sequence](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/SequenceToSequence/CMUDict/Python/Sequence2Sequence.py): A sequence to sequence grapheme to phoneme translation model that trains on the CMUDict corpus. -* [NumpyInterop](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Tutorials/NumpyInterop/FeedForwardNet.py) - NumPy interoperability example showing how to train a simple feed-forward network with training data fed using NumPy arrays. -* [LanguageUnderstanding](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py) - Language Understanding. -* [Video](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py[) - Basic 3D convolution networks for deep learning on video tasks. - - An overview on all examples and tutorials is also provided by the [Cognitive Toolkit Model Gallery](https://www.microsoft.com/en-us/research/product/cognitive-toolkit/model-gallery/) page. - - diff --git a/articles/test/Express-a-gating-mechanism.md b/articles/test/Express-a-gating-mechanism.md deleted file mode 100644 index d5e430c4..00000000 --- a/articles/test/Express-a-gating-mechanism.md +++ /dev/null @@ -1,5 +0,0 @@ -The simplest way is to use - -[`BS.Boolean.If`](./If-Operation)` (condition, thenExpression, elseExpression)` - -which works with scalars as well as tensors (all arguments can [broadcast](./Binary-Operations#broadcasting-semantics)). \ No newline at end of file diff --git a/articles/test/Express-a-softmax-over-a-dynamic-axis.md b/articles/test/Express-a-softmax-over-a-dynamic-axis.md deleted file mode 100644 index 21801e12..00000000 --- a/articles/test/Express-a-softmax-over-a-dynamic-axis.md +++ /dev/null @@ -1,14 +0,0 @@ -A softmax over a dynamic axis can be done via a recurrence: - -BrainScript: -``` -SoftMaxOverSequence (z) = { - # define a recursive expression for \log(\sum_{i=1}^t \exp(z_i)) - runningLogSumExp = LogPlus (z, - PastValue (0, runningLogSumExp, defaultHiddenActivation=-1e30)) - logSumExp = If (BS.Loop.IsLast (runningLogSumExp), # if last entry - /*then*/ runningLogSumExp, # then copy that - /*else*/ FutureValue (0, logSumExp)) # else just propagate to the front - result = Exp (z - logSumExp) -}.result -``` \ No newline at end of file diff --git a/articles/test/Express-a-softmax-with-a-temperature-parameter.md b/articles/test/Express-a-softmax-with-a-temperature-parameter.md deleted file mode 100644 index 3addc168..00000000 --- a/articles/test/Express-a-softmax-with-a-temperature-parameter.md +++ /dev/null @@ -1,4 +0,0 @@ -A temperature softmax is very easy in BrainScript -``` -TemperatureSoftmax (z, T) = Softmax (z / Constant (T)) -``` \ No newline at end of file diff --git a/articles/test/Express-the-error-rate-of-my-binary-classifier.md b/articles/test/Express-the-error-rate-of-my-binary-classifier.md deleted file mode 100644 index d5d15814..00000000 --- a/articles/test/Express-the-error-rate-of-my-binary-classifier.md +++ /dev/null @@ -1,7 +0,0 @@ -Do not use ClassificationError as it only works for multiclass labels. Using the standard arithmetic and logical operators you can express this as -``` -err = (label != (prediction > 0.5)) -... -evaluationNodes = (err) -``` -where we assume here that prediction is an estimate of the probability of the positive class given the input. diff --git a/articles/test/Get-nice-syntax-highlighting-for-BrainScript-config-files.md b/articles/test/Get-nice-syntax-highlighting-for-BrainScript-config-files.md deleted file mode 100644 index 2d2fd6fe..00000000 --- a/articles/test/Get-nice-syntax-highlighting-for-BrainScript-config-files.md +++ /dev/null @@ -1,7 +0,0 @@ -Here's how to do it if you use `vim`; other editors should be similar -``` -:set syntax=conf -``` -This is not perfect but it's much better than no syntax highlighting. If you know how to write a `.vim` syntax file please send us a pull request. - -In Emacs, `conf-mode` is best suited. To switch, press `M-x` to start entering commands, then enter `conf-mode`. On Windows Emacs, there is also `conf-windows-mode`, however that mode appears to sometimes get confused by special characters in comments. diff --git a/articles/test/Get-started-in-sequence-to-sequence-modelling.md b/articles/test/Get-started-in-sequence-to-sequence-modelling.md deleted file mode 100644 index 7e6a7b3c..00000000 --- a/articles/test/Get-started-in-sequence-to-sequence-modelling.md +++ /dev/null @@ -1,17 +0,0 @@ -This [hands-on lab](Hands-On-Labs-Language-Understanding) describes the main ingredients for getting started on sequence processing such as the CNTK text format and how to configure the reader to use short aliases for the various input sequences. -The [grapheme-to-phoneme (G2P) example](https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/BrainScript/G2P.cntk) -demonstrates an actual sequence-to-sequence task. - -An important issue for sequence-to-sequence modeling is how to decode test data with beam search. -This can be done with in a section of your config where the top level action is "write". Decoding requires a search for the most probable sequence of outputs. CNTK has a [beam search](https://en.wikipedia.org/wiki/Beam_search) decoder while you can call like this -``` -BrainScriptNetworkBuilder = (BS.Seq2Seq.BeamSearchSequenceDecoderFrom ( - BS.Network.Load (decodeModelPath), beamSize)) -``` -and will execute beam search with the specified beam size. For a beam size of 1 there is a specialized greedy decoder -``` -BrainScriptNetworkBuilder = (BS.Seq2Seq.GreedySequenceDecoderFrom ( - BS.Network.Load (decodeModelPath))) -``` -Both decoders have specific requirements to the network, as shown -in the [G2P example](https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/BrainScript/G2P.cntk) diff --git a/articles/test/Get-things-to-work-correctly-when-I-take-the-last-element-of-a-sequence.md b/articles/test/Get-things-to-work-correctly-when-I-take-the-last-element-of-a-sequence.md deleted file mode 100644 index f8c1def8..00000000 --- a/articles/test/Get-things-to-work-correctly-when-I-take-the-last-element-of-a-sequence.md +++ /dev/null @@ -1,11 +0,0 @@ -There are two issues here. Support that you have a sequence `seq`. You want to take the last element, further process it with a few layers. -```python -last = C.sequences.last(seq) -z = a_few_layers(last) -``` -Now you want to plug `z` in your loss but you get an error about dynamic axes. Input variables are created with some default dynamic axes but `last` (and `z`) had its dynamic axes determined by `sequences.last`. So one possibility is to define the label variable at this point and have it copy its dynamic axes from `z`. Typically: -```python -y = C.input_variable(z.shape, dynamic_axes=z.dynamic_axes) -loss = C.squared_error(y, z) -``` -Finally, when training, the data labels must have a dynamic axis of size one i.e. each element in the batch should have a shape of (1,)+y.shape. \ No newline at end of file diff --git a/articles/test/Hands-On-Labs-Image-Recognition.md b/articles/test/Hands-On-Labs-Image-Recognition.md index a6a40089..38ab2aad 100644 --- a/articles/test/Hands-On-Labs-Image-Recognition.md +++ b/articles/test/Hands-On-Labs-Image-Recognition.md @@ -1,9 +1,9 @@ -# Hands-On Lab: Image recognition with Convolutional Networks, Batch Normalization, and Residual Nets - Note that this tutorial requires the latest master version, or the upcoming CNTK 1.7 which will be released soon. -An intermediate binary download can be found in the [instructions for the KDD CNTK Hands-On Tutorial](KDD-2016-Tutorial) that +An intermediate binary download can be found in the [instructions for the KDD CNTK Hands-On Tutorial](https://github.com/Microsoft/CNTK/wiki/KDD-2016-Tutorial) that this tutorial was originally designed for. +# Hands-On Lab: Image recognition with Convolutional Networks, Batch Normalization, and Residual Nets + This hands-on lab shows how to implement convolution-based image recognition with CNTK. We will start with a common convolutional image-recognition architecture, add Batch Normalization, and then extend it into a Residual Network (ResNet-20). @@ -17,7 +17,7 @@ The techniques you will practice include: * parallel training We assume that you are familiar with basics of deep learning, and these specific concepts (if not, -you can catch up with [this two-page introduction](Tutorial2/Tutorial2#going-deep-convolutional-neural-networks-cnns)): +you can catch up with [this two-page introduction](./Tutorial2#going-deep-convolutional-neural-networks-cnns)): * convolutional networks * batch normalization diff --git a/articles/test/Hands-On-Labs-Language-Understanding.md b/articles/test/Hands-On-Labs-Language-Understanding.md index bff32b84..654eb96b 100644 --- a/articles/test/Hands-On-Labs-Language-Understanding.md +++ b/articles/test/Hands-On-Labs-Language-Understanding.md @@ -1,7 +1,7 @@ -# Hands-On Lab: Language Understanding with Recurrent Networks - Note that this tutorial requires the latest master version, or the upcoming CNTK 1.7.1 which will be released soon. +# Hands-On Lab: Language Understanding with Recurrent Networks + This hands-on lab shows how to implement a recurrent network to process text, for the Air Travel Information Services (ATIS) tasks of slot tagging and intent classification. We will start with a straight-forward embedding followed by a recurrent LSTM. diff --git a/articles/test/How-do-I-Adapt-Models-in-BrainScript.md b/articles/test/How-do-I-Adapt-Models-in-BrainScript.md new file mode 100644 index 00000000..96a3acf8 --- /dev/null +++ b/articles/test/How-do-I-Adapt-Models-in-BrainScript.md @@ -0,0 +1,89 @@ +* [Use a trained model as a feature extractor](How-do-I-Adapt-Models-in-BrainScript#use-a-trained-model-as-a-feature-extractor)? +* [Use an already trained network multiple times inside a larger network](./How-do-I-Adapt-Models-in-BrainScript#use-an-already-trained-network-multiple-times-inside-a-larger-network) +* [Adapt a model I trained on one task to another](./How-do-I-Adapt-Models-in-BrainScript#adapt-a-model-i-trained-on-one-task-to-another)? +* [Save and reload weights from one model to another](./How-do-I-Adapt-Models-in-BrainScript#save-and-reload-weights-from-one-model-to-another)? +* [Continue training from a previously saved model](How-do-I-Adapt-Models-in-BrainScript#continue-training-from-a-previously-saved-model)? + +## Use a trained model as a feature extractor + +(thanks to @zpbappi, [Issue #672](https://github.com/Microsoft/CNTK/issues/672)) +
(for a Python API example go [here](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/FeatureExtraction)) + +Consider you have trained, say, a deep auto-encoder, which is symmetric around an encoding layer ("input --> encoding --> input" model). Now you would like to use the first half of the network as a constant feature extractor in another network. + +To achieve this, define the new network that should use the feature extraction, and use the `BS.Network.Load()` and [`BS.Network.CloneFunction()`](./BS-CloneFunction) operations to incorporate a read-only clone of the pre-trained network as part of the new network. The following shows an example: + + # extract a clone of the pre-trained model into a BrainScript + # function that can then be used as a feature extractor. + # Your pre-trained model is expected to have two nodes named 'input' and 'feat'. + featExtNetwork = BS.Network.Load ("YOUR_TRAINED_FE_MODEL") + featExt = BS.Network.CloneFunction ( + featExtNetwork.input, # input node that FE model read data from + featExtNetwork.feat, # output node in FE model that holds the desired features + parameters="constant") # says to freeze that part of the network + + # define your actual model that is meant to consume the features + # This example maps features to logits with one hidden layer. + model (features) = [ + W_hid = ... ; b_hid = ... ; W_out = ... ; b_out = ... + h = Sigmoid (W_hid * features + b_hid) # whatever your hidden layer looks like + z = W_out * h + b_out + ].z + + # define your new network, using featExt() like any old BrainScript function. E.g. + input = Input (...) + z = model (featExt (input)) + ce = CrossEntropyWithSoftmax (labels, z) + featureNodes = (input) + labelNodes = (labels) + criterionNodes = (ce) + +A key parameter is `parameters="constant"`, which will lock all learnable parameters (setting `learningRateMultiplier=0`) inside so that they won't get updated during further training. It also locks potentially used `BatchNormalization()` operations by placing them into inference mode. + +#### How to know the precise node names + +CNTK automatically generates node names, which are attempted to be as close as possible to the original BrainScript expressions that generated them (it is not always possible). If the above fails because it cannot find the node name you used, please consult the log output generated from loading the model. You will see lines like this: + + Validating --> input = InputValue() : -> [1640 x 1 x *1] + Validating --> L4.y = RectifiedLinear (L4.o) : [2048 x 1 x *1] -> [2048 x 1 x *1] + +The precise node names in these two examples are `features` and `L4.y`. E.g. to refer to the former, say `featExtNetwork.input`. + +#### Workaround: Referencing nodes with names containing `.` or `[` + +The above approach currently has a known bug if the `input` or `feat` node you try to use contains a `.` or a `[` in the node name. This will lead to an "unknown identifier" error. To work around this problem, please replace all `.`, `[`, and `]` into underscores `_`. + +In the above example, the desired feature output is named `L4.y`. Saying `featExtNetwork.L4.y` will fail with a message like `unknown identifier 'L4'`. This is because nodes inside a network loaded from file are no longer true BrainScript records. + +The workaround is to use `featExtNetwork.L4_y` instead. + +## Use an already trained network multiple times inside a larger network + +Use the [`CloneFunction`](./CloneFunction)`()` operation. E.g.: + + BrainScriptNetworkBuilder = { + smallnet = BS.Network.Load ("pathToYourModel") + subnet1 = CloneFunction (smallnet.input, smallnet.output, parameters="learnable" ) + subnet2 = CloneFunction (smallnet.input, smallnet.output, parameters="constant" ) + subnet3 = CloneFunction (smallnet.input, smallnet.output, parameters="shared" ) + subnet4 = CloneFunction (smallnet.input, smallnet.output, parameters="shared" ) + # now use subnet1, subnet2, subnet3, and subnet4 as regular Brainscript functions + } + +`subnet1()` will get a copy of its learnable parameters, which will continue to be updated in training. +`subnet2()` will get a read-only copy of learnable parameters. +`subnet3()` and `subnet4()` will share the same set of learnable parameters, which will get updated jointly during training. + +## Adapt a model I trained on one task to another + +Use the [[CloneFunction]] operation. The [examples section](https://github.com/Microsoft/CNTK/wiki/CloneFunction#examples) in the [[CloneFunction]] page provides an adaptation example. + +## Save and reload weights from one model to another + +Use the [[CloneFunction]] operation. The [examples section](https://github.com/Microsoft/CNTK/wiki/CloneFunction#examples) in the [[CloneFunction]] page provides an adaptation example. + +## Continue training from a previously saved model + +To continue training from a previously saved snapshot of the model you need to do two things: +* Set keepCheckPointFiles to true in your config file so that check point files will not be deleted. +* Copy the model file and the corresponding check point files to a new folder and point your continued training to that folder. diff --git a/articles/test/Read-and-modify-the-training-weights-from-Python.md b/articles/test/How-do-I-Adapt-models-in-Python.md similarity index 51% rename from articles/test/Read-and-modify-the-training-weights-from-Python.md rename to articles/test/How-do-I-Adapt-models-in-Python.md index 16031b9c..8f8b4f81 100644 --- a/articles/test/Read-and-modify-the-training-weights-from-Python.md +++ b/articles/test/How-do-I-Adapt-models-in-Python.md @@ -1,3 +1,7 @@ +[Read and modify the training weights from Python](./How-do-I-Adapt-models-in-Python#read-and-modify-the-training-weights-from-python) + +## Read and modify the training weights from Python + ```python from cntk import * p=parameter(5, init=glorot_uniform()) diff --git a/articles/test/How-do-I-Deal-with-Errors-in-BrainScript.md b/articles/test/How-do-I-Deal-with-Errors-in-BrainScript.md new file mode 100644 index 00000000..4f0a2119 --- /dev/null +++ b/articles/test/How-do-I-Deal-with-Errors-in-BrainScript.md @@ -0,0 +1,78 @@ +* [Deal with the error "No node named 'x'; skipping"](./How-do-I-Deal-with-Errors-in-BrainScript#deal-with-the-error-no-node-named-x-skipping)? +* [Avoid the "AddSequence: Sequences must be a least one frame long." exception in sequence to sequence](How-do-I-Deal-with-Errors-in-BrainScript#avoid-addsequence-exception)? +* [Deal with the "No Output nodes found" error](./How-do-I-Deal-with-Errors-in-BrainScript#deal-with-the-no-output-nodes-found-error)? +* [Deal with the error "Reached the maximum number of allowed errors"](./How-do-I-Deal-with-Errors-in-BrainScript#deal-with-the-error-reached-the-maximum-number-of-allowed-errors)? +* [Deal with "InputValue operation had its row dimension x changed by the reader to y"](./How-do-I-Deal-with-Errors-in-BrainScript#deal-with-inputvalue-operation-had-its-row-dimension-x-changed-by-the-reader-to-y-compatible-dimensions-in-reader-and-config)? +* [Avoid the error CURAND failure 201](./How-do-I-Deal-with-Errors-in-BrainScript#avoid-the-error-curand-failure-201)? + +## Deal with the error 'No node named 'x'; skipping' + +This can happen for example when you use the `"write"` action for certain values of `outputputNodeNames`. The reason is that `outputNodeNames` expects the name of a node in the computational network which is not necessarily the same as the name of the expression in BrainScript. Check your log to see all the node names and use the appropriate node name as the right hand side of `outputputNodeNames` + +## Avoid AddSequence Exception + +Suppose you are training a sequence to sequence model for translating from English to Hungarian with inputs like +``` + I will not buy this record, it is scratched Nem fogok vásárolni ezt a lemezt, azt karcos + My hovercraft is full of eels A légpárnás hajóm tele van angolnákkal +... +``` +using the LUSequenceReader. There's a chance you'll encounter the cryptic exception +``` +EXCEPTION occurred: AddSequence: Sequences must be a least one frame long. +``` +The way to avoid it is to ensure that the `` token is in your vocabulary and its position corresponds to the variable `separatorRow` (0-based) in your config file. + +We also recommend to not use LUSequenceReader, but instead convert your text using the +[`txt2ctf.py`](https://github.com/Microsoft/CNTK/blob/master/Scripts/txt2ctf.py) tool +and then reading it with the [`CNTKTextFormatReader`](CNTKTextFormat-Reader). + +## Deal with the 'No Output nodes found' error + +Remember that Brainscript is case sensitive. So code like +``` +# Special Nodes +OutputNodes = (ol) +``` +will not work as expected. The names of all special nodes are *lower* camelCase +``` +# Special Nodes +featureNodes = (features) +labelNodes = (labels) +criterionNodes = (ce) +evaluationNodes = (errs) +outputNodes = (ol) +``` + +## Deal with the error 'Reached the maximum number of allowed errors' + +The CNTKTextFormatReader can tolerate a `maxErrors` number of malformed lines (default is 0). The error message +> Reached the maximum number of allowed errors while reading the input file + +means that that the reader has encountered more than `maxErrors` malformed lines. Make sure you have generated the input file correctly (e.g. there are no stray vertical bars in comment streams (`|#`)) or increase `maxErrors` if recreating the input file is close to impossible. + +## Deal with "InputValue operation had its row dimension x changed by the reader to y" (Compatible dimensions in reader and config) + +There are two typical cases where you might see this error message. The underlying reason is that the reader config and the training config do not agree. + +The first typical case is that when you modify a working config file to change the dimensionality of the output (e.g. take a config that works for a 7 class problem and make a new one that should work for a 5 class problem) you may encounter the error message +``` +NotifyFunctionValuesMBSizeModified: labels InputValue operation had its row dimension 7 changed by the reader to 5 +``` +This is most likely due to only updating the network definition part of your config file but not your reader definition. +Please double-check: + +* all [`Input{}`](./Inputs#input) nodes have corresponding streams in the reader section with the same name +* the dimensions match + +A second case is when you provide a wrong hint to the reader by saying +``` +featureNodes = (z) +``` +and `z` is not an actual input but an expression. + +## Avoid the error CURAND failure 201 + +You are probably running on a Pascal GPU (for example a GTX 1080) and are using a Microsoft Cognitive Toolkit compiled with CUDA 7.5. + +Starting with CNTK 2 Beta 5 the Window binary packages are generated using NVIDIA CUDA 8 and will not show this problem anymore. Please update you local CNTK version. diff --git a/articles/test/How-do-I-Deal-with-Errors-in-Python.md b/articles/test/How-do-I-Deal-with-Errors-in-Python.md new file mode 100644 index 00000000..5630feac --- /dev/null +++ b/articles/test/How-do-I-Deal-with-Errors-in-Python.md @@ -0,0 +1,33 @@ +* [Debug a Python notebook](./How-do-I-Deal-with-Errors-in-Python#debug-a-python-notebook)? +* [Get things to work correctly when I take the last element of a sequence](./How-do-I-Deal-with-Errors-in-Python#get-things-to-work-correctly-when-i-take-the-last-element-of-a-sequence)? + +## Debug a Python notebook + +You will need to install ipdb via +``` +pip install ipdb +``` +Make sure the above installation happens for the correct python environment (the one used to run the notebook). +Afterwards, in your notebook you should add this line to import the debugger. +```python +from IPython.core.debugger import Tracer +``` +Finally, at the point where you want to debug add this line +```python +Tracer()() +``` +When you run the cell with the `Tracer()()` the execution will drop into ipdb at the next line. Then you can start inspecting things with the usual [pdb](https://docs.python.org/2/library/pdb.html) commands. Don't forget to quit the debugger when you are done! + +## Get things to work correctly when I take the last element of a sequence + +There are two issues here. Support that you have a sequence `seq`. You want to take the last element, further process it with a few layers. +```python +last = C.sequences.last(seq) +z = a_few_layers(last) +``` +Now you want to plug `z` in your loss but you get an error about dynamic axes. Input variables are created with some default dynamic axes but `last` (and `z`) had its dynamic axes determined by `sequences.last`. So one possibility is to define the label variable at this point and have it copy its dynamic axes from `z`. Typically: +```python +y = C.input_variable(z.shape, dynamic_axes=z.dynamic_axes) +loss = C.squared_error(y, z) +``` +Finally, when training, the data labels must have a dynamic axis of size one i.e. each element in the batch should have a shape of (1,)+y.shape. \ No newline at end of file diff --git a/articles/test/How-do-I-Evaluate-Models-in-BrainScript.md b/articles/test/How-do-I-Evaluate-Models-in-BrainScript.md new file mode 100644 index 00000000..be8b6f69 --- /dev/null +++ b/articles/test/How-do-I-Evaluate-Models-in-BrainScript.md @@ -0,0 +1,102 @@ +* [Do early stopping](./How-do-I-Evaluate-Models-in-BrainScript#do-early-stopping)? +* [Monitor the error on a held out set during training](./How-do-I-Evaluate-Models-in-BrainScript#monitor-the-error-on-a-held-out-set-during-training)? +* [Set the dropout rate to 0 during evaluation/testing](./How-do-I-Evaluate-Models-in-BrainScript#Dropout-during-evaluation)? +* [Evaluate my newly trained model but output the activations at an intermediate layer](./How-do-I-Evaluate-Models-in-BrainScript#evaluate-my-newly-trained-model-but-output-the-activations-at-an-intermediate-layer)? +* [Associate an id with a prediction](./How-do-I-Evaluate-Models-in-BrainScript#associate-an-id-with-a-prediction)? +* [Deploy model evaluation on Windows](https://github.com/Microsoft/CNTK/wiki/CNTK-Evaluation-Overview#eval-samples-in-cntk-binary-download-package-for-windows)? +* [[Evaluate a model in an Azure WebApi]] + +## Do early stopping + +To do early stopping you have to train until the end and then pick the checkpoint that is performing the best on a validation set. To do this include the cv action in your config +``` +command=TrainModel:EvaluateCheckpoints +... +EvaluateCheckpoints = { + action = "cv" + reader = { + file = $myValidationSet$ + #rest of the options same as the "test" reader + ... + } + crossValidationInterval = 3:2:9 # evaluate epochs 3 to 9 with a step of 2 i.e. 3,5,7,9 + sleepTimeBetweenRuns = 0 # let the GPU cool off for this many seconds + #rest of the options same as the "test" action + ... +} +``` +The cv command will print the best model on the validation set. Then you can evaluate that model in your test action on the final test set. + +## Monitor the error on a held out set during training + +To monitor the error on a held out set during training specify a "cvReader" section inside your training block. This typically looks the same as the reader section but points to the held out data. + +For example: +``` +trainDNN = { + action = "train" + ... + reader = { + readerType = "CNTKTextFormatReader" + file = "train.txt" + input = { + features = { dim = 100000 ; format = "sparse" } + labels = { dim = 10; format = "sparse" } + } + } + + cvReader = { + readerType = "CNTKTextFormatReader" + file = "validation.txt" + input = { + features = { dim = 100000 ; format = "sparse" } + labels = { dim = 10; format = "sparse" } + } + } + ... +} +``` + +## Dropout during evaluation + +During evaluation, the dropout rate is automatically set 0, so you don't have to do anything. + +## Evaluate my newly trained model but output the activations at an intermediate layer + +To evaluate an output of a model that was not originally declared as a output, +create a "new" network in the write or test action instead of specifying a `modelPath`, +which is constructed by loading an old model and defining a new set of output nodes: + + myWriteAction = + { + action = "write" + BrainScriptNetworkBuilder = { + network = BS.Network.Load ("$modelPath$") + outputNodes = (network.yourIntermediateNode) + } + ... + } + +where `yourIntermediateNode` denotes the node you want to change into an output. +Sometimes, node names are not identical to the original variable name +used to declare them in BrainScript. +In this case, consult the validation output in the log and use the full name shown there. + +If the name of `yourIntermediateNode` contains periods (`.`) or brackets (`[ ]`), please replace them with underscores (`_`). If your network contained dynamic axes, you might have to redefine them. + +## Associate an id with a prediction + +Assuming you have an input called Id (which contains a unique numeric id for each example) in your file you can do +``` +predictionWithId = Splice(prediction:Id) +``` + +## Deploy model evaluation on Windows + +Deploying a trained model from CNTK on Windows is discussed [here](https://github.com/Microsoft/CNTK/wiki/CNTK-Evaluation-Overview#eval-samples-in-cntk-binary-download-package-for-windows) + +## Evaluate a model in an Azure WebApi + +Evaluating a trained model from CNTK in Azure via Web Api is discussed [here](./Evaluate-a-model-in-an-Azure-WebApi) + + diff --git a/articles/test/Evaluate-a-saved-convolutional-network.md b/articles/test/How-do-I-Evaluate-models-in-Python.md similarity index 77% rename from articles/test/Evaluate-a-saved-convolutional-network.md rename to articles/test/How-do-I-Evaluate-models-in-Python.md index 8730d5ec..a3727b8e 100644 --- a/articles/test/Evaluate-a-saved-convolutional-network.md +++ b/articles/test/How-do-I-Evaluate-models-in-Python.md @@ -1,3 +1,8 @@ +* [Evaluate a saved convolutional network](./How-do-I-Evaluate-models-in-Python#evaluate-a-saved-convolutional-network)? +* [Extract features from a specific layer using a trained model](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/FeatureExtraction)? + +## Evaluate a saved convolutional network + There are a few "gotchas" with models trained on images. At this point the transformations are not part of the model so subtracting the mean has to be done manually. Another issue is that PIL loads images in a different order than what was used during training and a transposition is required. Assuming that: @@ -40,3 +45,7 @@ predictions = np.squeeze(z_out.eval({z_out.arguments[0]:[pic]})) top_class = np.argmax(predictions) ``` The reason for the above, is that in old model we save the training information in addition to the actual model parameters. + +## Extract features from a specific layer using a trained model? + +There is an example [here](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/FeatureExtraction). \ No newline at end of file diff --git a/articles/test/Expose-new-operands-in-V2-Python-from-previous-V1-implementations.md b/articles/test/How-do-I-Express-Things-In-Python.md similarity index 60% rename from articles/test/Expose-new-operands-in-V2-Python-from-previous-V1-implementations.md rename to articles/test/How-do-I-Express-Things-In-Python.md index 997a17ed..817cd576 100644 --- a/articles/test/Expose-new-operands-in-V2-Python-from-previous-V1-implementations.md +++ b/articles/test/How-do-I-Express-Things-In-Python.md @@ -1,3 +1,170 @@ +* [Implement an attention mechanism](./How-do-I-Express-Things-In-Python#implement-an-attention-mechanism)? +* [Interrogate the dimensions of internal layers of a network from within the Python API](./How-do-I-Express-Things-In-Python#introspect-or-inspect-or-list-model-input-variables)? +* [Introspect or inspect or list model input variables](./How-do-I-Express-Things-In-Python#introspect-or-inspect-or-list-model-input-variables)? +* [Port projection of 1D input to 1D output from Python API to C++ API](./How-do-I-Express-Things-In-Python#port-projection-of-1d-input-to-1d-output-from-python-api-to-c-api)? +* [Port LSTM NDL primitives to Python](./How-do-I-Express-Things-In-Python#port-lstm-ndl-primitives-to-python)? +* [Restrict a prediction to a bounded interval](./How-do-I-Express-Things-In-Python#restrict-a-prediction-to-a-bounded-interval)? +* [Set the verbosity or traceLevel from Python](./How-do-I-Express-Things-In-Python#set-the-verbosity-or-tracelevel-from-python)? +* [Expose new operands in V2 Python from previous V1 implementations](./How-do-I-Express-Things-In-Python#expose-new-operands-in-v2-python-from-previous-v1-implementations)? + +## Implement an attention mechanism + +Implementing an attention mechanism requires computing a softmax over a dynamic axis. One way to do this is with a recurrence. Symbolic recurrences in Python take a little while to get used to. To make things concrete let's see how one might go about implementing a model that takes a query and a candidate answer and computes the cosine similarity of their representations. First we assume that the query and the answer have been processed by pipelines like this +```python +q_lstm = Sequential([ Embedding(500), BiRecurrence(LSTM(300), LSTM(300)), Dense(200)]) +a_lstm = Sequential([ Embedding(500), BiRecurrence(LSTM(300), LSTM(300)), Dense(200)]) +q_embed = q_lstm(question) +a_embed = a_lstm(answer) +``` +where `BiRecurrence` is a convenience function that you can find in the solution of the third task of [this tutorial](https://github.com/Microsoft/CNTK/blob/v2.0.rc1/Tutorials/CNTK_202_Language_Understanding.ipynb). It runs one LSTM forward, another LSTM backward and concatenates the results. After this preprocessing we have a variable-length sequence of 200 dimensional vectors for the query and another variable length sequence of 200 dimensional vectors for the answer. + +To implement an attention mechanism we need to compute scalar values for each position and exponentiate them with an appropriate correction so that the sum of their exponentials equals 1. +```python +w_q = C.parameter((200,1), init=C.glorot_normal()) +w_a = C.parameter((200,1), init=C.glorot_normal()) +zq = C.times(q_embed, w_q) +za = C.times(a_embed, w_a) +``` +Now we need to compute the appropriate correction which is the log of the sum of the exponentials. This can be done with another recurrence. +```python +p = C.placeholder_variable((1)) +prev_zq_or_tiny = C.element_select(C.sequence.is_first(zq), -1e+30, C.past_value(p)) +log_cumsum_exp = C.log_add_exp(zq, prev_zq_or_tiny) +actual_log_cumsum_exp = log_cumsum_exp.replace_placeholders({p:log_cumsum_exp.output}) +log_sum_exp = C.sequence.last(actual_log_cumsum_exp) +attn_q = C.exp(zq - C.sequence.broadcast_as(log_sum_exp , zq)) +``` +The hardest part to understand is the call to `replace_placeholders`. +Before this call this part of the computation graph did not contain a loop: we were +either looking at `zq` or the past value of `p`. Once we call replace_placeholders we close the loop and make +`p` point to the output of the expression it was used to define! + +The attention weights `attn_a` can be obtained in the same fashion. Finally, we can compute the cosine distance between +the attended embeddings as: +```python +attended_q = C.sequence.reduce_sum(attn_q * q_embed) +attended_a = C.sequence.reduce_sum(attn_a * a_embed) +cosine_dst = C.cosine_distance(attended_q, attended_a) +``` + +## Interrogate the dimensions of internal layers of a network from within the Python API + +It depends on how you use the API. If you use the layers API then a model like this: + +```python +model = Sequential([ + Embedding(emb_dim), + Recurrence(LSTM(hidden_dim), + Dense(num_labels) + ]) +``` + +Can be interrogated like this: + +```python +print(len(model.layers)) +print(model.layers[0].E.shape) +print(model.layers[2].b.value) +``` + +i.e. you need to know the names of the tensors (E for embedding, b for bias, W for weights). You could recover these with some reflection though. + +## Introspect or inspect or list model input variables + +If I create a model with some input_variable and then later from the trainer, I need the input_variable name, but I only have the model around, how can I introspect to get at the input_variables? + +Say for example, you setup your trainer like this: +```python +SetupTrainer(): + input = cntk.input_variable((input_dim), np.float32) + label = cntk.input_variable((num_output_classes), np.float32) + + z = model(input) # (features) -> (prediction as unnormalized log prob) + ce = cross_entropy_with_softmax(z, label) + errs = classification_error(z, label) + criterion = combine ([ce, errs]) # (features, labels) -> (loss, metric) + + trainer = Trainer(model, criterion.outputs[0], criterion.outputs[1], learner) + + return trainer, criterion +``` + +Then later, you needed to introspect to get back the names input and label by using “[arguments](https://www.cntk.ai/pythondocs/graph.html#cntk.ops.functions.Function)”: +```python +# train the model +trainer, criterion = SetupTrainer() +trainer.train_minibatch({criterion.arguments[0]: features, criterion.arguments[1]: labels}) +``` + +## Port projection of 1D input to 1D output from Python API to C++ API + +In C++ API a rank-1 tensor denotes a column and tensors are stored in column major format (i.e. axis 0 is the faster changing dimension, followed by axis 1 and so on). + +```python +input = input_variable((input_dim), np.float32) +times_param = parameter(shape=(input.shape[0], output_dim)) +t = times(input, times_param) +``` +So to project a 1D input of dim “inputDim” to a 1D output of dim “outputDim”, you need to setup things as follows in C++: + +```cpp +input = InputVariable({ inputDim }, DataType::Float); +timesParam = CNTK::Parameter({ outputDim, input.Shape()[0] }); +t = Times(timesParam, input); +``` + +Note how both the tensor shapes and the order of the operands to the Times operation are reversed compared to the python code. In python, to conform to the generally accepted norm established by numpy, a rank-1 tensor denoted a row vector and data layout is row major (i.e. axis 0 is the slowest changing dimension, followed by axis 1 and so on). We internally do the required shape transformations at the SWIG layer to map the python shapes and ops to the C++ implementation correctly. + +## Port LSTM NDL primitives to Python + +How do I find the support for following NDL LSTM primitives to Python: + +**Delay** + +* How to pass argument in delay of a variable defined later in the network? E.g. for peep hole LSTM, cell state variable is defined later, but delay is needed to get t-1 cell state. Python doesn’t allow variables to be used first and defined later. + +* Ans: One needs to use a `placeholder_variable` and later a call to `replace_placeholders`. [Here](https://github.com/Microsoft/CNTK/wiki/Implement-an-attention-mechanism) is a simple example. + + +**RowStack**, **RowSlice** + +* Are there any substitutes for these primitives? If not how to implement them in python? Can we operate on variables as if they are numpy arrays? + +* Ans: Use [splice](https://cntk.ai/pythondocs/cntk.ops.html?highlight=splice#cntk.ops.splice) + + +**DiagTime** vs **ElementTimes** + +* Is there any difference between them for vector element wise multiplication? Also is DiagTimes in supported in python? + +* Use [element wise multiplication](https://cntk.ai/pythondocs/cntk.ops.html?highlight=element#cntk.ops.element_times) + +**Parameter initialization** + +* How to Initialize parameters from file in python and set `computeGradient` as false. + +* Use [constants](https://cntk.ai/pythondocs/cntk.ops.html?highlight=splice#cntk.ops.splice). You can specify the initial value via a NumPy array. There are many ways to load a text (or other) file into a NumPy array. + +## Restrict a prediction to a bounded interval + +You can use `clip`. For example if you use the layers API +```python +z = Sequential([ Dense(500, activation=relu), + Dense(4, activation=None), + clip(Placeholder(), 0, 224) ]) +``` +will create a network with one layer with relu's and one layer with linear activations. The latter has four outputs whose predictions are limited in the interval [0,224]. This could be used to predict bounding boxes for images of size 224 x 224. + +## Set the verbosity or traceLevel from Python + +```python +import _cntk_py + +_cntk_py.set_computation_network_trace_level(1) +``` + +## Expose new operands in V2 Python from previous V1 implementations + There are several steps to expose a function that is available in the V1 library to V2 in Python: @@ -7,7 +174,7 @@ to V2 in Python: > Step 3: Establish the glue layers on the Python and C++ API in SWIG layers -## Step 1: Python interface +### Step 1: Python interface In this step you define the interface to the operand you want to expose. In the following example we have and operand that takes 2 inputs that are variables @@ -72,7 +239,7 @@ def test_cosine_distance_with_negative_samples(): # Add tests that assert model shape and the returned values ``` -## Step 2: Expose the operator in V2 C++ API +### Step 2: Expose the operator in V2 C++ API > Update `CNTKLibrary.h` (in `//Source/CNTKv2LibraryDll/API`): Add the signature you would like to expose. This should mirror the signature @@ -274,7 +441,7 @@ for the unique ID assertions ``` -## Step 3: Updates to SWIG: +### Step 3: Updates to SWIG: Some of the compiler warnings are to be ignored. @@ -291,7 +458,7 @@ Some of the compiler warnings are to be ignored. %ignore_function CNTK::Internal::CosineDistanceWithNegativeSamples; ``` -## Optionaly expose functionality in BrainScript V2 +### Optionaly expose functionality in BrainScript V2 > Update `ComputationNetworkBuilder.h` (in //Source/ComputationNetworkLib): diff --git a/articles/test/How-do-I-Express-Things-in-BrainScript.md b/articles/test/How-do-I-Express-Things-in-BrainScript.md new file mode 100644 index 00000000..28ff14ec --- /dev/null +++ b/articles/test/How-do-I-Express-Things-in-BrainScript.md @@ -0,0 +1,107 @@ +* [Get nice syntax highlighting for BrainScript config files](./How-do-I-Express-Things-in-BrainScript#get-nice-syntax-highlighting-for-brainscript-config-files) ? +* [Express the error rate of my binary classifier](./How-do-I-Express-Things-in-BrainScript#express-the-error-rate-of-my-binary-classifier)? +* [Express a softmax with a temperature parameter](./How-do-I-Express-Things-in-BrainScript#express-a-softmax-with-a-temperature-parameter)? +* [Express a gating mechanism](./How-do-I-Express-Things-in-BrainScript#express-a-gating-mechanism)? +* [Express a softmax over a dynamic axis](./How-do-I-Express-Things-in-BrainScript#express-a-softmax-over-a-dynamic-axis)? +* [Zoneout](./How-do-I-Express-Things-in-BrainScript#implement-zoneout)? +* [Build a constant 3D tensor](./How-do-I-Express-Things-in-BrainScript#build-a-constant-3d-tensor)? +* [Combine or concatenate vectors in BrainScript](./How-do-I-Express-Things-in-BrainScript#build-a-constant-3d-tensor)? +* [Interpret the training loss](./How-do-I-Express-Things-in-BrainScript#interpret-the-training-loss)? + +## Get nice syntax highlighting for BrainScript config files + +Here's how to do it if you use `vim`; other editors should be similar +``` +:set syntax=conf +``` +This is not perfect but it's much better than no syntax highlighting. If you know how to write a `.vim` syntax file please send us a pull request. + +In Emacs, `conf-mode` is best suited. To switch, press `M-x` to start entering commands, then enter `conf-mode`. On Windows Emacs, there is also `conf-windows-mode`, however that mode appears to sometimes get confused by special characters in comments. + +## Express the error rate of my binary classifier + +Do not use ClassificationError as it only works for multiclass labels. Using the standard arithmetic and logical operators you can express this as +``` +err = (label != (prediction > 0.5)) +... +evaluationNodes = (err) +``` +where we assume here that prediction is an estimate of the probability of the positive class given the input. + +## Express a softmax with a temperature parameter + +A temperature softmax is very easy in BrainScript +``` +TemperatureSoftmax (z, T) = Softmax (z / Constant (T)) +``` + +## Express a gating mechanism + +The simplest way is to use + +[`BS.Boolean.If`](./If-Operation)` (condition, thenExpression, elseExpression)` + +which works with scalars as well as tensors (all arguments can [broadcast](./Binary-Operations#broadcasting-semantics)). + +## Express a softmax over a dynamic axis + +A softmax over a dynamic axis can be done via a recurrence: + +BrainScript: +``` +SoftMaxOverSequence (z) = { + # define a recursive expression for \log(\sum_{i=1}^t \exp(z_i)) + runningLogSumExp = LogPlus (z, + PastValue (0, runningLogSumExp, defaultHiddenActivation=-1e30)) + logSumExp = If (BS.Loop.IsLast (runningLogSumExp), # if last entry + /*then*/ runningLogSumExp, # then copy that + /*else*/ FutureValue (0, logSumExp)) # else just propagate to the front + result = Exp (z - logSumExp) +}.result +``` + +## Implement Zoneout + +You might be wondering if Dropout applied to a Constant is dynamically evaluated. It is! Therefore [Zoneout](http://arxiv.org/abs/1606.01305) is as simple as +``` +Zoneout (x, xprev) = +{ + mask = Dropout (BS.Constants.OnesLike (x)) + res = BS.Boolean.If (mask, xprev, x) +}.res +``` + +## Build a constant 3D tensor + +I want to build a constant 3D tensor in CNTK. I learned how to produce 1D and 2D constant arrays, I can stack (conacenate or combine) them and repeat them. Now I need to stack the 2D Tensors to make a 3D tensor? + +Say you have three tensors, e.g. + +``` +A = ParameterTensor {100:200} +B = ParameterTensor {100:200} +C = ParameterTensor {100:200} +``` + +You can now say + +``` + ABC = Splice (A:B:C, axis=3) +``` + +which will give you a [100 x 200 x 3] tensor. + +(If, on the other hand, you had says Splice (A:B:C, axis=1), you would get a [300 x 200] tensor, and Splice (A:B:C, axis=2) would get you a [100 x 600] tensor.) + +Note that to splice in a new dimension, dimensions of all inputs must match. E.g. you cannot stack a ParameterTensor {100:200} and a ParameterTensor {100:50} along axis 3 (but you could stack it along axis 2, which wold give you a [100 x 250] tensor). + +Also note that axis indices are 1-based, like in math. E.g. the row dimension of a matrix is the first axis, axis=1, and the column dimension is the second axis, axis=2. + +## Interpret the training loss + +How does CNTK calculate the training loss (as bolded) after an epoch is finished? Does the final model go through the whole training set to get the value, or does it simply average the miniBatch training losses produced by the model while it gets updated during the epoch? +
Finished Epoch[ 1 of 100]: [Training] CE = 0.15920662 * 59999999; totalSamplesSeen = 59999999; learningRatePerSample = 9.9999997e-05; epochTime=641.392s
+Final Results: Minibatch[1-3510]: CE = 0.32190538 * 3592929
+Finished Epoch[ 1 of 100]: [Validate] CE = 0.32190538 * 3592929
+ +The answer is the latter (where the averaging is weighted by the samples in the minibatch in case they are variable-length). I.e. you do not get the actual training loss of the model in the state it is in at the end of the epoch. \ No newline at end of file diff --git a/articles/test/How-do-I-Read-Things-in-BrainScript.md b/articles/test/How-do-I-Read-Things-in-BrainScript.md new file mode 100644 index 00000000..3fcb9873 --- /dev/null +++ b/articles/test/How-do-I-Read-Things-in-BrainScript.md @@ -0,0 +1,51 @@ +* [Specify multiple label streams with the HTKMLFReader](./How-do-I-Read-Things-in-BrainScript#specify-multiple-label-streams-with-the-htkmlfreader)? +* [Use the built-in readers to train a network model using multiple input files](./How-do-I-Read-Things-in-BrainScript#use-built-in-readers-with-multiple-inputs)? +* [Put labels and features in separate files with CNTKTextFormatReader](./How-do-I-Read-Things-in-BrainScript#put-labels-and-features-in-separate-files-with-cntktextformatreader)? + +## Specify multiple label streams with the HTKMLFReader + +The HTKMLFReader (the reader to read Master Label Files (MLF) of the Hidden Markov Toolkit (HTK)) +can be configured to read multiple label streams. The example below is taken from +[TIMIT_TrainMultiTask_ndl_deprecated.cntk](https://github.com/Microsoft/CNTK/blob/master/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_TrainMultiTask_ndl_deprecated.cntk) +in the Examples directory: + + reader = { + readerType = "HTKMLFReader" + ... + labels = { + mlfFile = "$MlfDir$/TIMIT.train.align_cistate.mlf.cntk" + labelMappingFile = "$MlfDir$/TIMIT.statelist" + labelDim = 183 + labelType = "category" + } + regions = { + mlfFile = "$MlfDir$/TIMIT.train.align_dr.mlf.cntk" + labelDim = 8 + labelType = "category" + } + } + +## Use built in readers with multiple inputs + +See the description at [Understanding and Extending Readers](./BrainScript-and-Python---Understanding-and-Extending-Readers) and look for the section describing how to "compose several data deserializers" + +## Put labels and features in separate files with CNTKTextFormatReader + +Use the composite reader to specifiy the two files, one for lables, and one for features. And make sure to match sequence id's in labels file and the features file. + +``` +reader = [ + … + deserializers = ( + [ + type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" + file = "$RootDir$/features.txt" + input = [ features = [...]] + ]:[ + type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" + file = "$RootDir$/labels.txt" + input = [ labels = [...]] + ] +] +``` + diff --git a/articles/test/Load-model-and-access-network-weights-(parameters).md b/articles/test/How-do-I-Read-Things-in-Python.md similarity index 69% rename from articles/test/Load-model-and-access-network-weights-(parameters).md rename to articles/test/How-do-I-Read-Things-in-Python.md index 06432e33..91563da4 100644 --- a/articles/test/Load-model-and-access-network-weights-(parameters).md +++ b/articles/test/How-do-I-Read-Things-in-Python.md @@ -1,3 +1,7 @@ +[Load model and access network weights (parameters)](./How-do-I-Read-Things-in-Python#load-model-and-access-network-weights-parameters) + +## Load model and access network weights (parameters) + You have trained and saved a model file. Now you want to load it elsewhere and get the parameters. Here are the steps you need to follow: ```python diff --git a/articles/test/How-do-I-Train-Models-in-BrainScript.md b/articles/test/How-do-I-Train-Models-in-BrainScript.md new file mode 100644 index 00000000..d2bff756 --- /dev/null +++ b/articles/test/How-do-I-Train-Models-in-BrainScript.md @@ -0,0 +1,260 @@ +* [Perform layer-wise training](./How-do-I-Train-Models-in-BrainScript#layer-wise-training)? +* [Train with a multitask objective](./How-do-I-Train-Models-in-BrainScript#train-with-a-multitask-objective)? +* [Train a regression model on images](./How-do-I-Train-Models-in-BrainScript#train-a-regression-model-on-images)? +* [Train a multilabel classifier](./How-do-I-Train-Models-in-BrainScript#train-a-multilabel-classifier)? +* [Get started in sequence to sequence modelling](./How-do-I-Train-Models-in-BrainScript#get-started-in-sequence-to-sequence-modelling)? +* [Train a DSSM (or a convolutional-DSSM) model](./How-do-I-Train-Models-in-BrainScript#train-a-dssm-or-a-convolutional-dssm-model)? +* [Train an Image auto encoder using Deconvolution and Unpooling](./Image-Auto-Encoder-Using-Deconvolution-And-Unpooling)? +* [Object Detection using Fast R-CNN](./Object-Detection-using-Fast-R-CNN)? + +## Layer wise training + +To perform layer-wise training simply use multiple "commands" in your config file, where each command is of type action=train. +``` +command = TrainLayer1:TrainLayer2:TrainLayer3:EndToEndTrain:Test:Output + +TrainLayer1= [ + action = train + ... +] + +TrainLayer2= [ + action = train + ... +] +... +``` + +## Train with a multitask objective + +You can just define your combined criterion as a BrainScript expression and you can monitor all individual task losses by specifying them as `evaluationNodes`. +``` +task1loss = CrossEntropyWithSoftMax(prediction,label) +task2loss = SquareError(reconstruction,input) +mtloss = task1loss + Constant(0.3) .* task2loss +criterionNodes = (mtloss) +evaluationNodes = (task1loss:task2loss) +``` + +## Train a regression model on images + +Below we describe how to predict one or more floating point value for an input image using CNTK. An example use case is to predict a bounding box, for example as (x, y, w, h), of an object in a given image. You could also think of predicting the price for a car just by looking at an image of that car (would be interesting actually). Here we use a very simple example in which we train a network to predict the average rgb values of an image (normalized to [0, 1]). However, the same steps apply for other use cases. These steps are: + +1. Define both the image and the ground truth regression labels as inputs to your network +2. Define a network that predicts a matching number of values wrt your regression labels +3. Define a loss function that compares the predicted values with the ground truth +4. Adapt the reader section in your .cntk config file to read both image and regression labels + +Here is how you could do it. The full config file is included in the Examples folder at [Examples/Image/Regression/RegrSimple_CIFAR10.cntk](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Regression/RegrSimple_CIFAR10.cntk). That folder also contains the scripts to download the image data and generate the regression ground truth for training and testing. + +1-3) Defining inputs, network and loss function: + +``` + BrainScriptNetworkBuilder = [ + imageShape = 32:32:3 + featScale = Constant(1/256) + labelDim = 3 + + model (features) = { + featNorm = Scale(features, featScale) + h1 = LinearLayer {100, init="gaussian", initValueScale=1.5} (featNorm) + ol = LinearLayer {labelDim, init="gaussian", initValueScale=1.5} (h1) + }.ol + + # inputs + features = Input {imageShape} + regrLabels = Input {labelDim} + + # apply model to features + ol = model (features) + + # define regression loss + # rmse = sqrt(SquareError(regrLabels, ol) / labelDim) + sqerr = SquareError (regrLabels, ol) + rmse = Sqrt (Constant(1/labelDim).* sqerr) + + featureNodes = (features) + labelNodes = (regrLabels) + criterionNodes = (rmse) + evaluationNodes = (rmse) + OutputNodes = (ol) + ] +``` + +4) Defining a composite reader using both the ImageReader and CNTKTextFormatReader: + +``` + reader = { + verbosity = 0 ; randomize = true + deserializers = ({ + type = "ImageDeserializer" ; module = "ImageReader" + file = "$dataDir$/cifar-10-batches-py/train_map.txt" + input = { + features = { transforms = ( + { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } : + { type = "Transpose" } + )} + ignored = { labelDim = 10 } + } + } : { + type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" + file = "$dataDir$/cifar-10-batches-py/train_regrLabels.txt" + input = { + regrLabels = { dim = 3 ; format = "dense" } + } + }) + } +``` + +The reader is a composite reader that uses the ImageReader to read images and the CNTKTextFormatReader to read the regression ground truth labels. It does so by defining an array of deserializers (using `{...} : {...}`) and assigning the inputs as defined in the network above (cf. features and regrLabels). + +See [Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk) for the full config file and the corresponding Readme in that folder for running the example. + +## Train a multilabel classifier + +For multilabel classification you should avoid using CrossEntropy as it can only handle input vectors that sum to 1. A sensible alternative is to use a sum of logistic loss functions, one for each output +``` +... +probabilities = DenseLayer {outputSize, activation=Sigmoid} (hidden) +logisticLoss = Logistic (multiLabels, probabilities) +trainingCriterion = (logisticLoss) +... +``` +Apart from the loss itself you might want to monitor other metrics such as the number of incorrect predictions. There is no builtin expression for this but it can be expressed as +``` +... +hammingLoss (y, p) = ReduceSum (y != (p > 0.5)) +hl = hammingLoss(multiLabels,probabilities) +evaluationNodes = (hl) +... +``` +This counts the number of times y[i] disagrees with p[i]>0.5. + +## Get started in sequence to sequence modelling + +This [hands-on lab](Hands-On-Labs-Language-Understanding) describes the main ingredients for getting started on sequence processing such as the CNTK text format and how to configure the reader to use short aliases for the various input sequences. +The [grapheme-to-phoneme (G2P) example](https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/BrainScript/G2P.cntk) +demonstrates an actual sequence-to-sequence task. + +An important issue for sequence-to-sequence modeling is how to decode test data with beam search. +This can be done with in a section of your config where the top level action is "write". Decoding requires a search for the most probable sequence of outputs. CNTK has a [beam search](https://en.wikipedia.org/wiki/Beam_search) decoder while you can call like this +``` +BrainScriptNetworkBuilder = (BS.Seq2Seq.BeamSearchSequenceDecoderFrom ( + BS.Network.Load (decodeModelPath), beamSize)) +``` +and will execute beam search with the specified beam size. For a beam size of 1 there is a specialized greedy decoder +``` +BrainScriptNetworkBuilder = (BS.Seq2Seq.GreedySequenceDecoderFrom ( + BS.Network.Load (decodeModelPath))) +``` +Both decoders have specific requirements to the network, as shown +in the [G2P example](https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/BrainScript/G2P.cntk) + +## Train a DSSM (or a convolutional DSSM) model + +[DSSM](https://www.microsoft.com/en-us/research/project/dssm/) (or Deep Semantic Similarity Model) is a DNN model trained on pairs of source-target texts, for learning a short-text embedding space where relevant source and target text pairs are closer. The text input to the model is represented by their pre-computed trigram hash (see, [Huang et al.](https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/)). For [C-DSSM](https://www.microsoft.com/en-us/research/publication/learning-semantic-representations-using-convolutional-neural-networks-for-web-search/), the trigram hash is computed per word and then concatenated in the order in which the words occur in the text. The input to both models are of fixed size. If we consider 50K trigrams, then the DSSM input corresponding to the source and the target text would be a vector of length 50K each. For C-DSSM, the vector would be of length 50K x n, where the first n-1 word vectors are concatenated, and the nth vector contains a sum of the vectors corresponding to all the remaining words in the text. If there are less than n words in the text, then the rest of the vector is padded with zeros. To draw an analogy with image, you can think of the text input for C-DSSM as an image with dimensions 10x1 and 50K channels stored in a `[C x H x W]` format. + +This example demonstrates how to train a DSSM / C-DSSM model using CNTKTextFormatReader. The data should contain 2 features (source and target text) and 1 label (which is always set to the value 1 in the training data as it contains only positive samples – during training the negative target examples are generated by random sampling). +Here’s the reader configuration, +``` +reader = { + verbosity = 0 + randomize = true + + deserializers = ({ + type = "CNTKTextFormatDeserializer" + module = "CNTKTextFormatReader" + file = "data.txt" + + input = { + Q = { dim = 500000; format = "sparse" } + D = { dim = 500000; format = "sparse" } + L = { dim = 1; format = "dense" } + } + }) +} +``` +A sample of the input data, +``` +|L 1 |Q 482:1 761:1 1832:1 2117:1 12370:1 17131:1 17854:1 24976:1 27676:1 28055:1 28177:1 29507:1|D 482:1 761:1 1832:1 2117:1 12370:1 17131:1 17854:1 24976:1 27676:1 28055:1 28177:1 29507:1 +|L 1 |Q 149:1 153:1 595:1 671:1 675:1 1110:1 1517:1 2077:1 2114:1 5533:1 5662:1 6886:1 6901:1 7294:1 12846:1 13033:1 16614:1 19425:1 22015:1 24839:1 24994:1 26196:1 26358:1 27565:1|D 149:1 153:1 595:1 671:1 675:1 1110:1 1517:1 2077:1 2114:1 5533:1 5662:1 6886:1 6901:1 7294:1 12846:1 13033:1 16614:1 19425:1 22015:1 24839:1 24994:1 26196:1 26358:1 27565:1 +|L 1 |Q 187:1 2294:1 2800:1 6920:1|D 187:1 2294:1 2800:1 6920:1 +``` +And finally the network definition, +``` +BrainScriptNetworkBuilder = { + # Constants scalars + isConvolutional = true + numWords = (if isConvolutional then 10 else 1) + numTrigramsPerWord = 50000 + numHiddenNodes = 300 + wordWindowSize = 3 + numWindows = numWords - wordWindowSize + 1 + numNeg = 50 + + # Constant tensors + CONST_GAMMA = Constant(10) + CONST_SHIFT = Constant(1) + CONST_NEG = Constant(numNeg) + CONST_PAD_NEG = Constant(0, rows=numNeg, cols=1) + CONST_PAD_POS = Constant(1, rows=1, cols=1) + CONST_PAD = Splice(CONST_PAD_POS : CONST_PAD_NEG, axis=1) + + # Inputs + Q = Input(500000) + D = Input(500000) + L = Input(1) + + qr = if isConvolutional + then TransposeDimensions(ReshapeDimension(Q, 1, numTrigramsPerWord:1:numWords), 1, 3) + else Slice(0, numTrigramsPerWord, Q, axis=1) + dr = if isConvolutional + then TransposeDimensions(ReshapeDimension(D, 1, numTrigramsPerWord:1:numWords), 1, 3) + else Slice(0, numTrigramsPerWord, D, axis=1) + + qdssm = Sequential ( + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh}) + qcdssm = Sequential ( + ConvolutionalLayer {numHiddenNodes, (wordWindowSize:1), pad=false, activation=Tanh} : + MaxPoolingLayer {(numWindows:1), stride=(1:1)} : + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh}) + ddssm = Sequential ( + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh}) + dcdssm = Sequential ( + ConvolutionalLayer {numHiddenNodes, (wordWindowSize:1), pad=false, activation=Tanh} : + MaxPoolingLayer {(numWindows:1), stride=(1:1)} : + DenseLayer {numHiddenNodes, activation=Tanh} : + DenseLayer {numHiddenNodes, activation=Tanh}) + qembed = if isConvolutional + then qcdssm + else qdssm + dembed = if isConvolutional + then dcdssm + else ddssm + + qf = qembed(qr) + df = dembed(dr) + lf = Times(CONST_PAD, L) + c = CosDistanceWithNegativeSamples(qf, df, CONST_SHIFT, CONST_NEG) + s = Slice(0, 1, c, axis=1, tag="output") + ce = CrossEntropyWithSoftmax(lf, Scale(CONST_GAMMA, c), tag="criterion") +} +``` + +Note: +* While C-DSSM has been shown to consistently perform better than DSSM, it also trains slower (sometime up to 5-10x slower). So in some cases you may get better performance from DSSM in the same training time by training over more data (or for more epochs). +* The original DSSM / C-DSSM were trained on query and document title pairs. But you can learn other relationships between short texts by training on other kinds of data such as [session query pairs](https://www.microsoft.com/en-us/research/publication/exploring-session-context-using-distributed-representations-of-queries-and-reformulations/) or [query prefix-suffix pairs](https://www.microsoft.com/en-us/research/publication/query-auto-completion-for-rare-prefixes/). + +## Train an Image Auto Encoder Using Deconvolution And Unpooling + +There are instructions [here](./Image-Auto-Encoder-Using-Deconvolution-And-Unpooling). + +## Train Object Detection using Fast R CNN + +There are instructions [here](./Object-Detection-using-Fast-R-CNN). + diff --git a/articles/test/How-do-I-Train-models-in-Python.md b/articles/test/How-do-I-Train-models-in-Python.md new file mode 100644 index 00000000..b817794b --- /dev/null +++ b/articles/test/How-do-I-Train-models-in-Python.md @@ -0,0 +1,116 @@ + * [Interpret the use of MinibatchSource.next_minibatch](./How-do-I-Train-models-in-Python#interpret-the-use-of-minibatchsourcenext_minibatch)? + * [Load pre-trained checkpointed model and continue retraining](./How-do-I-Train-models-in-Python#load-pre-trained-checkpointed-model-and-continue-retraining)? + * [Relate alpha, beta1, beta2 and epsilon to learning rate and momentum in adam_sgd](./How-do-I-Train-models-in-Python#relate-alpha-beta1-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd)? + * [Train two or more models jointly](./How-do-I-Train-models-in-Python#train-two-or-more-models-jointly)? + * [Train with a weighted loss](./How-do-I-Train-models-in-Python#train-with-a-weighted-loss)? + * [Train a multilabel classifier in Python](./How-do-I-Train-models-in-Python#train-a-multilabel-classifier-in-python)? + * [Train an Image auto encoder using Deconvolution and Unpooling](./How-do-I-Train-models-in-Python#train-an-image-auto-encoder-using-deconvolution-and-unpooling)? + * [Object Detection using Fast R-CNN](./How-do-I-Train-models-in-Python#train-object-detection-using-fast-r-cnn)? + * [Build your own image classifier using Transfer Learning](./How-do-I-Train-models-in-Python#train-an-image-classifier-using-transfer-learning)? + +## Interpret the use of MinibatchSource.next_minibatch + +**TODO** This material applies to all of CNTK and should move to a common location + +My current understanding is that when I call MinibatchSource.next_minibatch(minibatch_size, input_map) during training it will pick a random subset of minibatch_size samples from my training data set? + +**Yes** + +Does the implementation ensure that when I call next_minibatch N times (N = number_of_training_samples / minibatch_size) my whole data set gets covered at the end of the N calls of next_minibatch? Also, when I call next_minibatch 2*N times does it means that my whole training set gets covered twice? + +**Yes**. + +**Additional info:** +* Each cycle through the data will have a different random order. +* If you double your minibatch size, one minibatch will now contain exactly the samples that, before, the corresponding two consecutive minibatches would have contained (this may be approximate if you have variable-length sequences). I.e. two runs that only differ in MB size process the data in the same order. +* If you interrupt and restart from checkpoint, you will get the same random order as if you had not interrupted. + +This is implemented by grounding the reading/randomization process on a nominal time axis, with this simple algorithm: + +* Training proceeds on a nominal infinite time axis. If you fetch a MB of size 256, the nominal time progresses by 256. +* The training corpus is replicated an infinite number of times on this time axis. If you have M samples, then the first replica spans nominal time 0..M-1; the second M..2M-1; etc. +* Each replica is random-shuffled within, but not across replica boundaries. I.e. once you have processed precisely M samples, you have seen each sample exactly once. +* Calling next_minibatch(K) gives you the next K samples on this reshuffled infinite time line. It’s the same as calling next_minibatch(1) for K times. +* This is all done lazily. +* Restarting from checkpoint is as simple as resetting the nominal time to the nominal time when the checkpoint was created. + +## Load pre trained checkpointed model and continue retraining + +You have a CNTK `trainer` object and save a checkpoint file. + + checkpoint_file = "mModel_checkpointed.dnn" + trainer.save_checkpoint(checkpoint_file) + +At a later time I load the `".dnn"` file as my model + + mymodel.load_model(checkpoint_file) + +When you restart training and it would load the model but not the weights. To be able to restore training from the previously checkpointed model. You need to use [`trainer.restore_from_checkpoint`](https://cntk.ai/pythondocs/cntk.trainer.html?highlight=restore_from_checkpoint#cntk.trainer.Trainer.restore_from_checkpoint) instead, to recreate the trainer and learners. One important thing is that in your python script the order of network creation (not only structure! but the order in which you create your nodes) should stay the same at the point when you create a checkpoint and when you restore from it. + +## Relate alpha, beta1, beta2 and epsilon to learning rate and momentum in adam_sgd + +* Alpha is the learning_rate +* Beta1 is momentum parameter +* Beta2 is variance_momentum parameter + +## Train two or more models jointly + +There are two cases that this can arise. One is multitask learning the other is you have one processing pipeline for some part of your input and another pipeline for the other. + + * Multitask learning is when you have multiple criteria you want your network to be good at. The standard way to deal with this is to construct a single number out of all these criteria. For example +```python +cross_entropy = cross_entropy_with_softmax(predictions, labels) +reconstruction_error = squared_error(reconstruction, features) +combined_loss = 0.3 * reconstruction_error + cross_entropy +``` + * Multiple pipelines such as the one below are also tricky +```python +q_embed = qmodel(question) +a_embed = amodel(answer) +``` + +The trainer takes a single function whose parameters should be modified by the training proceduce. How can we pass the union of `qmodel` and `amodel` or the union of `cross_entropy` and `reconstruction_error`? The answer is to find a point where all the parameters are part of the same function. That happens at the very least in the final loss. +i.e. `loss.parameters` contains all parameters the loss depends on. Typically however our model should not +include the loss, as it only makes sense for training. Therefore a better approach is to use combine to create a combined model +```python +final_model = combine(predictions, reconstruction) +``` +For the separate pipeline case there is probably a place where everything gets combined +```python +qa_score = score(q_embed,a_embed) +``` +then `qa_score` can play the role of `final_model` above. + +## Train with a weighted loss + +The specification of a per-example weight in the loss is as simple as +```python +weight = input_variable((1)) +weighted_loss = weight * loss +``` +where `loss` is any builtin or user-defined loss function. Of course during training each minibatch will need to have a mapping from weight to actual values (one for each example). + +## Train a multilabel classifier in Python + +For multilabel classification you should avoid using CrossEntropy as it can only handle input vectors that sum to 1. A sensible alternative is to use hamming loss. + +```python +def my_hamming_loss: + y = placeholder_variable() + p = placeholder_variable() + hammingLoss = reduce_sum (not_equal (y, (greater (p, 0.5)))) + return hammingLoss +``` +which you then can call as hammingLoss(labels, prob) and pass as the error metric to the Trainer. + +## Train an Image Auto Encoder Using Deconvolution And Unpooling + +There are instructions [here](./Image-Auto-Encoder-Using-Deconvolution-And-Unpooling). + +## Train Object Detection using Fast R CNN + +There are instructions [here](./Object-Detection-using-Fast-R-CNN). + +## Train an image classifier using Transfer Learning + +There are instructions [here](./Build-your-own-image-classifier-using-Transfer-Learning) diff --git a/articles/test/How-do-I-in-BrainScript.md b/articles/test/How-do-I-in-BrainScript.md index f41e6132..704f86ae 100644 --- a/articles/test/How-do-I-in-BrainScript.md +++ b/articles/test/How-do-I-in-BrainScript.md @@ -1,36 +1,6 @@ * [Express things](./How-do-I-Express-Things-in-BrainScript) * [Train models](./How-do-I-Train-Models-in-BrainScript) - -Evaluate models - -* [[Interpret the training loss]]? -* [[Do early stopping]]? -* [[Monitor the error on a held out set during training]]? -* [Set the dropout rate to 0 during evaluation/testing](Dropout-during-evaluation)? -* [[Evaluate my newly trained model but output the activations at an intermediate layer]]? -* [[Associate an id with a prediction]]? -* [Deploy model evaluation on Windows](https://github.com/Microsoft/CNTK/wiki/CNTK-Evaluation-Overview#eval-samples-in-cntk-binary-download-package-for-windows)? -* [[Evaluate a model in an Azure WebApi]] - -Adapt models - -* [Use a trained model as a feature extractor](How-do-I-use-a-trained-model-as-a-feature-extractor)? -* [[Use an already trained network multiple times inside a larger network]]? -* [[Adapt a model I trained on one task to another]] -* [Save and reload weights from one model to another](Adapt-a-model-I-trained-on-one-task-to-another) -* [Continue training from a previously saved model](Continue-training-from-a-previously-saved-model)? - -Read things - -* [[Specify multiple label streams with the HTKMLFReader]]? -* [Use the built-in readers to train a network model using multiple input files](Use-built-in-readers-with-multiple-inputs)? -* [[Put labels and features in separate files with CNTKTextFormatReader]]? - -Deal with errors - -* [Deal with the error "No node named 'x'; skipping"](Deal-with-the-error-'No-node-named-'x';-skipping')? -* [Avoid the "AddSequence: Sequences must be a least one frame long." exception in sequence to sequence](Avoid-AddSequence-Exception)? -* [Deal with the "No Output nodes found" error](Deal-with-the-'No-Output-nodes-found'-error)? -* [Deal with the error "Reached the maximum number of allowed errors"](Deal-with-the-error-'Reached-the-maximum-number-of-allowed-errors')? -* [Deal with "InputValue operation had its row dimension x changed by the reader to y"](Compatible-dimensions-in-reader-and-config)? -* [[Avoid the error CURAND failure 201]]? +* [Evaluate models](./How-do-I-Evaluate-Models-in-BrainScript) +* [Adapt models](./How-do-I-Adapt-Models-in-BrainScript) +* [Read things](./How-do-I-Read-Things-in-BrainScript) +* [Deal with errors](./How-do-I-Deal-with-Errors-in-BrainScript) diff --git a/articles/test/How-do-I-in-Python.md b/articles/test/How-do-I-in-Python.md new file mode 100644 index 00000000..f1da14e5 --- /dev/null +++ b/articles/test/How-do-I-in-Python.md @@ -0,0 +1,8 @@ +* [Express things](./How-do-I-Express-Things-In-Python) +* [Train models](./How-do-I-train-models-in-Python) +* [Evaluate models](./How-do-I-Evaluate-models-in-Python) +* [Adapt models](./How-do-I-Adapt-models-in-Python) +* [Read things](./How-do-I-Read-Things-in-Python) +* [Deal with errors](./How-do-I-Deal-with-Errors-in-Python) + + diff --git a/articles/test/How-do-I.md b/articles/test/How-do-I.md new file mode 100644 index 00000000..f54de49f --- /dev/null +++ b/articles/test/How-do-I.md @@ -0,0 +1,7 @@ +On this page, we are collecting specific questions on how to implement or realize a specific kind of model or feature. + +How do I... + +* [in Python](How-do-I-in-Python) +* [in BrainScript](How-do-I-in-BrainScript) + diff --git a/articles/test/Implement-Zoneout.md b/articles/test/Implement-Zoneout.md deleted file mode 100644 index fa95f9b7..00000000 --- a/articles/test/Implement-Zoneout.md +++ /dev/null @@ -1,8 +0,0 @@ -You might be wondering if Dropout applied to a Constant is dynamically evaluated. It is! Therefore [Zoneout](http://arxiv.org/abs/1606.01305) is as simple as -``` -Zoneout (x, xprev) = -{ - mask = Dropout (BS.Constants.OnesLike (x)) - res = BS.Boolean.If (mask, xprev, x) -}.res -``` diff --git a/articles/test/Implement-an-attention-mechanism.md b/articles/test/Implement-an-attention-mechanism.md deleted file mode 100644 index f63c8ad7..00000000 --- a/articles/test/Implement-an-attention-mechanism.md +++ /dev/null @@ -1,39 +0,0 @@ -Implementing an attention mechanism requires computing a softmax over a dynamic axis. One way to do this is with a recurrence. Symbolic recurrences in Python take a little while to get used to. To make things concrete let's see how one might go about implementing a model that takes a query and a candidate answer and computes the cosine similarity of their representations. First we assume that the query and the answer have been processed by pipelines like this -```python -q_lstm = Sequential([ Embedding(500), BiRecurrence(LSTM(300), LSTM(300)), Dense(200)]) -a_lstm = Sequential([ Embedding(500), BiRecurrence(LSTM(300), LSTM(300)), Dense(200)]) -q_embed = q_lstm(question) -a_embed = a_lstm(answer) -``` -where `BiRecurrence` is a convenience function that you can find in the solution of the third task of [this tutorial](https://github.com/Microsoft/CNTK/blob/v2.0.beta15.0/Tutorials/CNTK_202_Language_Understanding.ipynb). It runs one LSTM forward, another LSTM backward and concatenates the results. After this preprocessing we have a variable-length sequence of 200 dimensional vectors for the query and another variable length sequence of 200 dimensional vectors for the answer. - -To implement an attention mechanism we need to compute scalar values for each position and exponentiate them with an appropriate correction so that the sum of their exponentials equals 1. -```python -w_q = C.parameter((200,1), init=C.glorot_normal()) -w_a = C.parameter((200,1), init=C.glorot_normal()) -zq = C.times(q_embed, w_q) -za = C.times(a_embed, w_a) -``` -Now we need to compute the appropriate correction which is the log of the sum of the exponentials. This can be done with another recurrence. -```python -p = C.placeholder_variable((1)) -prev_zq_or_tiny = C.element_select(C.sequence.is_first(zq), -1e+30, C.past_value(p)) -log_cumsum_exp = C.log_add_exp(zq, prev_zq_or_tiny) -actual_log_cumsum_exp = log_cumsum_exp.replace_placeholders({p:log_cumsum_exp.output}) -log_sum_exp = C.sequence.last(actual_log_cumsum_exp) -attn_q = C.exp(zq - C.sequence.broadcast_as(log_sum_exp , zq)) -``` -The hardest part to understand is the call to `replace_placeholders`. -Before this call this part of the computation graph did not contain a loop: we were -either looking at `zq` or the past value of `p`. Once we call replace_placeholders we close the loop and make -`p` point to the output of the expression it was used to define! - -The attention weights `attn_a` can be obtained in the same fashion. Finally, we can compute the cosine distance between -the attended embeddings as: -```python -attended_q = C.sequence.reduce_sum(attn_q * q_embed) -attended_a = C.sequence.reduce_sum(attn_a * a_embed) -cosine_dst = C.cosine_distance(attended_q, attended_a) -``` - - diff --git a/articles/test/Interpret-the-training-loss.md b/articles/test/Interpret-the-training-loss.md deleted file mode 100644 index 04002920..00000000 --- a/articles/test/Interpret-the-training-loss.md +++ /dev/null @@ -1,6 +0,0 @@ -How does CNTK calculate the training loss (as bolded) after an epoch is finished? Does the final model go through the whole training set to get the value, or does it simply average the miniBatch training losses produced by the model while it gets updated during the epoch? -
Finished Epoch[ 1 of 100]: [Training] CE = 0.15920662 * 59999999; totalSamplesSeen = 59999999; learningRatePerSample = 9.9999997e-05; epochTime=641.392s
-Final Results: Minibatch[1-3510]: CE = 0.32190538 * 3592929
-Finished Epoch[ 1 of 100]: [Validate] CE = 0.32190538 * 3592929
- -The answer is the latter (where the averaging is weighted by the samples in the minibatch in case they are variable-length). I.e. you do not get the actual training loss of the model in the state it is in at the end of the epoch. \ No newline at end of file diff --git a/articles/test/Interpret-the-use-of-MinibatchSource.next_minibatch.md b/articles/test/Interpret-the-use-of-MinibatchSource.next_minibatch.md deleted file mode 100644 index c8a8735a..00000000 --- a/articles/test/Interpret-the-use-of-MinibatchSource.next_minibatch.md +++ /dev/null @@ -1,23 +0,0 @@ -**TODO** This material applies to all of CNTK and should move to a common location - -My current understanding is that when I call MinibatchSource.next_minibatch(minibatch_size, input_map) during training it will pick a random subset of minibatch_size samples from my training data set? - -**Yes** - -Does the implementation ensure that when I call next_minibatch N times (N = number_of_training_samples / minibatch_size) my whole data set gets covered at the end of the N calls of next_minibatch? Also, when I call next_minibatch 2*N times does it means that my whole training set gets covered twice? - -**Yes**. - -**Additional info:** -* Each cycle through the data will have a different random order. -* If you double your minibatch size, one minibatch will now contain exactly the samples that, before, the corresponding two consecutive minibatches would have contained (this may be approximate if you have variable-length sequences). I.e. two runs that only differ in MB size process the data in the same order. -* If you interrupt and restart from checkpoint, you will get the same random order as if you had not interrupted. - -This is implemented by grounding the reading/randomization process on a nominal time axis, with this simple algorithm: - -* Training proceeds on a nominal infinite time axis. If you fetch a MB of size 256, the nominal time progresses by 256. -* The training corpus is replicated an infinite number of times on this time axis. If you have M samples, then the first replica spans nominal time 0..M-1; the second M..2M-1; etc. -* Each replica is random-shuffled within, but not across replica boundaries. I.e. once you have processed precisely M samples, you have seen each sample exactly once. -* Calling next_minibatch(K) gives you the next K samples on this reshuffled infinite time line. It’s the same as calling next_minibatch(1) for K times. -* This is all done lazily. -* Restarting from checkpoint is as simple as resetting the nominal time to the nominal time when the checkpoint was created. diff --git a/articles/test/Interrogate-the-dimensions-of-internal-layers-of-a-network-from-within-the-Python-API.md b/articles/test/Interrogate-the-dimensions-of-internal-layers-of-a-network-from-within-the-Python-API.md deleted file mode 100644 index ee374ff8..00000000 --- a/articles/test/Interrogate-the-dimensions-of-internal-layers-of-a-network-from-within-the-Python-API.md +++ /dev/null @@ -1,20 +0,0 @@ -It depends on how you use the API. If you use the layers API then a model like this: - -```python -model = Sequential([ - Embedding(emb_dim), - Recurrence(LSTM(hidden_dim), - Dense(num_labels) - ]) -``` - -Can be interrogated like this: - -```python -print(len(model.layers)) -print(model.layers[0].E.shape) -print(model.layers[2].b.value) -``` - -i.e. you need to know the names of the tensors (E for embedding, b for bias, W for weights). You could recover these with some reflection though. - diff --git a/articles/test/Introspect-or-inspect-or-list-model-input-variables.md b/articles/test/Introspect-or-inspect-or-list-model-input-variables.md deleted file mode 100644 index d1eeb2eb..00000000 --- a/articles/test/Introspect-or-inspect-or-list-model-input-variables.md +++ /dev/null @@ -1,24 +0,0 @@ -If I create a model with some input_variable and then later from the trainer, I need the input_variable name, but I only have the model around, how can I introspect to get at the input_variables? - -Say for example, you setup your trainer like this: -```python -SetupTrainer(): - input = cntk.input_variable((input_dim), np.float32) - label = cntk.input_variable((num_output_classes), np.float32) - - z = model(input) # (features) -> (prediction as unnormalized log prob) - ce = cross_entropy_with_softmax(z, label) - errs = classification_error(z, label) - criterion = combine ([ce, errs]) # (features, labels) -> (loss, metric) - - trainer = Trainer(model, criterion.outputs[0], criterion.outputs[1], learner) - - return trainer, criterion -``` - -Then later, you needed to introspect to get back the names input and label by using “[arguments](https://www.cntk.ai/pythondocs/graph.html#cntk.ops.functions.Function)”: -```python -# train the model -trainer, criterion = SetupTrainer() -trainer.train_minibatch({criterion.arguments[0]: features, criterion.arguments[1]: labels}) -``` \ No newline at end of file diff --git a/articles/test/Layer-wise-training.md b/articles/test/Layer-wise-training.md deleted file mode 100644 index f5d38921..00000000 --- a/articles/test/Layer-wise-training.md +++ /dev/null @@ -1,15 +0,0 @@ -To perform layer-wise training simply use multiple "commands" in your config file, where each command is of type action=train. -``` -command = TrainLayer1:TrainLayer2:TrainLayer3:EndToEndTrain:Test:Output - -TrainLayer1= [ - action = train - ... -] - -TrainLayer2= [ - action = train - ... -] -... -``` \ No newline at end of file diff --git a/articles/test/Load-pre-trained-checkpointed-model-and-continue-retraining.md b/articles/test/Load-pre-trained-checkpointed-model-and-continue-retraining.md deleted file mode 100644 index aa02f08e..00000000 --- a/articles/test/Load-pre-trained-checkpointed-model-and-continue-retraining.md +++ /dev/null @@ -1,11 +0,0 @@ -You have a CNTK `trainer` object and save a checkpoint file. - - checkpoint_file = "mModel_checkpointed.dnn" - trainer.save_checkpoint(checkpoint_file) - -At a later time I load the `".dnn"` file as my model - - mymodel.load_model(checkpoint_file) - -When you restart training and it would load the model but not the weights. To be able to restore training from the previously checkpointed model. You need to use [`trainer.restore_from_checkpoint`](https://cntk.ai/pythondocs/cntk.trainer.html?highlight=restore_from_checkpoint#cntk.trainer.Trainer.restore_from_checkpoint) instead, to recreate the trainer and learners. One important thing is that in your python script the order of network creation (not only structure! but the order in which you create your nodes) should stay the same at the point when you create a checkpoint and when you restore from it. - diff --git a/articles/test/Managed-EvalDLL-API.md b/articles/test/Managed-EvalDLL-API.md deleted file mode 100644 index ba662fc9..00000000 --- a/articles/test/Managed-EvalDLL-API.md +++ /dev/null @@ -1,110 +0,0 @@ -There is a managed interface available for evaluating models without requiring a data reader or data file(s). This interface is implemented in CLI managed code inside the EvalWrapper.DLL library. This library in turn uses the EvalDLL.dll library to perform the actual network evaluations (in native C++). - -The managed interface name (in C#) is the following: - ` - public interface IEvaluateModelManaged : IDisposable - ` -Where `T` is the element type (`float` or `double`) - -This interface provides the following methods: -*** -`void Init(string config)` -This method initializes the evaluation engine with the specified configuration file. The entries from the configuration file are parsed and assigned to evaluation engine, however, the network is not created inside this call. Additionally, this method will default the `numCPUThreads` property to `1`, unless the configuration file contains a different value for this property. - -*** - -`void CreateNetwork(string networkDescription)` -This method builds the network from either the network description in the configuration file, or if the `modelPath` attribute is specified, it loads the model from disk. -* `networkDescription` : contains the description of the network, either through a NetworkBuilder entry of a modelPath attribute. - -*** - -`void CreateNetwork(string networkDescription, List outputNodeNames)` -This method builds the network from either the network description in the configuration file, or if the `modelPath` attribute is specified, it loads the model from disk. Additionally, it replaces the list of output nodes from the configuration file, with the list passed in as an argument. This enables the caller to retrieve the output values of other nodes, such as those in the hidden layer(s). -* `networkDescription` : contains the description of the network, either through a NetworkBuilder entry of a modelPath attribute. -* `outputNodeNames` : list of nodes to be marked as Output, so they can be evaluated. - -*** - -`void CreateNetwork(string networkDescription, int deviceId)` -This method builds the network from the network description in the configuration file. It is a merely a convenience method that prepends `deviceId=` to the `networkDescription` text. -* `networkDescription` : contains the description of the network, either through a `NetworkBuilder` property or a `modelPath` attribute. -* `deviceId` : specifies the device Id value to prepend to the network description's `deviceId` property. - -*** - -`void CreateNetwork(string networkDescription, int deviceId, List outputNodeNames)` -This method builds the network from the network description in the configuration file. It is a merely a convenience method that prepends `deviceId=` to the `networkDescription` text. -* `networkDescription` : contains the description of the network, either through a NetworkBuilder entry of a modelPath attribute. -* `deviceId` : specifies the device Id value to prepend to the network description's `deviceId` property. -* `outputNodeNames` : list of nodes to be marked as Output, so they can be evaluated. - -*** - -`List Evaluate(string outputKey, int outputSize)` -This method evaluates the network with a single forward pass (no input) and returns the values associated with the specified layer `outputKey`. -* `outputKey` : layer name to return the values from. -* `outputSize` : number of values in the output layer. -**This method is deprecated. Instead use the `List Evaluate(string outputKey)` method.** - -*** - -`List Evaluate(string outputKey)` -This method evaluates the network with a single forward pass (no input) and returns the values associated with the specified layer `outputKey`. -* `outputKey` : layer name to return the values from. -Internally the method, determines the required data buffer size for the output and allocates the necessary buffer. - -*** - -`void Evaluate(Dictionary> inputs, Dictionary> outputs)` -This method evaluates the network using the provided input and retrieves multiple output layers. -* `inputs` : the dictionary mapping input layer names to values to use as input to the network. -* `outputs` : the dictionary mapping output layer names to values to retrieve from the network. - -*** - -`List Evaluate(Dictionary> inputs, string outputKey, int outputSize)` -This method evaluates a network with the provided input and retrieves a single output layer -* `inputs` : the dictionary mapping input layer names to values to use as input to the network. -* `outputKey` : the name of the desired output layer. -* `outputSize` : the number of values in the output layer. -**This method is deprecated. Instead use the `List Evaluate(Dictionary> inputs, string outputKey)` method.** - -*** - -`List Evaluate(Dictionary> inputs, string outputKey)` -This method evaluates a network with the provided input and retrieves a single output layer -* `inputs` : the dictionary mapping input layer names to values to use as input to the network. -* `outputKey` : the name of the desired output layer. -Internally the method, determines the required data buffer size for the output and allocates the necessary buffer. - -*** - -`List EvaluateRgbImage(Bitmap image, string outputKey)` -This method evaluates a bitmap image and retrieves a single output layer -* `image` : the bitmap image input in RGB format. It must already be re-sized to match the size expected by the network. -* `outputKey` : the name of the desired output layer. -Internally the method first extracts the feature vector that contains 3 channels, and then feed it as the input to the network. It also allocates the necessary buffer for the output. - -*** - -`Dictionary GetNodeDimensions(NodeGroup nodeGroup)` -This method returns a dictionary of items, with each item mapping the layer name (key) to the dimension size. The node group is defined through the `NodeGroup` enumeration: - - public enum class NodeGroup - { - nodeInput, // an input node - nodeOutput, // an output node - nodeSpecified - }; - -*** - -There are two implementations of this interface available to the managed client: -`IEvaluateModelManagedF : IEvaluateModelManaged` (Assumes data elements of type `float`) -and -`IEvaluateModelManagedD : IEvaluateModelManaged` (Assumes data elements of type `double`) - - -## Example -The CSEvalClient program located [here](https://github.com/Microsoft/CNTK/blob/master/Examples/Evaluation/CSEvalClient) demonstrates the usage of this evaluation interface. \ No newline at end of file diff --git a/articles/test/Native-EvalDLL-API.md b/articles/test/Native-EvalDLL-API.md deleted file mode 100644 index cb3ac6aa..00000000 --- a/articles/test/Native-EvalDLL-API.md +++ /dev/null @@ -1,72 +0,0 @@ -# C++ EvalDLL API - -There is a native interface available for evaluating models without requiring a data reader or data file(s). This interface is implemented in C++ inside the EvalDLL.dll library in Windows and libeval.so in Linux respectively. The EvalDll.dll in turn is also used by the EvalWrapper assembly (which provides a managed layer) in Windows. - -The native interface name is the following: - - template - class IEvaluateModel - -Where `ElemType` is the element type (`float` or `double`). - -This interface provides the following methods: -*** -`void Init(const std::string& config)` -This method initializes the evaluation engine with the specified configuration file. The entries from the configuration file are parsed and assigned to evaluation engine, however, the network is not created inside this call. Additionally, this method will default the `numCPUThreads` property to `1`, unless the configuration file contains a different value for this property. - -*** - -`void CreateNetwork(const std::string& networkDescription)` -This method builds the network from either the network description in the configuration file, or if the `modelPath` attribute is specified, it loads the model from disk. -* `networkDescription` : contains the description of the network, either through a NetworkBuilder entry of a modelPath attribute. - -*** - -`void GetNodeDimensions(std::map& dimensions, NodeGroup nodeGroup)` -This method fills a dictionary of dimensions, with each fimension item mapping the layer name (key) to the dimension size. The node group is defined through the `NodeGroup` enumeration: - - enum NodeGroup - { - nodeInput, // an input node - nodeOutput, // an output node - nodeSpecified - }; -*** - -`void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName)` -This methods prepares the network for Evaluate calls. -* `outputNodeName` : the name of the node that will be evaluated - -*** - -`void Evaluate(std::map*>& inputs, std::map*>& outputs)` -This method evaluates the network using the provided input and retrieves multiple output layers. -* `inputs` : the dictionary mapping input layer names to values to use as input to the network. -* `outputs` : the dictionary mapping output layer names to values to retrieve from the network. - -*** - -`void Evaluate(std::map*>& outputs)` -This method evaluates the network retrieving multiple output layers. The evaluation is a single-forward pass evaluating the output nodes. -* `outputs` : the dictionary mapping output layer names to values to retrieve from the network. - -*** - -`void ResetState()` -TBD - -*** - -`void Destroy()` -Releases resources allocated during network creation. - -*** - -There are two implementations of this interface available to the client: -`IEvaluateModelF : IEvaluateModel` (Assumes data elements of type `float`) -and -`IEvaluateModelD : IEvaluateModel` (Assumes data elements of type `double`) - - -## Example -The [CPPEvalClient](https://github.com/Microsoft/CNTK/tree/master/Examples/Evaluation/CPPEvalClient) program located in the folder [Examples/Evaluation/CPPEvalClient](https://github.com/Microsoft/CNTK/blob/master/Examples/Evaluation/CPPEvalClient) demonstrates the usage of this evaluation interface. diff --git a/articles/test/Port-LSTM-NDL-primitives-to-Python.md b/articles/test/Port-LSTM-NDL-primitives-to-Python.md deleted file mode 100644 index c8710e44..00000000 --- a/articles/test/Port-LSTM-NDL-primitives-to-Python.md +++ /dev/null @@ -1,27 +0,0 @@ -How do I find the support for following NDL LSTM primitives to Python: - -**Delay** - -* How to pass argument in delay of a variable defined later in the network? E.g. for peep hole LSTM, cell state variable is defined later, but delay is needed to get t-1 cell state. Python doesn’t allow variables to be used first and defined later. - -* Ans: One needs to use a `placeholder_variable` and later a call to `replace_placeholders`. [Here](https://github.com/Microsoft/CNTK/wiki/Implement-an-attention-mechanism) is a simple example. - - -**RowStack**, **RowSlice** - -* Are there any substitutes for these primitives? If not how to implement them in python? Can we operate on variables as if they are numpy arrays? - -* Ans: Use [splice](https://cntk.ai/pythondocs/cntk.ops.html?highlight=splice#cntk.ops.splice) - - -**DiagTime** vs **ElementTimes** - -* Is there any difference between them for vector element wise multiplication? Also is DiagTimes in supported in python? - -* Use [element wise multiplication](https://cntk.ai/pythondocs/cntk.ops.html?highlight=element#cntk.ops.element_times) - -**Parameter initialization** - -* How to Initialize parameters from file in python and set `computeGradient` as false. - -* Use [constants](https://cntk.ai/pythondocs/cntk.ops.html?highlight=splice#cntk.ops.splice). You can specify the initial value via a NumPy array. There are many ways to load a text (or other) file into a NumPy array. diff --git a/articles/test/Port-projection-of-1D-input-to-1D-output-from-Python-API-to-C---API.md b/articles/test/Port-projection-of-1D-input-to-1D-output-from-Python-API-to-C---API.md deleted file mode 100644 index e338c1d6..00000000 --- a/articles/test/Port-projection-of-1D-input-to-1D-output-from-Python-API-to-C---API.md +++ /dev/null @@ -1,16 +0,0 @@ -In C++ API a rank-1 tensor denotes a column and tensors are stored in column major format (i.e. axis 0 is the faster changing dimension, followed by axis 1 and so on). - -```python -input = input_variable((input_dim), np.float32) -times_param = parameter(shape=(input.shape[0], output_dim)) -t = times(input, times_param) -``` -So to project a 1D input of dim “inputDim” to a 1D output of dim “outputDim”, you need to setup things as follows in C++: - -```cpp -input = InputVariable({ inputDim }, DataType::Float); -timesParam = CNTK::Parameter({ outputDim, input.Shape()[0] }); -t = Times(timesParam, input); -``` - -Note how both the tensor shapes and the order of the operands to the Times operation are reversed compared to the python code. In python, to conform to the generally accepted norm established by numpy, a rank-1 tensor denoted a row vector and data layout is row major (i.e. axis 0 is the slowest changing dimension, followed by axis 1 and so on). We internally do the required shape transformations at the SWIG layer to map the python shapes and ops to the C++ implementation correctly. diff --git a/articles/test/Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd.md b/articles/test/Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd.md deleted file mode 100644 index 91f355b2..00000000 --- a/articles/test/Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd.md +++ /dev/null @@ -1,3 +0,0 @@ -* Alpha is the learning_rate -* Beta1 is momentum parameter -* Beta2 is variance_momentum parameter \ No newline at end of file diff --git a/articles/test/ReleaseNotes/CNTK_2_0_RC_1_Release_Notes.md b/articles/test/ReleaseNotes/CNTK_2_0_RC_1_Release_Notes.md new file mode 100644 index 00000000..25aee694 --- /dev/null +++ b/articles/test/ReleaseNotes/CNTK_2_0_RC_1_Release_Notes.md @@ -0,0 +1,152 @@ + +## CNTK v.2.0 RC 1 Release Notes + +With Release Candidate 1 the Microsoft Cognitive Toolkit enters the final set of enhancements before release of the production version of CNTK 2.0. We expect no breaking changes before the release of the production version, but we are eager to hear your feedback! + +### Highlights of CNTK 2.0 Release Candidate 1 + +The release candidate contains [all changes and improvements introduced in CNTK 2.0 during beta phase](https://github.com/Microsoft/CNTK/wiki/CNTK-2.0-Beta-Highlights). + +### Breaking changes + +This release contains the following **breaking changes**: + +* New file names of CNTK Shared Libraries. [Read more in this Wiki article](https://github.com/Microsoft/CNTK/wiki/CNTK-Shared-Libraries-Naming-Format). + * This is important for developers and other users who use explicit CNTK Shared Library file names in their solutions. + * If you are using CNTK NuGet packages for C++ or C# no action is required (beside updating to the latest NuGet package) +* There is a set of breaking changes in Python and C# API. See the correspondent sections below. + +### CNTK Core: new and improved features + +* Improved performance and memory footprint in CSC/CSR sparse matrix operations. +* Caffe-converted pretrained models on image classification including AlexNet, ResNet, VGG and BN-Inception. +* Multiple-axis slicing. +* Improvements in the device selection API (specifying excluded devices, querying device properties, locking device exclusively). +* Limit the number of sweeps for training. + +### CNTK Python API + +#### New feature + +We enabled support for model debugging in Python (similar to gdb/pdb) by wrapping the model with `debug_model()` [Read more here](https://www.cntk.ai/pythondocs/cntk.debugging.html#module-cntk.debugging.debug). + +#### Breaking changes + +This release contains the following **breaking changes in CNTK Python API**: + +* Automatic conversion to NumPy array via `np.asarray(cntk_obj)` is being replaced by `asarray()`/`as_sequences()` methods on those objects + * Reasons: + * `np.assarray()` always returns NumPy arrays, so no sequences could be supported + * `np.assarray()` only supports dense arrays + * `np.assarray()` does not allow to throw exceptions from within the array interface + * Solution: + * `asarray()` returns dense or sparse (instance of `scipy.csr_matrix`) versions of the underlying data + * supported by `Constant`, `Parameter`, `NDArrayView`, `Value`, and `MinibatchData` + * `as_sequences(var)` returns a Python list of sequences whose elements are NumPy arrays (according to var's dynamic and static axes) + * supported by `Value` and `MinibatchData` +* `forward()` and `eval()` behave more consistent now + * Before: when the returned result did had sequences all of same length, they would be returned as a single NumPy array. + * Now: independent of the sequences, the result will always be a list if the output has a sequence axis. This will impact every usage that has the sequence axis in the output variable (e.g. if you just used `input_variable()` with standard settings and did not get rid of the sequence axes before the final output node). +* `MinibatchData.value` is deprecated now because of performance issues + * Replacement: use `MinibatchData.data`, which returns a `Value` instance that does not need to be converted before passing it to the Trainer and thus is much faster. +* Deprecated parameters were removed from `training_session`. The progress printer should be now passed directly to the trainer, not to the training session. To configure different aspects of the training session, please use configuration objects. In the new version CNTK supports: + * checkpoint configuration + * cross validation configuration + * test configuration + +### CNTK C#/.NET Managed API + +#### New and improved features + +* Support of CSC Sparse input. +* More flexible interface types in API to provide more freedom for developers. +* Improved CLS compliance to better support other .NET languages. +* Performance optimizations. +* Multiple bug fixes, including memory safety and concurrent evaluations. + +The updated APIs are described in [this Wiki page](https://github.com/Microsoft/CNTK/wiki/CNTK-Library-Managed-API). + +#### Breaking changes + +This release contains the following **breaking changes in CNTK C# API**: + +* The following C# APIs have been moved to new classes: + * `CNTKLib.Alias()` -> `Function.Alias()` + * `CNTKLib.AsComposite()` -> `Function.AsComposite()` + * `CNTKLib.Combine()` -> `Function.Combine()` + * `CNTKLib.SetMaxNumCPUThreads()` -> `Utils.SetMaxNumCPUThreads()` + * `CNTKLib.GetMaxNumCPUThreads()` -> `Utils.GetMaxNumCPUThreads()` +* We have the following two groups of API changes due to signature simplification and certain type changes. These changes may require adaptations in dependent applications. + +##### C# API changes due to type change from `uint` to `int` + +* Class `NDShape` +``` +public NDShape(int numAxes, int dimension) +public NDShape(int numAxes) +public int Rank { get; } +public IList Dimensions { get; } +public int TotalSize { get; } +public int this[int key] { get; } +public NDShape SubShape(int beginAxisId, int endAxisId) +public NDShape SubShape(int beginAxisId) +public static NDShape CreateNDShape(IEnumerable dimensions) +``` +* Class `DeviceDescriptor` +``` +public int Id { get; } +public static DeviceDescriptor GPUDevice(int deviceId) +``` +* Class `NDArrayView` +``` +public NDArrayView(NDShape viewShape, float[] dataBuffer, DeviceDescriptor device, bool readOnly = false) +public NDArrayView(NDShape viewShape, double[] dataBuffer, DeviceDescriptor device, bool readOnly = false) +public NDArrayView(NDShape viewShape, int[] colStarts, int[] rowIndices, float[] nonZeroValues, DeviceDescriptor device, bool readOnly = false) +public NDArrayView(NDShape viewShape, int[] colStarts, int[] rowIndices, double[] nonZeroValues, DeviceDescriptor device, bool readOnly = false) +``` +* Class `NDMask` +``` +public int MaskedCount { get; } +public void InvalidateSection(IEnumerable sectionOffset, NDShape sectionShape) +public void MarkSequenceBegin(IEnumerable offset) +public void MarkSequenceBegin(IEnumerable offset, NDShape sectionShape) +``` +* Class `Value` +``` +public int MaskedCount { get; } +public static Value CreateBatch(int dimension, IEnumerable batch, DeviceDescriptor device, bool readOnly = false) +public static Value CreateSequence(int dimension, IEnumerable sequence, DeviceDescriptor device, bool readOnly = false) +public static Value CreateSequence(int dimension, IEnumerable sequence, bool sequenceStartFlag, DeviceDescriptor device, bool readOnly = false) +public static Value CreateBatchOfSequences(int dimension, IEnumerable> batchOfSequences, DeviceDescriptor device, bool readOnly = false) +public static Value CreateBatchOfSequences(int dimension, IEnumerable> batchOfSequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +public static Value Create(int dimension, IEnumerable> sequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +``` +##### C# API changes due to type change from concrete class type to interface type + +* Class `Function` +``` +public IList Arguments { get; } +public IList Outputs { get; } +``` +* Class `Variable` +``` +public IList DynamicAxes { get; } +``` +* Class `Value` +``` +public static Value CreateBatch(NDShape sampleShape, IEnumerable batch, DeviceDescriptor device, bool readOnly = false) +public static Value CreateSequence(NDShape sampleShape, IEnumerable sequence, DeviceDescriptor device, bool readOnly = false) +public static Value CreateSequence(NDShape sampleShape, IEnumerable sequence, bool sequenceStartFlag, DeviceDescriptor device, bool readOnly = false) +public static Value CreateBatchOfSequences(NDShape sampleShape, IEnumerable> batchOfSequences, DeviceDescriptor device, bool readOnly = false) +public static Value CreateBatchOfSequences(NDShape sampleShape, IEnumerable> batchOfSequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +public static Value Create(NDShape sampleShape, IEnumerable> sequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +public static Value Create(NDShape sampleShape, IEnumerable> sequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +public static Value Create(NDShape sampleShape, IEnumerable sequences, DeviceDescriptor device, bool readOnly = false) +public static Value Create(NDShape sampleShape, IEnumerable sequences, IEnumerable sequenceStartFlags, DeviceDescriptor device, bool readOnly = false) +``` + +### CNTK NuGet package + +A new set of NuGet Packages is provided with this Release. + +**IMPORTANT!** In Visual Studio *Manage Nuget Packages* Window change the default option *Stable Only* to *Include Prerelease*. Otherwise the packages will not be visible. The Package version should be ```2.0.0-rc1```. diff --git a/articles/test/Restrict-a-prediction-to-a-bounded-interval.md b/articles/test/Restrict-a-prediction-to-a-bounded-interval.md deleted file mode 100644 index 73904861..00000000 --- a/articles/test/Restrict-a-prediction-to-a-bounded-interval.md +++ /dev/null @@ -1,7 +0,0 @@ -You can use `clip`. For example if you use the layers API -```python -z = Sequential([ Dense(500, activation=relu), - Dense(4, activation=None), - clip(Placeholder(), 0, 224) ]) -``` -will create a network with one layer with relu's and one layer with linear activations. The latter has four outputs whose predictions are limited in the interval [0,224]. This could be used to predict bounding boxes for images of size 224 x 224. \ No newline at end of file diff --git "a/articles/test/Sequence-to-Sequence-\342\200\223-Deep-Recurrent-Neural-Networks-in-CNTK-\342\200\223-Part-2-\342\200\223-Machine-Translation.md" "b/articles/test/Sequence-to-Sequence-\342\200\223-Deep-Recurrent-Neural-Networks-in-CNTK-\342\200\223-Part-2-\342\200\223-Machine-Translation.md" new file mode 100644 index 00000000..e5c88330 --- /dev/null +++ "b/articles/test/Sequence-to-Sequence-\342\200\223-Deep-Recurrent-Neural-Networks-in-CNTK-\342\200\223-Part-2-\342\200\223-Machine-Translation.md" @@ -0,0 +1,96 @@ +by William Darling + +[Last time](https://github.com/Microsoft/CNTK/wiki/Sequence-to-Sequence-%E2%80%93-Deep-Recurrent-Neural-Networks-in-CNTK-%E2%80%93-Part-1), I gave an introduction to sequence-to-sequence networks and their implementation using CNTK. In particular, I talked about how this seemingly simple deep recurrent neural network architecture is revolutionizing the approaches that we take to NLP (and other) tasks like text summarization, syntax parsing, and machine translation. To reiterate, a basic sequence-to-sequence network with no special knowledge of the problem domain is in many cases able to reach – and surpass – the results that a specialized approach, built up over several years by very smart people, have been able to accomplish with precise pre-processing, model tuning, and everything else. Today, I will report some of my experiences and results that I achieved using the basic sequence-to-sequence model in CNTK that I discussed last time, applied to machine translation. + +First, I know essentially nothing about machine translation except that, living in Germany, I depend on it probably a little bit too much. Every time I get a piece of mail that looks official (anything non-official-looking typically gets pre-processed out) I feel a great amount of gratitude to every MT researcher out there. So, like the intended reader of this blog, I am an MT newb. But I wanted to see how difficult it would be to come to a new domain and get some interesting results which is more-or-less what I promised s2s networks allow in my last post. I wanted to try something a little bit beyond a toy example, but nothing that would require too much domain knowledge or too much training time. In the end, I decided on English to French translation with the transcripts from the Canadian Parliament as training data. + +In the rest of this post, I will explain the steps I had to take to find the data and get it ready for training, the parameters I chose for my network, my results (I had some memories of the [BLEU](https://en.wikipedia.org/wiki/BLEU) metric left over from grad school) and what the next steps might be if I were so inclined to continue to improve this model. Let's start with the data… + +# Data # + +Getting data for learning a MT model is harder than I had anticipated. I first considered the data from the EMNLP Workshop on Machine Translation ([WMT15](http://www.statmt.org/wmt15/index.html)) but it consists of about 20 GB of data so I thought that might be a bit big for a first try. I then considered taking a subset of that data – say the EuroParl V7. The EuroParl data seemed to require all kinds of sentence-alignment pre-processing with tools that never worked and I quickly gave up on that too. Finally, I went back to my (Canadian) roots and turned to the [Aligned Hansards of the 36th Parliament of Canada](http://www.isi.edu/natural-language/download/hansard/). The Hansards are the name for the transcripts of Parliamentary debates in the Canadian government. Because Canada is a bilingual country, the transcripts are released in a professionally-translated dual English-French set of documents. And best of all, they are completely free to download and use. Perfect for translation training data. + +The training data can be downloaded [here](http://www.isi.edu/natural-language/download/hansard/hansard.36.r2001-1a.house.debates.training.tar) and the testing data is [here](http://www.isi.edu/natural-language/download/hansard/hansard.36.r2001-1a.house.debates.testing.tar). Note that this consists of the House debates (the bigger sets) but there are also Senate debates that could be combined into this data (see the website). The data is already sentence-aligned so we know the input sequence and the output sequence (obviously important) and it's enough data to build a pretty good model, but not so much that it would take weeks to train or use up your whole hard disk: 948,000 sequence pairs for the training data (14,614,000 English words and 15,657,000 French words) and 62,000 sequence pairs for test set 1 and another 60,000 sequence pairs for test set 2 (I only looked at test set 1 and didn't do any parameter optimization). Note that this is a TINY amount of data; to get a great translation model you'll need way more data and way more varied data. For example, the types of conversations that are had in the House of Commons are likely very different from the style, vocabulary, etc. of the types of things that you might want to translate from a TV show or a government letter or a magazine. But for this domain, at least, we should be able to get something that is not bad. + +An example sequence pair from this data looks like this (after some very basic NLTK-based tokenizing): + +| Language | Sentence | +| ------------- |:-------------:| +| English| that motion also called on the federal government to take action to tackle energy inefficiency | +| French| cette motion exigeait également que le gouvernement fédéral prenne des mesures pour résoudre les problèmes dans ce domaine | + +# Model # + +I trained a sequence-to-sequence network with attention, 2 LSTM layers of encoder (single direction), 2 LSTM layers of decoder (all LSTM layers with 256 hidden dimensions), an embedding (learned on the data) of 200 dimensions, vocabulary size of 99,771 (no words were cut), and any sequence longer than 50 words (< 1% of the data) was cut to its first 50 words. I implemented this network in CNTK using the [G2P example](https://github.com/Microsoft/CNTK/blob/master/Examples/SequenceToSequence/CMUDict/BrainScript/G2P.cntk) and a miniscule amount of pre-processing and basically random parameter choosing (i.e. why didn't I do one layer or three layers? Great questions with no answers). + +After about a weekend's worth of training (around 10 full passes over the data – which is probably too much but remember this is just for fun), I had what looked like a good enough model. I tried some basic things: + +| Language | Sentence | +| ------------- |:-------------:| +| English| house of commons | +| French | chambre des communes | +| English| i am here to help you| +| French| je suis ici pour vous aider | +| English| translating is not so difficult | +| French| ce n'est pas une question | + +So all in all not so bad except that the translation of the third sentence means "it is not a question" (almost like talking to a bot!). As discussed above, however, having something close enough to "translating is not so difficult" that the model could learn to generalize to that was said in the House of Commons transcripts is pretty unlikely. That's the same reason that when I tried "hello world" the model guessed "le monde entier" which means "the entire world".[1] + + +# Evaluation # + +The first evaluation that I did (examples just above) was subjective. If you know a language, you can tell if what your model is spitting out is useful or garbage right away. And the stuff above looks pretty good indeed (when in-domain). For things that were "parliamentary" (if you can imagine that), the model seems really good. For everyday speech or common sayings that don't really match the style of the training data, however, it does much worse. But, we knew this would be a pretty domain-dependent MT system. Luckily, we have the test set from the Hansard data release which contains 62,000 pairs of aligned English and French sentences from the parliamentary debates, but that didn't form part of the training data. Let's evaluate our model on that. + +To evaluate our models, I used the machine translation metric [BLEU](https://en.wikipedia.org/wiki/BLEU). BLEU stands for Bilingual Evaluation Understudy and is probably the most popular automated MT metric in use (as far as I -- as a newb, remember -- know). It aims to correlate (and has been shown to correlate well) with human judgments of translation quality. BLEU is computed over an entire corpus (if you try to use it per-sentence it doesn't tell you much of anything [2]) and consists of a number between 0 and 1 with values closer to 1 representing more similar texts to the reference translation. The differences between BLEU and simple precision come in for a few reasons. First, MT systems tend to generate more words than are in a reference text (with many words you have a better chance at getting a high precision of correct words showing up in your translation). BLEU corrects for this by clipping the appearance of words to the maximum number of times that they show up in the reference translations. + +Second, individual word statistics are not usually very telling in translation quality, for obvious reasons. It would be fairly easy to look up each word in a dictionary table and simply replace the English word with its most likely French equivalent. Of course this wouldn't give perfect unigram precision, but it wouldn't be bad. However, it would be a bad translation that lacks proper grammar, and in many cases be entirely nonsensical. To deal with this issue, the same modified precision statistics discussed above are calculated using *n*-grams. The highest correlation with human judgments was found to be when *n*=4, so 4-grams are used in the calculation of BLEU. + +Finally, BLEU on its own could in some cases favor very short sentences. If a translation cuts down a sentence and only retains say two words but those words are in all references, then again it will get a good precision score. To combat this problem, the score for the whole corpus is calculated by combining the modified precision scores using the geometric mean weighted by a brevity penalty. This penalty is applied only when the total length of the reference corpus *r* (the correct translation) is greater than or equal to the total length of the translated corpus *c*. If so, a penalty of *e(1 - r/c)* is applied. So, let's get to it! + +I used BLEU 1.04 (available [here](ftp://jaguar.ncsl.nist.gov/mt/resources) [3]) which was actually really hard to find, and after spending even more time trying to figure out the XML file format/schema/whatever required for the reference and test sets I finally found a pretty good explanation of them [here](http://www.itl.nist.gov/iad/mig/tests/mt/2009/MT09_EvalPlan.pdf). Here is the output of the BLEU metric: + + > bleu-1.04.pl -t mytst.txt -r myref.txt + [...] + Add -0.0082 and +0.0082 to BLEU below for 95% confidence level + System,wmd + SegsScored,61268 + SysWords,1093690 + Ref2SysLen,1.0215 + 1-gPrec,0.5404 + 2-gPrec,0.3277 + 3-gPrec,0.2182 + 4-gPrec,0.1487 + PrecScore,0.2753 + BrevityPenalty,0.9787 + BLEUr1n4c,0.2695 + +So, I have 61,268 segments scored (each sentence I made a segment -- not sure if that's right), and those consist of a total of 1,093,690 words. That sounds about right given the stats above. My brevity penalty was 0.9787 and that's probably at least somewhat related to the fact that the way the decoder is currently setup it uses the length of the correct translation (but none of its content of course) and generates words until it hits that length. Once the end-sequence tag is generated ("
"), I cut the rest of the sequence, but it can never produce something longer. In the end, we get a BLEU score of 0.2695. Is that any good? Well, subjectively, as we saw, it does do a pretty good job on in-domain stuff. And quantitatively? + +A quick Bing search led me to a paper from 2002, "[A Phrase-Based, Joint Probability Model for Statistical Machine Translation](http://anthology.aclweb.org/W/W02/W02-1018.pdf)" by Daniel Marcu and William Wong, that uses the Hansard dataset to train their "traditional" MT model. They translate French->English instead of the other way around, but the numbers should be roughly on-par. They present numbers for their MT system (with special word-alignments, etc.) and for another baseline: Giza (the IBM Model 4). Remember I'm not claiming this to be rigorous, but I do think it suggests that as a total MT newb, with seq2seq, you can get a pretty good model: + +| Model| BLEU Score| +| ------------- |:-------------:| +| IBM Model 4| 0.2158| +| Phrase-based joint probability model| 0.2325| +| CNTK Sequence-to-Sequence MT Newb model| **0.2695**| + +Not bad! + +# Discussion # + +In this post I've showed that by doing some simple data pre-processing, you can get a very good domain-specific (in terms of content) machine translation model trained and ready to use with a very domain-independent (in terms of model-type) model (sequence-to-sequence) and little knowledge of the problem area. More involved papers (e.g. [here](https://arxiv.org/abs/1409.3215) and [here](https://arxiv.org/abs/1409.0473)) show that with a little bit of domain knowledge or at least some more time to spend on the problem, you can get state-of-the-art results using sequence-to-sequence, and even extend to domain-independent machine translation. + +The model I trained could be improved in many easy (and some more complex) ways. For instance, one could try: + +- Training the model for a longer time +- Optimizing the hyperparameters on one of the development sets +- Using WAY more data +- Adding more layers, more hidden dimensions, using a bigger embedding, etc. +- Doing better pre-processing (I did about the crudest thing imaginable) +- Etc. + +# Notes # + +1. For a great explanation of this problem, see the presentation "[Translating across domains is hard](http://www.clsp.jhu.edu/user_uploads/ws12/daume/Domain%20Adaptation%20in%20Statistical%20Machine%20Translation.pdf)". +1. By the way, for all the information about BLEU I'm just paraphrasing the [Wikipedia page](https://en.wikipedia.org/wiki/BLEU) so if you're interested in this stuff at all it's probably best that you go read directly from the source. +1. Yes, they really do still use Perl. diff --git a/articles/test/Set-the-verbosity-or-traceLevel-from-Python.md b/articles/test/Set-the-verbosity-or-traceLevel-from-Python.md deleted file mode 100644 index dbcca847..00000000 --- a/articles/test/Set-the-verbosity-or-traceLevel-from-Python.md +++ /dev/null @@ -1,5 +0,0 @@ -```python -import _cntk_py - -_cntk_py.set_computation_network_trace_level(1) -``` diff --git a/articles/test/Setup-CNTK-on-your-machine.md b/articles/test/Setup-CNTK-on-your-machine.md index 243419a6..b418544d 100644 --- a/articles/test/Setup-CNTK-on-your-machine.md +++ b/articles/test/Setup-CNTK-on-your-machine.md @@ -29,9 +29,9 @@ If you just want to download and install the latest precompiled binaries to your |Windows | Linux | |:------------------------|:------------------------| -|[Python-only installation](./Setup-Windows-Python) | [Python-only installation](./Setup-Linux-Python) | -|[Script-driven installation](./Setup-Windows-Binary-Script) | [Script-driven installation](./Setup-Linux-Binary-Script) -|[Manual installation](./Setup-Windows-Binary-Manual) | [Manual installation](./Setup-Linux-Binary-Manual) +|[Python-only installation](./Setup-Windows-Python)
Simple pip install
of CNTK lib for
use in Python| [Python-only installation](./Setup-Linux-Python))
Simple pip install
of CNTK lib for
use in Python | +|[Script-driven installation](./Setup-Windows-Binary-Script)
Script that installs
CNTK Python lib and
CNTK exe for BrainScript | [Script-driven installation](./Setup-Linux-Binary-Script)
Script that installs
CNTK Python lib and
CNTK exe for BrainScript +|[Manual installation](./Setup-Windows-Binary-Manual)
Manually install
CNTK Python lib,
CNTK exe for BrainScript,
and dependencies | [Manual installation](./Setup-Linux-Binary-Manual)
Manually install
CNTK Python lib,
CNTK exe for BrainScript,
and dependencies | | [Docker installation](./CNTK-Docker-Containers) #### Installation and building of the CNTK codebase diff --git a/articles/test/Train-a-DSSM-(or-a-convolutional-DSSM)-model.md b/articles/test/Train-a-DSSM-(or-a-convolutional-DSSM)-model.md deleted file mode 100644 index b9596674..00000000 --- a/articles/test/Train-a-DSSM-(or-a-convolutional-DSSM)-model.md +++ /dev/null @@ -1,97 +0,0 @@ -[DSSM](https://www.microsoft.com/en-us/research/project/dssm/) (or Deep Semantic Similarity Model) is a DNN model trained on pairs of source-target texts, for learning a short-text embedding space where relevant source and target text pairs are closer. The text input to the model is represented by their pre-computed trigram hash (see, [Huang et al.](https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/)). For [C-DSSM](https://www.microsoft.com/en-us/research/publication/learning-semantic-representations-using-convolutional-neural-networks-for-web-search/), the trigram hash is computed per word and then concatenated in the order in which the words occur in the text. The input to both models are of fixed size. If we consider 50K trigrams, then the DSSM input corresponding to the source and the target text would be a vector of length 50K each. For C-DSSM, the vector would be of length 50K x n, where the first n-1 word vectors are concatenated, and the nth vector contains a sum of the vectors corresponding to all the remaining words in the text. If there are less than n words in the text, then the rest of the vector is padded with zeros. To draw an analogy with image, you can think of the text input for C-DSSM as an image with dimensions 10x1 and 50K channels stored in a `[C x H x W]` format. - -This example demonstrates how to train a DSSM / C-DSSM model using CNTKTextFormatReader. The data should contain 2 features (source and target text) and 1 label (which is always set to the value 1 in the training data as it contains only positive samples – during training the negative target examples are generated by random sampling). -Here’s the reader configuration, -``` -reader = { - verbosity = 0 - randomize = true - - deserializers = ({ - type = "CNTKTextFormatDeserializer" - module = "CNTKTextFormatReader" - file = "data.txt" - - input = { - Q = { dim = 500000; format = "sparse" } - D = { dim = 500000; format = "sparse" } - L = { dim = 1; format = "dense" } - } - }) -} -``` -A sample of the input data, -``` -|L 1 |Q 482:1 761:1 1832:1 2117:1 12370:1 17131:1 17854:1 24976:1 27676:1 28055:1 28177:1 29507:1|D 482:1 761:1 1832:1 2117:1 12370:1 17131:1 17854:1 24976:1 27676:1 28055:1 28177:1 29507:1 -|L 1 |Q 149:1 153:1 595:1 671:1 675:1 1110:1 1517:1 2077:1 2114:1 5533:1 5662:1 6886:1 6901:1 7294:1 12846:1 13033:1 16614:1 19425:1 22015:1 24839:1 24994:1 26196:1 26358:1 27565:1|D 149:1 153:1 595:1 671:1 675:1 1110:1 1517:1 2077:1 2114:1 5533:1 5662:1 6886:1 6901:1 7294:1 12846:1 13033:1 16614:1 19425:1 22015:1 24839:1 24994:1 26196:1 26358:1 27565:1 -|L 1 |Q 187:1 2294:1 2800:1 6920:1|D 187:1 2294:1 2800:1 6920:1 -``` -And finally the network definition, -``` -BrainScriptNetworkBuilder = { - # Constants scalars - isConvolutional = true - numWords = (if isConvolutional then 10 else 1) - numTrigramsPerWord = 50000 - numHiddenNodes = 300 - wordWindowSize = 3 - numWindows = numWords - wordWindowSize + 1 - numNeg = 50 - - # Constant tensors - CONST_GAMMA = Constant(10) - CONST_SHIFT = Constant(1) - CONST_NEG = Constant(numNeg) - CONST_PAD_NEG = Constant(0, rows=numNeg, cols=1) - CONST_PAD_POS = Constant(1, rows=1, cols=1) - CONST_PAD = Splice(CONST_PAD_POS : CONST_PAD_NEG, axis=1) - - # Inputs - Q = Input(500000) - D = Input(500000) - L = Input(1) - - qr = if isConvolutional - then TransposeDimensions(ReshapeDimension(Q, 1, numTrigramsPerWord:1:numWords), 1, 3) - else Slice(0, numTrigramsPerWord, Q, axis=1) - dr = if isConvolutional - then TransposeDimensions(ReshapeDimension(D, 1, numTrigramsPerWord:1:numWords), 1, 3) - else Slice(0, numTrigramsPerWord, D, axis=1) - - qdssm = Sequential ( - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh}) - qcdssm = Sequential ( - ConvolutionalLayer {numHiddenNodes, (wordWindowSize:1), pad=false, activation=Tanh} : - MaxPoolingLayer {(numWindows:1), stride=(1:1)} : - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh}) - ddssm = Sequential ( - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh}) - dcdssm = Sequential ( - ConvolutionalLayer {numHiddenNodes, (wordWindowSize:1), pad=false, activation=Tanh} : - MaxPoolingLayer {(numWindows:1), stride=(1:1)} : - DenseLayer {numHiddenNodes, activation=Tanh} : - DenseLayer {numHiddenNodes, activation=Tanh}) - qembed = if isConvolutional - then qcdssm - else qdssm - dembed = if isConvolutional - then dcdssm - else ddssm - - qf = qembed(qr) - df = dembed(dr) - lf = Times(CONST_PAD, L) - c = CosDistanceWithNegativeSamples(qf, df, CONST_SHIFT, CONST_NEG) - s = Slice(0, 1, c, axis=1, tag="output") - ce = CrossEntropyWithSoftmax(lf, Scale(CONST_GAMMA, c), tag="criterion") -} -``` - -Note: -* While C-DSSM has been shown to consistently perform better than DSSM, it also trains slower (sometime up to 5-10x slower). So in some cases you may get better performance from DSSM in the same training time by training over more data (or for more epochs). -* The original DSSM / C-DSSM were trained on query and document title pairs. But you can learn other relationships between short texts by training on other kinds of data such as [session query pairs](https://www.microsoft.com/en-us/research/publication/exploring-session-context-using-distributed-representations-of-queries-and-reformulations/) or [query prefix-suffix pairs](https://www.microsoft.com/en-us/research/publication/query-auto-completion-for-rare-prefixes/). \ No newline at end of file diff --git a/articles/test/Train-a-multilabel-classifier-in-Python.md b/articles/test/Train-a-multilabel-classifier-in-Python.md deleted file mode 100644 index 80d66d3b..00000000 --- a/articles/test/Train-a-multilabel-classifier-in-Python.md +++ /dev/null @@ -1,10 +0,0 @@ -For multilabel classification you should avoid using CrossEntropy as it can only handle input vectors that sum to 1. A sensible alternative is to use hamming loss. - -```python -def my_hamming_loss: - y = placeholder_variable() - p = placeholder_variable() - hammingLoss = reduce_sum (not_equal (y, (greater (p, 0.5)))) - return hammingLoss -``` -which you then can call as hammingLoss(labels, prob) and pass as the error metric to the Trainer. \ No newline at end of file diff --git a/articles/test/Train-a-multilabel-classifier.md b/articles/test/Train-a-multilabel-classifier.md deleted file mode 100644 index b41fe22b..00000000 --- a/articles/test/Train-a-multilabel-classifier.md +++ /dev/null @@ -1,17 +0,0 @@ -For multilabel classification you should avoid using CrossEntropy as it can only handle input vectors that sum to 1. A sensible alternative is to use a sum of logistic loss functions, one for each output -``` -... -probabilities = DenseLayer {outputSize, activation=Sigmoid} (hidden) -logisticLoss = Logistic (multiLabels, probabilities) -trainingCriterion = (logisticLoss) -... -``` -Apart from the loss itself you might want to monitor other metrics such as the number of incorrect predictions. There is no builtin expression for this but it can be expressed as -``` -... -hammingLoss (y, p) = ReduceSum (y != (p > 0.5)) -hl = hammingLoss(multiLabels,probabilities) -evaluationNodes = (hl) -... -``` -This counts the number of times y[i] disagrees with p[i]>0.5. \ No newline at end of file diff --git a/articles/test/Train-a-regression-model-on-images.md b/articles/test/Train-a-regression-model-on-images.md deleted file mode 100644 index b287c9c2..00000000 --- a/articles/test/Train-a-regression-model-on-images.md +++ /dev/null @@ -1,71 +0,0 @@ -Below we describe how to predict one or more floating point value for an input image using CNTK. An example use case is to predict a bounding box, for example as (x, y, w, h), of an object in a given image. You could also think of predicting the price for a car just by looking at an image of that car (would be interesting actually). Here we use a very simple example in which we train a network to predict the average rgb values of an image (normalized to [0, 1]). However, the same steps apply for other use cases. These steps are: - -1. Define both the image and the ground truth regression labels as inputs to your network -2. Define a network that predicts a matching number of values wrt your regression labels -3. Define a loss function that compares the predicted values with the ground truth -4. Adapt the reader section in your .cntk config file to read both image and regression labels - -Here is how you could do it. The full config file is included in the Examples folder at [Examples/Image/Regression/RegrSimple_CIFAR10.cntk](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Regression/RegrSimple_CIFAR10.cntk). That folder also contains the scripts to download the image data and generate the regression ground truth for training and testing. - -1-3) Defining inputs, network and loss function: - -``` - BrainScriptNetworkBuilder = [ - imageShape = 32:32:3 - featScale = Constant(1/256) - labelDim = 3 - - model (features) = { - featNorm = Scale(features, featScale) - h1 = LinearLayer {100, init="gaussian", initValueScale=1.5} (featNorm) - ol = LinearLayer {labelDim, init="gaussian", initValueScale=1.5} (h1) - }.ol - - # inputs - features = Input {imageShape} - regrLabels = Input {labelDim} - - # apply model to features - ol = model (features) - - # define regression loss - # rmse = sqrt(SquareError(regrLabels, ol) / labelDim) - sqerr = SquareError (regrLabels, ol) - rmse = Sqrt (Constant(1/labelDim).* sqerr) - - featureNodes = (features) - labelNodes = (regrLabels) - criterionNodes = (rmse) - evaluationNodes = (rmse) - OutputNodes = (ol) - ] -``` - -4) Defining a composite reader using both the ImageReader and CNTKTextFormatReader: - -``` - reader = { - verbosity = 0 ; randomize = true - deserializers = ({ - type = "ImageDeserializer" ; module = "ImageReader" - file = "$dataDir$/cifar-10-batches-py/train_map.txt" - input = { - features = { transforms = ( - { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } : - { type = "Transpose" } - )} - ignored = { labelDim = 10 } - } - } : { - type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader" - file = "$dataDir$/cifar-10-batches-py/train_regrLabels.txt" - input = { - regrLabels = { dim = 3 ; format = "dense" } - } - }) - } -``` - -The reader is a composite reader that uses the ImageReader to read images and the CNTKTextFormatReader to read the regression ground truth labels. It does so by defining an array of deserializers (using `{...} : {...}`) and assigning the inputs as defined in the network above (cf. features and regrLabels). - -See [Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk) for the full config file and the corresponding Readme in that folder for running the example. diff --git a/articles/test/Train-two-or-more-models-jointly.md b/articles/test/Train-two-or-more-models-jointly.md deleted file mode 100644 index 348c1c05..00000000 --- a/articles/test/Train-two-or-more-models-jointly.md +++ /dev/null @@ -1,25 +0,0 @@ -There are two cases that this can arise. One is multitask learning the other is you have one processing pipeline for some part of your input and another pipeline for the other. - - * Multitask learning is when you have multiple criteria you want your network to be good at. The standard way to deal with this is to construct a single number out of all these criteria. For example -```python -cross_entropy = cross_entropy_with_softmax(predictions, labels) -reconstruction_error = squared_error(reconstruction, features) -combined_loss = 0.3 * reconstruction_error + cross_entropy -``` - * Multiple pipelines such as the one below are also tricky -```python -q_embed = qmodel(question) -a_embed = amodel(answer) -``` - -The trainer takes a single function whose parameters should be modified by the training proceduce. How can we pass the union of `qmodel` and `amodel` or the union of `cross_entropy` and `reconstruction_error`? The answer is to find a point where all the parameters are part of the same function. That happens at the very least in the final loss. -i.e. `loss.parameters` contains all parameters the loss depends on. Typically however our model should not -include the loss, as it only makes sense for training. Therefore a better approach is to use combine to create a combined model -```python -final_model = combine(predictions, reconstruction) -``` -For the separate pipeline case there is probably a place where everything gets combined -```python -qa_score = score(q_embed,a_embed) -``` -then `qa_score` can play the role of `final_model` above. \ No newline at end of file diff --git a/articles/test/Train-with-a-multitask-objective.md b/articles/test/Train-with-a-multitask-objective.md deleted file mode 100644 index 48aefae5..00000000 --- a/articles/test/Train-with-a-multitask-objective.md +++ /dev/null @@ -1,8 +0,0 @@ -You can just define your combined criterion as a BrainScript expression and you can monitor all individual task losses by specifying them as `evaluationNodes`. -``` -task1loss = CrossEntropyWithSoftMax(prediction,label) -task2loss = SquareError(reconstruction,input) -mtloss = task1loss + Constant(0.3) .* task2loss -criterionNodes = (mtloss) -evaluationNodes = (task1loss:task2loss) -``` diff --git a/articles/test/Train-with-a-weighted-loss.md b/articles/test/Train-with-a-weighted-loss.md deleted file mode 100644 index b2898249..00000000 --- a/articles/test/Train-with-a-weighted-loss.md +++ /dev/null @@ -1,6 +0,0 @@ -The specification of a per-example weight in the loss is as simple as -```python -weight = input_variable((1)) -weighted_loss = weight * loss -``` -where `loss` is any builtin or user-defined loss function. Of course during training each minibatch will need to have a mapping from weight to actual values (one for each example). \ No newline at end of file diff --git a/articles/test/Tutorial2/Tutorial2.md b/articles/test/Tutorial2/Tutorial2.md index 9669667d..ea0ac924 100644 --- a/articles/test/Tutorial2/Tutorial2.md +++ b/articles/test/Tutorial2/Tutorial2.md @@ -1,6 +1,5 @@ -#Brainscript tutorial - **Table of Contents** + - [Introduction](#introduction) - [The MNIST Data](#the-mnist-data) - [Getting the Data](#getting-the-data) @@ -21,27 +20,27 @@ - [The Network Definition](#the-network-definition-2) - [Putting it all Together](#putting-it-all-together-2) -## Introduction +# Introduction This tutorial is for BrainScript. For Python click [here](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/MLP/Python/SimpleMNIST.py). This is the second part of the CNTK tutorial where we will start using CNTK more to its full potential. We will go deep! This tutorial assumes that you have already gone through the [first part](https://github.com/Microsoft/CNTK/wiki/Tutorial), and thus you are familiar with basic CNTK/ML concepts such as logistic regression and softmax. In the first tutorial we built models to solve simple binary and multi-class classification problems. Though those models achieved good accuracy, they will not perform as well on harder real-world problems. One principal reason is that the decision boundaries between the classes are not typically linear. In this tutorial, we will learn to build more complex models, namely, neural networks and convolutional neural networks. We will build an image classification system using the MNIST dataset as our benchmark. The files for this tutorial can be found in the source-code distribution under `Examples/Image/GettingStarted` ([Github link](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/GettingStarted)). -## The MNIST Data +# The MNIST Data MNIST is a popular image dataset of handwritten digits. It is divided into a training set of 60,000 examples, and a test set of 10,000 examples. This dataset is a subset of the original data from NIST, pre-processed and published by LeCun et al. For more details please refer to [this page](http://yann.lecun.com/exdb/mnist/). The MNIST dataset has become a standard benchmark for machine learning methods because it is real-world data, yet it is simple and requires minimal efforts in pre-processing and formatting. Each instance of the data consists of a 28x28 pixel image representing one digit, with a label between 0 and 9. Here are some examples: ![MNIST Examples](./Tutorial2/mnist_examples.png) -### Getting the Data +## Getting the Data CNTK comes with a Python script that fetches and prepares the MNIST data for CNTK consumption. You find it here: [install_mnist.py](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/DataSets/MNIST/install_mnist.py). -## Some Important CNTK Concepts +# Some Important CNTK Concepts Before building the neural networks for written digit image recognition using the MNIST dataset, we will go through two important concepts of CNTK: (1) [Functions](https://github.com/Microsoft/CNTK/wiki/BS-Functions); and (2) model editing. -### CNTK Functions +## CNTK Functions BrainScript allows user-defined functions. User-defined functions are CNTK's mechanism of creating reusable blocks. Using functions is easy, cuts down on the verbosity of defining networks, increases code re-use, and helps to prevent errors. Using BrainScript's `include` directive, they can be shared across multiple configuration files. Functions are parameterized expressions. A function's value can be of any type understood by BrainScript, including scalars, @@ -99,7 +98,7 @@ That final `.y` reads out `y` immediately, effectively turning `W` and `b` into Functions can be declared locally inside other functions as seen in `RecurrentLSTM()` above. Functions can be called recursively, which is one way of creating stacks of networks. -### Model Editing +## Model Editing Model editing refers to modifying the structure or the model parameters of an existing trained network. One form is to load a network and create a new network referencing nodes of the old network. This can be done on the fly in all operations that operate on networks. @@ -120,17 +119,17 @@ The following example shows how add an evaluation node to a network that counts Other model-editing operations include adding new layers for discriminative pre-training and freezing parameters. -## Starting Shallow: One-Hidden-Layer Neural Network +# Starting Shallow: One-Hidden-Layer Neural Network Let's get back to the task at hand: classifying images of hand-written digits. To do so, we will build our first neural network with CNTK. Starting simple, our network will only have one hidden layer. -### Neural Network vs. Softmax Regression +## Neural Network vs. Softmax Regression We saw in the previous tutorial that softmax regression can learn to separate data with more than two classes. However, the separation boundaries are linear. What if those boundaries were trickier? In that case, we could distort the feature space in a way to bring the data closer to being linearly separable, and this is what a hidden layer can do for us. So basically, we take our softmax regression solution and insert in a hidden layer connected to the network's inputs. Such a layer will learn to apply a feature mapping that projects the data into a space where it is (hopefully) linearly separable. Then, the next layer will receive an easier problem to deal with using its linear decision boundaries. If we apply softmax classification like in the [previous tutorial](https://github.com/Microsoft/CNTK/wiki/Tutorial) to the MNIST problem, we will get an error rate of *7.5%*. We will show that by adding one hidden layer, the error is reduced to *2.25%*. -### The Network Definition +## The Network Definition First, we define our features and labels. Note that we apply a scaling of (1.0 / 256.0) on the features in order to have their values within the range 0 to 1. This normalization helps SGD to converge faster with better predictive performance. Then, we specify the topology of our network which looks similar to the one we used in [Part 1 of the tutorial](https://github.com/Microsoft/CNTK/wiki/Tutorial), except that it has an additional layer, that is, the hidden layer `DNNSigmoidLayer`. The layers are defined in a separate shared BS file, [Shared.bs](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/Config/Shared.bs). @@ -157,7 +156,7 @@ First, we define our features and labels. Note that we apply a scaling of (1.0 / evaluationNodes = (errs) outputNodes = (z) -### SGD Parameters +## SGD Parameters The SGD (Stochastic Gradient Descent) block tells CNTK how to optimize the network to find the best parameters. This includes information about mini-batch size (so the computation is more efficient), the learning rate, and how many epochs to train. Here is the SGD block for our example: SGD = [ @@ -180,7 +179,7 @@ Below is a list of the most common parameters used with the SGD block: - `maxEpochs`: the maximum number of epochs to run. - `dropoutRate`: the dropout rate per epoch. The default value is 0 (off). -#### Converting Learning-rate and Momentum Parameters From Other Toolkits +### Converting Learning-rate and Momentum Parameters From Other Toolkits CNTK's model-update formulae differ somewhat from some other toolkits and from literature, in that in CNTK, the parameters are specified in a way that is *agnostic of the minibatch size*. This is important in the context of data-parallel training, where CNTK itself may modify the minibatch size. Specifying learning rate and momentum in an agnostic way avoids complexities of adjusting these values upon changes of minibatch size. @@ -223,7 +222,7 @@ You will get close to this by using `learningRatePerMB` and `momentumPerMB`, whi learningRatePerSample = learningRatePerMB / minibatchSize momentumAsTimeConstant = -minibatchSize / ln (momentumPerMB) -### Putting it all Together +## Putting it all Together Here are the configuration files you need to run this example: - [The main configuration file, including the BrainScript](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/Config/01_OneHidden_ndl_deprecated.cntk) @@ -247,15 +246,15 @@ The output folder will be created inside `Image/MNIST/`, and we get the test res This model has an error of *2.25%*. Not too bad for image recognition! But now let's build an even better model by using Convolutional Neural Networks... -## Going Deep: Convolutional Neural Networks (CNNs) +# Going Deep: Convolutional Neural Networks (CNNs) As we have seen, a simple neural network can achieve impressive results on this task. However, these results are not all that good when compared to what is out there in the literature. But we can do much better if we introduce some concepts from CNN theory. So, let's start by introducing the main concept of CNNs, and then building such a network with CNTK. -### CNNs: The Ingredients +## CNNs: The Ingredients A *convolutional neural network* consists of model layers that apply local filters and are stacked in a certain order. Here we describe each of these layers briefly. -#### Convolutional Layer +### Convolutional Layer A convolutional layer applies a learned local linear filter, e.g. of `[3 x 3]` pixels, to every pixel position in an image. Such filter capture local patterns thanks to the local connectivity of its units. Whereas the input image has a single dimension (gray-scale level), hidden layers maintain their spatial layout but store an entire activation vector for each pixel position, where each dimension of the activation vector has its own filter kernel, or *feature map*. A feature map is basically a sliding window over sub-regions of the layer's inputs, where each application of the map results in one dimension of the output activation vector. A feature map is computed by performing a dot product between its filter parameters and the corresponding input rectangle, which is slid across the entire 2D plane. This linear filter operation is called *convolution*. Each feature map is called a depth slice. Here is a simple example of how the feature map is applied. @@ -263,27 +262,27 @@ A convolutional layer applies a learned local linear filter, e.g. of `[3 x 3]` p For example, for a `[28 x 28]` image and a 16 elements-deep feature map, the resulting hidden layer would be a tensor of dimension `[28 x 28 x 16]`. It is common to drop the outer pixels for which the filter would fall outside the image bounds; in that case, the resulting tensor dimensions will be reduced accordingly. -#### Activation Function Layer +### Activation Function Layer A non-linear activation function is then applied to each unit output element of the convolutional layer. One of the most commonly used functions is the *Rectified Linear Unit*, or ReLU, which is simply _max_(0,_x_). Its practical advantage over a sigmoid function is that it does not suffer from the vanishing gradient problem, and therefore learning can be more efficient. -#### Max-Pooling Layer +### Max-Pooling Layer "Max Pooling" is an operation placed after the activation function that aims at reducing dimensionality. It divides the input into a set of non-overlapping regions, where for each region it outputs the maximum activation value (independently for each depth slice). By reducing each region into a single point, the image dimension is reduced. What this achieves is two-fold: (1) it reduces the number of parameters and thus helps controlling overfitting; and (2) it selects the salient activation values regardless of their location in the region, which helps training models that are more resilient to things like rotation / translation. Here is an example (from [Wikipedia](https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer)) of max pooling with a window size of `[2 x 2]` (which would reduce a hidden layer of `[28 x 28 x 16]` to `[14 x 14 x 16]`): ![Attribution: By Aphex34 (Own work) [CC BY-SA 4.0 (http://creativecommons.org/licenses/by-sa/4.0)], via Wikimedia Commons](./Tutorial2/Max_pooling.png) -#### Dense Layer +### Dense Layer Finally, after cascading several convolutional, activation function, and MaxPooling layers, a CNN will have one or more fully connected, or dense, layers. Every unit in a dense layer has connections to all activations of the previous layer, similar to regular neural networks. -#### Softmax Layer +### Softmax Layer Finally, the softmax layer for classification. We know the softmax from the first part of the tutorial. The output is a probability distribution over the possible classes. Below is a chart of a CNN with two alternating convolution / activation and MaxPooling layers, one dense layer, and one softmax layer. ![CNN](./Tutorial2/cnn.png) -### The Network Definition +## The Network Definition Our CNN will have a bit more of a complex definition than our previous networks. Starting from the features, you will notice that we define each sample as a `28 x 28` matrix rather than a vector. This is because a CNN exploits local correlations in the image. Thus, we need to preserve this information. Second, in addition to the layer we saw in the previous network, we define a cascade of convolutional and max-pooling layers. We have two of each type. The core layer is `ConvReLULayer` which is defined as a function in [Shared.bs](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/Config/Shared.bs). Here is what this macro looks like: @@ -361,7 +360,7 @@ This allows us to put together the final network definition of our CNN that will evaluationNodes = (errs) outputNodes = (ol) -### Putting it all Together +## Putting it all Together Here are the configuration files you will need to run this example: - [The main configuration file incl. network definition](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/Config/02_Convolution_ndl_deprecated.cntk) @@ -381,17 +380,17 @@ With an error of *0.83%*, this model (unsurprisingly) greatly outperforms the pr Now let's check out a technique that helps us to reduce the training time without significantly compromising the predictive performance of the model. -## Improving Training: CNN with Batch Normalization +# Improving Training: CNN with Batch Normalization In this section we will add to our network a widely used technique called Batch Normalization. It helps to make training much more efficient. We will give a brief introduction before demonstrating how to add it to our CNN. -### Batch Normalization in a Nutshell +## Batch Normalization in a Nutshell One problem with training deep neural networks is that the distribution of each layer's inputs changes during training because the parameters of the previous layers change. This causes the training to slow down, as upper layers constantly have to adapt themselves to the changed ranges. One common technique to address the problem is called Batch Normalization (BN). BN consists of normalizing layer inputs w.r.t. their zero-mean/unit-variance, and to do so for each training minibatch. BN makes it possible to use much higher learning rates and allows us to be less careful about initialization. For more details, please refer to the paper [here](http://arxiv.org/pdf/1502.03167v3.pdf), and to the description of the [BatchNormalization](https://github.com/Microsoft/CNTK/wiki/BatchNormalization) operation. Disclaimer: Training with Batch Normalization is currently only supported on GPU (but models can be evaluated using the CPU). -### The Network Definition +## The Network Definition We start from the CNN we just built and add a batch normalization node to each layer's macro. Also, we change the sigmoid layer to a ReLU layer as it gives a slight bump to the model's accuracy. @@ -454,7 +453,7 @@ We start from the CNN we just built and add a batch normalization node to each l evaluationNodes = (errs) outputNodes = (ol) -### Putting it all Together +## Putting it all Together Here are the configuration files you need to run this example: - [The main configuration file](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/Config/03_ConvBatchNorm_ndl_deprecated.cntk) diff --git a/articles/test/Tutorials,-Examples,-etc...md b/articles/test/Tutorials,-Examples,-etc...md new file mode 100644 index 00000000..418f483d --- /dev/null +++ b/articles/test/Tutorials,-Examples,-etc...md @@ -0,0 +1,4 @@ +* [[Tutorials]] +* [[Examples]] +* [[Articles]] +* [[Presentations]] \ No newline at end of file diff --git a/articles/test/Tutorials-examples.md b/articles/test/Tutorials-examples.md deleted file mode 100644 index d98bad59..00000000 --- a/articles/test/Tutorials-examples.md +++ /dev/null @@ -1,6 +0,0 @@ -#Tutorials, Examples, etc... - -* [Tutorials](Tutorials) -* [Examples](Examples) -* [Articles](Articles) -* [Presentations](Presentations) \ No newline at end of file diff --git a/articles/test/Tutorials.md b/articles/test/Tutorials.md index 84d540e0..2c847e36 100644 --- a/articles/test/Tutorials.md +++ b/articles/test/Tutorials.md @@ -1,8 +1,6 @@ -#Tutorials - There are tutorials for Python users and for BrainScript users. -## Python Jupyter Notebook Tutorials +### Python Jupyter Notebook Tutorials CNTK Python Jupyter notebook [tutorials](https://www.cntk.ai/pythondocs/tutorials.html) cover a range of different application including image classification, language understanding, reinforcement learning and others. @@ -10,9 +8,9 @@ You can also try out the tutorials live with [Azure Notebooks](https://notebooks Additional Python tutorials: * The folder [Tutorials/NumpyInterop](https://github.com/Microsoft/CNTK/tree/master/Tutorials/NumpyInterop) contains a simple example of how to use numpy arrays as input for CNTK training and evaluation. -* ['Build your own image classifier using Transfer Learning'](./Build-your-own-image-classifier-using-Transfer-Learning) provides two examples for custom image classifiers using transfer learning. +* ['Build your own image classifier using Transfer Learning'](https://github.com/Microsoft/CNTK/wiki/Build-your-own-image-classifier-using-Transfer-Learning) provides two examples for custom image classifiers using transfer learning. -## BrainScript Tutorials +### BrainScript Tutorials * [Getting Started](./Tutorial/Tutorial): Simple Logistic Regression and multi-class classification * [Image Recognition](./Hands-On-Labs-Image-Recognition): Image recognition on CIFAR-10 with Convolutional and Residual Networks * [Language Understanding with ATIS](./Hands-On-Labs-Language-Understanding): Slot tagging and intent classification with Recurrent Networks diff --git a/articles/test/Using-CNTK-with-multiple-GPUs-and-or-machines.md b/articles/test/Using-CNTK-with-multiple-GPUs-and-or-machines.md new file mode 100644 index 00000000..08f44962 --- /dev/null +++ b/articles/test/Using-CNTK-with-multiple-GPUs-and-or-machines.md @@ -0,0 +1 @@ +See [[Multiple GPUs and machines]]. \ No newline at end of file diff --git a/articles/test/out.txt b/articles/test/out.txt deleted file mode 100644 index b383ea40..00000000 --- a/articles/test/out.txt +++ /dev/null @@ -1,314 +0,0 @@ -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Adapt-a-model-I-trained-on-one-task-to-another.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2 -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles3 -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles4 -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Associate-an-id-with-a-prediction.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Avoid-AddSequence-Exception.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Avoid-the-error-CURAND-failure-201.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Baseline-Metrics.asciidoc -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BatchNormalization.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Binary-Operations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Activation-Functions.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-and-Python---Understanding-and-Extending-Readers.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-and-Python-Performance-Profiler.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Basic-Concepts.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-CNTKBinary-Reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-CNTKTextFormat-Reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Command-line-parsing-rules.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Config-file-overview.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-epochSize-and-Python-epoch_size-in-CNTK.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-expressions.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Full-Function-Reference.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Functions.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-HTKMLF-Reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Image-reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Layers-Reference.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-LM-sequence-reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-LU-sequence-reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-minibatchSize-and-Python-minibatch_size_in_samples-in-CNTK.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Model-Editing.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Network-Builder.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Reader-block.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-SGD-Block.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Top-level-configurations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-Train,-Test,-Eval.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\BrainScript-UCI-Fast-Reader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Breaking-changes-in-Master-compared-to-beta15.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Build-a-constant-3D-tensor.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Build-your-own-image-classifier-using-Transfer-Learning.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CloneFunction.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-1bit-SGD-License.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-2.0-Python-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-2.0-Setup-from-Sources.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-2.0-Setup.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Binary-Download-and-Configuration.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Binary-Download-and-Manual-Configuration.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Docker-Containers.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Eval-Examples.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Evaluate-Hidden-Layers.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Evaluate-Image-Transforms.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Evaluate-Multiple-Models.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Evaluation-Overview.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Evaluation-using-cntk.exe.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-FAQ.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Library-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Library-Evaluation-on-Linux.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Library-Evaluation-on-Windows.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Library-Evaluation-Overview.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Library-Managed-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-model-format.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-move-to-Cuda8.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-on-Azure.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-Python-known-issues-and-limitations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\CNTK-usage-overview.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Coding-Guidelines.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Compatible-dimensions-in-reader-and-config.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Conference-Appearances.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Continue-training-from-a-previously-saved-model.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Contributing-to-CNTK.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ConvertDBN-command.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Convolution.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Deal-with-the-'No-Output-nodes-found'-error.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Deal-with-the-error-'No-node-named-'x';-skipping'.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Deal-with-the-error-'Reached-the-maximum-number-of-allowed-errors'.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Debug-a-Python-notebook.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Debugging-CNTK's-GPU-source-code-in-Visual-Studio.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Debugging-CNTK-source-code-in-Visual-Studio.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Deep-Crossing-on-CNTK.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Developing-and-Testing.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Do-early-stopping.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Dropout-during-evaluation.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Dropout.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Enabling-1bit-SGD.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\EvalDLL-Evaluation-on-Linux.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\EvalDLL-Evaluation-on-Windows.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\EvalDLL-Evaluation-Overview.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Evaluate-a-model-in-an-Azure-WebApi.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Evaluate-a-saved-convolutional-network.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Evaluate-my-newly-trained-model-but-output-the-activations-at-an-intermediate-layer.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Examples.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Expose-new-operands-in-V2-Python-from-previous-V1-implementations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Express-a-gating-mechanism.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Express-a-softmax-over-a-dynamic-axis.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Express-a-softmax-with-a-temperature-parameter.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Express-the-error-rate-of-my-binary-classifier.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Feedback-Channels.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\figures -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Gather-and-Scatter.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Get-nice-syntax-highlighting-for-BrainScript-config-files.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Get-started-in-sequence-to-sequence-modelling.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Get-things-to-work-correctly-when-I-take-the-last-element-of-a-sequence.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\GRUs-on-CNTK-with-BrainScript.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Hands-On-Labs-Image-Recognition.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Hands-On-Labs-Language-Understanding.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Home.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-do-I-in-BrainScript.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-do-I-in-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-do-I-run-Eval-in-Azure.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-do-I-use-a-trained-model-as-a-feature-extractor.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-do-I.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\How-to-Test.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\If-Operation.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Image-Auto-Encoder-Using-Deconvolution-And-Unpooling.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Implement-an-attention-mechanism.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Implement-Zoneout.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Inputs.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Interpret-the-training-loss.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Interpret-the-use-of-MinibatchSource.next_minibatch.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Interrogate-the-dimensions-of-internal-layers-of-a-network-from-within-the-Python-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Introspect-or-inspect-or-list-model-input-variables.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\KDD-2016-Tutorial.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Layer-wise-training.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Layers-Library-Reference.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Load-model-and-access-network-weights-(parameters).md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Load-pre-trained-checkpointed-model-and-continue-retraining.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Loss-Functions-and-Metrics.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Managed-EvalDLL-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Monitor-the-error-on-a-held-out-set-during-training-or-do-Cross-Validation-(CV)-during-training.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Monitor-the-error-on-a-held-out-set-during-training.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Multiple-GPUs-and-machines.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Native-CNTK-Library-Eval-Interface.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Native-EvalDLL-API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\News-2016.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\News.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\NuGet-Package.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Object-Detection-using-Fast-R-CNN.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\OptimizedRNNStack.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\out.xt -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Parameters-And-Constants.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Plot-command.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Pooling.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Port-LSTM-NDL-primitives-to-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Port-projection-of-1D-input-to-1D-output-from-Python-API-to-C---API.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Post-Batch-Normalization-Statistics.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Presentations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\project-a-1D-input-of-dim-inputDim-to-a-1D-output-of-dim-outputDim.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Put-labels-and-features-in-separate-files-with-CNTKTextFormatReader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Read-and-modify-the-training-weights-from-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Recommended-CNTK-2.0-Setup.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Records.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Recurrent-Neural-Networks-with-CNTK-and-applications-to-the-world-of-ranking.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Reduction-Operations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd-optimizer.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Relate-alpha,-beta1,-beta2-and-epsilon-to-learning-rate-and-momentum-in-adam_sgd.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Restrict-a-prediction-to-a-bounded-interval.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Sequence-to-Sequence---Deep-Recurrent-Neural-Networks-in-CNTK---Part-1.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Sequence-to-Sequence---Deep-Recurrent-Neural-Networks-in-CNTK---Part-2---Machine-Translation.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Sequence-to-Sequence---Deep-Recurrent-Neural-Networks-in-CNTK---Part-2.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Sequential.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Set-the-verbosity-or-traceLevel-from-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-BuildProtobuf-VS15.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Buildzlib-VS15.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-CNTK-on-Linux.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-CNTK-on-Windows.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-CNTK-on-your-machine.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-CNTK-Python-Tools-For-Windows.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-CNTK-with-script-on-Windows.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Linux-Binary-Manual.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Linux-Binary-Script.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Linux-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Migrate-VS13-to-VS15.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Test-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Windows-Binary-Manual.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Windows-Binary-Script-Options.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Windows-Binary-Script.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Windows-Devinstall-Script-Option.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Setup-Windows-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Simple-Network-Builder.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Special-Nodes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Specify-multiple-label-streams-with-the-HTKMLFReader.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Test-Configurations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Times-and-TransposeTimes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Top-level-commands.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-a-DSSM-(or-a-convolutional-DSSM)-model.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-a-multilabel-classifier-in-Python.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-a-multilabel-classifier.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-a-regression-model-on-images.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-two-or-more-models-jointly.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-with-a-multitask-objective.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Train-with-a-weighted-loss.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Troubleshoot-CNTK.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2 -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorials.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Unary-Operations.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Update-1bit-SGD-Submodule-Location.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Use-an-already-trained-network-multiple-times-inside-a-larger-network.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Use-built-in-readers-with-multiple-inputs.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Using-CNTK-with-BrainScript.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Using-CNTK-with-multiple-GPUs-and-or-machines.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Using-TensorBoard-for-Visualization.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Variables.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Windows-Environment-Variables.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\WWW-2017-Tutorial.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\_Sidebar.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1312_RecurrentNe1.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1312_RecurrentNe2.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1312_RecurrentNe3.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1312_RecurrentNe4.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1315_GRUsBSando1.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1315_GRUsBSando2.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1315_GRUsBSando3.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\071316_1315_GRUsBSando4.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq1.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq2.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq3.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq4.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq5.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles2\eq6.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles3\DeepCrossing.cntk -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles3\deepCrossing.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles3\residual.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Articles4\SequencetoS2.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\figures\asgd-cifar-compare.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\figures\bm.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\figures\bmcompare.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\ImageAutoEncoder -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\TensorBoard -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic1.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic2.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic3.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic4.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic5.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic6.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\EvaluateWebApi\pic7.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\ImageAutoEncoder\imageAutoEncoder_16x.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\ImageAutoEncoder\imageAutoEncoder_cmp.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup\UnblockZip70.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup\VS2015InstallCustom.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup\VS2015InstallCustom70.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup\VS2015InstallFeatures.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\setup\VS2015InstallFeatures70.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\TensorBoard\tensorboard_graph.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\pictures\TensorBoard\tensorboard_scalars.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\CNTK-AI-NEXT-Mar-2017-Frank-Seide-print.pdf -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\CNTK-and-the-CS-behind-it-CGO-HPCA-PPoPP-CC-Keynote-Feb-2017-Frank-Seide-print.pdf -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\LATAM_Frank_Seide_CNTK, 2016-5-19, Print.pptx -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\LATAM_Frank_Seide_CNTK, 2016-5-19.pptx -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\Layers-vs-API.pdf -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ppt\S6843-Deep-Learning-in-Microsoft-with-CNTK.pptx -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_1_5_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_1_6_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_1_7_1_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_1_7_2_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_1_7_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_10_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_11_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_12_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_15_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_1_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_2_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_3_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_4_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_5_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_6_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_7_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_8_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\ReleaseNotes\CNTK_2_0_Beta_9_Release_Notes.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\3class_decision.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\decision_boundary.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\logistic.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\softmax.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\softmaxformula.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\synth_data.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\synth_data_3.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial\Tutorial.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\cnn.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\convLayer.PNG -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\loss.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\Max_pooling.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\mnist_examples.png -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial2\Tutorial2.md -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\AP.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\bus_01.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_0WIN_20160803_11_28_42_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_1WIN_20160803_11_42_36_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_2WIN_20160803_11_46_03_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_3WIN_20160803_11_48_26_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_4WIN_20160803_12_37_07_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\nn_noNms4WIN_20160803_12_37_07_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\p_interp.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\rcnnPipeline.JPG -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\svm_4WIN_20160803_12_37_07_Pro.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\WIN_20160803_11_29_07_Pro.noGrid.roi.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\WIN_20160803_11_29_07_Pro.noGridNoFiltering.roi.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_FastRCNN\WIN_20160803_11_29_07_Pro.roi.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\Bird_in_flight_wings_spread.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\Canis_lupus_occidentalis.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\Icelandic_breed_sheep.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\image_08058.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\image_08081.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\image_08084.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\image_08093.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\quetzal-bird.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\Swaledale_sheep.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\The_white_wolf_by_Lunchi.jpg -D:\Documents\GitHub\cognitive-toolkit-docs-pr\articles\test\Tutorial_TL\Weaver_bird.jpg