From f2b79ecbf37629902ccdbad2e1a556977c53d370 Mon Sep 17 00:00:00 2001 From: Emre Safak Date: Wed, 10 Jan 2018 18:06:52 -0800 Subject: [PATCH] Add sparse matrix support for large data sets Closes #15 --- .travis.yml | 45 ++-- HISTORY.rst | 4 +- description | 1 - hooks/applypatch-msg.sample | 15 -- hooks/commit-msg.sample | 24 -- hooks/post-update.sample | 8 - hooks/pre-applypatch.sample | 14 -- hooks/pre-commit.sample | 50 ---- hooks/pre-rebase.sample | 169 ------------- hooks/prepare-commit-msg.sample | 36 --- hooks/update.sample | 128 ---------- info/exclude | 6 - setup.py | 7 +- src/__init__.py | 8 +- src/mca.py | 418 ++++++++++++++++---------------- tests/test_mca.py | 354 ++++++++++++++------------- 16 files changed, 425 insertions(+), 862 deletions(-) delete mode 100644 description delete mode 100644 hooks/applypatch-msg.sample delete mode 100644 hooks/commit-msg.sample delete mode 100644 hooks/post-update.sample delete mode 100644 hooks/pre-applypatch.sample delete mode 100644 hooks/pre-commit.sample delete mode 100644 hooks/pre-rebase.sample delete mode 100644 hooks/prepare-commit-msg.sample delete mode 100644 hooks/update.sample delete mode 100644 info/exclude diff --git a/.travis.yml b/.travis.yml index f505eab..3cd223f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,40 +3,39 @@ language: python cache: - - apt - - pip + - apt + - pip notifications: - email: false + email: false python: - - 3.6 - - 3.5 - # - "2.7_with_system_site_packages" - - "3.2_with_system_site_packages" - - 2.7 - # - "pypy" + - 3.6 + - 3.5 + - 2.7 + # - "pypy" + # - "pypy3" # virtualenv: - # system_site_packages: true +# system_site_packages: true before_install: - - if [ $TRAVIS_PYTHON_VERSION == "3.2_with_system_site_packages" ]; then - wget -O- http://neuro.debian.net/lists/precise.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list; sudo apt-key adv --recv-keys --keyserver pool.sks-keyservers.net 2649A5A9; echo | sudo add-apt-repository ppa:chris-lea/python-numpy; echo | sudo add-apt-repository ppa:pylab/stable; sudo apt-get update -q; sudo apt-get install -qq python3-numpy python3-scipy python3-pandas python3-pandas-lib; - else - sudo pip install conda; sudo conda init; conda create -p $HOME/py --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas; export PATH=$HOME/py/bin:$PATH; - fi + # - if [ $TRAVIS_PYTHON_VERSION == "3.2_with_system_site_packages" ]; then + # wget -O- http://neuro.debian.net/lists/precise.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list; sudo apt-key adv --recv-keys --keyserver pool.sks-keyservers.net 2649A5A9; echo | sudo add-apt-repository ppa:chris-lea/python-numpy; echo | sudo add-apt-repository ppa:pylab/stable; sudo apt-get update -q; sudo apt-get install -qq python3-numpy python3-scipy python3-pandas python3-pandas-lib; + # else + sudo pip install conda; sudo conda init; conda create -p $HOME/py --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas; export PATH=$HOME/py/bin:$PATH; + # fi install: - - pip install -r requirements.txt - - pip install . + - pip install -r requirements.txt + - pip install . script: python setup.py test deploy: - provider: pypi - user: esafak - password: - secure: "ERTeFa4LZh+hiDqnv8Wf6qK6KNkFr+LFFsYx7bohM8iZsEDdciGcNis17r2H21WjIu2khIPUYarRKijVvvRMApVNgt4VLtuT5Z4WPVGS71vpJ5jZvVgZS/zo8MBZ8jbUAlMugdoWIxtSGRvJKZz6M2//eh3BtDhIz/gB4o1wGOg=" - on: - tags: true \ No newline at end of file + provider: pypi + user: esafak + password: + secure: "ERTeFa4LZh+hiDqnv8Wf6qK6KNkFr+LFFsYx7bohM8iZsEDdciGcNis17r2H21WjIu2khIPUYarRKijVvvRMApVNgt4VLtuT5Z4WPVGS71vpJ5jZvVgZS/zo8MBZ8jbUAlMugdoWIxtSGRvJKZz6M2//eh3BtDhIz/gB4o1wGOg=" + on: + tags: true \ No newline at end of file diff --git a/HISTORY.rst b/HISTORY.rst index 4e8b5c3..10b1bfe 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -8,4 +8,6 @@ History * **1.01** (2015-03-23) More documentation, in the form of an ipython notebook. Fixed bug #2 affecting python 2.x * **1.02** (2017-07-29) - Fixed division-by-zero bug (issue #14) \ No newline at end of file + Fixed division-by-zero bug (issue #14) +* **1.03** (2018-01-10) + Added sparse matrix support \ No newline at end of file diff --git a/description b/description deleted file mode 100644 index 498b267..0000000 --- a/description +++ /dev/null @@ -1 +0,0 @@ -Unnamed repository; edit this file 'description' to name the repository. diff --git a/hooks/applypatch-msg.sample b/hooks/applypatch-msg.sample deleted file mode 100644 index 8b2a2fe..0000000 --- a/hooks/applypatch-msg.sample +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -# -# An example hook script to check the commit log message taken by -# applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. The hook is -# allowed to edit the commit message file. -# -# To enable this hook, rename this file to "applypatch-msg". - -. git-sh-setup -test -x "$GIT_DIR/hooks/commit-msg" && - exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"} -: diff --git a/hooks/commit-msg.sample b/hooks/commit-msg.sample deleted file mode 100644 index b58d118..0000000 --- a/hooks/commit-msg.sample +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -# -# An example hook script to check the commit log message. -# Called by "git commit" with one argument, the name of the file -# that has the commit message. The hook should exit with non-zero -# status after issuing an appropriate message if it wants to stop the -# commit. The hook is allowed to edit the commit message file. -# -# To enable this hook, rename this file to "commit-msg". - -# Uncomment the below to add a Signed-off-by line to the message. -# Doing this in a hook is a bad idea in general, but the prepare-commit-msg -# hook is more suited to it. -# -# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" - -# This example catches duplicate Signed-off-by lines. - -test "" = "$(grep '^Signed-off-by: ' "$1" | - sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { - echo >&2 Duplicate Signed-off-by lines. - exit 1 -} diff --git a/hooks/post-update.sample b/hooks/post-update.sample deleted file mode 100644 index ec17ec1..0000000 --- a/hooks/post-update.sample +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -# -# An example hook script to prepare a packed repository for use over -# dumb transports. -# -# To enable this hook, rename this file to "post-update". - -exec git update-server-info diff --git a/hooks/pre-applypatch.sample b/hooks/pre-applypatch.sample deleted file mode 100644 index b1f187c..0000000 --- a/hooks/pre-applypatch.sample +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -# -# An example hook script to verify what is about to be committed -# by applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-applypatch". - -. git-sh-setup -test -x "$GIT_DIR/hooks/pre-commit" && - exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"} -: diff --git a/hooks/pre-commit.sample b/hooks/pre-commit.sample deleted file mode 100644 index 18c4829..0000000 --- a/hooks/pre-commit.sample +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/sh -# -# An example hook script to verify what is about to be committed. -# Called by "git commit" with no arguments. The hook should -# exit with non-zero status after issuing an appropriate message if -# it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-commit". - -if git rev-parse --verify HEAD >/dev/null 2>&1 -then - against=HEAD -else - # Initial commit: diff against an empty tree object - against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 -fi - -# If you want to allow non-ascii filenames set this variable to true. -allownonascii=$(git config hooks.allownonascii) - -# Redirect output to stderr. -exec 1>&2 - -# Cross platform projects tend to avoid non-ascii filenames; prevent -# them from being added to the repository. We exploit the fact that the -# printable range starts at the space character and ends with tilde. -if [ "$allownonascii" != "true" ] && - # Note that the use of brackets around a tr range is ok here, (it's - # even required, for portability to Solaris 10's /usr/bin/tr), since - # the square bracket bytes happen to fall in the designated range. - test $(git diff --cached --name-only --diff-filter=A -z $against | - LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 -then - echo "Error: Attempt to add a non-ascii file name." - echo - echo "This can cause problems if you want to work" - echo "with people on other platforms." - echo - echo "To be portable it is advisable to rename the file ..." - echo - echo "If you know what you are doing you can disable this" - echo "check using:" - echo - echo " git config hooks.allownonascii true" - echo - exit 1 -fi - -# If there are whitespace errors, print the offending file names and fail. -exec git diff-index --check --cached $against -- diff --git a/hooks/pre-rebase.sample b/hooks/pre-rebase.sample deleted file mode 100644 index 33730ca..0000000 --- a/hooks/pre-rebase.sample +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/sh -# -# Copyright (c) 2006, 2008 Junio C Hamano -# -# The "pre-rebase" hook is run just before "git rebase" starts doing -# its job, and can prevent the command from running by exiting with -# non-zero status. -# -# The hook is called with the following parameters: -# -# $1 -- the upstream the series was forked from. -# $2 -- the branch being rebased (or empty when rebasing the current branch). -# -# This sample shows how to prevent topic branches that are already -# merged to 'next' branch from getting rebased, because allowing it -# would result in rebasing already published history. - -publish=next -basebranch="$1" -if test "$#" = 2 -then - topic="refs/heads/$2" -else - topic=`git symbolic-ref HEAD` || - exit 0 ;# we do not interrupt rebasing detached HEAD -fi - -case "$topic" in -refs/heads/??/*) - ;; -*) - exit 0 ;# we do not interrupt others. - ;; -esac - -# Now we are dealing with a topic branch being rebased -# on top of master. Is it OK to rebase it? - -# Does the topic really exist? -git show-ref -q "$topic" || { - echo >&2 "No such branch $topic" - exit 1 -} - -# Is topic fully merged to master? -not_in_master=`git rev-list --pretty=oneline ^master "$topic"` -if test -z "$not_in_master" -then - echo >&2 "$topic is fully merged to master; better remove it." - exit 1 ;# we could allow it, but there is no point. -fi - -# Is topic ever merged to next? If so you should not be rebasing it. -only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` -only_next_2=`git rev-list ^master ${publish} | sort` -if test "$only_next_1" = "$only_next_2" -then - not_in_topic=`git rev-list "^$topic" master` - if test -z "$not_in_topic" - then - echo >&2 "$topic is already up-to-date with master" - exit 1 ;# we could allow it, but there is no point. - else - exit 0 - fi -else - not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` - /usr/bin/perl -e ' - my $topic = $ARGV[0]; - my $msg = "* $topic has commits already merged to public branch:\n"; - my (%not_in_next) = map { - /^([0-9a-f]+) /; - ($1 => 1); - } split(/\n/, $ARGV[1]); - for my $elem (map { - /^([0-9a-f]+) (.*)$/; - [$1 => $2]; - } split(/\n/, $ARGV[2])) { - if (!exists $not_in_next{$elem->[0]}) { - if ($msg) { - print STDERR $msg; - undef $msg; - } - print STDERR " $elem->[1]\n"; - } - } - ' "$topic" "$not_in_next" "$not_in_master" - exit 1 -fi - -<<\DOC_END - -This sample hook safeguards topic branches that have been -published from being rewound. - -The workflow assumed here is: - - * Once a topic branch forks from "master", "master" is never - merged into it again (either directly or indirectly). - - * Once a topic branch is fully cooked and merged into "master", - it is deleted. If you need to build on top of it to correct - earlier mistakes, a new topic branch is created by forking at - the tip of the "master". This is not strictly necessary, but - it makes it easier to keep your history simple. - - * Whenever you need to test or publish your changes to topic - branches, merge them into "next" branch. - -The script, being an example, hardcodes the publish branch name -to be "next", but it is trivial to make it configurable via -$GIT_DIR/config mechanism. - -With this workflow, you would want to know: - -(1) ... if a topic branch has ever been merged to "next". Young - topic branches can have stupid mistakes you would rather - clean up before publishing, and things that have not been - merged into other branches can be easily rebased without - affecting other people. But once it is published, you would - not want to rewind it. - -(2) ... if a topic branch has been fully merged to "master". - Then you can delete it. More importantly, you should not - build on top of it -- other people may already want to - change things related to the topic as patches against your - "master", so if you need further changes, it is better to - fork the topic (perhaps with the same name) afresh from the - tip of "master". - -Let's look at this example: - - o---o---o---o---o---o---o---o---o---o "next" - / / / / - / a---a---b A / / - / / / / - / / c---c---c---c B / - / / / \ / - / / / b---b C \ / - / / / / \ / - ---o---o---o---o---o---o---o---o---o---o---o "master" - - -A, B and C are topic branches. - - * A has one fix since it was merged up to "next". - - * B has finished. It has been fully merged up to "master" and "next", - and is ready to be deleted. - - * C has not merged to "next" at all. - -We would want to allow C to be rebased, refuse A, and encourage -B to be deleted. - -To compute (1): - - git rev-list ^master ^topic next - git rev-list ^master next - - if these match, topic has not merged in next at all. - -To compute (2): - - git rev-list master..topic - - if this is empty, it is fully merged to "master". - -DOC_END diff --git a/hooks/prepare-commit-msg.sample b/hooks/prepare-commit-msg.sample deleted file mode 100644 index f093a02..0000000 --- a/hooks/prepare-commit-msg.sample +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# -# An example hook script to prepare the commit log message. -# Called by "git commit" with the name of the file that has the -# commit message, followed by the description of the commit -# message's source. The hook's purpose is to edit the commit -# message file. If the hook fails with a non-zero status, -# the commit is aborted. -# -# To enable this hook, rename this file to "prepare-commit-msg". - -# This hook includes three examples. The first comments out the -# "Conflicts:" part of a merge commit. -# -# The second includes the output of "git diff --name-status -r" -# into the message, just before the "git status" output. It is -# commented because it doesn't cope with --amend or with squashed -# commits. -# -# The third example adds a Signed-off-by line to the message, that can -# still be edited. This is rarely a good idea. - -case "$2,$3" in - merge,) - /usr/bin/perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;; - -# ,|template,) -# /usr/bin/perl -i.bak -pe ' -# print "\n" . `git diff --cached --name-status -r` -# if /^#/ && $first++ == 0' "$1" ;; - - *) ;; -esac - -# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" diff --git a/hooks/update.sample b/hooks/update.sample deleted file mode 100644 index 71ab04e..0000000 --- a/hooks/update.sample +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/sh -# -# An example hook script to blocks unannotated tags from entering. -# Called by "git receive-pack" with arguments: refname sha1-old sha1-new -# -# To enable this hook, rename this file to "update". -# -# Config -# ------ -# hooks.allowunannotated -# This boolean sets whether unannotated tags will be allowed into the -# repository. By default they won't be. -# hooks.allowdeletetag -# This boolean sets whether deleting tags will be allowed in the -# repository. By default they won't be. -# hooks.allowmodifytag -# This boolean sets whether a tag may be modified after creation. By default -# it won't be. -# hooks.allowdeletebranch -# This boolean sets whether deleting branches will be allowed in the -# repository. By default they won't be. -# hooks.denycreatebranch -# This boolean sets whether remotely creating branches will be denied -# in the repository. By default this is allowed. -# - -# --- Command line -refname="$1" -oldrev="$2" -newrev="$3" - -# --- Safety check -if [ -z "$GIT_DIR" ]; then - echo "Don't run this script from the command line." >&2 - echo " (if you want, you could supply GIT_DIR then run" >&2 - echo " $0 )" >&2 - exit 1 -fi - -if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -# --- Config -allowunannotated=$(git config --bool hooks.allowunannotated) -allowdeletebranch=$(git config --bool hooks.allowdeletebranch) -denycreatebranch=$(git config --bool hooks.denycreatebranch) -allowdeletetag=$(git config --bool hooks.allowdeletetag) -allowmodifytag=$(git config --bool hooks.allowmodifytag) - -# check for no description -projectdesc=$(sed -e '1q' "$GIT_DIR/description") -case "$projectdesc" in -"Unnamed repository"* | "") - echo "*** Project description file hasn't been set" >&2 - exit 1 - ;; -esac - -# --- Check types -# if $newrev is 0000...0000, it's a commit to delete a ref. -zero="0000000000000000000000000000000000000000" -if [ "$newrev" = "$zero" ]; then - newrev_type=delete -else - newrev_type=$(git cat-file -t $newrev) -fi - -case "$refname","$newrev_type" in - refs/tags/*,commit) - # un-annotated tag - short_refname=${refname##refs/tags/} - if [ "$allowunannotated" != "true" ]; then - echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2 - echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 - exit 1 - fi - ;; - refs/tags/*,delete) - # delete tag - if [ "$allowdeletetag" != "true" ]; then - echo "*** Deleting a tag is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/tags/*,tag) - # annotated tag - if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 - then - echo "*** Tag '$refname' already exists." >&2 - echo "*** Modifying a tag is not allowed in this repository." >&2 - exit 1 - fi - ;; - refs/heads/*,commit) - # branch - if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then - echo "*** Creating a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/heads/*,delete) - # delete branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/remotes/*,commit) - # tracking branch - ;; - refs/remotes/*,delete) - # delete tracking branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a tracking branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - *) - # Anything else (is there anything else?) - echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 - exit 1 - ;; -esac - -# --- Finished -exit 0 diff --git a/info/exclude b/info/exclude deleted file mode 100644 index a5196d1..0000000 --- a/info/exclude +++ /dev/null @@ -1,6 +0,0 @@ -# git ls-files --others --exclude-from=.git/info/exclude -# Lines that start with '#' are comments. -# For a project mostly in C, the following would be a good set of -# exclude patterns (uncomment them if you want to use them): -# *.[oa] -# *~ diff --git a/setup.py b/setup.py index a88d0b6..41691b4 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import os -import sys - +import os, sys try: from setuptools import setup except ImportError: from distutils.core import setup - readme = open('README.rst').read() history = open('HISTORY.rst').read().replace('.. :changelog:', '') @@ -24,7 +21,7 @@ setup( name='mca', - version='1.0.2', + version='1.0.3', description='Multiple correspondence analysis with pandas', long_description=readme + '\n\n' + history, author='Emre Safak', diff --git a/src/__init__.py b/src/__init__.py index 520f52c..609d605 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -__author__ = 'Emre Safak' -__email__ = 'misteremre@yahoo.com' -__version__ = '1.0' +from pkg_resources import get_distribution -# __all__ = ['mca'] -# import scipy.linalg, numpy, pandas, functools \ No newline at end of file +__author__ = 'Emre Safak' +__version__ = get_distribution('mca').version \ No newline at end of file diff --git a/src/mca.py b/src/mca.py index 8e5ed85..d01a93b 100644 --- a/src/mca.py +++ b/src/mca.py @@ -1,219 +1,231 @@ # -*- coding: utf-8 -*- from functools import reduce -from pandas import concat, get_dummies -from scipy.linalg import diagsvd + from numpy import apply_along_axis, argmax, array, cumsum, diag, dot, finfo, flatnonzero, int64, outer, sqrt from numpy.linalg import norm, svd +from pandas import concat, get_dummies +from scipy.linalg import diagsvd +from scipy.sparse import diags +from scipy.sparse.linalg import svds def process_df(DF, cols, ncols): - if cols: # if you want us to do the dummy coding - K = len(cols) # the number of categories - X = dummy(DF, cols) - else: # if you want to dummy code it yourself or do all the cols - K = ncols - if ncols is None: # be sure to pass K if you didn't multi-index - K = len(DF.columns) # ... it with mca.dummy() - if not K: - raise ValueError("Your DataFrame has no columns.") - elif not isinstance(ncols, int) or ncols <= 0 or \ - ncols > len(DF.columns): # if you dummy coded it yourself - raise ValueError("You must pass a valid number of columns.") - X = DF - J = X.shape[1] - return X, K, J + if cols: # if you want us to do the dummy coding + K = len(cols) # the number of categories + X = dummy(DF, cols) + else: # if you want to dummy code it yourself or do all the cols + K = ncols + if ncols is None: # be sure to pass K if you didn't multi-index + K = len(DF.columns) # ... it with mca.dummy() + if not K: + raise ValueError("Your DataFrame has no columns.") + elif not isinstance(ncols, int) or ncols <= 0 or \ + ncols > len(DF.columns): # if you dummy coded it yourself + raise ValueError("You must pass a valid number of columns.") + X = DF + J = X.shape[1] + return X, K, J def dummy(DF, cols=None): - """Dummy code select columns of a DataFrame.""" - return concat((get_dummies(DF[col]) - for col in (DF.columns if cols is None else cols)), - axis=1, keys=DF.columns) + """Dummy code select columns of a DataFrame.""" + dummies = (get_dummies(DF[col]) for col in + (DF.columns if cols is None else cols)) + return concat(dummies, axis=1, keys=DF.columns) def _mul(*args): - """An internal method to multiply matrices.""" - return reduce(dot, args) + """An internal method to multiply matrices.""" + return reduce(dot, args) class MCA(object): - """Run MCA on selected columns of a pd DataFrame. - - If the column are specified, assume that they hold - categorical variables that need to be replaced with - dummy indicators, otherwise process the DataFrame as is. - - 'cols': The columns of the DataFrame to process. - 'ncols': The number of columns before dummy coding. To be passed if cols isn't. - 'benzecri': Perform Benzécri correction (default: True) - 'TOL': value below which to round eigenvalues to zero (default: 1e-4) - """ - - def __init__(self, DF, cols=None, ncols=None, benzecri=True, TOL=1e-4): - - X, self.K, self.J = process_df(DF, cols, ncols) - S = X.sum().sum() - Z = X / S # correspondence matrix - self.r = Z.sum(axis=1) - self.c = Z.sum() - self._numitems = len(DF) - self.cor = benzecri - self.D_r = diag(1/sqrt(self.r)) - Z_c = Z - outer(self.r, self.c) # standardized residuals matrix - eps = finfo(float).eps # avoid division-by-zero - self.D_c = diag(1/(eps + sqrt(self.c))) - - # another option, not pursued here, is sklearn.decomposition.TruncatedSVD - self.P, self.s, self.Q = svd(_mul(self.D_r, Z_c, self.D_c)) - - self.E = None - E = self._benzecri() if self.cor else self.s**2 - self.inertia = sum(E) - self.rank = argmax(E < TOL) - self.L = E[:self.rank] - - def _benzecri(self): - if self.E is None: - self.E = array([(self.K/(self.K-1.)*(_ - 1./self.K))**2 - if _ > 1./self.K else 0 for _ in self.s**2]) - return self.E - - def fs_r(self, percent=0.9, N=None): - """Get the row factor scores (dimensionality-reduced representation), - choosing how many factors to retain, directly or based on the explained - variance. - - 'percent': The minimum variance that the retained factors are required - to explain (default: 90% = 0.9) - 'N': The number of factors to retain. Overrides 'percent'. - If the rank is less than N, N is ignored. - """ - if not 0 <= percent <= 1: - raise ValueError("Percent should be a real number between 0 and 1.") - if N: - if not isinstance(N, (int, int64)) or N <= 0: - raise ValueError("N should be a positive integer.") - N = min(N, self.rank) - # S = zeros((self._numitems, N)) - # else: - self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0] - # S = zeros((self._numitems, self.k)) - # the sign of the square root can be either way; singular value vs. eigenvalue - # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s) - num2ret = N if N else self.k - s = -sqrt(self.L) if self.cor else self.s - S = diagsvd(s[:num2ret], self._numitems, num2ret) - self.F = _mul(self.D_r, self.P, S) - return self.F - - def fs_c(self, percent=0.9, N=None): - """Get the column factor scores (dimensionality-reduced representation), - choosing how many factors to retain, directly or based on the explained - variance. - - 'percent': The minimum variance that the retained factors are required - to explain (default: 90% = 0.9) - 'N': The number of factors to retain. Overrides 'percent'. - If the rank is less than N, N is ignored. - """ - if not 0 <= percent <= 1: - raise ValueError("Percent should be a real number between 0 and 1.") - if N: - if not isinstance(N, (int, int64)) or N <= 0: - raise ValueError("N should be a positive integer.") - N = min(N, self.rank) # maybe we should notify the user? - # S = zeros((self._numitems, N)) - # else: - self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0] - # S = zeros((self._numitems, self.k)) - # the sign of the square root can be either way; singular value vs. eigenvalue - # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s) - num2ret = N if N else self.k - s = -sqrt(self.L) if self.cor else self.s - S = diagsvd(s[:num2ret], len(self.Q), num2ret) - self.G = _mul(self.D_c, self.Q.T, S) # important! note the transpose on Q - return self.G - - def cos_r(self, N=None): # percent=0.9 - """Return the squared cosines for each row.""" - - if not hasattr(self, 'F') or self.F.shape[1] < self.rank: - self.fs_r(N=self.rank) # generate F - self.dr = norm(self.F, axis=1)**2 - # cheaper than diag(self.F.dot(self.F.T))? - - return apply_along_axis(lambda _: _/self.dr, 0, self.F[:, :N]**2) - - def cos_c(self, N=None): # percent=0.9, - """Return the squared cosines for each column.""" - - if not hasattr(self, 'G') or self.G.shape[1] < self.rank: - self.fs_c(N=self.rank) # generate - self.dc = norm(self.G, axis=1)**2 - # cheaper than diag(self.G.dot(self.G.T))? - - return apply_along_axis(lambda _: _/self.dc, 0, self.G[:, :N]**2) - - def cont_r(self, percent=0.9, N=None): - """Return the contribution of each row.""" - - if not hasattr(self, 'F'): - self.fs_r(N=self.rank) # generate F - return apply_along_axis(lambda _: _/self.L[:N], 1, - apply_along_axis(lambda _: _*self.r, 0, self.F[:, :N]**2)) - - def cont_c(self, percent=0.9, N=None): # bug? check axis number 0 vs 1 here - """Return the contribution of each column.""" - - if not hasattr(self, 'G'): - self.fs_c(N=self.rank) # generate G - return apply_along_axis(lambda _: _/self.L[:N], 1, - apply_along_axis(lambda _: _*self.c, 0, self.G[:, :N]**2)) - - def expl_var(self, greenacre=True, N=None): - """ - Return proportion of explained inertia (variance) for each factor. - - :param greenacre: Perform Greenacre correction (default: True) - """ - if greenacre: - greenacre_inertia = (self.K / (self.K - 1.) * (sum(self.s**4) - - (self.J - self.K) / self.K**2.)) - return (self._benzecri() / greenacre_inertia)[:N] - else: - E = self._benzecri() if self.cor else self.s**2 - return (E / sum(E))[:N] - - def fs_r_sup(self, DF, N=None): - """Find the supplementary row factor scores. - - ncols: The number of singular vectors to retain. - If both are passed, cols is given preference. - """ - if not hasattr(self, 'G'): - self.fs_c(N=self.rank) # generate G - - if N and (not isinstance(N, int) or N <= 0): - raise ValueError("ncols should be a positive integer.") - s = -sqrt(self.E) if self.cor else self.s - N = min(N, self.rank) if N else self.rank - S_inv = diagsvd(-1/s[:N], len(self.G.T), N) - # S = scipy.linalg.diagsvd(s[:N], len(self.tau), N) - return _mul(DF.div(DF.sum(axis=1), axis=0), self.G, S_inv)[:, :N] - - def fs_c_sup(self, DF, N=None): - """Find the supplementary column factor scores. - - ncols: The number of singular vectors to retain. - If both are passed, cols is given preference. - """ - if not hasattr(self, 'F'): - self.fs_r(N=self.rank) # generate F - - if N and (not isinstance(N, int) or N <= 0): - raise ValueError("ncols should be a positive integer.") - s = -sqrt(self.E) if self.cor else self.s - N = min(N, self.rank) if N else self.rank - S_inv = diagsvd(-1/s[:N], len(self.F.T), N) - # S = scipy.linalg.diagsvd(s[:N], len(self.tau), N) - return _mul((DF/DF.sum()).T, self.F, S_inv)[:, :N] + """Run MCA on selected columns of a pd DataFrame. + + If the column are specified, assume that they hold + categorical variables that need to be replaced with + dummy indicators, otherwise process the DataFrame as is. + + 'cols': The columns of the DataFrame to process. + 'ncols': The number of columns before dummy coding. To be passed if cols isn't. + 'benzecri': Perform Benzécri correction (default: True) + 'TOL': value below which to round eigenvalues to zero (default: 1e-4) + """ + + def __init__(self, DF, cols=None, ncols=None, benzecri=True, TOL=1e-4, + sparse=False, approximate=False): + + X, self.K, self.J = process_df(DF, cols, ncols) + S = X.sum().sum() + Z = X / S # correspondence matrix + self.r = Z.sum(axis=1) + self.c = Z.sum() + self.cor = benzecri + + eps = finfo(float).eps + self.D_r = (diags if sparse else diag)(1/(eps + sqrt(self.r))) + self.D_c = diag(1/(eps + sqrt(self.c))) # can't use diags here + Z_c = Z - outer(self.r, self.c) # standardized residuals matrix + + product = self.D_r.dot(Z_c).dot(self.D_c) + if sparse: + P, s, Q = svds(product, min(product.shape)-1 if ncols is None else ncols) + # svds and svd use complementary orders + self.P = P.T[::-1].T + self.Q = Q[::-1] + self.s = s[::-1] + self._numitems = min(product.shape)-1 + else: + self._numitems = len(DF) + self.P, self.s, self.Q = svd(product) + + self.E = None + E = self._benzecri() if self.cor else self.s**2 + self.inertia = sum(E) + self.rank = argmax(E < TOL) + if not self.rank: self.rank = len(E) + self.L = E[:self.rank] + + def _benzecri(self): + if self.E is None: + self.E = array([(self.K/(self.K-1.)*(_ - 1./self.K))**2 + if _ > 1./self.K else 0 for _ in self.s**2]) + return self.E + + def fs_r(self, percent=0.9, N=None): + """Get the row factor scores (dimensionality-reduced representation), + choosing how many factors to retain, directly or based on the explained + variance. + + 'percent': The minimum variance that the retained factors are required + to explain (default: 90% = 0.9) + 'N': The number of factors to retain. Overrides 'percent'. + If the rank is less than N, N is ignored. + """ + if not 0 <= percent <= 1: + raise ValueError("Percent should be a real number between 0 and 1.") + if N: + if not isinstance(N, (int, int64)) or N <= 0: + raise ValueError("N should be a positive integer.") + N = min(N, self.rank) + self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0] + # S = zeros((self._numitems, self.k)) + # the sign of the square root can be either way; singular value vs. eigenvalue + # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s) + num2ret = N if N else self.k + s = -sqrt(self.L) if self.cor else self.s + S = diagsvd(s[:num2ret], self._numitems, num2ret) + self.F = self.D_r.dot(self.P).dot(S) + return self.F + + def fs_c(self, percent=0.9, N=None): + """Get the column factor scores (dimensionality-reduced representation), + choosing how many factors to retain, directly or based on the explained + variance. + + 'percent': The minimum variance that the retained factors are required + to explain (default: 90% = 0.9) + 'N': The number of factors to retain. Overrides 'percent'. + If the rank is less than N, N is ignored. + """ + if not 0 <= percent <= 1: + raise ValueError("Percent should be a real number between 0 and 1.") + if N: + if not isinstance(N, (int, int64)) or N <= 0: + raise ValueError("N should be a positive integer.") + N = min(N, self.rank) # maybe we should notify the user? + # S = zeros((self._numitems, N)) + # else: + self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0] + # S = zeros((self._numitems, self.k)) + # the sign of the square root can be either way; singular value vs. eigenvalue + # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s) + num2ret = N if N else self.k + s = -sqrt(self.L) if self.cor else self.s + S = diagsvd(s[:num2ret], len(self.Q), num2ret) + self.G = _mul(self.D_c, self.Q.T, S) # important! note the transpose on Q + return self.G + + def cos_r(self, N=None): # percent=0.9 + """Return the squared cosines for each row.""" + + if not hasattr(self, 'F') or self.F.shape[1] < self.rank: + self.fs_r(N=self.rank) # generate F + self.dr = norm(self.F, axis=1)**2 + # cheaper than diag(self.F.dot(self.F.T))? + + return apply_along_axis(lambda _: _/self.dr, 0, self.F[:, :N]**2) + + def cos_c(self, N=None): # percent=0.9, + """Return the squared cosines for each column.""" + + if not hasattr(self, 'G') or self.G.shape[1] < self.rank: + self.fs_c(N=self.rank) # generate + self.dc = norm(self.G, axis=1)**2 + # cheaper than diag(self.G.dot(self.G.T))? + + return apply_along_axis(lambda _: _/self.dc, 0, self.G[:, :N]**2) + + def cont_r(self, percent=0.9, N=None): + """Return the contribution of each row.""" + + if not hasattr(self, 'F'): + self.fs_r(N=self.rank) # generate F + return apply_along_axis(lambda _: _/self.L[:N], 1, + apply_along_axis(lambda _: _*self.r, 0, self.F[:, :N]**2)) + + def cont_c(self, percent=0.9, N=None): # bug? check axis number 0 vs 1 here + """Return the contribution of each column.""" + + if not hasattr(self, 'G'): + self.fs_c(N=self.rank) # generate G + return apply_along_axis(lambda _: _/self.L[:N], 1, + apply_along_axis(lambda _: _*self.c, 0, self.G[:, :N]**2)) + + def expl_var(self, greenacre=True, N=None): + """ + Return proportion of explained inertia (variance) for each factor. + + :param greenacre: Perform Greenacre correction (default: True) + """ + if greenacre: + greenacre_inertia = (self.K / (self.K - 1.) * (sum(self.s**4) + - (self.J - self.K) / self.K**2.)) + return (self._benzecri() / greenacre_inertia)[:N] + else: + E = self._benzecri() if self.cor else self.s**2 + return (E / sum(E))[:N] + + def fs_r_sup(self, DF, N=None): + """Find the supplementary row factor scores. + + ncols: The number of singular vectors to retain. + If both are passed, cols is given preference. + """ + if not hasattr(self, 'G'): + self.fs_c(N=self.rank) # generate G + + if N and (not isinstance(N, int) or N <= 0): + raise ValueError("ncols should be a positive integer.") + s = -sqrt(self.E) if self.cor else self.s + N = min(N, self.rank) if N else self.rank + S_inv = diagsvd(-1/s[:N], len(self.G.T), N) + # S = diagsvd(s[:N], len(self.tau), N) + return _mul(DF.div(DF.sum(axis=1), axis=0), self.G, S_inv)[:, :N] + + def fs_c_sup(self, DF, N=None): + """Find the supplementary column factor scores. + + ncols: The number of singular vectors to retain. + If both are passed, cols is given preference. + """ + if not hasattr(self, 'F'): + self.fs_r(N=self.rank) # generate F + + if N and (not isinstance(N, int) or N <= 0): + raise ValueError("ncols should be a positive integer.") + s = -sqrt(self.E) if self.cor else self.s + N = min(N, self.rank) if N else self.rank + S_inv = diagsvd(-1/s[:N], len(self.F.T), N) + # S = diagsvd(s[:N], len(self.tau), N) + return _mul((DF/DF.sum()).T, self.F, S_inv)[:, :N] diff --git a/tests/test_mca.py b/tests/test_mca.py index 8aa731a..4b592e4 100644 --- a/tests/test_mca.py +++ b/tests/test_mca.py @@ -8,184 +8,190 @@ Tests for `mca` module. """ -import unittest +from unittest import TestCase, main from numpy.testing import assert_allclose -from numpy import array -import pandas +from numpy import array, sign +from numpy.random import randint +from pandas import read_table, DataFrame from mca import MCA -class TestMca(unittest.TestCase): - - def test_abdi_valentin(self): - # Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf - # Multiple Correspondence Analysis - # (Hervé Abdi & Dominique Valentin, 2007) - # See in particular Table 2,3,4. - - # first we check the eigenvalues and factor scores with Benzecri - # correction - df = pandas.read_table('data/burgundies.csv', skiprows=1, sep=',', - index_col=0) - mca_df = MCA(df.drop('oak_type', axis=1), ncols=10) - assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4) - true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08], - [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]] - assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2) - true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9, - .90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90, - -.90, .9, .90, -.90], - [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00, - .18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00, - .00, .00]] - assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2) - - true_cont_r = [[177, 121, 202, 177, 202, 121], - [83, 333, 83, 83, 83, 333]] - assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1) - - true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44, - 58, 58, 6, 6, 58, 58, 58, 58], - [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0, - 0, 0, 0, 0, 0, 0, 0]] - assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1) - - # I declined to include a test for the cos_c and cos_r functions because - # I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in - # elsewhere the formula for the squared cosine is f**2/d**2. This does not - # agree with tables 3 and 4. In table 3 the squared cosine is derived from - # f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no - # idea how the squared cosines in table 4 were derived. My formula, however - # does comport with the figures given in (Abdi & Bera, 2014), tested next. - - # oak = pandas.DataFrame([1,2,2,2,1,1], columns=['oak_type']) - # print(dummy(oak)) - # mca_df.fs_c_sup(dummy(oak)) - - # ... then without Benzecri correction - mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False) - assert_allclose([0.8532, 0.2, 0.1151, 0.0317], - (mca_df_i.s**2)[:4], atol=1e-4) - - # check percentage of explained variance both with and without Benzecri - # and Greenacre corrections - true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.] - true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.] - true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.] - assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4) - assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4) - assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4) - assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4) - - def test_abdi_bera(self): - # Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf - # Correspondence Analysis, (Herve Abdi & Michel Bera, 2014) - # Springer Encyclopedia of Social Networks and Mining. - df = pandas.read_table('data/music_color.csv', skiprows=0, index_col=0, - sep=',') - mca_df = MCA(df, benzecri=False) - - # Table 1, page 13 - assert_allclose(mca_df.r, [.121, .091, .126, .116, .096, .066, .071, - .146, .061, .106], atol=1e-3) - assert_allclose(mca_df.c, [.11, .11, .11, .11, .11, .11, .11, .11, .11], - atol=1e-2) - - # Table 2, page 14 - assert_allclose(mca_df.fs_r(N=2), [[-0.026, 0.299], [-0.314, 0.232], - [-0.348, 0.202], [-0.044, -0.490], - [-0.082, -0.206], [-0.619, 0.475], - [-0.328, 0.057], [1.195, 0.315], - [-0.57, 0.3], [0.113, -0.997]], - atol=1e-3) - assert_allclose(mca_df.cont_r(N=2)*1000, [[0, 56], [31, 25], [53, 27], - [1, 144], [2, 21], [87, 77], - [26, 1], [726, 75], [68, 28], - [5, 545]], - atol=1) - assert_allclose(mca_df.cos_r(N=2)*1000, - [[3, 410], [295, 161], [267, 89], [5, 583], [13, 81], - [505, 298], [77, 2], [929, 65], [371, 103], [12, 973]], - atol=1) - - # Table 3, page 17 - assert_allclose(mca_df.fs_c(N=2), - [[-0.541, 0.386], [-.257, .275], [-.291, -.309], - [.991, .397], [-.122, -.637], [-.236, .326], - [.954, -.089], [-.427, .408], [-.072, -.757]], - atol=1e-3) - assert_allclose(mca_df.cont_c(N=2)*1000, - [[113, 86], [25, 44], [33, 55], - [379, 91], [6, 234], [22, 61], - [351, 5], [70, 96], [2, 330]], - atol=1) - assert_allclose(mca_df.cos_c(N=2)*1000, - [[454, 232], [105, 121], [142, 161], [822, 132], - [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]], - atol=1) - - assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3) - self.assertAlmostEqual(mca_df.inertia, 0.746, 3) - - def test_abdi_williams(self): - # Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf - # Correspondence Analysis, (Herve Abdi & Michel Bera, 2010) - # SAGE Encyclopedia of Research Design. Table 4, page 16. - - df = pandas.read_table('data/french_writers.csv', skiprows=0, - index_col=0, sep=',') - mca_df = MCA(df, benzecri=False) - - assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4) - assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835], - atol=1e-4) - - true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475], - [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]] - assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4) - assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4) - - assert_allclose(-mca_df.fs_c(N=2).T, [[-0.0489, 0.0973, -0.2914], - [.1115, -0.0367, -0.0901]], - atol=1e-4) - - true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106], - [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]] - assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4) - - true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554], - [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]] - assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4) - - assert_allclose(mca_df.cont_c(N=2).T, [[0.0399, 0.2999, 0.6601], - [0.6628, 0.1359, 0.2014]], - atol=1e-4) - assert_allclose(mca_df.cos_c(N=2).T, [[0.1614, 0.8758, 0.9128], - [0.8386, 0.1242, 0.0872]], - atol=1e-4) - - assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4) - assert_allclose(mca_df.dr, - [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408], - atol=1e-4) - - # abdi = numpy.array([216, 139, 26]) # - abdi = pandas.DataFrame([216, 139, 26]).T - assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]], - atol=1e-4) - - supp = pandas.read_table('data/french_writers_supp.csv', skiprows=0, - index_col=0, sep=',') - - true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008], - [0.2318, 0.2082, -0.2976, -0.4740]] - assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3) - - def test_invalid_inputs(self): - df = pandas.read_table('data/burgundies.csv', skiprows=1, sep=',') - self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols=0) - self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols='') +class TestMca(TestCase): + + def test_abdi_valentin(self): + # Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf + # Multiple Correspondence Analysis + # (Hervé Abdi & Dominique Valentin, 2007) + # See in particular Table 2,3,4. + + # first we check the eigenvalues and factor scores with Benzecri + # correction + df = read_table('data/burgundies.csv', skiprows=1, sep=',', index_col=0) + mca_df = MCA(df.drop('oak_type', axis=1), ncols=10) + assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4) + true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08], + [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]] + assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2) + true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9, + .90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90, + -.90, .9, .90, -.90], + [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00, + .18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00, + .00, .00]] + assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2) + + true_cont_r = [[177, 121, 202, 177, 202, 121], + [83, 333, 83, 83, 83, 333]] + assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1) + + true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44, + 58, 58, 6, 6, 58, 58, 58, 58], + [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0, + 0, 0, 0, 0, 0, 0, 0]] + assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1) + + # I declined to include a test for the cos_c and cos_r functions because + # I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in + # elsewhere the formula for the squared cosine is f**2/d**2. This does not + # agree with tables 3 and 4. In table 3 the squared cosine is derived from + # f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no + # idea how the squared cosines in table 4 were derived. My formula, however + # does comport with the figures given in (Abdi & Bera, 2014), tested next. + + # oak = DataFrame([1,2,2,2,1,1], columns=['oak_type']) + # print(dummy(oak)) + # mca_df.fs_c_sup(dummy(oak)) + + # ... then without Benzecri correction + mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False) + assert_allclose([0.8532, 0.2, 0.1151, 0.0317], + (mca_df_i.s**2)[:4], atol=1e-4) + + # check percentage of explained variance both with and without Benzecri + # and Greenacre corrections + true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.] + true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.] + true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.] + assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4) + assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4) + assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4) + assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4) + + def test_abdi_bera(self): + # Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf + # Correspondence Analysis, (Herve Abdi & Michel Bera, 2014) + # Springer Encyclopedia of Social Networks and Mining. + df = read_table('data/music_color.csv', skiprows=0, index_col=0, sep=',') + mca_df = MCA(df, benzecri=False) + + # Table 1, page 13 + assert_allclose(mca_df.r, [.121, .091, .126, .116, .096, .066, .071, + .146, .061, .106], atol=1e-3) + assert_allclose(mca_df.c, [.11, .11, .11, .11, .11, .11, .11, .11, .11], + atol=1e-2) + + # Table 2, page 14 + assert_allclose(mca_df.fs_r(N=2), [[-0.026, 0.299], [-0.314, 0.232], + [-0.348, 0.202], [-0.044, -0.490], + [-0.082, -0.206], [-0.619, 0.475], + [-0.328, 0.057], [1.195, 0.315], + [-0.57, 0.3], [0.113, -0.997]], + atol=1e-3) + assert_allclose(mca_df.cont_r(N=2)*1000, [[0, 56], [31, 25], [53, 27], + [1, 144], [2, 21], [87, 77], + [26, 1], [726, 75], [68, 28], + [5, 545]], + atol=1) + assert_allclose(mca_df.cos_r(N=2)*1000, + [[3, 410], [295, 161], [267, 89], [5, 583], [13, 81], + [505, 298], [77, 2], [929, 65], [371, 103], [12, 973]], + atol=1) + + # Table 3, page 17 + assert_allclose(mca_df.fs_c(N=2), + [[-0.541, 0.386], [-.257, .275], [-.291, -.309], + [.991, .397], [-.122, -.637], [-.236, .326], + [.954, -.089], [-.427, .408], [-.072, -.757]], + atol=1e-3) + assert_allclose(mca_df.cont_c(N=2)*1000, + [[113, 86], [25, 44], [33, 55], + [379, 91], [6, 234], [22, 61], + [351, 5], [70, 96], [2, 330]], + atol=1) + assert_allclose(mca_df.cos_c(N=2)*1000, + [[454, 232], [105, 121], [142, 161], [822, 132], + [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]], + atol=1) + + assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3) + self.assertAlmostEqual(mca_df.inertia, 0.746, 3) + + def test_abdi_williams(self): + # Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf + # Correspondence Analysis, (Herve Abdi & Michel Bera, 2010) + # SAGE Encyclopedia of Research Design. Table 4, page 16. + + df = read_table('data/french_writers.csv', skiprows=0, index_col=0, sep=',') + mca_df = MCA(df, benzecri=False) + + assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4) + assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835], + atol=1e-4) + + true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475], + [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]] + assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4) + assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4) + + true_fs_col = [[-0.0489, 0.0973, -0.2914], [.1115, -0.0367, -0.0901]] + assert_allclose(-mca_df.fs_c(N=2).T, true_fs_col, atol=1e-4) + + true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106], + [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]] + assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4) + + true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554], + [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]] + assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4) + + assert_allclose(mca_df.cont_c(N=2).T, [[0.0399, 0.2999, 0.6601], + [0.6628, 0.1359, 0.2014]], + atol=1e-4) + assert_allclose(mca_df.cos_c(N=2).T, [[0.1614, 0.8758, 0.9128], + [0.8386, 0.1242, 0.0872]], + atol=1e-4) + + assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4) + assert_allclose(mca_df.dr, [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408], + atol=1e-4) + + # abdi = numpy.array([216, 139, 26]) # + abdi = DataFrame([216, 139, 26]).T + assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]], atol=1e-4) + + supp = read_table('data/french_writers_supp.csv', + skiprows=0, index_col=0, sep=',') + + true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008], + [0.2318, 0.2082, -0.2976, -0.4740]] + assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3) + + def test_invalid_inputs(self): + df = read_table('data/burgundies.csv', skiprows=1, sep=',') + self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols=0) + self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols='') + + + def test_sparse(self): + df = DataFrame(randint(0,2,(100,100))) + mca1 = MCA(df, sparse=False) + mca2 = MCA(df, sparse=True) + assert_allclose(mca1.s[:-1], mca2.s, atol=1e-12) + for row1, row2 in zip(mca1.P.T, mca2.P.T): + assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) + for row1, row2 in zip(mca1.Q, mca2.Q): + assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) if __name__ == '__main__': - unittest.main() + main()