From f2b79ecbf37629902ccdbad2e1a556977c53d370 Mon Sep 17 00:00:00 2001
From: Emre Safak <misteremre@yahoo.com>
Date: Wed, 10 Jan 2018 18:06:52 -0800
Subject: [PATCH] Add sparse matrix support for large data sets

Closes #15
---
 .travis.yml                     |  45 ++--
 HISTORY.rst                     |   4 +-
 description                     |   1 -
 hooks/applypatch-msg.sample     |  15 --
 hooks/commit-msg.sample         |  24 --
 hooks/post-update.sample        |   8 -
 hooks/pre-applypatch.sample     |  14 --
 hooks/pre-commit.sample         |  50 ----
 hooks/pre-rebase.sample         | 169 -------------
 hooks/prepare-commit-msg.sample |  36 ---
 hooks/update.sample             | 128 ----------
 info/exclude                    |   6 -
 setup.py                        |   7 +-
 src/__init__.py                 |   8 +-
 src/mca.py                      | 418 ++++++++++++++++----------------
 tests/test_mca.py               | 354 ++++++++++++++-------------
 16 files changed, 425 insertions(+), 862 deletions(-)
 delete mode 100644 description
 delete mode 100644 hooks/applypatch-msg.sample
 delete mode 100644 hooks/commit-msg.sample
 delete mode 100644 hooks/post-update.sample
 delete mode 100644 hooks/pre-applypatch.sample
 delete mode 100644 hooks/pre-commit.sample
 delete mode 100644 hooks/pre-rebase.sample
 delete mode 100644 hooks/prepare-commit-msg.sample
 delete mode 100644 hooks/update.sample
 delete mode 100644 info/exclude

diff --git a/.travis.yml b/.travis.yml
index f505eab..3cd223f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,40 +3,39 @@
 language: python
 
 cache: 
-  - apt
-  - pip
+    - apt
+    - pip
 
 notifications:
-  email: false
+    email: false
 
 python:
-  - 3.6
-  - 3.5
-  # - "2.7_with_system_site_packages"
-  - "3.2_with_system_site_packages"
-  - 2.7
-  # - "pypy"
+    - 3.6
+    - 3.5
+    - 2.7
+    # - "pypy"
+    # - "pypy3"
 
 # virtualenv:
-  # system_site_packages: true
+#   system_site_packages: true
 
 before_install:
-  - if [ $TRAVIS_PYTHON_VERSION == "3.2_with_system_site_packages" ]; then
-      wget -O- http://neuro.debian.net/lists/precise.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list; sudo apt-key adv --recv-keys --keyserver pool.sks-keyservers.net 2649A5A9; echo | sudo add-apt-repository ppa:chris-lea/python-numpy; echo | sudo add-apt-repository ppa:pylab/stable; sudo apt-get update -q; sudo apt-get install -qq python3-numpy python3-scipy python3-pandas python3-pandas-lib;
-    else
-      sudo pip install conda; sudo conda init; conda create -p $HOME/py --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas; export PATH=$HOME/py/bin:$PATH;
-    fi
+    # - if [ $TRAVIS_PYTHON_VERSION == "3.2_with_system_site_packages" ]; then
+    #     wget -O- http://neuro.debian.net/lists/precise.us-ca.libre | sudo tee /etc/apt/sources.list.d/neurodebian.sources.list; sudo apt-key adv --recv-keys --keyserver pool.sks-keyservers.net 2649A5A9; echo | sudo add-apt-repository ppa:chris-lea/python-numpy; echo | sudo add-apt-repository ppa:pylab/stable; sudo apt-get update -q; sudo apt-get install -qq python3-numpy python3-scipy python3-pandas python3-pandas-lib;
+    #   else
+    sudo pip install conda; sudo conda init; conda create -p $HOME/py --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas; export PATH=$HOME/py/bin:$PATH;
+    # fi
 
 install:
-  - pip install -r requirements.txt
-  - pip install .
+    - pip install -r requirements.txt
+    - pip install .
 
 script: python setup.py test
 
 deploy:
-  provider: pypi
-  user: esafak
-  password:
-    secure: "ERTeFa4LZh+hiDqnv8Wf6qK6KNkFr+LFFsYx7bohM8iZsEDdciGcNis17r2H21WjIu2khIPUYarRKijVvvRMApVNgt4VLtuT5Z4WPVGS71vpJ5jZvVgZS/zo8MBZ8jbUAlMugdoWIxtSGRvJKZz6M2//eh3BtDhIz/gB4o1wGOg="
-  on:
-    tags: true
\ No newline at end of file
+    provider: pypi
+    user: esafak
+    password:
+        secure: "ERTeFa4LZh+hiDqnv8Wf6qK6KNkFr+LFFsYx7bohM8iZsEDdciGcNis17r2H21WjIu2khIPUYarRKijVvvRMApVNgt4VLtuT5Z4WPVGS71vpJ5jZvVgZS/zo8MBZ8jbUAlMugdoWIxtSGRvJKZz6M2//eh3BtDhIz/gB4o1wGOg="
+    on:
+        tags: true
\ No newline at end of file
diff --git a/HISTORY.rst b/HISTORY.rst
index 4e8b5c3..10b1bfe 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -8,4 +8,6 @@ History
 * **1.01** (2015-03-23)
 	More documentation, in the form of an ipython notebook. Fixed bug #2 affecting python 2.x
 * **1.02** (2017-07-29)
-	Fixed division-by-zero bug (issue #14)
\ No newline at end of file
+	Fixed division-by-zero bug (issue #14)
+* **1.03** (2018-01-10)	
+	Added sparse matrix support
\ No newline at end of file
diff --git a/description b/description
deleted file mode 100644
index 498b267..0000000
--- a/description
+++ /dev/null
@@ -1 +0,0 @@
-Unnamed repository; edit this file 'description' to name the repository.
diff --git a/hooks/applypatch-msg.sample b/hooks/applypatch-msg.sample
deleted file mode 100644
index 8b2a2fe..0000000
--- a/hooks/applypatch-msg.sample
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to check the commit log message taken by
-# applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.  The hook is
-# allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "applypatch-msg".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/commit-msg" &&
-	exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"}
-:
diff --git a/hooks/commit-msg.sample b/hooks/commit-msg.sample
deleted file mode 100644
index b58d118..0000000
--- a/hooks/commit-msg.sample
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to check the commit log message.
-# Called by "git commit" with one argument, the name of the file
-# that has the commit message.  The hook should exit with non-zero
-# status after issuing an appropriate message if it wants to stop the
-# commit.  The hook is allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "commit-msg".
-
-# Uncomment the below to add a Signed-off-by line to the message.
-# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
-# hook is more suited to it.
-#
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
-
-# This example catches duplicate Signed-off-by lines.
-
-test "" = "$(grep '^Signed-off-by: ' "$1" |
-	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
-	echo >&2 Duplicate Signed-off-by lines.
-	exit 1
-}
diff --git a/hooks/post-update.sample b/hooks/post-update.sample
deleted file mode 100644
index ec17ec1..0000000
--- a/hooks/post-update.sample
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to prepare a packed repository for use over
-# dumb transports.
-#
-# To enable this hook, rename this file to "post-update".
-
-exec git update-server-info
diff --git a/hooks/pre-applypatch.sample b/hooks/pre-applypatch.sample
deleted file mode 100644
index b1f187c..0000000
--- a/hooks/pre-applypatch.sample
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed
-# by applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-applypatch".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/pre-commit" &&
-	exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"}
-:
diff --git a/hooks/pre-commit.sample b/hooks/pre-commit.sample
deleted file mode 100644
index 18c4829..0000000
--- a/hooks/pre-commit.sample
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed.
-# Called by "git commit" with no arguments.  The hook should
-# exit with non-zero status after issuing an appropriate message if
-# it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-commit".
-
-if git rev-parse --verify HEAD >/dev/null 2>&1
-then
-	against=HEAD
-else
-	# Initial commit: diff against an empty tree object
-	against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
-fi
-
-# If you want to allow non-ascii filenames set this variable to true.
-allownonascii=$(git config hooks.allownonascii)
-
-# Redirect output to stderr.
-exec 1>&2
-
-# Cross platform projects tend to avoid non-ascii filenames; prevent
-# them from being added to the repository. We exploit the fact that the
-# printable range starts at the space character and ends with tilde.
-if [ "$allownonascii" != "true" ] &&
-	# Note that the use of brackets around a tr range is ok here, (it's
-	# even required, for portability to Solaris 10's /usr/bin/tr), since
-	# the square bracket bytes happen to fall in the designated range.
-	test $(git diff --cached --name-only --diff-filter=A -z $against |
-	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
-then
-	echo "Error: Attempt to add a non-ascii file name."
-	echo
-	echo "This can cause problems if you want to work"
-	echo "with people on other platforms."
-	echo
-	echo "To be portable it is advisable to rename the file ..."
-	echo
-	echo "If you know what you are doing you can disable this"
-	echo "check using:"
-	echo
-	echo "  git config hooks.allownonascii true"
-	echo
-	exit 1
-fi
-
-# If there are whitespace errors, print the offending file names and fail.
-exec git diff-index --check --cached $against --
diff --git a/hooks/pre-rebase.sample b/hooks/pre-rebase.sample
deleted file mode 100644
index 33730ca..0000000
--- a/hooks/pre-rebase.sample
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/bin/sh
-#
-# Copyright (c) 2006, 2008 Junio C Hamano
-#
-# The "pre-rebase" hook is run just before "git rebase" starts doing
-# its job, and can prevent the command from running by exiting with
-# non-zero status.
-#
-# The hook is called with the following parameters:
-#
-# $1 -- the upstream the series was forked from.
-# $2 -- the branch being rebased (or empty when rebasing the current branch).
-#
-# This sample shows how to prevent topic branches that are already
-# merged to 'next' branch from getting rebased, because allowing it
-# would result in rebasing already published history.
-
-publish=next
-basebranch="$1"
-if test "$#" = 2
-then
-	topic="refs/heads/$2"
-else
-	topic=`git symbolic-ref HEAD` ||
-	exit 0 ;# we do not interrupt rebasing detached HEAD
-fi
-
-case "$topic" in
-refs/heads/??/*)
-	;;
-*)
-	exit 0 ;# we do not interrupt others.
-	;;
-esac
-
-# Now we are dealing with a topic branch being rebased
-# on top of master.  Is it OK to rebase it?
-
-# Does the topic really exist?
-git show-ref -q "$topic" || {
-	echo >&2 "No such branch $topic"
-	exit 1
-}
-
-# Is topic fully merged to master?
-not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
-if test -z "$not_in_master"
-then
-	echo >&2 "$topic is fully merged to master; better remove it."
-	exit 1 ;# we could allow it, but there is no point.
-fi
-
-# Is topic ever merged to next?  If so you should not be rebasing it.
-only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
-only_next_2=`git rev-list ^master           ${publish} | sort`
-if test "$only_next_1" = "$only_next_2"
-then
-	not_in_topic=`git rev-list "^$topic" master`
-	if test -z "$not_in_topic"
-	then
-		echo >&2 "$topic is already up-to-date with master"
-		exit 1 ;# we could allow it, but there is no point.
-	else
-		exit 0
-	fi
-else
-	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
-	/usr/bin/perl -e '
-		my $topic = $ARGV[0];
-		my $msg = "* $topic has commits already merged to public branch:\n";
-		my (%not_in_next) = map {
-			/^([0-9a-f]+) /;
-			($1 => 1);
-		} split(/\n/, $ARGV[1]);
-		for my $elem (map {
-				/^([0-9a-f]+) (.*)$/;
-				[$1 => $2];
-			} split(/\n/, $ARGV[2])) {
-			if (!exists $not_in_next{$elem->[0]}) {
-				if ($msg) {
-					print STDERR $msg;
-					undef $msg;
-				}
-				print STDERR " $elem->[1]\n";
-			}
-		}
-	' "$topic" "$not_in_next" "$not_in_master"
-	exit 1
-fi
-
-<<\DOC_END
-
-This sample hook safeguards topic branches that have been
-published from being rewound.
-
-The workflow assumed here is:
-
- * Once a topic branch forks from "master", "master" is never
-   merged into it again (either directly or indirectly).
-
- * Once a topic branch is fully cooked and merged into "master",
-   it is deleted.  If you need to build on top of it to correct
-   earlier mistakes, a new topic branch is created by forking at
-   the tip of the "master".  This is not strictly necessary, but
-   it makes it easier to keep your history simple.
-
- * Whenever you need to test or publish your changes to topic
-   branches, merge them into "next" branch.
-
-The script, being an example, hardcodes the publish branch name
-to be "next", but it is trivial to make it configurable via
-$GIT_DIR/config mechanism.
-
-With this workflow, you would want to know:
-
-(1) ... if a topic branch has ever been merged to "next".  Young
-    topic branches can have stupid mistakes you would rather
-    clean up before publishing, and things that have not been
-    merged into other branches can be easily rebased without
-    affecting other people.  But once it is published, you would
-    not want to rewind it.
-
-(2) ... if a topic branch has been fully merged to "master".
-    Then you can delete it.  More importantly, you should not
-    build on top of it -- other people may already want to
-    change things related to the topic as patches against your
-    "master", so if you need further changes, it is better to
-    fork the topic (perhaps with the same name) afresh from the
-    tip of "master".
-
-Let's look at this example:
-
-		   o---o---o---o---o---o---o---o---o---o "next"
-		  /       /           /           /
-		 /   a---a---b A     /           /
-		/   /               /           /
-	       /   /   c---c---c---c B         /
-	      /   /   /             \         /
-	     /   /   /   b---b C     \       /
-	    /   /   /   /             \     /
-    ---o---o---o---o---o---o---o---o---o---o---o "master"
-
-
-A, B and C are topic branches.
-
- * A has one fix since it was merged up to "next".
-
- * B has finished.  It has been fully merged up to "master" and "next",
-   and is ready to be deleted.
-
- * C has not merged to "next" at all.
-
-We would want to allow C to be rebased, refuse A, and encourage
-B to be deleted.
-
-To compute (1):
-
-	git rev-list ^master ^topic next
-	git rev-list ^master        next
-
-	if these match, topic has not merged in next at all.
-
-To compute (2):
-
-	git rev-list master..topic
-
-	if this is empty, it is fully merged to "master".
-
-DOC_END
diff --git a/hooks/prepare-commit-msg.sample b/hooks/prepare-commit-msg.sample
deleted file mode 100644
index f093a02..0000000
--- a/hooks/prepare-commit-msg.sample
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to prepare the commit log message.
-# Called by "git commit" with the name of the file that has the
-# commit message, followed by the description of the commit
-# message's source.  The hook's purpose is to edit the commit
-# message file.  If the hook fails with a non-zero status,
-# the commit is aborted.
-#
-# To enable this hook, rename this file to "prepare-commit-msg".
-
-# This hook includes three examples.  The first comments out the
-# "Conflicts:" part of a merge commit.
-#
-# The second includes the output of "git diff --name-status -r"
-# into the message, just before the "git status" output.  It is
-# commented because it doesn't cope with --amend or with squashed
-# commits.
-#
-# The third example adds a Signed-off-by line to the message, that can
-# still be edited.  This is rarely a good idea.
-
-case "$2,$3" in
-  merge,)
-    /usr/bin/perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;;
-
-# ,|template,)
-#   /usr/bin/perl -i.bak -pe '
-#      print "\n" . `git diff --cached --name-status -r`
-#	 if /^#/ && $first++ == 0' "$1" ;;
-
-  *) ;;
-esac
-
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
diff --git a/hooks/update.sample b/hooks/update.sample
deleted file mode 100644
index 71ab04e..0000000
--- a/hooks/update.sample
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to blocks unannotated tags from entering.
-# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
-#
-# To enable this hook, rename this file to "update".
-#
-# Config
-# ------
-# hooks.allowunannotated
-#   This boolean sets whether unannotated tags will be allowed into the
-#   repository.  By default they won't be.
-# hooks.allowdeletetag
-#   This boolean sets whether deleting tags will be allowed in the
-#   repository.  By default they won't be.
-# hooks.allowmodifytag
-#   This boolean sets whether a tag may be modified after creation. By default
-#   it won't be.
-# hooks.allowdeletebranch
-#   This boolean sets whether deleting branches will be allowed in the
-#   repository.  By default they won't be.
-# hooks.denycreatebranch
-#   This boolean sets whether remotely creating branches will be denied
-#   in the repository.  By default this is allowed.
-#
-
-# --- Command line
-refname="$1"
-oldrev="$2"
-newrev="$3"
-
-# --- Safety check
-if [ -z "$GIT_DIR" ]; then
-	echo "Don't run this script from the command line." >&2
-	echo " (if you want, you could supply GIT_DIR then run" >&2
-	echo "  $0 <ref> <oldrev> <newrev>)" >&2
-	exit 1
-fi
-
-if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
-	echo "Usage: $0 <ref> <oldrev> <newrev>" >&2
-	exit 1
-fi
-
-# --- Config
-allowunannotated=$(git config --bool hooks.allowunannotated)
-allowdeletebranch=$(git config --bool hooks.allowdeletebranch)
-denycreatebranch=$(git config --bool hooks.denycreatebranch)
-allowdeletetag=$(git config --bool hooks.allowdeletetag)
-allowmodifytag=$(git config --bool hooks.allowmodifytag)
-
-# check for no description
-projectdesc=$(sed -e '1q' "$GIT_DIR/description")
-case "$projectdesc" in
-"Unnamed repository"* | "")
-	echo "*** Project description file hasn't been set" >&2
-	exit 1
-	;;
-esac
-
-# --- Check types
-# if $newrev is 0000...0000, it's a commit to delete a ref.
-zero="0000000000000000000000000000000000000000"
-if [ "$newrev" = "$zero" ]; then
-	newrev_type=delete
-else
-	newrev_type=$(git cat-file -t $newrev)
-fi
-
-case "$refname","$newrev_type" in
-	refs/tags/*,commit)
-		# un-annotated tag
-		short_refname=${refname##refs/tags/}
-		if [ "$allowunannotated" != "true" ]; then
-			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
-			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,delete)
-		# delete tag
-		if [ "$allowdeletetag" != "true" ]; then
-			echo "*** Deleting a tag is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,tag)
-		# annotated tag
-		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
-		then
-			echo "*** Tag '$refname' already exists." >&2
-			echo "*** Modifying a tag is not allowed in this repository." >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,commit)
-		# branch
-		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
-			echo "*** Creating a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,delete)
-		# delete branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/remotes/*,commit)
-		# tracking branch
-		;;
-	refs/remotes/*,delete)
-		# delete tracking branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	*)
-		# Anything else (is there anything else?)
-		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
-		exit 1
-		;;
-esac
-
-# --- Finished
-exit 0
diff --git a/info/exclude b/info/exclude
deleted file mode 100644
index a5196d1..0000000
--- a/info/exclude
+++ /dev/null
@@ -1,6 +0,0 @@
-# git ls-files --others --exclude-from=.git/info/exclude
-# Lines that start with '#' are comments.
-# For a project mostly in C, the following would be a good set of
-# exclude patterns (uncomment them if you want to use them):
-# *.[oa]
-# *~
diff --git a/setup.py b/setup.py
index a88d0b6..41691b4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import os
-import sys
-
+import os, sys
 
 try:
     from setuptools import setup
 except ImportError:
     from distutils.core import setup
 
-
 readme = open('README.rst').read()
 history = open('HISTORY.rst').read().replace('.. :changelog:', '')
 
@@ -24,7 +21,7 @@
 
 setup(
     name='mca',
-    version='1.0.2',
+    version='1.0.3',
     description='Multiple correspondence analysis with pandas',
     long_description=readme + '\n\n' + history,
     author='Emre Safak',
diff --git a/src/__init__.py b/src/__init__.py
index 520f52c..609d605 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 
-__author__ = 'Emre Safak'
-__email__ = 'misteremre@yahoo.com'
-__version__ = '1.0'
+from pkg_resources import get_distribution
 
-# __all__ = ['mca']
-# import scipy.linalg, numpy, pandas, functools
\ No newline at end of file
+__author__ = 'Emre Safak'
+__version__ = get_distribution('mca').version
\ No newline at end of file
diff --git a/src/mca.py b/src/mca.py
index 8e5ed85..d01a93b 100644
--- a/src/mca.py
+++ b/src/mca.py
@@ -1,219 +1,231 @@
 # -*- coding: utf-8 -*-
 
 from functools import reduce
-from pandas import concat, get_dummies
-from scipy.linalg import diagsvd
+
 from numpy import apply_along_axis, argmax, array, cumsum, diag, dot, finfo, flatnonzero, int64, outer, sqrt
 from numpy.linalg import norm, svd
+from pandas import concat, get_dummies
+from scipy.linalg import diagsvd
+from scipy.sparse import diags
+from scipy.sparse.linalg import svds
 
 def process_df(DF, cols, ncols):
-    if cols:  # if you want us to do the dummy coding
-        K = len(cols)  # the number of categories
-        X = dummy(DF, cols)
-    else:  # if you want to dummy code it yourself or do all the cols
-        K = ncols
-        if ncols is None:  # be sure to pass K if you didn't multi-index
-            K = len(DF.columns)  # ... it with mca.dummy()
-            if not K:
-                raise ValueError("Your DataFrame has no columns.")
-        elif not isinstance(ncols, int) or ncols <= 0 or \
-                        ncols > len(DF.columns):  # if you dummy coded it yourself
-            raise ValueError("You must pass a valid number of columns.")
-        X = DF
-    J = X.shape[1]
-    return X, K, J
+	if cols:  # if you want us to do the dummy coding
+		K = len(cols)  # the number of categories
+		X = dummy(DF, cols)
+	else:  # if you want to dummy code it yourself or do all the cols
+		K = ncols
+		if ncols is None:  # be sure to pass K if you didn't multi-index
+			K = len(DF.columns)  # ... it with mca.dummy()
+			if not K:
+				raise ValueError("Your DataFrame has no columns.")
+		elif not isinstance(ncols, int) or ncols <= 0 or \
+					ncols > len(DF.columns):  # if you dummy coded it yourself
+			raise ValueError("You must pass a valid number of columns.")
+		X = DF
+	J = X.shape[1]
+	return X, K, J
 
 
 def dummy(DF, cols=None):
-    """Dummy code select columns of a DataFrame."""
-    return concat((get_dummies(DF[col])
-        for col in (DF.columns if cols is None else cols)),
-        axis=1, keys=DF.columns)
+	"""Dummy code select columns of a DataFrame."""
+	dummies = (get_dummies(DF[col]) for col in 
+		(DF.columns if cols is None else cols))
+	return concat(dummies, axis=1, keys=DF.columns)
 
 
 def _mul(*args):
-    """An internal method to multiply matrices."""
-    return reduce(dot, args)
+	"""An internal method to multiply matrices."""
+	return reduce(dot, args)
 
 
 class MCA(object):
-    """Run MCA on selected columns of a pd DataFrame.
-    
-    If the column are specified, assume that they hold
-    categorical variables that need to be replaced with
-    dummy indicators, otherwise process the DataFrame as is.
-
-    'cols': The columns of the DataFrame to process.
-    'ncols': The number of columns before dummy coding. To be passed if cols isn't.
-    'benzecri': Perform Benzécri correction (default: True)
-    'TOL': value below which to round eigenvalues to zero (default: 1e-4)
-    """
-
-    def __init__(self, DF, cols=None, ncols=None, benzecri=True, TOL=1e-4):
-
-        X, self.K, self.J = process_df(DF, cols, ncols)
-        S = X.sum().sum()
-        Z = X / S  # correspondence matrix
-        self.r = Z.sum(axis=1)
-        self.c = Z.sum()
-        self._numitems = len(DF)
-        self.cor = benzecri
-        self.D_r = diag(1/sqrt(self.r))
-        Z_c = Z - outer(self.r, self.c)  # standardized residuals matrix
-        eps = finfo(float).eps  # avoid division-by-zero
-        self.D_c = diag(1/(eps + sqrt(self.c)))
-
-        # another option, not pursued here, is sklearn.decomposition.TruncatedSVD
-        self.P, self.s, self.Q = svd(_mul(self.D_r, Z_c, self.D_c))
-
-        self.E = None
-        E = self._benzecri() if self.cor else self.s**2
-        self.inertia = sum(E)
-        self.rank = argmax(E < TOL)
-        self.L = E[:self.rank]
-
-    def _benzecri(self):
-        if self.E is None:
-            self.E = array([(self.K/(self.K-1.)*(_ - 1./self.K))**2
-                              if _ > 1./self.K else 0 for _ in self.s**2])
-        return self.E
-
-    def fs_r(self, percent=0.9, N=None):
-        """Get the row factor scores (dimensionality-reduced representation),
-        choosing how many factors to retain, directly or based on the explained
-        variance.
-
-        'percent': The minimum variance that the retained factors are required
-                                to explain (default: 90% = 0.9)
-        'N': The number of factors to retain. Overrides 'percent'.
-                If the rank is less than N, N is ignored.
-        """
-        if not 0 <= percent <= 1:
-                raise ValueError("Percent should be a real number between 0 and 1.")
-        if N:
-                if not isinstance(N, (int, int64)) or N <= 0:
-                        raise ValueError("N should be a positive integer.")
-                N = min(N, self.rank)
-                # S = zeros((self._numitems, N))
-        # else:
-        self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0]
-        #  S = zeros((self._numitems, self.k))
-        # the sign of the square root can be either way; singular value vs. eigenvalue
-        # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s)
-        num2ret = N if N else self.k
-        s = -sqrt(self.L) if self.cor else self.s
-        S = diagsvd(s[:num2ret], self._numitems, num2ret)
-        self.F = _mul(self.D_r, self.P, S)
-        return self.F
-
-    def fs_c(self, percent=0.9, N=None):
-        """Get the column factor scores (dimensionality-reduced representation),
-        choosing how many factors to retain, directly or based on the explained
-        variance.
-
-        'percent': The minimum variance that the retained factors are required
-                                to explain (default: 90% = 0.9)
-        'N': The number of factors to retain. Overrides 'percent'.
-                If the rank is less than N, N is ignored.
-        """
-        if not 0 <= percent <= 1:
-                raise ValueError("Percent should be a real number between 0 and 1.")
-        if N:
-                if not isinstance(N, (int, int64)) or N <= 0:
-                        raise ValueError("N should be a positive integer.")
-                N = min(N, self.rank)  # maybe we should notify the user?
-                # S = zeros((self._numitems, N))
-        # else:
-        self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0]
-        #  S = zeros((self._numitems, self.k))
-        # the sign of the square root can be either way; singular value vs. eigenvalue
-        # fill_diagonal(S, -sqrt(self.E) if self.cor else self.s)
-        num2ret = N if N else self.k
-        s = -sqrt(self.L) if self.cor else self.s
-        S = diagsvd(s[:num2ret], len(self.Q), num2ret)
-        self.G = _mul(self.D_c, self.Q.T, S)  # important! note the transpose on Q
-        return self.G
-
-    def cos_r(self, N=None):  # percent=0.9
-        """Return the squared cosines for each row."""
-
-        if not hasattr(self, 'F') or self.F.shape[1] < self.rank:
-                self.fs_r(N=self.rank)  # generate F
-        self.dr = norm(self.F, axis=1)**2
-        # cheaper than diag(self.F.dot(self.F.T))?
-
-        return apply_along_axis(lambda _: _/self.dr, 0, self.F[:, :N]**2)
-
-    def cos_c(self, N=None):  # percent=0.9,
-        """Return the squared cosines for each column."""
-
-        if not hasattr(self, 'G') or self.G.shape[1] < self.rank:
-                self.fs_c(N=self.rank)  # generate
-        self.dc = norm(self.G, axis=1)**2
-        # cheaper than diag(self.G.dot(self.G.T))?
-
-        return apply_along_axis(lambda _: _/self.dc, 0, self.G[:, :N]**2)
-
-    def cont_r(self, percent=0.9, N=None):
-        """Return the contribution of each row."""
-
-        if not hasattr(self, 'F'):
-            self.fs_r(N=self.rank)  # generate F
-        return apply_along_axis(lambda _: _/self.L[:N], 1,
-               apply_along_axis(lambda _: _*self.r, 0, self.F[:, :N]**2))
-
-    def cont_c(self, percent=0.9, N=None):  # bug? check axis number 0 vs 1 here
-        """Return the contribution of each column."""
-
-        if not hasattr(self, 'G'):
-            self.fs_c(N=self.rank)  # generate G
-        return apply_along_axis(lambda _: _/self.L[:N], 1,
-               apply_along_axis(lambda _: _*self.c, 0, self.G[:, :N]**2))
-
-    def expl_var(self, greenacre=True, N=None):
-        """
-        Return proportion of explained inertia (variance) for each factor.
-
-        :param greenacre: Perform Greenacre correction (default: True)
-        """
-        if greenacre:
-            greenacre_inertia = (self.K / (self.K - 1.) * (sum(self.s**4)
-                                 - (self.J - self.K) / self.K**2.))
-            return (self._benzecri() / greenacre_inertia)[:N]
-        else:
-            E = self._benzecri() if self.cor else self.s**2
-            return (E / sum(E))[:N]
-
-    def fs_r_sup(self, DF, N=None):
-        """Find the supplementary row factor scores.
-
-        ncols: The number of singular vectors to retain.
-        If both are passed, cols is given preference.
-        """
-        if not hasattr(self, 'G'):
-            self.fs_c(N=self.rank)  # generate G
-
-        if N and (not isinstance(N, int) or N <= 0):
-                raise ValueError("ncols should be a positive integer.")
-        s = -sqrt(self.E) if self.cor else self.s
-        N = min(N, self.rank) if N else self.rank
-        S_inv = diagsvd(-1/s[:N], len(self.G.T), N)
-        # S = scipy.linalg.diagsvd(s[:N], len(self.tau), N)
-        return _mul(DF.div(DF.sum(axis=1), axis=0), self.G, S_inv)[:, :N]
-
-    def fs_c_sup(self, DF, N=None):
-        """Find the supplementary column factor scores.
-
-        ncols: The number of singular vectors to retain.
-        If both are passed, cols is given preference.
-        """
-        if not hasattr(self, 'F'):
-            self.fs_r(N=self.rank)  # generate F
-
-        if N and (not isinstance(N, int) or N <= 0):
-                raise ValueError("ncols should be a positive integer.")
-        s = -sqrt(self.E) if self.cor else self.s
-        N = min(N, self.rank) if N else self.rank
-        S_inv = diagsvd(-1/s[:N], len(self.F.T), N)
-        # S = scipy.linalg.diagsvd(s[:N], len(self.tau), N)
-        return _mul((DF/DF.sum()).T, self.F, S_inv)[:, :N]
+	"""Run MCA on selected columns of a pd DataFrame.
+	
+	If the column are specified, assume that they hold
+	categorical variables that need to be replaced with
+	dummy indicators, otherwise process the DataFrame as is.
+
+	'cols': The columns of the DataFrame to process.
+	'ncols': The number of columns before dummy coding. To be passed if cols isn't.
+	'benzecri': Perform Benzécri correction (default: True)
+	'TOL': value below which to round eigenvalues to zero (default: 1e-4)
+	"""
+
+	def __init__(self, DF, cols=None, ncols=None, benzecri=True, TOL=1e-4, 
+		sparse=False, approximate=False):
+
+		X, self.K, self.J = process_df(DF, cols, ncols)
+		S = X.sum().sum()
+		Z = X / S  # correspondence matrix
+		self.r = Z.sum(axis=1)
+		self.c = Z.sum()
+		self.cor = benzecri
+
+		eps = finfo(float).eps
+		self.D_r = (diags if sparse else diag)(1/(eps + sqrt(self.r)))
+		self.D_c = diag(1/(eps + sqrt(self.c)))  # can't use diags here
+		Z_c = Z - outer(self.r, self.c)  # standardized residuals matrix
+
+		product = self.D_r.dot(Z_c).dot(self.D_c)
+		if sparse:
+			P, s, Q = svds(product, min(product.shape)-1 if ncols is None else ncols)
+			# svds and svd use complementary orders
+			self.P = P.T[::-1].T
+			self.Q = Q[::-1]
+			self.s = s[::-1]
+			self._numitems = min(product.shape)-1
+		else:
+			self._numitems = len(DF)
+			self.P, self.s, self.Q = svd(product)
+
+		self.E = None
+		E = self._benzecri() if self.cor else self.s**2
+		self.inertia = sum(E)
+		self.rank = argmax(E < TOL)
+		if not self.rank: self.rank = len(E)
+		self.L = E[:self.rank]
+
+	def _benzecri(self):
+		if self.E is None:
+			self.E = array([(self.K/(self.K-1.)*(_ - 1./self.K))**2
+							  if _ > 1./self.K else 0 for _ in self.s**2])
+		return self.E
+
+	def fs_r(self, percent=0.9, N=None):
+		"""Get the row factor scores (dimensionality-reduced representation),
+		choosing how many factors to retain, directly or based on the explained
+		variance.
+
+		'percent': The minimum variance that the retained factors are required
+								to explain (default: 90% = 0.9)
+		'N': The number of factors to retain. Overrides 'percent'.
+				If the rank is less than N, N is ignored.
+		"""
+		if not 0 <= percent <= 1:
+			raise ValueError("Percent should be a real number between 0 and 1.")
+		if N:
+			if not isinstance(N, (int, int64)) or N <= 0:
+				raise ValueError("N should be a positive integer.")
+			N = min(N, self.rank)
+		self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0]
+		#  S = zeros((self._numitems, self.k))
+		# the sign of the square root can be either way; singular value vs. eigenvalue
+		# fill_diagonal(S, -sqrt(self.E) if self.cor else self.s)
+		num2ret = N if N else self.k
+		s = -sqrt(self.L) if self.cor else self.s
+		S = diagsvd(s[:num2ret], self._numitems, num2ret)
+		self.F = self.D_r.dot(self.P).dot(S)
+		return self.F
+
+	def fs_c(self, percent=0.9, N=None):
+		"""Get the column factor scores (dimensionality-reduced representation),
+		choosing how many factors to retain, directly or based on the explained
+		variance.
+
+		'percent': The minimum variance that the retained factors are required
+								to explain (default: 90% = 0.9)
+		'N': The number of factors to retain. Overrides 'percent'.
+				If the rank is less than N, N is ignored.
+		"""
+		if not 0 <= percent <= 1:
+			raise ValueError("Percent should be a real number between 0 and 1.")
+		if N:
+			if not isinstance(N, (int, int64)) or N <= 0:
+				raise ValueError("N should be a positive integer.")
+			N = min(N, self.rank)  # maybe we should notify the user?
+			# S = zeros((self._numitems, N))
+		# else:
+		self.k = 1 + flatnonzero(cumsum(self.L) >= sum(self.L)*percent)[0]
+		#  S = zeros((self._numitems, self.k))
+		# the sign of the square root can be either way; singular value vs. eigenvalue
+		# fill_diagonal(S, -sqrt(self.E) if self.cor else self.s)
+		num2ret = N if N else self.k
+		s = -sqrt(self.L) if self.cor else self.s
+		S = diagsvd(s[:num2ret], len(self.Q), num2ret)
+		self.G = _mul(self.D_c, self.Q.T, S)  # important! note the transpose on Q
+		return self.G
+
+	def cos_r(self, N=None):  # percent=0.9
+		"""Return the squared cosines for each row."""
+
+		if not hasattr(self, 'F') or self.F.shape[1] < self.rank:
+			self.fs_r(N=self.rank)  # generate F
+		self.dr = norm(self.F, axis=1)**2
+		# cheaper than diag(self.F.dot(self.F.T))?
+
+		return apply_along_axis(lambda _: _/self.dr, 0, self.F[:, :N]**2)
+
+	def cos_c(self, N=None):  # percent=0.9,
+		"""Return the squared cosines for each column."""
+
+		if not hasattr(self, 'G') or self.G.shape[1] < self.rank:
+			self.fs_c(N=self.rank)  # generate
+		self.dc = norm(self.G, axis=1)**2
+		# cheaper than diag(self.G.dot(self.G.T))?
+
+		return apply_along_axis(lambda _: _/self.dc, 0, self.G[:, :N]**2)
+
+	def cont_r(self, percent=0.9, N=None):
+		"""Return the contribution of each row."""
+
+		if not hasattr(self, 'F'):
+			self.fs_r(N=self.rank)  # generate F
+		return apply_along_axis(lambda _: _/self.L[:N], 1,
+			   apply_along_axis(lambda _: _*self.r, 0, self.F[:, :N]**2))
+
+	def cont_c(self, percent=0.9, N=None):  # bug? check axis number 0 vs 1 here
+		"""Return the contribution of each column."""
+
+		if not hasattr(self, 'G'):
+			self.fs_c(N=self.rank)  # generate G
+		return apply_along_axis(lambda _: _/self.L[:N], 1,
+			   apply_along_axis(lambda _: _*self.c, 0, self.G[:, :N]**2))
+
+	def expl_var(self, greenacre=True, N=None):
+		"""
+		Return proportion of explained inertia (variance) for each factor.
+
+		:param greenacre: Perform Greenacre correction (default: True)
+		"""
+		if greenacre:
+			greenacre_inertia = (self.K / (self.K - 1.) * (sum(self.s**4)
+								 - (self.J - self.K) / self.K**2.))
+			return (self._benzecri() / greenacre_inertia)[:N]
+		else:
+			E = self._benzecri() if self.cor else self.s**2
+			return (E / sum(E))[:N]
+
+	def fs_r_sup(self, DF, N=None):
+		"""Find the supplementary row factor scores.
+
+		ncols: The number of singular vectors to retain.
+		If both are passed, cols is given preference.
+		"""
+		if not hasattr(self, 'G'):
+			self.fs_c(N=self.rank)  # generate G
+
+		if N and (not isinstance(N, int) or N <= 0):
+			raise ValueError("ncols should be a positive integer.")
+		s = -sqrt(self.E) if self.cor else self.s
+		N = min(N, self.rank) if N else self.rank
+		S_inv = diagsvd(-1/s[:N], len(self.G.T), N)
+		# S = diagsvd(s[:N], len(self.tau), N)
+		return _mul(DF.div(DF.sum(axis=1), axis=0), self.G, S_inv)[:, :N]
+
+	def fs_c_sup(self, DF, N=None):
+		"""Find the supplementary column factor scores.
+
+		ncols: The number of singular vectors to retain.
+		If both are passed, cols is given preference.
+		"""
+		if not hasattr(self, 'F'):
+			self.fs_r(N=self.rank)  # generate F
+
+		if N and (not isinstance(N, int) or N <= 0):
+			raise ValueError("ncols should be a positive integer.")
+		s = -sqrt(self.E) if self.cor else self.s
+		N = min(N, self.rank) if N else self.rank
+		S_inv = diagsvd(-1/s[:N], len(self.F.T), N)
+		# S = diagsvd(s[:N], len(self.tau), N)
+		return _mul((DF/DF.sum()).T, self.F, S_inv)[:, :N]
diff --git a/tests/test_mca.py b/tests/test_mca.py
index 8aa731a..4b592e4 100644
--- a/tests/test_mca.py
+++ b/tests/test_mca.py
@@ -8,184 +8,190 @@
 Tests for `mca` module.
 """
 
-import unittest
+from unittest import TestCase, main
 from numpy.testing import assert_allclose
-from numpy import array
-import pandas
+from numpy import array, sign
+from numpy.random import randint
+from pandas import read_table, DataFrame
 from mca import MCA
 
 
-class TestMca(unittest.TestCase):
-
-    def test_abdi_valentin(self):
-        # Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
-        # Multiple Correspondence Analysis
-        # (Hervé Abdi & Dominique Valentin, 2007)
-        # See in particular Table 2,3,4.
-
-        # first we check the eigenvalues and factor scores with Benzecri
-        # correction
-        df = pandas.read_table('data/burgundies.csv', skiprows=1, sep=',',
-                               index_col=0)
-        mca_df = MCA(df.drop('oak_type', axis=1), ncols=10)
-        assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4)
-        true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08],
-                       [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]]
-        assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2)
-        true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9,
-                        .90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90,
-                        -.90, .9, .90, -.90],
-                       [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00,
-                        .18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00,
-                        .00, .00]]
-        assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2)
-
-        true_cont_r = [[177, 121, 202, 177, 202, 121],
-                       [83, 333, 83, 83, 83, 333]]
-        assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1)
-
-        true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44,
-                        58, 58, 6, 6, 58, 58, 58, 58],
-                       [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0,
-                        0, 0, 0, 0, 0, 0, 0]]
-        assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1)
-
-        # I declined to include a test for the cos_c and cos_r functions because
-        # I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in
-        # elsewhere the formula for the squared cosine is f**2/d**2. This does not
-        # agree with tables 3 and 4. In table 3 the squared cosine is derived from
-        # f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no
-        # idea how the squared cosines in table 4 were derived. My formula, however
-        # does comport with the figures given in (Abdi & Bera, 2014), tested next.
-
-        # oak = pandas.DataFrame([1,2,2,2,1,1], columns=['oak_type'])
-        # print(dummy(oak))
-        # mca_df.fs_c_sup(dummy(oak))
-
-        # ... then without Benzecri correction
-        mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False)
-        assert_allclose([0.8532, 0.2, 0.1151, 0.0317],
-                        (mca_df_i.s**2)[:4], atol=1e-4)
-
-        # check percentage of explained variance both with and without Benzecri
-        # and Greenacre corrections
-        true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.]
-        true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.]
-        true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.]
-        assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4)
-        assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4)
-        assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4)
-        assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4)
-
-    def test_abdi_bera(self):
-        # Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf
-        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2014)
-        # Springer Encyclopedia of Social Networks and Mining.
-        df = pandas.read_table('data/music_color.csv', skiprows=0, index_col=0,
-                               sep=',')
-        mca_df = MCA(df, benzecri=False)
-
-        # Table 1, page 13
-        assert_allclose(mca_df.r, [.121, .091, .126, .116, .096, .066, .071,
-                                   .146, .061, .106], atol=1e-3)
-        assert_allclose(mca_df.c, [.11, .11, .11, .11, .11, .11, .11, .11, .11],
-                        atol=1e-2)
-
-        # Table 2, page 14
-        assert_allclose(mca_df.fs_r(N=2), [[-0.026, 0.299], [-0.314, 0.232],
-                                           [-0.348, 0.202], [-0.044, -0.490],
-                                           [-0.082, -0.206], [-0.619, 0.475],
-                                           [-0.328, 0.057], [1.195, 0.315],
-                                           [-0.57, 0.3], [0.113, -0.997]],
-                        atol=1e-3)
-        assert_allclose(mca_df.cont_r(N=2)*1000, [[0, 56], [31, 25], [53, 27],
-                                                  [1, 144], [2, 21], [87, 77],
-                                                  [26, 1], [726, 75], [68, 28],
-                                                  [5, 545]],
-                        atol=1)
-        assert_allclose(mca_df.cos_r(N=2)*1000,
-                        [[3, 410], [295, 161], [267, 89], [5, 583], [13, 81],
-                         [505, 298], [77, 2], [929, 65], [371, 103], [12, 973]],
-                        atol=1)
-
-        # Table 3, page 17
-        assert_allclose(mca_df.fs_c(N=2),
-                        [[-0.541, 0.386], [-.257, .275], [-.291, -.309],
-                         [.991, .397], [-.122, -.637], [-.236, .326],
-                         [.954, -.089], [-.427, .408], [-.072, -.757]],
-                         atol=1e-3)
-        assert_allclose(mca_df.cont_c(N=2)*1000,
-                        [[113, 86], [25, 44], [33, 55],
-                         [379, 91], [6, 234], [22, 61],
-                         [351, 5], [70, 96], [2, 330]],
-                        atol=1)
-        assert_allclose(mca_df.cos_c(N=2)*1000,
-                        [[454, 232], [105, 121], [142, 161], [822, 132],
-                         [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]],
-                        atol=1)
-
-        assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3)
-        self.assertAlmostEqual(mca_df.inertia, 0.746, 3)
-
-    def test_abdi_williams(self):
-        # Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf
-        # Correspondence Analysis, (Herve Abdi & Michel Bera, 2010)
-        # SAGE Encyclopedia of Research Design. Table 4, page 16.
-
-        df = pandas.read_table('data/french_writers.csv', skiprows=0,
-                               index_col=0, sep=',')
-        mca_df = MCA(df, benzecri=False)
-
-        assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4)
-        assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835],
-                        atol=1e-4)
-
-        true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475],
-                       [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]]
-        assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4)
-        assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4)
-
-        assert_allclose(-mca_df.fs_c(N=2).T, [[-0.0489, 0.0973, -0.2914],
-                                              [.1115, -0.0367, -0.0901]],
-                        atol=1e-4)
-
-        true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106],
-                       [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]]
-        assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4)
-
-        true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554],
-                      [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]]
-        assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4)
-
-        assert_allclose(mca_df.cont_c(N=2).T, [[0.0399, 0.2999, 0.6601],
-                                               [0.6628, 0.1359, 0.2014]],
-                        atol=1e-4)
-        assert_allclose(mca_df.cos_c(N=2).T, [[0.1614, 0.8758, 0.9128],
-                                              [0.8386, 0.1242, 0.0872]],
-                        atol=1e-4)
-
-        assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4)
-        assert_allclose(mca_df.dr,
-                        [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408],
-                        atol=1e-4)
-
-        # abdi = numpy.array([216, 139, 26]) #
-        abdi = pandas.DataFrame([216, 139, 26]).T
-        assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]],
-                        atol=1e-4)
-
-        supp = pandas.read_table('data/french_writers_supp.csv', skiprows=0,
-                                 index_col=0, sep=',')
-
-        true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008],
-                           [0.2318, 0.2082, -0.2976, -0.4740]]
-        assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3)
-
-    def test_invalid_inputs(self):
-        df = pandas.read_table('data/burgundies.csv', skiprows=1, sep=',')
-        self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols=0)
-        self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols='')
+class TestMca(TestCase):
+
+	def test_abdi_valentin(self):
+		# Data taken from http://www.utdallas.edu/~herve/Abdi-MCA2007-pretty.pdf
+		# Multiple Correspondence Analysis
+		# (Hervé Abdi & Dominique Valentin, 2007)
+		# See in particular Table 2,3,4.
+
+		# first we check the eigenvalues and factor scores with Benzecri
+		# correction
+		df = read_table('data/burgundies.csv', skiprows=1, sep=',', index_col=0)
+		mca_df = MCA(df.drop('oak_type', axis=1), ncols=10)
+		assert_allclose([0.7004, 0.0123, 0.0003], mca_df.E[:3], atol=1e-4)
+		true_fs_row = [[0.86, 0.08], [-0.71, -0.16], [-0.92, 0.08],
+					   [-0.86, 0.08], [0.92, 0.08], [0.71, -0.16]]
+		assert_allclose(true_fs_row, mca_df.fs_r(N=2), atol=1e-2)
+		true_fs_col = [[.90, -.90, -.97, .00, .97, -.90, .90, .90, -.90, -.9,
+						.90, -.97, .00, .97, -.90, .90, .28, -.28, -.90, .90,
+						-.90, .9, .90, -.90],
+					   [.00, .00, .18, -.35, .18, .00, .00, .00, .0, .00, .00,
+						.18, -.35, .18, .00, .00, .0, .00, .00, .00, .00, .00,
+						.00, .00]]
+		assert_allclose(array(true_fs_col).T[:-2], mca_df.fs_c(N=2), atol=1e-2)
+
+		true_cont_r = [[177, 121, 202, 177, 202, 121],
+					   [83, 333, 83, 83, 83, 333]]
+		assert_allclose(true_cont_r, 1000*mca_df.cont_r(N=2).T, atol=1)
+
+		true_cont_c = [[58, 58, 44, 0, 44, 58, 58, 58, 58, 58, 58, 44, 0, 44,
+						58, 58, 6, 6, 58, 58, 58, 58],
+					   [0, 0, 83, 333, 83, 0, 0, 0, 0, 0, 0, 83, 333, 83, 0,
+						0, 0, 0, 0, 0, 0, 0]]
+		assert_allclose(true_cont_c, 1000*mca_df.cont_c(N=2).T, atol=1)
+
+		# I declined to include a test for the cos_c and cos_r functions because
+		# I think the source itself is mistaken. In Abdi-MCA2007-pretty.pdf as in
+		# elsewhere the formula for the squared cosine is f**2/d**2. This does not
+		# agree with tables 3 and 4. In table 3 the squared cosine is derived from
+		# f**2/I where I = 1.2 is the inertia before Benzecri correction. I have no
+		# idea how the squared cosines in table 4 were derived. My formula, however
+		# does comport with the figures given in (Abdi & Bera, 2014), tested next.
+
+		# oak = DataFrame([1,2,2,2,1,1], columns=['oak_type'])
+		# print(dummy(oak))
+		# mca_df.fs_c_sup(dummy(oak))
+
+		# ... then without Benzecri correction
+		mca_df_i = MCA(df.drop('oak_type', axis=1), ncols=10, benzecri=False)
+		assert_allclose([0.8532, 0.2, 0.1151, 0.0317],
+						(mca_df_i.s**2)[:4], atol=1e-4)
+
+		# check percentage of explained variance both with and without Benzecri
+		# and Greenacre corrections
+		true_expl_var_i = [.7110, .1667, .0959, .0264, 0., 0.]
+		true_expl_var_z = [.9823, .0173, .0004, 0., 0., 0.]
+		true_expl_var_c = [.9519, .0168, .0004, 0., 0., 0.]
+		assert_allclose(mca_df_i.expl_var(False), true_expl_var_i, atol=1e-4)
+		assert_allclose(mca_df_i.expl_var(), true_expl_var_c, atol=1e-4)
+		assert_allclose(mca_df.expl_var(False), true_expl_var_z, atol=1e-4)
+		assert_allclose(mca_df.expl_var(), true_expl_var_c, atol=1e-4)
+
+	def test_abdi_bera(self):
+		# Data taken from www.utdallas.edu/~herve/abdi-AB2014_CA.pdf
+		# Correspondence Analysis, (Herve Abdi & Michel Bera, 2014)
+		# Springer Encyclopedia of Social Networks and Mining.
+		df = read_table('data/music_color.csv', skiprows=0, index_col=0, sep=',')
+		mca_df = MCA(df, benzecri=False)
+
+		# Table 1, page 13
+		assert_allclose(mca_df.r, [.121, .091, .126, .116, .096, .066, .071,
+								   .146, .061, .106], atol=1e-3)
+		assert_allclose(mca_df.c, [.11, .11, .11, .11, .11, .11, .11, .11, .11],
+						atol=1e-2)
+
+		# Table 2, page 14
+		assert_allclose(mca_df.fs_r(N=2), [[-0.026, 0.299], [-0.314, 0.232],
+										   [-0.348, 0.202], [-0.044, -0.490],
+										   [-0.082, -0.206], [-0.619, 0.475],
+										   [-0.328, 0.057], [1.195, 0.315],
+										   [-0.57, 0.3], [0.113, -0.997]],
+						atol=1e-3)
+		assert_allclose(mca_df.cont_r(N=2)*1000, [[0, 56], [31, 25], [53, 27],
+												  [1, 144], [2, 21], [87, 77],
+												  [26, 1], [726, 75], [68, 28],
+												  [5, 545]],
+						atol=1)
+		assert_allclose(mca_df.cos_r(N=2)*1000,
+						[[3, 410], [295, 161], [267, 89], [5, 583], [13, 81],
+						 [505, 298], [77, 2], [929, 65], [371, 103], [12, 973]],
+						atol=1)
+
+		# Table 3, page 17
+		assert_allclose(mca_df.fs_c(N=2),
+						[[-0.541, 0.386], [-.257, .275], [-.291, -.309],
+						 [.991, .397], [-.122, -.637], [-.236, .326],
+						 [.954, -.089], [-.427, .408], [-.072, -.757]],
+						 atol=1e-3)
+		assert_allclose(mca_df.cont_c(N=2)*1000,
+						[[113, 86], [25, 44], [33, 55],
+						 [379, 91], [6, 234], [22, 61],
+						 [351, 5], [70, 96], [2, 330]],
+						atol=1)
+		assert_allclose(mca_df.cos_c(N=2)*1000,
+						[[454, 232], [105, 121], [142, 161], [822, 132],
+						 [26, 709], [78, 149], [962, 8], [271, 249], [7, 759]],
+						atol=1)
+
+		assert_allclose(mca_df.L[:2], [.287, .192], atol=2e-3)
+		self.assertAlmostEqual(mca_df.inertia, 0.746, 3)
+
+	def test_abdi_williams(self):
+		# Data taken from www.utdallas.edu/~herve/abdi-CorrespondenceAnaysis2010-pretty.pdf
+		# Correspondence Analysis, (Herve Abdi & Michel Bera, 2010)
+		# SAGE Encyclopedia of Research Design. Table 4, page 16.
+
+		df = read_table('data/french_writers.csv', skiprows=0, index_col=0, sep=',')
+		mca_df = MCA(df, benzecri=False)
+
+		assert_allclose(mca_df.c, [.2973, .5642, .1385], atol=1e-4)
+		assert_allclose(mca_df.r, [.0189, .1393, .2522, .3966, .1094, .0835],
+						atol=1e-4)
+
+		true_fs_row = [[0.2398, 0.1895, 0.1033, -0.0918, -0.2243, 0.0475],
+					   [0.0741, 0.1071, -0.0297, 0.0017, 0.0631, -0.1963]]
+		assert_allclose(mca_df.fs_r(N=2).T, true_fs_row, atol=1e-4)
+		assert_allclose(mca_df.L, [.0178, .0056], atol=1e-4)
+
+		true_fs_col = [[-0.0489, 0.0973, -0.2914], [.1115, -0.0367, -0.0901]]
+		assert_allclose(-mca_df.fs_c(N=2).T, true_fs_col, atol=1e-4)
+
+		true_cont_r = [[0.0611, 0.2807, 0.1511, 0.1876, 0.3089, 0.0106],
+					   [0.0186, 0.2864, 0.0399, 0.0002, 0.0781, 0.5767]]
+		assert_allclose(mca_df.cont_r(N=2).T, true_cont_r, atol=1e-4)
+
+		true_cos_r = [[0.9128, 0.7579, 0.9236, 0.9997, 0.9266, 0.0554],
+					  [0.0872, 0.2421, 0.0764, 0.0003, 0.0734, 0.9446]]
+		assert_allclose(mca_df.cos_r(N=2).T, true_cos_r, atol=1e-4)
+
+		assert_allclose(mca_df.cont_c(N=2).T, [[0.0399, 0.2999, 0.6601],
+											   [0.6628, 0.1359, 0.2014]],
+						atol=1e-4)
+		assert_allclose(mca_df.cos_c(N=2).T, [[0.1614, 0.8758, 0.9128],
+											  [0.8386, 0.1242, 0.0872]],
+						atol=1e-4)
+
+		assert_allclose(mca_df.dc, [0.0148, 0.0108, 0.0930], atol=1e-4)
+		assert_allclose(mca_df.dr, [0.0630, 0.0474, 0.0116, 0.0084, 0.0543, 0.0408],
+						atol=1e-4)
+
+		# abdi = numpy.array([216, 139, 26]) #
+		abdi = DataFrame([216, 139, 26]).T
+		assert_allclose(mca_df.fs_r_sup(abdi, 2), [[-0.0908, 0.5852]], atol=1e-4)
+
+		supp = read_table('data/french_writers_supp.csv', 
+							skiprows=0, index_col=0, sep=',')
+
+		true_fs_col_sup = [[-0.0596, -0.1991, -0.4695, -0.4008],
+						   [0.2318, 0.2082, -0.2976, -0.4740]]
+		assert_allclose(mca_df.fs_c_sup(supp).T, true_fs_col_sup, atol=1e-3)
+
+	def test_invalid_inputs(self):
+		df = read_table('data/burgundies.csv', skiprows=1, sep=',')
+		self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols=0)
+		self.assertRaises(ValueError, MCA, df.iloc[:, 2:], ncols='')
+
+
+	def test_sparse(self):
+		df = DataFrame(randint(0,2,(100,100)))
+		mca1 = MCA(df, sparse=False)
+		mca2 = MCA(df, sparse=True)
+		assert_allclose(mca1.s[:-1], mca2.s, atol=1e-12)
+		for row1, row2 in zip(mca1.P.T, mca2.P.T):
+		 	assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) 
+		for row1, row2 in zip(mca1.Q, mca2.Q):
+		 	assert_allclose(row1, row2 if sign(row1[0])*sign(row2[0]) > 0 else -row2, atol=1e-12) 
 
 
 if __name__ == '__main__':
-    unittest.main()
+	main()