and remove all html tags of type   and return a cleaned Pandas Series.
+ Removes all html tags of the type `<.*?>` such as ,
,
and removes all html tags of type   and returns a cleaned Pandas Series.
+
+ Parameters
+ ----------
+ s: Pandas Series
+
+ Returns
+ -------
+ Pandas Series
Examples
--------
+ >>> import texthero as hero
+ >>> import pandas as pd
>>> s = pd.Series("
Title
")
- >>> remove_html_tags(s)
+ >>> hero.remove_html_tags(s)
0 Title
dtype: object
@@ -589,14 +801,22 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
def tokenize(s: pd.Series) -> pd.Series:
"""
- Tokenize each row of the given Series.
+ Tokenizes each row of the given Series.
- Tokenize each row of the given Pandas Series and return a Pandas Series where each row contains a list of tokens.
+ Tokenizes each row of the given Pandas Series and returns a Pandas Series where each row contains a list of tokens.
Algorithm: add a space between any punctuation symbol at
exception if the symbol is between two alphanumeric character and split.
+ Parameters
+ ----------
+ s: Pandas Series
+
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
>>> import texthero as hero
@@ -615,10 +835,12 @@ def tokenize(s: pd.Series) -> pd.Series:
return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
-def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10):
+def tokenize_with_phrases(
+ s: pd.Series, min_count: int = 5, threshold: int = 10
+) -> pd.Series:
r"""Tokenize and group up collocations words
- Tokenize the given pandas Series and group up bigrams where each tokens has at least min_count term frequrncy and where the threshold is larger than the underline formula.
+ Tokenizes the given pandas Series and group up bigrams where each tokens has at least min_count term frequrncy and where the threshold is larger than the underline formula.
:math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab }{ (word\_a\_count * word\_b\_count)}`.
@@ -626,15 +848,21 @@ def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10)
Parameters
----------
s : Pandas Series
+
min_count : Int, optional. Default is 5.
ignore tokens with frequency less than this
+
threshold : Int, optional. Default is 10.
ignore tokens with a score under that threshold
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
- >>> import pandas as pd
>>> import texthero as hero
+ >>> import pandas as pd
>>> s = pd.Series(["New York is a beautiful city", "Look: New York!"])
>>> hero.tokenize_with_phrases(s, min_count=1, threshold=1)
0 [New_York, is, a, beautiful, city]
@@ -661,6 +889,17 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:
`replace_urls` replace any urls from the given Pandas Series with the given symbol.
+ Parameters
+ ----------
+ s: Pandas Series
+
+ symbol: String
+ The symbol to which the URL should be changed to.
+
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
>>> import texthero as hero
@@ -682,9 +921,17 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:
def remove_urls(s: pd.Series) -> pd.Series:
- r"""Remove all urls from a given Pandas Series.
+ r"""Removes all urls from a given Pandas Series.
+
+ Removes all urls and replaces them with a single empty space.
+
+ Parameters
+ ----------
+ s: Pandas Series
- `remove_urls` remove any urls and replace it with a single empty space.
+ Returns
+ -------
+ Pandas Series
Examples
--------
@@ -712,9 +959,14 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
Parameters
----------
s : Pandas Series
+
symbols : str
Symbols to replace
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
>>> import texthero as hero
@@ -735,6 +987,14 @@ def remove_tags(s: pd.Series) -> pd.Series:
A tag is a string formed by @ concatenated with a sequence of characters and digits. Example: @texthero123. Tags are replaceb by an empty space ` `.
+ Parameters
+ ----------
+ s: Pandas Series
+
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
>>> import texthero as hero
@@ -759,8 +1019,13 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
Parameters
----------
s : Pandas Series
+
symbols : str
Symbols to replace
+
+ Returns
+ -------
+ Panda Series
Examples
--------
@@ -781,6 +1046,14 @@ def remove_hashtags(s: pd.Series) -> pd.Series:
A hashtag is a string formed by # concatenated with a sequence of characters, digits and underscores. Example: #texthero_123.
+ Parameters
+ ----------
+ s: Pandas Series
+
+ Returns
+ -------
+ Pandas Series
+
Examples
--------
>>> import texthero as hero
diff --git a/texthero/representation.py b/texthero/representation.py
index 9c27db97..14b64d50 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -43,14 +43,20 @@ def representation_series_to_flat_series(
----------
s : Sparse Pandas Series or Pandas Series
The multiindexed Pandas Series to flatten.
+
index : Pandas Index, optional, default to None
The index the flattened Series should have.
+
fill_missing_with : Any, default to np.nan
Value to fill the NaNs (missing values) with. This _does not_ mean
that existing values that are np.nan are replaced, but rather that
features that are not present in one document but present in others
are filled with fill_missing_with. See example below.
+ Returns
+ -------
+ Panda Series
+
Examples
--------
@@ -101,22 +107,49 @@ def representation_series_to_flat_series(
def term_frequency(
- s: pd.Series, max_features: Optional[int] = None, return_feature_names=False
-):
+ s: pd.Series,
+ max_features: Optional[int] = None,
+ return_feature_names=False,
+ min_df=1,
+ max_df=1.0,
+ binary=False,
+) -> pd.Series:
"""
- Represent a text-based Pandas Series using term_frequency.
+ Represents a text-based Pandas Series using term_frequency.
The input Series should already be tokenized. If not, it will
be tokenized before term_frequency is calculated.
+
Parameters
----------
s : Pandas Series
- max_features : int, optional
- Maximum number of features to keep.
- return_features_names : Boolean, False by Default
+
+ max_features : int, optional, default to None.
+ Maximum number of features to keep. Will keep all features if set to None.
+
+ return_features_names : Boolean, default to False.
If True, return a tuple (*term_frequency_series*, *features_names*)
+ max_df : float in range [0.0, 1.0] or int, default=1.0
+ Ignore terms that have a document frequency (number of documents they appear in)
+ frequency strictly higher than the given threshold.
+ If float, the parameter represents a proportion of documents, integer
+ absolute counts.
+
+ min_df : float in range [0.0, 1.0] or int, default=1
+ When building the vocabulary ignore terms that have a document
+ frequency (number of documents they appear in) strictly
+ lower than the given threshold.
+ If float, the parameter represents a proportion of documents, integer
+ absolute counts.
+
+ binary : bool, default=False
+ If True, all non zero counts are set to 1.
+
+ Returns
+ -------
+ Pandas Series
Examples
--------
@@ -130,7 +163,7 @@ def term_frequency(
dtype: object
To return the features_names:
-
+
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"])
@@ -149,7 +182,12 @@ def term_frequency(
s = preprocessing.tokenize(s)
tf = CountVectorizer(
- max_features=max_features, tokenizer=lambda x: x, preprocessor=lambda x: x,
+ max_features=max_features,
+ tokenizer=lambda x: x,
+ preprocessor=lambda x: x,
+ min_df=min_df,
+ max_df=max_df,
+ binary=binary,
)
s = pd.Series(tf.fit_transform(s).toarray().tolist(), index=s.index)
@@ -192,17 +230,26 @@ def tfidf(
Parameters
----------
s : Pandas Series (tokenized)
+
max_features : int, optional, default to None.
If not None, only the max_features most frequent tokens are used.
+
min_df : int, optional, default to 1.
- When building the vocabulary, ignore terms that have a document
+ When building the vocabulary, ignore terms that have a document
frequency (number of documents a term appears in) strictly lower than the given threshold.
+
max_df : int or double, optional, default to 1.0
When building the vocabulary, ignore terms that have a document
- frequency (number of documents a term appears in) strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
+ frequency (number of documents a term appears in) strictly higher than the given threshold.
+ This arguments basically permits to remove corpus-specific stop words.
+ When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.
+
return_feature_names: Boolean, optional, default to False
Whether to return the feature (i.e. word) names with the output.
+ Returns
+ -------
+ Panda Series
Examples
--------
@@ -210,11 +257,16 @@ def tfidf(
>>> import pandas as pd
>>> s = pd.Series(["Hi Bye", "Test Bye Bye"])
>>> s = hero.tokenize(s)
- >>> hero.tfidf(s, return_feature_names=True)
+ >>> hero.tfidf(s, return_feature_names=True) # doctest: +SKIP
(document
0 [1.0, 1.4054651081081644, 0.0]
1 [2.0, 0.0, 1.4054651081081644]
dtype: object, ['Bye', 'Hi', 'Test'])
+
+ See Also
+ --------
+ `TF-IDF on Wikipedia `_
+
"""
# Check if input is tokenized. Else, print warning and tokenize.
@@ -262,34 +314,125 @@ def tfidf(
"""
-def pca(s, n_components=2):
+def pca(s: pd.Series, n_components=2, random_state=None) -> pd.Series:
"""
Perform principal component analysis on the given Pandas Series.
- In general, *pca* should be called after the text has already been represented.
+ Principal Component Analysis (PCA) is a statistical method that is used
+ to reveal where the variance in a dataset comes from. For textual data,
+ one could for example first represent a Series of documents using
+ :meth:`texthero.representation.tfidf` to get a vector representation
+ of each document. Then, PCA can generate new vectors from the tfidf representation
+ that showcase the differences among the documents most strongly in fewer dimensions.
+
+ For example, the tfidf vectors will have length 100 if hero.tfidf was called
+ on a large corpus with max_features=100. Visualizing 100 dimensions is hard!
+ Using PCA with n_components=3, every document will now get a vector of
+ length 3, and the vectors will be chosen so that the document differences
+ are easily visible. The corpus can now be visualized in 3D and we can
+ get a good first view of the data!
+
+ In general, *pca* should be called after the text has already been represented to a matrix form.
Parameters
----------
s : Pandas Series
+
n_components : Int. Default is 2.
- Number of components to keep. If n_components is not set or None, all components are kept.
+ Number of components to keep (dimensionality of output vectors).
+ If n_components is not set or None, all components are kept.
+
+ random_state : int, RandomState instance, default=None
+ Pass an int for reproducible results across multiple function calls.
+
+
+ Returns
+ -------
+ Pandas Series with the vector calculated by PCA for the document in every cell.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
- >>> s = pd.Series(["Sentence one", "Sentence two"])
-
+ >>> s = pd.Series(["Football is great", "Hi, I'm Texthero, who are you? Tell me!"])
+ >>> s = hero.clean(s)
+ >>> s = hero.tokenize(s)
+ >>> s = hero.tfidf(s)
+ >>> hero.pca(s, random_state=42) # doctest: +SKIP
+ document
+ 0 [1.5713577608669735, 1.1102230246251565e-16]
+ 1 [-1.5713577608669729, 1.1102230246251568e-16]
+ dtype: object
+
+ See also
+ --------
+ `PCA on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
+
"""
- pca = PCA(n_components=n_components)
+ pca = PCA(n_components=n_components, random_state=random_state)
return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index)
-def nmf(s, n_components=2):
+def nmf(s, n_components=2) -> pd.Series:
"""
- Perform non-negative matrix factorization.
+ Performs non-negative matrix factorization.
+
+ Non-Negative Matrix Factorization (NMF) is often used in
+ natural language processing to find clusters of similar
+ texts (e.g. some texts in a corpus might be about sports
+ and some about music, so they will differ in the usage
+ of technical terms; see the example below).
+
+ Given a document-term matrix (so in
+ texthero usually a Series after applying :meth:`texthero.representation.tfidf`
+ or some other first representation function that assigns a scalar (a weight)
+ to each word), NMF will find n_components many topics (clusters)
+ and calculate a vector for each document that places it
+ correctly among the topics.
+
+
+ Parameters
+ ----------
+ s : Pandas Series
+
+ n_components : Int. Default is 2.
+ Number of components to keep (dimensionality of output vectors).
+ If n_components is not set or None, all components are kept.
+
+ Returns
+ -------
+ Pandas Series with the vector calculated by NMF for the document in every cell.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> doc1 = "Football, Sports, Soccer"
+ >>> doc2 = "Music, Violin, Orchestra"
+ >>> doc3 = "Football, Music"
+ >>> s = pd.Series([doc1, doc2, doc3])
+ >>> s = hero.clean(s)
+ >>> s = hero.tokenize(s)
+ >>> s = hero.term_frequency(s)
+ >>> hero.nmf(s) # doctest: +SKIP
+ 0 [0.9080190347553924, 0.0]
+ 1 [0.0, 0.771931061231598]
+ 2 [0.3725409073202516, 0.31656880119331093]
+ dtype: object
+ >>> # As we can see, the third document, which
+ >>> # is a mix of sports and music, is placed
+ >>> # between the two axes (the topics) while
+ >>> # the other documents are placed right on
+ >>> # one topic axis each.
+
+ See also
+ --------
+ `NMF on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
-
"""
nmf = NMF(n_components=n_components, init="random", random_state=0)
return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index)
@@ -311,16 +454,145 @@ def tsne(
method="barnes_hut",
angle=0.5,
n_jobs=-1,
-):
+) -> pd.Series:
"""
- Perform TSNE on the given pandas series.
+ Performs TSNE on the given pandas series.
+
+ t-distributed Stochastic Neighbor Embedding (t-SNE) is
+ a machine learning algorithm used to visualize high-dimensional data in fewer
+ dimensions. In natural language processing, the high-dimensional
+ data is usually a document-term matrix
+ (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf`
+ or some other first representation function that assigns a scalar (a weight)
+ to each word) that is hard to visualize as there
+ might be many terms. With t-SNE, every document
+ gets a new, low-dimensional (n_components entries)
+ vector in such a way that the differences / similarities between
+ documents are preserved.
+
Parameters
----------
s : Pandas Series
+
n_components : int, default is 2.
- Number of components to keep. If n_components is not set or None, all components are kept.
- perplexity : int, default is 30.0
+ Number of components to keep (dimensionality of output vectors).
+ If n_components is not set or None, all components are kept.
+
+ perplexity : float, optional (default: 30)
+ The perplexity is related to the number of nearest neighbors that
+ is used in other manifold learning algorithms. Larger datasets
+ usually require a larger perplexity. Consider selecting a value
+ between 5 and 50. Different values can result in significanlty
+ different results.
+
+ early_exaggeration : float, optional (default: 12.0)
+ Controls how tight natural clusters in the original space are in
+ the embedded space and how much space will be between them. For
+ larger values, the space between natural clusters will be larger
+ in the embedded space. Again, the choice of this parameter is not
+ very critical. If the cost function increases during initial
+ optimization, the early exaggeration factor or the learning rate
+ might be too high.
+
+ learning_rate : float, optional (default: 200.0)
+ The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
+ the learning rate is too high, the data may look like a 'ball' with any
+ point approximately equidistant from its nearest neighbours. If the
+ learning rate is too low, most points may look compressed in a dense
+ cloud with few outliers. If the cost function gets stuck in a bad local
+ minimum increasing the learning rate may help.
+
+ n_iter : int, optional (default: 1000)
+ Maximum number of iterations for the optimization. Should be at
+ least 250.
+
+ n_iter_without_progress : int, optional (default: 300)
+ Maximum number of iterations without progress before we abort the
+ optimization, used after 250 initial iterations with early
+ exaggeration. Note that progress is only checked every 50 iterations so
+ this value is rounded to the next multiple of 50.
+
+ min_grad_norm : float, optional (default: 1e-7)
+ If the gradient norm is below this threshold, the optimization will
+ be stopped.
+
+ metric : string or callable, optional
+ The metric to use when calculating distance between instances in a
+ feature array. If metric is a string, it must be one of the options
+ allowed by scipy.spatial.distance.pdist for its metric parameter.
+
+ Alternatively, if metric is a callable function, it is called on each
+ pair of instances (rows) and the resulting value recorded. The callable
+ should take two arrays from X as input and return a value indicating
+ the distance between them. The default is "euclidean" which is
+ interpreted as squared euclidean distance.
+
+ init : string or numpy array, optional (default: "random")
+ Initialization of embedding. Possible options are 'random', 'pca',
+ and a numpy array of shape (n_samples, n_components).
+ PCA initialization cannot be used with precomputed distances and is
+ usually more globally stable than random initialization.
+
+ verbose : int, optional (default: 0)
+ Verbosity level.
+
+ random_state : int, RandomState instance, default=None
+ Determines the random number generator. Pass an int for reproducible
+ results across multiple function calls. Note that different
+ initializations might result in different local minima of the cost
+ function.
+
+ method : string (default: 'barnes_hut')
+ By default the gradient calculation algorithm uses Barnes-Hut
+ approximation running in O(NlogN) time. method='exact'
+ will run on the slower, but exact, algorithm in O(N^2) time. The
+ exact algorithm should be used when nearest-neighbor errors need
+ to be better than 3%. However, the exact method cannot scale to
+ millions of examples.
+
+ angle : float (default: 0.5)
+ Only used if method='barnes_hut'
+ This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
+ 'angle' is the angular size of a distant
+ node as measured from a point. If this size is below 'angle' then it is
+ used as a summary node of all points contained within it.
+ This method is not very sensitive to changes in this parameter
+ in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
+ computation time and angle greater 0.8 has quickly increasing error.
+
+ n_jobs : int or None, optional (default=None)
+ The number of parallel jobs to run for neighbors search. This parameter
+ has no impact when ``metric="precomputed"`` or
+ (``metric="euclidean"`` and ``method="exact"``).
+ ``-1`` means using all processors.
+
+ Returns
+ -------
+ Pandas Series with the vector calculated by t-SNE for the document in every cell.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> doc1 = "Football, Sports, Soccer"
+ >>> doc2 = "Music, Violin, Orchestra"
+ >>> doc3 = "Football, Music"
+ >>> s = pd.Series([doc1, doc2, doc3])
+ >>> s = hero.clean(s)
+ >>> s = hero.tokenize(s)
+ >>> s = hero.term_frequency(s)
+ >>> hero.tsne(s, random_state=42) # doctest: +SKIP
+ 0 [-18.833383560180664, -276.800537109375]
+ 1 [-210.60179138183594, 143.00535583496094]
+ 2 [-478.27984619140625, -232.97410583496094]
+ dtype: object
+
+ See also
+ --------
+ `t-SNE on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
"""
tsne = TSNE(
@@ -354,17 +626,112 @@ def kmeans(
n_init=10,
max_iter=300,
tol=0.0001,
- precompute_distances="auto",
verbose=0,
random_state=None,
copy_x=True,
n_jobs=-1,
algorithm="auto",
-):
+) -> pd.Series:
"""
- Perform K-means clustering algorithm.
+ Performs K-means clustering algorithm.
+
+ K-means clustering is used in natural language processing
+ to separate texts into k clusters (groups)
+ (e.g. some texts in a corpus might be about sports
+ and some about music, so they will differ in the usage
+ of technical terms; the K-means algorithm uses this
+ to separate them into two clusters).
+
+ Given a document-term matrix (so in
+ texthero usually a Series after applying :meth:`texthero.representation.tfidf`
+ or some other first representation function that assigns a scalar (a weight)
+ to each word), K-means will find k topics (clusters)
+ and assign a topic to each document.
+
+ Parameters
+ ----------
+ s: Pandas Series
+
+ n_clusters: Int, default to 5.
+ The number of clusters to separate the data into.
+
+ init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
+ Method for initialization:
+
+ 'k-means++' : selects initial cluster centers for k-mean
+ clustering in a smart way to speed up convergence. See section
+ Notes in k_init for more details.
+
+ 'random': choose `n_clusters` observations (rows) at random from data
+ for the initial centroids.
+
+ If an ndarray is passed, it should be of shape (n_clusters, n_features)
+ and gives the initial centers.
+
+ If a callable is passed, it should take arguments X, n_clusters and a
+ random state and return an initialization.
+
+ n_init : int, default=10
+ Number of time the k-means algorithm will be run with different
+ centroid seeds. The final results will be the best output of
+ n_init consecutive runs in terms of inertia.
+
+ max_iter : int, default=300
+ Maximum number of iterations of the k-means algorithm for a
+ single run.
+
+ tol : float, default=1e-4
+ Relative tolerance with regards to Frobenius norm of the difference
+ in the cluster centers of two consecutive iterations to declare
+ convergence.
+ It's not advised to set `tol=0` since convergence might never be
+ declared due to rounding errors. Use a very small number instead.
+
+ verbose : int, default=0
+ Verbosity mode.
+
+ random_state : int, RandomState instance, default=None
+ Determines random number generation for centroid initialization. Use
+ an int to make the randomness deterministic.
+
+ algorithm : {"auto", "full", "elkan"}, default="auto"
+ K-means algorithm to use. The classical EM-style algorithm is "full".
+ The "elkan" variation is more efficient on data with well-defined
+ clusters, by using the triangle inequality. However it's more memory
+ intensive.
+
+ Returns
+ -------
+ Pandas Series with the cluster the document was assigned to in each cell.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> doc1 = "Football, Sports, Soccer"
+ >>> doc2 = "music, violin, orchestra"
+ >>> doc3 = "football, fun, sports"
+ >>> doc4 = "music, fun, guitar"
+ >>> s = pd.Series([doc1, doc2, doc3, doc4])
+ >>> s = hero.clean(s)
+ >>> s = hero.tokenize(s)
+ >>> s = hero.term_frequency(s)
+ >>> hero.kmeans(s, n_clusters=2, random_state=42)
+ 0 1
+ 1 0
+ 2 1
+ 3 0
+ dtype: category
+ Categories (2, int64): [0, 1]
+ >>> # As we can see, the documents are correctly
+ >>> # separated into topics / clusters by the algorithm.
+
+ See also
+ --------
+ `kmeans on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
- Return a "category" Pandas Series.
"""
vectors = list(s)
kmeans = KMeans(
@@ -373,13 +740,13 @@ def kmeans(
n_init=n_init,
max_iter=max_iter,
tol=tol,
- precompute_distances=precompute_distances,
verbose=verbose,
random_state=random_state,
- copy_x=copy_x,
- n_jobs=n_jobs,
+ # We are using list(s) anyway, so we can safely modify that without changing the input.
+ copy_x=False,
algorithm=algorithm,
).fit(vectors)
+
return pd.Series(kmeans.predict(vectors), index=s.index).astype("category")
@@ -392,12 +759,103 @@ def dbscan(
algorithm="auto",
leaf_size=30,
p=None,
- n_jobs=None,
+ n_jobs=-1,
):
"""
Perform DBSCAN clustering.
- Return a "category" Pandas Series.
+ Density-based spatial clustering of applications with noise (DBSCAN)
+ is used in natural language processing
+ to separate texts into clusters (groups)
+ (e.g. some texts in a corpus might be about sports
+ and some about music, so they will differ in the usage
+ of technical terms; the DBSCAN algorithm uses this
+ to separate them into clusters). It chooses the
+ number of clusters on its own.
+
+ Given a document-term matrix (so in
+ texthero usually a Series after applying :meth:`texthero.representation.tfidf`
+ or some other first representation function that assigns a scalar (a weight)
+ to each word), DBSCAN will find topics (clusters)
+ and assign a topic to each document.
+
+ Parameters
+ ----------
+ s: Pandas Series
+
+ eps : float, default=0.5
+ The maximum distance between two samples for one to be considered
+ as in the neighborhood of the other. This is not a maximum bound
+ on the distances of points within a cluster. This is the most
+ important DBSCAN parameter to choose appropriately for your data set
+ and distance function.
+
+ min_samples : int, default=5
+ The number of samples (or total weight) in a neighborhood for a point
+ to be considered as a core point. This includes the point itself.
+
+ metric : string, or callable, default='euclidean'
+ The metric to use when calculating distance between instances in a
+ feature array. If metric is a string or callable, it must be one of
+ the options allowed by :func:`sklearn.metrics.pairwise_distances` for
+ its metric parameter.
+
+ metric_params : dict, default=None
+ Additional keyword arguments for the metric function.
+
+ algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
+ The algorithm to be used by the NearestNeighbors module
+ to compute pointwise distances and find nearest neighbors.
+ See NearestNeighbors module documentation for details.
+
+ leaf_size : int, default=30
+ Leaf size passed to BallTree or cKDTree. This can affect the speed
+ of the construction and query, as well as the memory required
+ to store the tree. The optimal value depends
+ on the nature of the problem.
+
+ p : float, default=None
+ The power of the Minkowski metric to be used to calculate distance
+ between points.
+
+ n_jobs : int, default=None
+ The number of parallel jobs to run.
+ ``-1`` means using all processors.
+
+ Returns
+ -------
+ Pandas Series with the cluster the document was assigned to in each cell.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> doc1 = "Football, Sports, Soccer"
+ >>> doc2 = "music, violin, orchestra"
+ >>> doc3 = "football, fun, sports"
+ >>> doc4 = "music, enjoy, guitar"
+ >>> s = pd.Series([doc1, doc2, doc3, doc4])
+ >>> s = hero.clean(s)
+ >>> s = hero.tokenize(s)
+ >>> s = hero.tfidf(s)
+ >>> hero.dbscan(s, min_samples=1, eps=4)
+ document
+ 0 0
+ 1 1
+ 2 0
+ 3 1
+ dtype: category
+ Categories (2, int64): [0, 1]
+ >>> # As we can see, the documents are correctly
+ >>> # separated into topics / clusters by the algorithm
+ >>> # and we didn't even have to say how many topics there are!
+
+ See also
+ --------
+ `DBSCAN on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
+
"""
return pd.Series(
@@ -428,9 +886,87 @@ def meanshift(
"""
Perform mean shift clustering.
- Return a "category" Pandas Series.
- """
+ Mean shift clustering
+ is used in natural language processing
+ to separate texts into clusters (groups)
+ (e.g. some texts in a corpus might be about sports
+ and some about music, so they will differ in the usage
+ of technical terms; the mean shift algorithm uses this
+ to separate them into clusters). It chooses the
+ number of clusters on its own.
+
+ Given a document-term matrix (so in
+ texthero usually a Series after applying :meth:`texthero.representation.tfidf`
+ or some other first representation function that assigns a scalar (a weight)
+ to each word), mean shift will find topics (clusters)
+ and assign a topic to each document.
+
+ Parameters
+ ----------
+ s: Pandas Series
+ bandwidth : float, default=None
+ Bandwidth used in the RBF kernel.
+
+ If not given, the bandwidth is estimated using
+ sklearn.cluster.estimate_bandwidth; see the documentation for that
+ function for hints on scalability.
+
+ seeds : array-like of shape (n_samples, n_features), default=None
+ Seeds used to initialize kernels.
+
+ bin_seeding : bool, default=False
+ If true, initial kernel locations are not locations of all
+ points, but rather the location of the discretized version of
+ points, where points are binned onto a grid whose coarseness
+ corresponds to the bandwidth. Setting this option to True will speed
+ up the algorithm because fewer seeds will be initialized.
+ The default value is False.
+ Ignored if seeds argument is not None.
+
+ min_bin_freq : int, default=1
+ To speed up the algorithm, accept only those bins with at least
+ min_bin_freq points as seeds.
+
+ cluster_all : bool, default=True
+ If true, then all points are clustered, even those orphans that are
+ not within any kernel. Orphans are assigned to the nearest kernel.
+ If false, then orphans are given cluster label -1.
+
+ n_jobs : int, default=None
+ The number of jobs to use for the computation.
+ ``-1`` means using all processors
+
+ max_iter : int, default=300
+ Maximum number of iterations, per seed point before the clustering
+ operation terminates (for that seed point), if has not converged yet.
+
+ Returns
+ -------
+ Pandas Series with the cluster the document was assigned to in each cell.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> s = pd.Series([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]])
+ >>> hero.meanshift(s, bandwidth=2)
+ 0 1
+ 1 1
+ 2 1
+ 3 0
+ 4 0
+ 5 0
+ dtype: category
+ Categories (2, int64): [0, 1]
+
+ See also
+ --------
+ `Mean-Shift on Wikipedia `_
+
+ :meth:`tfidf` to compute TF-IDF and :meth:`term_frequency` to compute term frequency
+
+ """
return pd.Series(
MeanShift(
bandwidth=bandwidth,
diff --git a/texthero/visualization.py b/texthero/visualization.py
index 507b83e5..a72f7921 100644
--- a/texthero/visualization.py
+++ b/texthero/visualization.py
@@ -20,31 +20,86 @@ def scatterplot(
df: pd.DataFrame,
col: str,
color: str = None,
+ hover_name: str = None,
hover_data: [] = None,
title="",
return_figure=False,
):
"""
- Show scatterplot using python plotly scatter.
+ Show scatterplot of DataFrame column using python plotly scatter.
+
Parameters
----------
- df
- col
- The name of the column of the DataFrame used for x and y axis.
+ df: DataFrame with a column to be visualized.
+
+ col: str
+ The name of the column of the DataFrame to use for x and y (and z) axis.
+
+ color: str, default to None.
+ Name of the column to use for coloring (rows with same value get same color).
+
+ title: str, default to "".
+ Title of the plot.
+
+ return_figure: optional, default to False.
+ Function returns the figure if set to True.
+
+ hover_data: List[str], default to [].
+ List of column names to supply data when hovering over a point.
+
+ hover_name: str, default to None
+ Name of the column to supply title of data when hovering over a point.
+
+ Examples
+ --------
+ >>> import texthero as hero
+ >>> import pandas as pd
+ >>> doc1 = "Football, Sports, Soccer"
+ >>> doc2 = "music, violin, orchestra"
+ >>> doc3 = "football, fun, sports"
+ >>> doc4 = "music, fun, guitar"
+ >>> df = pd.DataFrame([doc1, doc2, doc3, doc4], columns=["texts"])
+ >>> df["texts"] = hero.clean(df["texts"])
+ >>> df["texts"] = hero.tokenize(df["texts"])
+ >>> df["tfidf"] = hero.tfidf(df["texts"])
+ >>> df["topics"] = hero.kmeans(df["tfidf"], n_clusters=2)
+ >>> df["pca"] = hero.pca(df["tfidf"], n_components=3)
+ >>> hero.scatterplot(df, col="pca", color="topics", hover_name="texts") # doctest: +SKIP
"""
- pca0 = df[col].apply(lambda x: x[0])
- pca1 = df[col].apply(lambda x: x[1])
+ x = df[col].apply(lambda x: x[0])
+ y = df[col].apply(lambda x: x[1])
+
+ if len(df[col][0]) == 3:
+ z = df[col].apply(lambda x: x[2])
+ fig = px.scatter_3d(
+ df,
+ x=x,
+ y=y,
+ z=z,
+ color=color,
+ hover_data=hover_data,
+ title=title,
+ hover_name=hover_name,
+ )
+ else:
+ fig = px.scatter(
+ df,
+ x=x,
+ y=y,
+ color=color,
+ hover_data=hover_data,
+ title=title,
+ hover_name=hover_name,
+ )
- fig = px.scatter(
- df, x=pca0, y=pca1, color=color, hover_data=hover_data, title=title
- )
# fig.show(config={'displayModeBar': False})
- fig.show()
if return_figure:
return fig
+ else:
+ fig.show()
"""
@@ -78,26 +133,42 @@ def wordcloud(
Parameters
----------
s : pd.Series
+
font_path : str
- Font path to the font that will be used (OTF or TTF). Defaults to DroidSansMono path on a Linux machine. If you are on another OS or don't have this font, you need to adjust this path.
+ Font path to the font that will be used (OTF or TTF).
+ Defaults to DroidSansMono path on a Linux machine.
+ If you are on another OS or don't have this font, you need to adjust this path.
+
width : int
Width of the canvas.
+
height : int
Height of the canvas.
+
max_words : number (default=200)
The maximum number of words.
+
mask : nd-array or None (default=None)
- When set, gives a binary mask on where to draw words. When set, width and height will be ignored and the shape of mask will be used instead. All white (#FF or #FFFFFF) entries will be considerd "masked out" while other entries will be free to draw on.
+ When set, gives a binary mask on where to draw words.
+ When set, width and height will be ignored and the shape of mask will be used instead.
+ All white (#FF or #FFFFFF) entries will be considerd "masked out" while other
+ entries will be free to draw on.
+
contour_width: float (default=0)
If mask is not None and contour_width > 0, draw the mask contour.
+
contour_color: color value (default="PAPAYAWHIP")
Mask contour color.
+
min_font_size : int (default=4)
Smallest font size to use. Will stop when there is no more room in this size.
+
background_color : color value (default="PAPAYAWHIP")
Background color for the word cloud image.
+
max_font_size : int or None (default=None)
Maximum font size for the largest word. If None, height of the image is used.
+
relative_scaling : float (default='auto')
Importance of relative word frequencies for font-size. With
relative_scaling=0, only word-ranks are considered. With
@@ -106,8 +177,10 @@ def wordcloud(
their rank, relative_scaling around .5 often looks good.
If 'auto' it will be set to 0.5 unless repeat is true, in which
case it will be set to 0.
+
colormap : string or matplotlib colormap, default="viridis"
Matplotlib colormap to randomly draw colors from for each word.
+
"""
text = s.str.cat(sep=" ")
@@ -162,12 +235,23 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
Return a pandas series with index the top words and as value the count.
Tokenization: split by space and remove all punctuations that are not between characters.
-
+
Parameters
----------
- normalize :
+ normalize : optional, default to False.
When set to true, return normalized values.
+ Examples
+ --------
+ >>> import pandas as pd
+ >>> import texthero as hero
+ >>> s = pd.Series("one two two three three three")
+ >>> hero.top_words(s)
+ three 3
+ two 2
+ one 1
+ dtype: int64
+
"""
# Replace all punctuation that are NOT in-between chacarters
diff --git a/website/docs/getting-started.md b/website/docs/getting-started.md
index e2b9419c..3f8dbc26 100644
--- a/website/docs/getting-started.md
+++ b/website/docs/getting-started.md
@@ -9,13 +9,13 @@ Texthero is a python package to let you work efficiently and quickly with text d
## Overview
-Given a dataset with structured data, it's easy to have a quick understanding of the underline data. Oppositely, given a dataset composed of text-only, it's harder to have a quick undertanding of the data. Texthero help you there, providing utility functions to quickly **clean the text data**, **map it into a vector space** and gather from it **primary insights**.
+Given a dataset with structured data, it's easy to have a quick understanding of the underlying data. Oppositely, given a dataset composed of text-only, it's harder to have a quick undertanding of the data. Texthero help you there, providing utility functions to quickly **clean the text data**, **tokenize it**, **map it into a vector space** and gather from it **primary insights**.
##### Pandas integration
One of the main pillar of texthero is that is designed from the ground-up to work with **Pandas Dataframe** and **Series**.
-Most of texthero methods, simply apply transformation to Pandas Series. As a rule of thumb, the first argument and the return ouputs of almost all texthero methods are either a Pandas Series or a Pandas DataFrame.
+Most of texthero's methods simply apply a transformation to a Pandas Series. As a rule of thumb, the first argument and the ouput of almost all texthero methods are either a Pandas Series or a Pandas DataFrame.
##### Pipeline
@@ -46,7 +46,7 @@ The five different areas are _athletics_, _cricket_, _football_, _rugby_ and _te
The original dataset comes as a zip files with five different folder containing the article as text data for each topic.
-For convenience, we createdThis script simply read all text data and store it into a Pandas Dataframe.
+For convenience, we created this script simply read all text data and store it into a Pandas Dataframe.
Import texthero and pandas.
@@ -87,7 +87,7 @@ Recently, Pandas has introduced the pipe function. You can achieve the same resu
df['clean_text'] = df['text'].pipe(hero.clean)
```
-> Tips. When we need to define a new column returned from a function, we prepend the name of the function to the column name. Example: df['tsne_col'] = df['col'].pipe(hero.tsne). This keep the code simple to read and permit to construct complex pipeline.
+> Tips. When we need to define a new column returned from a function, we prepend the name of the function to the column name. Example: df['tsne_col'] = df['col'].pipe(hero.tsne). This keep the code simple to read and allows us to construct complex pipelines.
The default pipeline for the `clean` method is the following:
@@ -120,46 +120,66 @@ or alternatively
df['clean_text'] = df['clean_text'].pipe(hero.clean, custom_pipeline)
```
+##### Tokenize
+
+Next, we usually want to tokenize the text (_tokenizing_ means splitting sentences/documents into separate words, the _tokens_). Of course, texthero provides an easy function for that!
+
+```python
+df['tokenized_text'] = hero.tokenize(df['clean_text'])
+```
+
+
##### Preprocessing API
-The complete preprocessing API can be found at the following address: [api preprocessing](/docs/api-preprocessing).
+The complete preprocessing API can be found here: [api preprocessing](/docs/api-preprocessing).
### Representation
-Once cleaned the data, the next natural is to map each document into a vector.
+Once the data is cleaned and tokenized, the next natural step is to map each document to a vector so we can compare documents with mathematical methods to derive insights.
##### TFIDF representation
+TFIDF is a formula to calculate the _relative importance_ of the words in a document, taking
+into account the words' occurrences in other documents.
```python
-df['tfidf_clean_text'] = hero.tfidf(df['clean_text'])
+df['tfidf'] = hero.tfidf(df['tokenized_text'])
```
+Now, we have calculated a vector for each document that tells us what words are characteristic for the document.
+Usually, documents about similar topics use similar terms, so their tfidf-vectors will be similar too.
+
##### Dimensionality reduction with PCA
-To visualize the data, we map each point to a two-dimensional representation with PCA. The principal component analysis algorithms returns the combination of attributes that better account the variance in the data.
+We now want to visualize the data. However, the tfidf-vectors are very high-dimensional (i.e. every
+document might have a tfidf-vector of length 100). Visualizing 100 dimensions is hard!
+
+Thus, we perform dimensionality reduction (generating vectors with fewer entries from vectors with
+many entries). For that, we can use PCA. PCA generates new vectors from the tfidf representation
+that showcase the differences among the documents most strongly in fewer dimensions, often 2 or 3.
```python
-df['pca_tfidf_clean_text'] = hero.pca(df['tfidf_clean_text'])
+df['pca'] = hero.pca(df['tfidf'])
```
##### All in one step
-We can achieve all the three steps show above, _cleaning_, _tf-idf representation_ and _dimensionality reduction_ in a single step. Isn't fabulous?
+We can achieve all the steps shown above, _cleaning_, _tokenizing_, _tf-idf representation_ and _dimensionality reduction_ in a single step. Isn't that fabulous?
```python
df['pca'] = (
- df['text']
- .pipe(hero.clean)
- .pipe(hero.tfidf)
- .pipe(hero.pca)
- )
+ df['text']
+ .pipe(hero.clean)
+ .pipe(hero.tokenize)
+ .pipe(hero.tfidf)
+ .pipe(hero.pca)
+)
```
##### Representation API
-The complete representation module API can be found at the following address: [api representation](/docs/api-representation).
+The complete representation module API can be found here: [api representation](/docs/api-representation).
### Visualization
@@ -176,32 +196,43 @@ Also, we can "visualize" the most common words for each `topic` with `top_words`
```python
NUM_TOP_WORDS = 5
-df.groupby('topic')['text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS])
+df.groupby('topic')['clean_text'].apply(lambda x: hero.top_words(x, normalize=True)[:NUM_TOP_WORDS])
```
```
topic
-athletics said 0.010068
- world 0.008900
- year 0.008844
-cricket test 0.008250
- england 0.008001
- first 0.007787
-football said 0.009515
- chelsea 0.006110
- game 0.005950
-rugby england 0.012602
- said 0.008359
- wales 0.007880
-tennis 6 0.021047
- said 0.013012
- open 0.009834
+athletics said 0.010330
+ world 0.009132
+ year 0.009075
+ olympic 0.007819
+ race 0.006392
+cricket test 0.008492
+ england 0.008235
+ first 0.008016
+ cricket 0.007906
+ one 0.007760
+football said 0.009709
+ chelsea 0.006234
+ game 0.006071
+ would 0.005866
+ club 0.005601
+rugby england 0.012833
+ said 0.008512
+ wales 0.008025
+ ireland 0.007440
+ rugby 0.007245
+tennis said 0.013993
+ open 0.010575
+ first 0.009608
+ set 0.009028
+ year 0.008447
+Name: clean_text, dtype: float64
```
##### Visualization API
-The complete visualization module API can be found at the following address: [api visualization](/docs/api-visualization).
+The complete visualization module API can be found here: [api visualization](/docs/api-visualization).
## Summary
@@ -217,15 +248,19 @@ df = pd.read_csv(
df['pca'] = (
df['text']
.pipe(hero.clean)
+ .pipe(hero.tokenize)
.pipe(hero.tfidf)
- .pipe(hero.pca)
+ .pipe(hero.pca, n_components=3)
)
hero.scatterplot(df, col='pca', color='topic', title="PCA BBC Sport news")
```
+![](/img/scatterplot_bbcsport_3d.png)
+
+
##### Next section
By now, you should have understood the main building blocks of texthero.
-In the next sections, we will review each module, see how we can tune the default settings and we will show other application where Texthero might come in handy.
+In the next sections, we will review each module, see how we can tune the default settings and we will show other applications where Texthero might come in handy.
diff --git a/website/static/img/scatterplot_bbcsport_3d.png b/website/static/img/scatterplot_bbcsport_3d.png
new file mode 100644
index 00000000..2642d992
Binary files /dev/null and b/website/static/img/scatterplot_bbcsport_3d.png differ