|
list | phi = self.phi[:,topic_indices] |
|
tuple | k_arr |
|
list | ind = [self.corpus.words_int[word] for word in self.corpus.words] |
|
list | table = [] |
|
string | ch = 'Topic ' |
|
list | ch = topic_labels[i] |
|
tuple | col = LabeledColumn(k_arr[i], col_header=ch, col_len=print_len) |
|
list | schc = ['Topic', 'Words'] |
|
list | schf = ['Word', 'Prob'] |
|
| subcolhdr_compact = schf) |
|
A class for viewing a topic model estimated by one of vsm's LDA
classes using CGS.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.__init__ |
( |
|
self, |
|
|
|
corpus, |
|
|
|
model |
|
) |
| |
Initialize LdaCgsViewer.
:param corpus: Source of observed data.
:type corpus: :class:`Corpus`
:param model: An LDA model estimated by a CGS.
:type model: LdaCgsSeq
def vsm.viewer.ldacgsviewer.LdaCgsViewer.aggregate_doc_topics |
( |
|
self, |
|
|
|
docs, |
|
|
|
normed_sum = False , |
|
|
|
print_len = 10 |
|
) |
| |
Takes a list of documents identifiers and returns the sum of the
distributions over topics corresponding to these topics,
normalized to sum to 1. If normed_sum is True, the sum is
weighted by document lengths, so that documents contribute
uniformly to the aggregate distribution.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dismat_doc |
( |
|
self, |
|
|
|
docs = [] , |
|
|
|
dist_fn = JS_dist |
|
) |
| |
Calculates the distance matrix for a given list of documents.
:param docs: A list of documents whose distance matrix is to be computed.
Default is all the documents in the model.
:type docs: list, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:returns: an instance of :class:`IndexedSymmArray`.
n x n matrix containing floats where n is the number of documents.
:See Also: :meth:`vsm.viewer.wrapper.dismat_documents`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dismat_top |
( |
|
self, |
|
|
|
topics = [] , |
|
|
|
dist_fn = JS_dist |
|
) |
| |
Calculates the distance matrix for a given list of topics.
:param topic_indices: A list of topics whose distance matrix is to be
computed. Default is all topics in the model.
:type topic_indices: list, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:returns: an instance of :class:`IndexedSymmArray`.
n x n matrix containing floats where n is the number of
topics considered.
:See Also: :meth:`vsm.viewer.wrapper.dismat_top`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dist_doc_doc |
( |
|
self, |
|
|
|
doc_or_docs, |
|
|
|
print_len = 10 , |
|
|
|
filter_nan = True , |
|
|
|
label_fn = def_label_fn , |
|
|
|
as_strings = True , |
|
|
|
dist_fn = JS_dist , |
|
|
|
order = 'i' |
|
) |
| |
Computes and sorts the distances between a document
or list of documents and every document in the topic space.
:param doc_or_docs: Query document(s) relative to which
distances are computed.
:type doc_or_docs: string/integer or list of strings/integer.
:param print_len: Number of words printed by pretty-printing function.
Default is 10.
:type print_len: int, optional
:param filter_nan: If `True` not a number entries are filtered.
Default is `True`.
:type filter_nan: boolean, optional
:param label_fn: A function that defines how documents are represented.
Default is def_label_fn which retrieves the labels from corpus
metadata.
:type label_fn: string, optional
:param as_strings: If `True`, returns a list of words rather than
their integer representations. Default is `True`.
:type as_strings: boolean, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:param order: Order of sorting. 'i' for increasing and 'd' for
decreasing order. Default is 'i'.
:type order: string, optional
:returns: an instance of `LabeledColumn`.
A 2-dim array containing documents and their distances to
`doc_or_docs`.
:See Also: :meth:`vsm.viewer.wrapper.dist_doc_doc`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dist_top_doc |
( |
|
self, |
|
|
|
topic_or_topics, |
|
|
|
weights = [] , |
|
|
|
filter_words = [] , |
|
|
|
print_len = 10 , |
|
|
|
as_strings = True , |
|
|
|
label_fn = def_label_fn , |
|
|
|
filter_nan = True , |
|
|
|
dist_fn = JS_dist , |
|
|
|
order = 'i' |
|
) |
| |
Takes a topic or list of topics (by integer index) and returns
a list of documents sorted by distance.
:param topic_or_topics: Query topic(s) relative to which
distances are computed.
:type topic_or_topics: string or list of strings
:param weights: Specify weights for each topic in `topic_or_topics`.
Default uses equal weights (i.e. arithmetic mean)
:type weights: list of floating point, optional
:param filter_words: The topics that include these words are considered.
If not provided, by default all topics are considered.
:type filter_words: list of words, optional
:param print_len: Number of documents printed by pretty-pringing function
Default is 10.
:type print_len: int, optional
:param as_strings: If `True`, returns a list of documents as strings rather
than their integer representations. Default is `True`.
:type as_strings: boolean, optional
:param label_fn: A function that defines how documents are represented.
Default is def_label_fn which retrieves the labels from corpus
metadata.
:type label_fn: string, optional
:param filter_nan: If `True` not a number entries are filtered.
Default is `True`.
:type filter_nan: boolean, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:param order: Order of sorting. 'i' for increasing and 'd' for
decreasing order. Default is 'i'.
:type order: string, optional
:returns: an instance of :class:`LabeledColumn`.
A 2-dim array containing documents and their posterior probabilities
to `topic_or_topics`.
:See Also: :meth:`def_label_fn`, :meth:`vsm.viewer.wrapper.dist_top_doc`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dist_top_top |
( |
|
self, |
|
|
|
topic_or_topics, |
|
|
|
weights = [] , |
|
|
|
dist_fn = JS_dist , |
|
|
|
order = 'i' , |
|
|
|
show_topics = True , |
|
|
|
print_len = 10 , |
|
|
|
filter_nan = True , |
|
|
|
as_strings = True , |
|
|
|
compact_view = True , |
|
|
|
topic_labels = None |
|
) |
| |
Takes a topic or list of topics (by integer index) and returns
a list of topics sorted by the distances between a given
topic and every topic.
:param topic_or_topics: Query topic(s) to which distances are calculated.
:type topic_or_topics: integer or list of integers
:param weights: Specify weights for each topic in `topic_or_topics`.
Default uses equal weights (i.e. arithmetic mean)
:type weights: list of floating point, optional
:param show_topics: If `True`, topics are represented by their number
and distribution over words. Otherwise only topic numbers
are shown. Default is `True`.
:type show_topics: boolean, optional
:param print_len: Number of topics to be shown. Default is 10.
:type print_len: int, optional
:param filter_nan: If `True` not a number entries are filtered.
Default is `True`.
:type filter_nan: boolean, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:param order: Order of sorting. 'i' for increasing and 'd' for
decreasing order. Default is 'i'.
:type order: string, optional
:param as_strings: If `True`, words of each topic are represented as
strings. Otherwise they are represented by their integer
representation. Default is `True`.
:type as_strings: boolean, optional
:param compact_view: If `True`, topics are simply represented as
their top `print_len` number of words. Otherwise, topics are
shown as words and their probabilities. Default is `True`.
:type compact_view: boolean, optional
:param topic_labels: List of strings that are names that correspond
to the topics in `topic_indices`.
:type topic_labels: list, optional
:returns: an instance of :class:`LabeledColumn`.
A 2-dim array containing topics and their distances to
`topic_or_topics`.
:See Also: :meth:`vsm.viewer.wrapper.dist_top_top`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.dist_word_top |
( |
|
self, |
|
|
|
word_or_words, |
|
|
|
weights = [] , |
|
|
|
filter_nan = True , |
|
|
|
show_topics = True , |
|
|
|
print_len = 10 , |
|
|
|
as_strings = True , |
|
|
|
compact_view = True , |
|
|
|
dist_fn = JS_dist , |
|
|
|
order = 'i' , |
|
|
|
topic_labels = None |
|
) |
| |
Sorts topics according to their distance to the query
`word_or_words`.
A pseudo-topic from `word_or_words` as follows. If weights are
not provided, the word list is represented in the space of
topics as a topic which assigns equal non-zero probability to
each word in `words` and 0 to every other word in the
corpus. Otherwise, each word in `words` is assigned the
provided weight.
:param word_or_words: word(s) to which distances are calculated
:type word_or_words: string or list of strings
:param weights: Specify weights for each query word in `word_or_words`.
Default uses equal weights.
:type weights: list of floating point, optional
:param filter_nan: If `True` not a number entries are filtered.
Default is `True`.
:type filter_nan: boolean, optional
:param show_topics: If `True`, topics are represented by their number
and distribution over words. Otherwise, only topic numbers
are shown. Default is `True`.
:type show_topics: boolean, optional
:param print_len: Number of words printed by pretty-printing function.
Default is 10.
:type print_len: int, optional
:param as_strings: If `True`, words of each topic are represented as
strings. Otherwise they are represented by their integer
representation. Default is `True`.
:type as_strings: boolean, optional
:param compact_view: If `True`, topics are simply represented as
their top `print_len` number of words. Otherwise, topics are
shown as words and their probabilities. Default is `True`.
:type compact_view: boolean, optional
:param dist_fn: A distance function from functions in vsm.spatial.
Default is :meth:`JS_dist`.
:type dist_fn: string, optional
:param order: Order of sorting. 'i' for increasing and 'd' for
decreasing order. Default is 'i'.
:type order: string, optional
:param topic_labels: List of strings that are names that correspond
to the topics in `topic_indices`.
:type topic_labels: list, optional
:returns: an instance of :class:`LabeledColumn`.
A structured array of topics sorted by their distances
with `word_or_words`.
:See Also: :meth:`vsm.viewer.wrapper.dist_word_top`
def vsm.viewer.ldacgsviewer.LdaCgsViewer.doc_entropies |
( |
|
self, |
|
|
|
print_len = 10 , |
|
|
|
label_fn = def_label_fn , |
|
|
|
as_strings = True |
|
) |
| |
Returns the entropies of the distributions over topics as an
array sorted by entropy.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.doc_topics |
( |
|
self, |
|
|
|
doc_or_docs, |
|
|
|
compact_view = False , |
|
|
|
aggregate = False , |
|
|
|
print_len = 10 , |
|
|
|
topic_labels = None |
|
) |
| |
Returns the distribution over topics for the given documents.
:param doc: Specifies the document whose distribution over topics is
returned. It can either be the ID number (integer) or the
name (string) of the document.
:type doc: int or string
:param print_len: Number of topics to be printed. Default is 10.
:type print_len: int, optional
:param compact_view: If `True`, topics are simply represented as
their top `print_len` number of words. Otherwise, topics are
shown as words and their probabilities. Default is `False`.
:type compact_view: boolean, optional
:param topic_labels: List of strings that are names that correspond
to the topics in `topic_indices`.
:type topic_labels: list, optional
:returns: an instance of :class:`LabeledColumn` or of :class: `DataTable`.
An structured array of topics (represented by their
number) and their corresponding probabilities or a list of
such arrays.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.H_phi |
( |
|
self | ) |
|
Returns the entropies of the columns of phi (i.e., topics)
def vsm.viewer.ldacgsviewer.LdaCgsViewer.H_theta |
( |
|
self | ) |
|
Returns the entropies of the columns of theta.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.logp_plot |
( |
|
self, |
|
|
|
range = [] , |
|
|
|
step = 1 , |
|
|
|
show = True , |
|
|
|
grid = True |
|
) |
| |
Returns a plot of log probabilities for the specified range of
the MCMC chain used to fit a topic model by `LDAGibbs`.
The function requires matplotlib package.
:param range: Specifies the range of the MCMC chain whose log probabilites
are to be plotted. For example, range = [0, 999] plots log
probabilities from the 1st to the 1000th iterations.
The length of the list must be exactly two, and the first
element must be smaller than the second which can not exceed
the total length of the MCMC chain.
Default produces the plot for the entire chain.
:type range: list of integers, optional
:param step: Steps by which points are plotted. Default is 1.
:type step: int, optional
:param show: If `True`, the function actually draws the plot
in addition to returning a plot object. Default is `True`.
:type show: boolean, optional
:param grid: If `True` draw a grid. Default is `True`.
:type grid: boolean, optional
:returns: an instance of matplotlib.pyplot object.
Contains the log probability plot.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.phi |
( |
|
self | ) |
|
Returns the word by topic matrix from the model as a right
stochastic matrix (the columns phi_i are probability
distributions).
def vsm.viewer.ldacgsviewer.LdaCgsViewer.theta |
( |
|
self | ) |
|
Returns the topic by document matrix from the model as a right
stochastic matrix (the columns theta_i are probability
distributions.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.topic_entropies |
( |
|
self, |
|
|
|
print_len = 10 |
|
) |
| |
Returns the entropies of the topics of the model as an array sorted
by entropy.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.topic_hist |
( |
|
self, |
|
|
|
topic_indices = None , |
|
|
|
d_indices = [] , |
|
|
|
show = True |
|
) |
| |
Draws a histogram showing the proportion of topics within a set of
documents specified by d_indices.
:param topic_indices: Specifies the topics for which proportions are
calculated. Default is all topics.
:type doc: list of integers, optional
:param d_indices: Specifies the document for which topic proportions
are calculated. Default is all documents.
:type d_indices: list of integers, optional
:param show: shows plot if `True`. Default is `True`.
:type d_indices: boolean, optional
:returns: an instance of matplotlib.pyplot object.
Contains the topic proportion histogram.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.topic_jsds |
( |
|
self, |
|
|
|
print_len = 10 |
|
) |
| |
Returns the partial N-way JSD of each topic, where N is the number of
documents in the model. This measure captures the extent to which an
individual topic is a reliable signal of a document's overall topic
distribution.
Returns an array sorted by descending partial JSD.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.topic_oscillations |
( |
|
self, |
|
|
|
print_len = 10 , |
|
|
|
div_fn = KL_div |
|
) |
| |
Returns the oscillation in the divergences of documents
from each topic k, represented as a categorical distribution
over topics with mass concentrated at index k.
Oscillation is computed as the difference between the maximum
and the minimum of the divergences.
Returns an array sorted by descending oscillation.
def vsm.viewer.ldacgsviewer.LdaCgsViewer.word_topics |
( |
|
self, |
|
|
|
word, |
|
|
|
as_strings = True |
|
) |
| |
Searches for every occurrence of `word` in the entire corpus and returns
a list each row of which contains the name or ID number of document,
the relative position in the document, and the assigned topic number
for each occurrence of `word`.
:param word: The word for which the search is performed.
:type word: string
:param as_strings: If `True`, returns document names rather than
ID numbers. Default is `True`.
:type as_strings: boolean, optional
:returns: an instance of :class:`LabeledColumn`.
A structured array consisting of three columns. Each column
is a list of:
(1) name/ID of document containing `word`
(2) relative position of `word` in the document
(3) Topic number assigned to the token.
tuple vsm.viewer.ldacgsviewer.LdaCgsViewer.k_arr |
|
static |
list vsm.viewer.ldacgsviewer.LdaCgsViewer.phi = self.phi[:,topic_indices] |
|
static |
Returns a list of topics estimated by the model.
Each topic is represented by a list of words and the corresponding
probabilities.
:param topic_indices: List of indices of topics to be
displayed. Default is all topics.
:type topic_indices: list of integers
:param sort: Topic sort function.
:type sort: string, values are "entropy", "oscillation", "index", "jsd",
"user" (default if topic_indices set), "index" (default)
:param print_len: Number of words shown for each topic. Default is 10.
:type print_len: int, optional
:param as_string: If `True`, each topic displays words rather than its
integer representation. Default is `True`.
:type as_string: boolean, optional
:param compact_view: If `True`, topics are simply represented as
their top `print_len` number of words. Otherwise, topics are
shown as words and their probabilities. Default is `True`.
:type compact_view: boolean, optional
:param topic_labels: List of strings that are names that correspond
to the topics in `topic_indices`.
:type topic_labels: list, optional
:returns: an instance of :class:`DataTable`.
A structured array of topics.
La documentación para esta clase fue generada a partir del siguiente fichero:
- vsm/vsm/viewer/ldacgsviewer.py