import numpy as np
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP
from typing import List, Union
from bidi.algorithm import get_display
import arabic_reshaper
def visualize_documents(topic_model,
docs: List[str],
topics: List[int] = None,
embeddings: np.ndarray = None,
reduced_embeddings: np.ndarray = None,
sample: float = None,
hide_annotations: bool = False,
hide_document_hover: bool = False,
custom_labels: Union[bool, str] = False,
title: str = "<b>Documents and Topics</b>",
width: int = 1200,
height: int = 750):
""" Visualize documents and their topics in 2D
Arguments:
topic_model: A fitted BERTopic instance.
docs: The documents you used when calling either `fit` or `fit_transform`
topics: A selection of topics to visualize.
Not to be confused with the topics that you get from `.fit_transform`.
For example, if you want to visualize only topics 1 through 5:
`topics = [1, 2, 3, 4, 5]`.
embeddings: The embeddings of all documents in `docs`.
reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
sample: The percentage of documents in each topic that you would like to keep.
Value can be between 0 and 1. Setting this value to, for example,
0.1 (10% of documents in each topic) makes it easier to visualize
millions of documents as a subset is chosen.
hide_annotations: Hide the names of the traces on top of each cluster.
hide_document_hover: Hide the content of the documents when hovering over
specific points. Helps to speed up generation of visualization.
custom_labels: If bool, whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
If `str`, it uses labels from other aspects, e.g., "Aspect1".
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Examples:
To visualize the topics simply run:
```python
topic_model.visualize_documents(docs)
Do note that this re-calculates the embeddings and reduces them to 2D. The advised and prefered pipeline for using this function is as follows:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
# Prepare embeddings
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)
# Reduce dimensionality of embeddings, this step is optional
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)
# Or, if you have reduced the original embeddings already:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP
from typing import List, Union
from bidi.algorithm import get_display
import arabic_reshaper
def visualize_documents(topic_model,
docs: List[str],
topics: List[int] = None,
embeddings: np.ndarray = None,
reduced_embeddings: np.ndarray = None,
sample: float = None,
hide_annotations: bool = False,
hide_document_hover: bool = False,
custom_labels: Union[bool, str] = False,
title: str = "<b>Documents and Topics</b>",
width: int = 1200,
height: int = 750):
""" Visualize documents and their topics in 2D
Arguments:
topic_model: A fitted BERTopic instance.
docs: The documents you used when calling either `fit` or `fit_transform`
topics: A selection of topics to visualize.
Not to be confused with the topics that you get from `.fit_transform`.
For example, if you want to visualize only topics 1 through 5:
`topics = [1, 2, 3, 4, 5]`.
embeddings: The embeddings of all documents in `docs`.
reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
sample: The percentage of documents in each topic that you would like to keep.
Value can be between 0 and 1. Setting this value to, for example,
0.1 (10% of documents in each topic) makes it easier to visualize
millions of documents as a subset is chosen.
hide_annotations: Hide the names of the traces on top of each cluster.
hide_document_hover: Hide the content of the documents when hovering over
specific points. Helps to speed up generation of visualization.
custom_labels: If bool, whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
If `str`, it uses labels from other aspects, e.g., "Aspect1".
title: Title of the plot.
width: The width of the figure.
height: The height of the figure.
Examples:
To visualize the topics simply run:
```python
topic_model.visualize_documents(docs)
Do note that this re-calculates the embeddings and reduces them to 2D. The advised and prefered pipeline for using this function is as follows:
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
# Prepare embeddings
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Train BERTopic
topic_model = BERTopic().fit(docs, embeddings)
# Reduce dimensionality of embeddings, this step is optional
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)
# Or, if you have reduced the original embeddings already:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
6条答案
按热度按时间hiz5n14c1#
感谢您分享这个问题。您能否详细解释一下这个问题到底是什么,以及如何解决?我对阿拉伯语或类似的语言不熟悉,所以我需要一些帮助来理解这个问题。
jjjwad0x2#
当然
首先,感谢你的工作。你真的做得很好。
这是一个文本编码的问题。我已经解决了,并向你发送了请求。你可以查看详细信息:
阿拉伯文具有两个重要特点:
所以当你尝试在不支持阿拉伯文的应用程序或库中打印阿拉伯文时,很可能会得到类似这样的结果:
我们在这里有两个问题,第一个是字符处于孤立形式,这意味着每个字符都会被渲染,而不管它的周围环境如何。第二个问题是文本是从左到右书写的。
要解决后者的问题,我们只需要使用 Unicode bidirectional algorithm ,它在 python-bidi 中完全用Python实现。如果你使用它,你会得到类似这样的结果:
剩下的问题是如何重塑这些字符,并根据它们的周围环境将它们替换为正确的形状。使用这个库有助于重塑,所以我们可以得到正确的结果,如下所示:
r7s23pms3#
感谢您的详细描述!这确实帮助我了解如何正确渲染阿拉伯文本。实现本身将是我的主要关注点,因为需要额外的依赖项才能正确渲染文本,其中许多对于大多数用户来说并非必要。可选依赖项目前仅关注嵌入,但未来可能会根据进一步的开发和社区的需求而发生变化。也许可以进行某种检查,以查看是否已安装相关软件包并使用它们(如果有这种情况)。通常情况下,只有在用户手动安装这些软件包时才会安装此类软件包。
egmofgnx4#
你好,我已经解决了这个问题,如下所示:
正如@apoalquaary提到的,我已经添加了所需的库来正确渲染文本。请注意,这不仅适用于阿拉伯语,还适用于其他应该从右到左书写的语言。
此外,我想为对此解决方案感兴趣的人提供我的实现方法:
.env/lib/python3.8/sitepackages/bertopic/plotting/_documents.py
Do note that this re-calculates the embeddings and reduces them to 2D.
The advised and prefered pipeline for using this function is as follows:
Or if you want to save the resulting figure:
7fhtutme5#
你好,我已经解决了这个问题,如下所示:
正如@apoalquaary提到的,我已经添加了所需的库来正确渲染文本。请注意,这不仅适用于阿拉伯语,还适用于其他应该从右向左书写的语言。
此外,我想为对此解决方案感兴趣的人提供我的实现方法:
.env/lib/python3.8/sitepackages/bertopic/plotting/_documents.py
Do note that this re-calculates the embeddings and reduces them to 2D.
The advised and prefered pipeline for using this function is as follows:
Or if you want to save the resulting figure:
au9on6nz6#
好的,我之前没有看到这个仓库。谢谢 :)