def count_tokens(text):
#Tokenizes the given text and returns a dictionary with the count of each distinct token.
# First, split the text into individual words
words = text.split()
# Next, create an empty dictionary to hold the token counts
token_counts = {}
# Loop over the words and count how many times each one appears
for word in words:
if word in token_counts:
token_counts[word] += 1
else:
token_counts[word] = 1
# Finally, return the token counts dictionary
return token_counts
text = "This is a clock. This is only a clock."
counts = count_tokens(text)
print(counts)
### stopword function
import nltk
from nltk.corpus import stopwords
def count_tokens(text):
#Tokenizes the given text, removes stopwords, and returns a dictionary with the count of each distinct token.
# First, split the text into individual words
words = text.split()
# Next, remove stopwords from the words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stop_words]
# Next, create an empty dictionary to hold the token counts
token_counts = {}
# Loop over the words and count how many times each one appears
for word in words:
if word in token_counts:
token_counts[word] += 1
else:
token_counts[word] = 1
# Finally, return the token counts dictionary
return token_counts
text = "This is a clock. This is only a clock."
counts = count_tokens(text)
print(counts)
3条答案
按热度按时间ijnw1ujt1#
表达式
'a' or 'z'
总是产生'a'
。表达式'a' and 'z'
总是产生'z'
。它不是某种用于在容器中进行查询的DSL,它是一个简单的布尔表达式(并且find
被调用其结果)。如果你想说“字符串中是否有'a'或'z'”,你需要做对于第二个(字符串中的'a'和'z'):
5t7ly7z52#
这是因为
find
方法实际上不支持or
和and
,它只支持查询字符串。那么,到底是怎么回事呢?好吧,原来
or
和and
是可以在字符串上执行的运算符。现在你已经知道了,你基本上只是像往常一样搜索
'a'
和'z'
。s2j5cfk03#