Update brand term removal with this code
Opened this issue · 1 comments
jroakes commented
self, texts: List[str], brand_regex: str, repl_term: str = "brandx"
) -> list:
"""Replaces top ngrams in a list of texts that match a given regex string"""
brand_queries = [t for t in texts if re.search(brand_regex, t)]
vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=3)
X1 = vectorizer.fit_transform(brand_queries)
features = [
f
for f in filter(
lambda x: re.search(brand_regex, x),
list(vectorizer.get_feature_names()),
)
]
# Replace longest phrases first
brand_terms = sorted(features, key=lambda x: len(x.split()), reverse=True)
return [self.repl_from_list(t, brand_terms, repl_term) for t in texts]```
joejoinerr commented
You may see better performance for the regex if you pre-compile it at the beginning of the function like this:
self, texts: List[str], brand_regex: str, repl_term: str = "brandx"
) -> list:
"""Replaces top ngrams in a list of texts that match a given regex string"""
brand_regex = re.compile(brand_regex)
brand_queries = [t for t in texts if brand_regex.search(t)]
vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=3)
X1 = vectorizer.fit_transform(brand_queries)
features = [
f
for f in filter(
lambda x: brand_regex.search(x),
list(vectorizer.get_feature_names()),
)
]
# Replace longest phrases first
brand_terms = sorted(features, key=lambda x: len(x.split()), reverse=True)
return [self.repl_from_list(t, brand_terms, repl_term) for t in texts]