请告诉我,我运行了代码,但没有加载 CPU,图表上没有任何动作。没有结果。我的关节在哪里?
stop = stopwords.words('russian')
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, n_jobs=-1):
"""
Text preprocessing transformer:
name - name of dataframe, saves files according to name
n_jobs - parallel jobs to run
"""
self.n_jobs = n_jobs
def fit(self, X, name, y=None):
self.name=name
return self
def transform(self, X, *_):
# main transformer
data=self._text_indexing(X)
data=self._multi(self._proc_target, data)
return data
def _proc_target(self, task):
#task=task.reset_index(drop=True, inplace=True)
data=self._preprocess_text(task)
data=self._stemmer(data)
data = self._punc(data)
data = self._stopwords_remover(data)
return data
def _multi(self, target, tasks, workers=None):
if workers is None: workers = max(2, mp.cpu_count() - 1)
pool = mp.Pool(processes=workers)
res = pool.map(target, tasks)
pool.close()
pool.join()
return res
def _preprocess_text(self, text):
low_cased_text = self._low_case(text['text'])
eng_cleaned = self._english(low_cased_text)
stopwords_cleaned = self._stopwords(eng_cleaned)
return self._numbers(stopwords_cleaned)
def _stemmer(self, text):
mst=MyStem(mystem_path='/home/azubochenko/work/plagiat/baseline/pipeline/mystem')
data=[]
for i in text:
data.append(mst._make_mystem_lemma(i))
return data
def _text_indexing(self, data):
for i in data.iloc[0]:
data.rename(columns={0: 'text'}, inplace=True)
return data
def _low_case(self, text):
#text to lower case
return text.str.lower()
def _english(self, text):
#delete english words
return text.apply(lambda x : re.sub(r'[a-z]+', '', x))
def _stopwords(self, text):
#delete stopwords from russian nltk vocabulary
return text.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
def _numbers(self, text):
#delete digits
return text.apply(lambda x : re.sub(r'\d+', '', x))
def _punc(self, text):
#delete punctuation
return [[i.translate(str.maketrans(dict.fromkeys(string.punctuation))) for i in j] for j in text]
def _sentence_tok(self, data):
#indexing of texts to paragrapsh and text indexes
corp=[]
text_indexes=[]
indexes=[]
text_idx=0
for i in data.iloc[:,0]:
j=sent_tokenize(i)
sentences=[]
idx=1
text_idx+=1
for k in j:
if len(sentences)<3:
sentences.append(k)
else:
corp.append(str(sentences).strip('[]'))
sentences=[]
indexes.append(idx)
text_indexes.append(text_idx)
idx+=1
return pd.DataFrame({'text': corp, 'paragraph_index':indexes, 'text_index':text_indexes})
def _tokenize(self, data):
#text tokenizing
data.dropna(inplace=True)
return data.apply(word_tokenize)
def _get_text(self, url, encoding='utf-8', to_lower=True):
#stopwords getter of from Github stopwords-iso
url = str(url)
if url.startswith('http'):
r = requests.get(url)
if not r.ok:
r.raise_for_status()
return r.text.lower() if to_lower else r.text
elif os.path.exists(url):
with open(url, encoding=encoding) as f:
return f.read().lower() if to_lower else f.read()
else:
raise Exception('parameter [url] can be either URL or a filename')
def _remove_stopwords(self, tokens, stopwords=None, min_length=4):
#stopwords remover using stopwords-iso
if not stopwords:
return tokens
stopwords = set(stopwords)
tokens = [tok
for tok in tokens
if tok not in stopwords and len(tok) >= min_length]
return tokens
def _stopwords_remover(self, tokens):
url_stopwords_ru = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
stopwords_ru = self._get_text(url_stopwords_ru).splitlines()
output=[]
[output.append(self._remove_stopwords(x, stopwords=stopwords_ru)) for x in tokens]
return output'''
text_preprocessing=TextPreprocessor()
test_df=text_preprocessing.fit_transform(test, name='test')
怀疑错误出在函数本身_multi
,但我不明白在哪里。
我自己想通了,熊猫与多处理冲突。要实现此代码,您必须以列表形式提交数据,或使用多处理的其他实现,例如
pandarallel
. 问题已结束。