Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CalledProcessError: Command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' returned non-zero exit status 127.Bug report [BUG] #121

Open
w1973145618 opened this issue Mar 25, 2024 · 1 comment

Comments

@w1973145618
Copy link

Describe the bug
CalledProcessError: Command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' returned non-zero exit status 127.

To Reproduce
os.environ['MALLET_MEMORY'] = '200G'
from pycisTopic.lda_models import run_cgs_models_mallet

Configure path Mallet

mallet_path="/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet"

Run models

models=run_cgs_models_mallet(
cistopic_obj,
n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
n_cpu=12,
n_iter=500,
random_state=555,
alpha=50,
alpha_by_topic=True,
eta=0.1,
eta_by_topic=False,
tmp_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial",
save_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial",
mallet_path=mallet_path,
)

Error output
CalledProcessError Traceback (most recent call last)
File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:532, in LDAMallet.convert_input(self, corpus)
531 try:
--> 532 subprocess.check_output(args=cmd, shell=False, stderr=subprocess.STDOUT)
533 except subprocess.CalledProcessError as e:

File ~/anaconda3/envs/pycisTopic/lib/python3.11/subprocess.py:466, in check_output(timeout, *popenargs, **kwargs)
464 kwargs['input'] = empty
--> 466 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
467 **kwargs).stdout

File ~/anaconda3/envs/pycisTopic/lib/python3.11/subprocess.py:571, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
570 if check and retcode:
--> 571 raise CalledProcessError(retcode, process.args,
572 output=stdout, stderr=stderr)
573 return CompletedProcess(process.args, retcode, stdout, stderr)

CalledProcessError: Command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' returned non-zero exit status 127.

During handling of the above exception, another exception occurred:

RuntimeError Traceback (most recent call last)
Cell In[47], line 6
4 mallet_path="/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet"
5 # Run models
----> 6 models=run_cgs_models_mallet(
7 cistopic_obj,
8 n_topics=[2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
9 n_cpu=12,
10 n_iter=500,
11 random_state=555,
12 alpha=50,
13 alpha_by_topic=True,
14 eta=0.1,
15 eta_by_topic=False,
16 tmp_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial",
17 save_path="./scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial",
18 mallet_path=mallet_path,
19 )

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:806, in run_cgs_models_mallet(cistopic_obj, n_topics, n_cpu, n_iter, random_state, alpha, alpha_by_topic, eta, eta_by_topic, top_topics_coh, tmp_path, save_path, reuse_corpus, mallet_path)
803 corpus = matutils.Sparse2Corpus(binary_matrix)
804 id2word = utils.FakeDict(len(region_names))
--> 806 model_list = [
807 run_cgs_model_mallet(
808 binary_matrix=binary_matrix,
809 corpus=corpus,
810 id2word=id2word,
811 n_topics=n_topic,
812 cell_names=cell_names,
813 region_names=region_names,
814 n_cpu=n_cpu,
815 n_iter=n_iter,
816 random_state=random_state,
817 alpha=alpha,
818 alpha_by_topic=alpha_by_topic,
819 eta=eta,
820 eta_by_topic=eta_by_topic,
821 top_topics_coh=top_topics_coh,
822 tmp_path=tmp_path,
823 save_path=save_path,
824 reuse_corpus=reuse_corpus,
825 mallet_path=mallet_path,
826 )
827 for n_topic in n_topics
828 ]
829 return model_list

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:807, in (.0)
803 corpus = matutils.Sparse2Corpus(binary_matrix)
804 id2word = utils.FakeDict(len(region_names))
806 model_list = [
--> 807 run_cgs_model_mallet(
808 binary_matrix=binary_matrix,
809 corpus=corpus,
810 id2word=id2word,
811 n_topics=n_topic,
812 cell_names=cell_names,
813 region_names=region_names,
814 n_cpu=n_cpu,
815 n_iter=n_iter,
816 random_state=random_state,
817 alpha=alpha,
818 alpha_by_topic=alpha_by_topic,
819 eta=eta,
820 eta_by_topic=eta_by_topic,
821 top_topics_coh=top_topics_coh,
822 tmp_path=tmp_path,
823 save_path=save_path,
824 reuse_corpus=reuse_corpus,
825 mallet_path=mallet_path,
826 )
827 for n_topic in n_topics
828 ]
829 return model_list

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:916, in run_cgs_model_mallet(binary_matrix, corpus, id2word, n_topics, cell_names, region_names, n_cpu, n_iter, random_state, alpha, alpha_by_topic, eta, eta_by_topic, top_topics_coh, tmp_path, save_path, reuse_corpus, mallet_path)
914 start = time.time()
915 log.info(f"Running model with {n_topics} topics")
--> 916 model = LDAMallet(
917 corpus=corpus,
918 id2word=id2word,
919 num_topics=n_topics,
920 iterations=n_iter,
921 alpha=alpha,
922 eta=eta,
923 n_cpu=n_cpu,
924 tmp_dir=tmp_path,
925 random_seed=random_state,
926 reuse_corpus=reuse_corpus,
927 mallet_path=mallet_path,
928 )
929 end_time = time.time() - start
931 # Get distributions

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:467, in LDAMallet.init(self, num_topics, corpus, alpha, eta, id2word, n_cpu, tmp_dir, optimize_interval, iterations, topic_threshold, random_seed, reuse_corpus, mallet_path)
465 self.mallet_path = mallet_path
466 if corpus is not None:
--> 467 self.train(corpus, reuse_corpus)

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:552, in LDAMallet.train(self, corpus, reuse_corpus)
550 logger = logging.getLogger("LDAMalletWrapper")
551 if os.path.isfile(self.fcorpusmallet()) is False or reuse_corpus is False:
--> 552 self.convert_input(corpus)
553 else:
554 logger.info("MALLET corpus already exists, training model")

File ~/anaconda3/Git/pycisTopic/src/pycisTopic/lda_models.py:534, in LDAMallet.convert_input(self, corpus)
532 subprocess.check_output(args=cmd, shell=False, stderr=subprocess.STDOUT)
533 except subprocess.CalledProcessError as e:
--> 534 raise RuntimeError(
535 f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}"
536 )

RuntimeError: command '['/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet', 'import-file', '--preserve-case', '--keep-sequence', '--token-regex', '\S+', '--input', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.txt', '--output', './scratch/leuven/330/vsc33053/ray_spill/mallet/tutorial/corpus.mallet']' return with error (code 127): b'/home/taxue/mywork/pycisTopic/Mallet-202108/bin/mallet: \xe8\xa1\x8c 60: java: \xe6\x9c\xaa\xe6\x89\xbe\xe5\x88\xb0\xe5\x91\xbd\xe4\xbb\xa4\n'

@w1973145618
Copy link
Author

“Mallet is based on Java, make sure Java is installed in the environment.”

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant