From e7fce728773c25ff87a5b7f53c54c535605a47e0 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Fri, 20 Feb 2026 11:27:03 -0500 Subject: [PATCH 1/7] recursive extract --- papers/__main__.py | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/papers/__main__.py b/papers/__main__.py index 870f03a..808368a 100644 --- a/papers/__main__.py +++ b/papers/__main__.py @@ -11,6 +11,7 @@ import itertools import fnmatch # unix-like match from slugify import slugify +import concurrent.futures import papers from papers import logger @@ -790,7 +791,40 @@ def fetchcmd(parser, o): print(fetch_bibtex_by_fulltext_crossref(field)) def extractcmd(parser, o): - print(extract_pdf_metadata(o.pdf, search_doi=not o.fulltext, search_fulltext=True, scholar=o.scholar, minwords=o.word_count, max_query_words=o.word_count, image=o.image)) + if os.path.isdir(o.pdf) and o.recursive: + pdf_files = Path(o.pdf).rglob('*.pdf') + futures = [] + with concurrent.futures.ProcessPoolExecutor() as executor: + for pdf in pdf_files: + future = executor.submit(extract_pdf_metadata, + pdf, + search_doi=not o.fulltext, + search_fulltext=True, + scholar=o.scholar, + minwords=o.word_count, + max_query_words=o.word_count, + image=o.image) + print(future.result()) + futures.append(future) + del pdf_files + # for future in futures: + # print(future.result()) + del futures + # OK, a note on the above: clearly, theres parallelization to + # be gained here from doing this all concurrently using futures + # boyan.penkov saw this run locally on his machine; however + # the parallel writes to .cache/papers/crossref.json and + # crossref-bibtex.json have race conditions, and clobber + # the file format, leaving the base command papers unusable + # with json load failures. I'd be glad to fix it, but for now + # we have to do this serially. + # I'd rather leave the futures thing in there, since it does + # work and is a nice path to a clear speedup TODO. + elif len(o.pdf) == 1 and o.pdf.endswith('.pdf'): + print(extract_pdf_metadata(o.pdf, search_doi=not o.fulltext, search_fulltext=True, scholar=o.scholar, minwords=o.word_count, max_query_words=o.word_count, image=o.image)) + else: + raise ValueError('extract requires a single pdf or a directory.') + # TODO trivially extend this for len(o.file) > 1, but no dir # print(fetch_bibtex_by_doi(o.doi)) @@ -1265,6 +1299,7 @@ def get_parser(config=None): extractp.add_argument('--fulltext', action='store_true', help='fulltext only (otherwise DOI-based)') extractp.add_argument('--scholar', action='store_true', help='use google scholar instead of default crossref for fulltext search') extractp.add_argument('--image', action='store_true', help='convert to image and use tesseract instead of pdftotext') + extractp.add_argument('--recursive', action='store_true', help='takes one directory as an arguement; recursively descends into it and shows extracted bibibinfo for each pdf') # *** Pure OS related file checks *** @@ -1396,4 +1431,4 @@ def main_clean_exit(args=None): if __name__ == "__main__": # we use try/except here to use a clean exit instead of trace # test and debugging may use main() directly for speed-up => better to avoid sys.exit there - main_clean_exit() \ No newline at end of file + main_clean_exit() From af9854ace348568f7787fcd4172a4c27bf720631 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Fri, 20 Feb 2026 11:36:23 -0500 Subject: [PATCH 2/7] clean flake8 imports --- tests/test_add.py | 2 -- tests/test_filecheck.py | 9 ++------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/test_add.py b/tests/test_add.py index 90ed7d6..0cde12a 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -2,10 +2,8 @@ import shutil import subprocess as sp import tempfile -import unittest from pathlib import Path -import bibtexparser from papers.entries import parse_file as bp_parse_file, parse_string, get_entry_val from papers.encoding import entry_to_unicode_dict diff --git a/tests/test_filecheck.py b/tests/test_filecheck.py index 9f12582..2a41af9 100644 --- a/tests/test_filecheck.py +++ b/tests/test_filecheck.py @@ -4,16 +4,11 @@ """ import os import shutil -import subprocess as sp import tempfile -import unittest -from pathlib import Path - -import bibtexparser from papers.bib import Biblio from papers.entries import get_entry_val -from tests.common import PAPERSCMD, paperscmd, prepare_paper, prepare_paper2, BibTest +from tests.common import paperscmd, prepare_paper, BibTest class TestFileCheck(BibTest): @@ -99,4 +94,4 @@ def test_filecheck_clean_filesdir(self): self.papers('uninstall') def tearDown(self): - self.temp_dir.cleanup() \ No newline at end of file + self.temp_dir.cleanup() From ed133521e4232a7540745f0779c1a029dd7e9fd7 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Fri, 20 Feb 2026 12:14:03 -0500 Subject: [PATCH 3/7] set file detection better --- papers/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/papers/__main__.py b/papers/__main__.py index 808368a..75853b7 100644 --- a/papers/__main__.py +++ b/papers/__main__.py @@ -820,10 +820,10 @@ def extractcmd(parser, o): # we have to do this serially. # I'd rather leave the futures thing in there, since it does # work and is a nice path to a clear speedup TODO. - elif len(o.pdf) == 1 and o.pdf.endswith('.pdf'): + elif os.path.isfile(o.pdf) == 1 and o.pdf.endswith('.pdf'): print(extract_pdf_metadata(o.pdf, search_doi=not o.fulltext, search_fulltext=True, scholar=o.scholar, minwords=o.word_count, max_query_words=o.word_count, image=o.image)) else: - raise ValueError('extract requires a single pdf or a directory.') + raise ValueError('extract requires a single pdf or a directory and --recursive.') # TODO trivially extend this for len(o.file) > 1, but no dir # print(fetch_bibtex_by_doi(o.doi)) From aca948e1ebc63931c949399750c581a33baf60d7 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Fri, 20 Feb 2026 12:26:11 -0500 Subject: [PATCH 4/7] attempt at doing this with copying from testaddrecursive --- tests/test_extract.py | 137 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 6 deletions(-) diff --git a/tests/test_extract.py b/tests/test_extract.py index a044d72..5da7f4f 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,25 +1,150 @@ import unittest import os +import tempfile +import shutil from papers.extract import extract_pdf_metadata from papers.entries import parse_string -from tests.common import paperscmd, prepare_paper + +from papers.bib import Biblio +from tests.common import paperscmd, prepare_paper, prepare_paper2, BibTest class TestSimple(unittest.TestCase): def setUp(self): - self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex, self.file_rename = prepare_paper() + ( + self.pdf, + self.doi, + self.key, + self.newkey, + self.year, + self.bibtex, + self.file_rename, + ) = prepare_paper() self.assertTrue(os.path.exists(self.pdf)) def test_doi(self): - self.assertEqual(paperscmd(f'doi {self.pdf}', sp_cmd='check_output').strip(), self.doi) + self.assertEqual( + paperscmd(f"doi {self.pdf}", sp_cmd="check_output").strip(), self.doi + ) def test_fetch(self): - bibtexs = paperscmd(f'fetch {self.doi}', sp_cmd='check_output').strip() + bibtexs = paperscmd(f"fetch {self.doi}", sp_cmd="check_output").strip() db1 = parse_string(bibtexs) db2 = parse_string(self.bibtex) - self.assertEqual([dict(e.items()) for e in db1.entries], [dict(e.items()) for e in db2.entries]) + self.assertEqual( + [dict(e.items()) for e in db1.entries], + [dict(e.items()) for e in db2.entries], + ) def test_fetch_scholar(self): - extract_pdf_metadata(self.pdf, scholar=True) \ No newline at end of file + extract_pdf_metadata(self.pdf, scholar=True) + + +class TestAddDir(BibTest): + # TODO delete this later + def setUp(self): + ( + self.pdf1, + self.doi, + self.key1, + self.newkey1, + self.year, + self.bibtex1, + self.file_rename1, + ) = prepare_paper() + ( + self.pdf2, + self.si, + self.doi, + self.key2, + self.newkey2, + self.year, + self.bibtex2, + self.file_rename2, + ) = prepare_paper2() + self.somedir = tempfile.mktemp(prefix="papers.somedir") + self.subdir = os.path.join(self.somedir, "subdir") + os.makedirs(self.somedir) + os.makedirs(self.subdir) + shutil.copy(self.pdf1, self.somedir) + shutil.copy(self.pdf2, self.subdir) + self.mybib = tempfile.mktemp(prefix="papers.bib") + paperscmd(f"install --local --no-prompt --bibtex {self.mybib}") + + def test_adddir_pdf(self): + self.my = Biblio.load(self.mybib, "") + self.my.scan_dir(self.somedir) + self.assertEqual(len(self.my.db.entries), 2) + keys = [self.my.db.entries[0]["ID"], self.my.db.entries[1]["ID"]] + self.assertEqual( + sorted(keys), sorted([self.newkey1, self.newkey2]) + ) # PDF: update key + + def test_adddir_pdf_cmd(self): + paperscmd(f"add --recursive --bibtex {self.mybib} {self.somedir}") + self.my = Biblio.load(self.mybib, "") + self.assertEqual(len(self.my.db.entries), 2) + keys = [self.my.db.entries[0]["ID"], self.my.db.entries[1]["ID"]] + self.assertEqual( + sorted(keys), sorted([self.newkey1, self.newkey2]) + ) # PDF: update key + + def tearDown(self): + os.remove(self.mybib) + shutil.rmtree(self.somedir) + paperscmd(f"uninstall") + + +class TestRecursiveExtract(unittest.TestCase): + + def setUp(self): + ( + self.pdf1, + self.doi1, + self.key1, + self.newkey1, + self.year1, + self.bibtex1, + self.file_rename1, + ) = prepare_paper() + ( + self.pdf2, + self.si2, + self.doi2, + self.key2, + self.newkey2, + self.year2, + self.bibtex2, + self.file_rename2, + ) = prepare_paper2() + self.somedir = tempfile.mktemp(prefix="papers.somedir") + self.subdir = os.path.join(self.somedir, "subdir") + os.makedirs(self.somedir) + os.makedirs(self.subdir) + shutil.copy(self.pdf1, self.somedir) + shutil.copy(self.pdf2, self.subdir) + self.mybib = tempfile.mktemp(prefix="papers.bib") + paperscmd(f"install --local --no-prompt --bibtex {self.mybib}") + self.assertTrue(os.path.exists(self.pdf1)) + self.assertTrue(os.path.exists(self.pdf2)) + + def test_doi(self): + self.assertEqual( + paperscmd(f"doi {self.pdf1}", sp_cmd="check_output").strip(), self.doi1 + ) + + def test_fetch(self): + bibtexs = paperscmd(f"extract {self.pdf1}", sp_cmd="check_output").strip() + db1 = parse_string(bibtexs) + db2 = parse_string(self.bibtex1) + self.assertEqual( + [dict(e.items()) for e in db1.entries], + [dict(e.items()) for e in db2.entries], + ) + + def tearDown(self): + os.remove(self.mybib) + shutil.rmtree(self.somedir) + paperscmd(f"uninstall") From 78ca3142afdd0522053f1f3b72214a9d33fccc86 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Sat, 21 Feb 2026 13:06:47 -0500 Subject: [PATCH 5/7] get papers status -v to tell you bibtex file count --- papers/config.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/papers/config.py b/papers/config.py index 0b1e846..e88d51b 100644 --- a/papers/config.py +++ b/papers/config.py @@ -162,6 +162,20 @@ def _update_paths_to_absolute(self): def status(self, check_files=False, verbose=False): + def _count_files_in_bibtex(db): + """ + Given a bibtexparser database, return the file count + in it, over all the guys that have multiple files. + """ + file_count = 0 + for entry in db.entries: + # assumes papers only sticks things in a file = {:whatever.pdf:pdf} line + if 'file' in entry: + # assumes papers has multiple files separated by ';' + files = entry['file'].split(';') + file_count += len(files) + return file_count + def _fmt_path(p): if self.local: return os.path.relpath(p, ".") @@ -210,7 +224,9 @@ def _fmt_path(p): bibtexstring = open(self.bibtex).read() db = parse_string(bibtexstring) if len(db.entries): - status = bcolors.OKBLUE+' ({} entries)'.format(len(db.entries))+bcolors.ENDC + file_count = _count_files_in_bibtex(db) + print(file_count) + status = bcolors.OKBLUE+' ({} files in {} entries)'.format(file_count, len(db.entries))+bcolors.ENDC else: status = bcolors.WARNING+' (empty)'+bcolors.ENDC except: From 395b1e3f4f48493535ecd10b38be5e1df321a930 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Sat, 21 Feb 2026 13:45:00 -0500 Subject: [PATCH 6/7] clean one print, del --- papers/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/papers/config.py b/papers/config.py index e88d51b..1c10745 100644 --- a/papers/config.py +++ b/papers/config.py @@ -225,8 +225,8 @@ def _fmt_path(p): db = parse_string(bibtexstring) if len(db.entries): file_count = _count_files_in_bibtex(db) - print(file_count) status = bcolors.OKBLUE+' ({} files in {} entries)'.format(file_count, len(db.entries))+bcolors.ENDC + del file_count else: status = bcolors.WARNING+' (empty)'+bcolors.ENDC except: From 3ad9b95c72010c79e0064c01e28e930696eca8e6 Mon Sep 17 00:00:00 2001 From: Boyan Penkov Date: Sun, 22 Feb 2026 11:35:04 -0500 Subject: [PATCH 7/7] make the split more explicit, since this fails on stuff like '&' --- papers/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/papers/config.py b/papers/config.py index 1c10745..37f0b2d 100644 --- a/papers/config.py +++ b/papers/config.py @@ -172,7 +172,7 @@ def _count_files_in_bibtex(db): # assumes papers only sticks things in a file = {:whatever.pdf:pdf} line if 'file' in entry: # assumes papers has multiple files separated by ';' - files = entry['file'].split(';') + files = entry['file'].split('.pdf:pdf;') file_count += len(files) return file_count