This commit is contained in:
commit
95a533c876
451 changed files with 18255 additions and 0 deletions
106
pkgs/marker-pdf/default.nix
Normal file
106
pkgs/marker-pdf/default.nix
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
{
|
||||
lib,
|
||||
python3,
|
||||
fetchPypi,
|
||||
fetchurl,
|
||||
}:
|
||||
|
||||
let
|
||||
fontFileName = "GoNotoCurrent-Regular.ttf";
|
||||
|
||||
fetchFont = fetchurl {
|
||||
url = "https://models.datalab.to/artifacts/${fontFileName}";
|
||||
hash = "sha256-iCr7q5ZWCMLSvGJ/2AFrliqlpr4tNY+d4kp7WWfFYy4=";
|
||||
};
|
||||
|
||||
python = python3;
|
||||
|
||||
pdftext = import ./pdftext.nix { inherit lib python fetchPypi; };
|
||||
surya-ocr = import ./surya-ocr.nix { inherit lib python fetchPypi; };
|
||||
in
|
||||
|
||||
python.pkgs.buildPythonApplication rec {
|
||||
pname = "marker-pdf";
|
||||
version = "1.8.2";
|
||||
pyproject = true;
|
||||
|
||||
src = fetchPypi {
|
||||
pname = "marker_pdf";
|
||||
inherit version;
|
||||
hash = "sha256-k2mxOpBBtXdCzxP4hqfXnCEqUF69hQZWr/d9V/tITZ4=";
|
||||
};
|
||||
|
||||
patches = [
|
||||
./skip-font-download.patch
|
||||
./fix-output-dir.patch
|
||||
];
|
||||
|
||||
pythonRelaxDeps = [
|
||||
"click"
|
||||
"anthropic"
|
||||
"markdownify"
|
||||
"pillow"
|
||||
];
|
||||
|
||||
pythonRemoveDeps = [
|
||||
"pre-commit"
|
||||
];
|
||||
|
||||
postInstall = ''
|
||||
FONT_DEST_DIR="$out/lib/${python.libPrefix}/site-packages/static/fonts"
|
||||
mkdir -p $FONT_DEST_DIR
|
||||
cp ${fetchFont} "$FONT_DEST_DIR/${fontFileName}"
|
||||
echo "Installed font to $FONT_DEST_DIR/${fontFileName}"
|
||||
'';
|
||||
|
||||
build-system = [
|
||||
python.pkgs.poetry-core
|
||||
];
|
||||
|
||||
dependencies = [
|
||||
pdftext
|
||||
surya-ocr
|
||||
]
|
||||
++ (with python.pkgs; [
|
||||
anthropic
|
||||
click
|
||||
filetype
|
||||
ftfy
|
||||
google-genai
|
||||
markdown2
|
||||
markdownify
|
||||
openai
|
||||
pillow
|
||||
pydantic
|
||||
pydantic-settings
|
||||
python-dotenv
|
||||
rapidfuzz
|
||||
regex
|
||||
scikit-learn
|
||||
torch
|
||||
tqdm
|
||||
transformers
|
||||
]);
|
||||
|
||||
optional-dependencies = with python.pkgs; {
|
||||
full = [
|
||||
ebooklib
|
||||
mammoth
|
||||
openpyxl
|
||||
python-pptx
|
||||
weasyprint
|
||||
];
|
||||
};
|
||||
|
||||
pythonImportsCheck = [
|
||||
"marker"
|
||||
];
|
||||
|
||||
meta = {
|
||||
description = "Convert documents to markdown with high speed and accuracy";
|
||||
homepage = "https://pypi.org/project/marker-pdf/";
|
||||
license = lib.licenses.gpl3Only;
|
||||
maintainers = with lib.maintainers; [ ];
|
||||
};
|
||||
|
||||
}
|
||||
11
pkgs/marker-pdf/fix-output-dir.patch
Normal file
11
pkgs/marker-pdf/fix-output-dir.patch
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
--- a/marker/settings.py
|
||||
+++ b/marker/settings.py
|
||||
@@ -6,7 +6,7 @@
|
||||
class Settings(BaseSettings):
|
||||
# Paths
|
||||
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
- OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
|
||||
+ OUTPUT_DIR: str = "/tmp/marker_conversion_results"
|
||||
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
|
||||
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
|
||||
ARTIFACT_URL: str = "https://models.datalab.to/artifacts"
|
||||
43
pkgs/marker-pdf/pdftext.nix
Normal file
43
pkgs/marker-pdf/pdftext.nix
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
lib,
|
||||
python,
|
||||
fetchPypi,
|
||||
}:
|
||||
|
||||
python.pkgs.buildPythonApplication rec {
|
||||
pname = "pdftext";
|
||||
version = "0.6.3";
|
||||
pyproject = true;
|
||||
|
||||
src = fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
|
||||
};
|
||||
|
||||
pythonRelaxDeps = [
|
||||
"pypdfium2"
|
||||
];
|
||||
|
||||
build-system = [
|
||||
python.pkgs.poetry-core
|
||||
];
|
||||
|
||||
dependencies = with python.pkgs; [
|
||||
click
|
||||
numpy
|
||||
pydantic
|
||||
pydantic-settings
|
||||
pypdfium2
|
||||
];
|
||||
|
||||
pythonImportsCheck = [
|
||||
"pdftext"
|
||||
];
|
||||
|
||||
meta = {
|
||||
description = "Extract structured text from pdfs quickly";
|
||||
homepage = "https://pypi.org/project/pdftext/";
|
||||
license = lib.licenses.asl20;
|
||||
maintainers = with lib.maintainers; [ ];
|
||||
};
|
||||
}
|
||||
24
pkgs/marker-pdf/skip-font-download.patch
Normal file
24
pkgs/marker-pdf/skip-font-download.patch
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
--- a/marker/util.py
|
||||
+++ b/marker/util.py
|
||||
@@ -151,13 +151,7 @@
|
||||
return sorted_lines
|
||||
|
||||
def download_font():
|
||||
- if not os.path.exists(settings.FONT_PATH):
|
||||
- os.makedirs(os.path.dirname(settings.FONT_PATH), exist_ok=True)
|
||||
- font_dl_path = f"{settings.ARTIFACT_URL}/{settings.FONT_NAME}"
|
||||
- with requests.get(font_dl_path, stream=True) as r, open(settings.FONT_PATH, 'wb') as f:
|
||||
- r.raise_for_status()
|
||||
- for chunk in r.iter_content(chunk_size=8192):
|
||||
- f.write(chunk)
|
||||
+ pass
|
||||
|
||||
def get_opening_tag_type(tag):
|
||||
"""
|
||||
@@ -195,4 +189,4 @@
|
||||
if tag_type in TAG_MAPPING:
|
||||
return True, TAG_MAPPING[tag_type]
|
||||
|
||||
- return False, None
|
||||
\ No newline at end of file
|
||||
+ return False, None
|
||||
58
pkgs/marker-pdf/surya-ocr.nix
Normal file
58
pkgs/marker-pdf/surya-ocr.nix
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
{
|
||||
lib,
|
||||
python,
|
||||
fetchPypi,
|
||||
}:
|
||||
|
||||
python.pkgs.buildPythonApplication rec {
|
||||
pname = "surya-ocr";
|
||||
version = "0.14.6";
|
||||
pyproject = true;
|
||||
|
||||
src = fetchPypi {
|
||||
pname = "surya_ocr";
|
||||
inherit version;
|
||||
hash = "sha256-yFoL2d0AyGq0TtJlwO0VYBEG268tDQoGf6e7UzE31fA=";
|
||||
};
|
||||
|
||||
pythonRelaxDeps = [
|
||||
"opencv-python-headless"
|
||||
"pillow"
|
||||
"pypdfium2"
|
||||
"einops"
|
||||
];
|
||||
|
||||
pythonRemoveDeps = [
|
||||
"pre-commit"
|
||||
];
|
||||
|
||||
build-system = [
|
||||
python.pkgs.poetry-core
|
||||
];
|
||||
|
||||
dependencies = with python.pkgs; [
|
||||
click
|
||||
einops
|
||||
filetype
|
||||
opencv-python-headless
|
||||
pillow
|
||||
platformdirs
|
||||
pydantic
|
||||
pydantic-settings
|
||||
pypdfium2
|
||||
python-dotenv
|
||||
torch
|
||||
transformers
|
||||
];
|
||||
|
||||
pythonImportsCheck = [
|
||||
"surya"
|
||||
];
|
||||
|
||||
meta = {
|
||||
description = "OCR, layout, reading order, and table recognition in 90+ languages";
|
||||
homepage = "https://pypi.org/project/surya-ocr/";
|
||||
license = lib.licenses.gpl3Only;
|
||||
maintainers = with lib.maintainers; [ ];
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue