initial commit
All checks were successful
Deploy docs / build-and-deploy (push) Successful in 3s

This commit is contained in:
sid 2026-02-23 20:34:35 +01:00
commit 95a533c876
451 changed files with 18255 additions and 0 deletions

106
pkgs/marker-pdf/default.nix Normal file
View file

@ -0,0 +1,106 @@
{
lib,
python3,
fetchPypi,
fetchurl,
}:
let
fontFileName = "GoNotoCurrent-Regular.ttf";
fetchFont = fetchurl {
url = "https://models.datalab.to/artifacts/${fontFileName}";
hash = "sha256-iCr7q5ZWCMLSvGJ/2AFrliqlpr4tNY+d4kp7WWfFYy4=";
};
python = python3;
pdftext = import ./pdftext.nix { inherit lib python fetchPypi; };
surya-ocr = import ./surya-ocr.nix { inherit lib python fetchPypi; };
in
python.pkgs.buildPythonApplication rec {
pname = "marker-pdf";
version = "1.8.2";
pyproject = true;
src = fetchPypi {
pname = "marker_pdf";
inherit version;
hash = "sha256-k2mxOpBBtXdCzxP4hqfXnCEqUF69hQZWr/d9V/tITZ4=";
};
patches = [
./skip-font-download.patch
./fix-output-dir.patch
];
pythonRelaxDeps = [
"click"
"anthropic"
"markdownify"
"pillow"
];
pythonRemoveDeps = [
"pre-commit"
];
postInstall = ''
FONT_DEST_DIR="$out/lib/${python.libPrefix}/site-packages/static/fonts"
mkdir -p $FONT_DEST_DIR
cp ${fetchFont} "$FONT_DEST_DIR/${fontFileName}"
echo "Installed font to $FONT_DEST_DIR/${fontFileName}"
'';
build-system = [
python.pkgs.poetry-core
];
dependencies = [
pdftext
surya-ocr
]
++ (with python.pkgs; [
anthropic
click
filetype
ftfy
google-genai
markdown2
markdownify
openai
pillow
pydantic
pydantic-settings
python-dotenv
rapidfuzz
regex
scikit-learn
torch
tqdm
transformers
]);
optional-dependencies = with python.pkgs; {
full = [
ebooklib
mammoth
openpyxl
python-pptx
weasyprint
];
};
pythonImportsCheck = [
"marker"
];
meta = {
description = "Convert documents to markdown with high speed and accuracy";
homepage = "https://pypi.org/project/marker-pdf/";
license = lib.licenses.gpl3Only;
maintainers = with lib.maintainers; [ ];
};
}

View file

@ -0,0 +1,11 @@
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -6,7 +6,7 @@
class Settings(BaseSettings):
# Paths
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
+ OUTPUT_DIR: str = "/tmp/marker_conversion_results"
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
ARTIFACT_URL: str = "https://models.datalab.to/artifacts"

View file

@ -0,0 +1,43 @@
{
lib,
python,
fetchPypi,
}:
python.pkgs.buildPythonApplication rec {
pname = "pdftext";
version = "0.6.3";
pyproject = true;
src = fetchPypi {
inherit pname version;
hash = "sha256-q1xd/g8ft43h24N8ytrB6kGwfOGJD+rZc8moTNr1Tew=";
};
pythonRelaxDeps = [
"pypdfium2"
];
build-system = [
python.pkgs.poetry-core
];
dependencies = with python.pkgs; [
click
numpy
pydantic
pydantic-settings
pypdfium2
];
pythonImportsCheck = [
"pdftext"
];
meta = {
description = "Extract structured text from pdfs quickly";
homepage = "https://pypi.org/project/pdftext/";
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [ ];
};
}

View file

@ -0,0 +1,24 @@
--- a/marker/util.py
+++ b/marker/util.py
@@ -151,13 +151,7 @@
return sorted_lines
def download_font():
- if not os.path.exists(settings.FONT_PATH):
- os.makedirs(os.path.dirname(settings.FONT_PATH), exist_ok=True)
- font_dl_path = f"{settings.ARTIFACT_URL}/{settings.FONT_NAME}"
- with requests.get(font_dl_path, stream=True) as r, open(settings.FONT_PATH, 'wb') as f:
- r.raise_for_status()
- for chunk in r.iter_content(chunk_size=8192):
- f.write(chunk)
+ pass
def get_opening_tag_type(tag):
"""
@@ -195,4 +189,4 @@
if tag_type in TAG_MAPPING:
return True, TAG_MAPPING[tag_type]
- return False, None
\ No newline at end of file
+ return False, None

View file

@ -0,0 +1,58 @@
{
lib,
python,
fetchPypi,
}:
python.pkgs.buildPythonApplication rec {
pname = "surya-ocr";
version = "0.14.6";
pyproject = true;
src = fetchPypi {
pname = "surya_ocr";
inherit version;
hash = "sha256-yFoL2d0AyGq0TtJlwO0VYBEG268tDQoGf6e7UzE31fA=";
};
pythonRelaxDeps = [
"opencv-python-headless"
"pillow"
"pypdfium2"
"einops"
];
pythonRemoveDeps = [
"pre-commit"
];
build-system = [
python.pkgs.poetry-core
];
dependencies = with python.pkgs; [
click
einops
filetype
opencv-python-headless
pillow
platformdirs
pydantic
pydantic-settings
pypdfium2
python-dotenv
torch
transformers
];
pythonImportsCheck = [
"surya"
];
meta = {
description = "OCR, layout, reading order, and table recognition in 90+ languages";
homepage = "https://pypi.org/project/surya-ocr/";
license = lib.licenses.gpl3Only;
maintainers = with lib.maintainers; [ ];
};
}