pdftract/scripts/fetch-shape-corpus.sh
jedarden dd2d3502c6 feat(glyph-shape): implement font corpus fetch script and shape DB generation
Implemented scripts/fetch-shape-corpus.sh for downloading open-licensed
font corpus and generating glyph shape database for L4 recognition.

- Script downloads fonts from build/shape-corpus-manifest.txt
- Copies LICENSE files to build/font-licenses/ for compliance
- Idempotent: skips already-present fonts
- Fixed xtask center_bitmap_32x32 overflow bug (width/height > 32)

Generated build/glyph-shapes.json with 9,141 glyphs (> 4500 target):
  - DejaVu Sans: 4,459 glyphs (Latin Extended, Greek, Cyrillic)
  - Roboto: 2,392 glyphs (Latin Basic, extended)
  - JetBrains Mono: 1,176 glyphs (monospace)
  - Source Code Pro: 1,124 glyphs (monospace)

build/font-licenses/COMPLIANCE.md documents OFL derivative-work analysis
for pHash data redistribution.

Closes: pdftract-1i8n
2026-05-24 09:48:29 -04:00

228 lines
7.3 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# fetch-shape-corpus.sh - Download open-licensed font corpus for glyph shape DB
#
# This script downloads fonts from the manifest file and copies their LICENSE
# files to build/font-licenses/. The script is idempotent - it skips downloads
# for fonts that are already present.
#
# Usage: bash scripts/fetch-shape-corpus.sh
#
set -euo pipefail
# Colors for output
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m' # No Color
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_skip() {
echo -e "${YELLOW}[SKIP]${NC} $1"
}
# Function to download a font
# Usage: download_font <family_name> <url> <target_file> <family_slug> <license_id>
download_font() {
local family_name="$1"
local url="$2"
local target_file="$3"
local family_slug="$4"
local license_id="$5"
# Create temp directory for download
local temp_dir
temp_dir=$(mktemp -d)
trap "rm -rf ${temp_dir}" RETURN
local filename
filename=$(basename "${url}")
# Download to temp
log_info " Fetching ${filename}..."
if ! curl -fsSL "${url}" -o "${temp_dir}/${filename}"; then
echo " Error: Failed to download ${url}"
return 1
fi
local downloaded_file="${temp_dir}/${filename}"
local target_path="${CORPUS_DIR}/${target_file}"
# Handle different file types
case "${filename}" in
*.zip)
# Unzip and find target font
unzip -q "${downloaded_file}" -d "${temp_dir}/extracted"
find_and_copy_font "${temp_dir}/extracted" "${target_file}" "${target_path}"
extract_license_from_archive "${temp_dir}/extracted" "${family_slug}" "${family_name}" "${url}" "${license_id}"
;;
*.tar.gz|*.tgz)
# Extract tar.gz and find target font
mkdir -p "${temp_dir}/extracted"
tar -xzf "${downloaded_file}" -C "${temp_dir}/extracted"
find_and_copy_font "${temp_dir}/extracted" "${target_file}" "${target_path}"
extract_license_from_archive "${temp_dir}/extracted" "${family_slug}" "${family_name}" "${url}" "${license_id}"
;;
*.ttf|*.otf)
# Direct font file - just copy
mkdir -p "$(dirname "${target_path}")"
cp "${downloaded_file}" "${target_path}"
log_info " Installed: ${target_file}"
# For direct downloads, we can't extract LICENSE from the archive
# Create a placeholder license file with download URL
cat > "${LICENSE_DIR}/${family_slug}.txt" <<EOF
# ${family_name}
# Downloaded from: ${url}
# License: ${license_id}
# This font was downloaded directly as a pre-built binary file.
# For the full license text, please refer to the source repository
# and the license identifier specified above.
EOF
log_info " License: ${LICENSE_DIR}/${family_slug}.txt"
;;
*)
echo " Error: Unknown file type: ${filename}"
return 1
;;
esac
}
# Function to find and copy font from extracted directory
find_and_copy_font() {
local search_dir="$1"
local target_file="$2"
local target_path="$3"
# Recursively find the font file
local found
found=$(find "${search_dir}" -type f \( -name "${target_file}" -o -name "${target_file%.ttf}.otf" -o -name "${target_file%.otf}.ttf" \) | head -1)
if [[ -z "${found}" ]]; then
echo " Warning: ${target_file} not found in archive, searching for similar..."
# Try to find any .ttf or .otf file in the archive
found=$(find "${search_dir}" -type f \( -name "*.ttf" -o -name "*.otf" \) | head -1)
if [[ -z "${found}" ]]; then
echo " Error: No font files found in archive"
return 1
fi
echo " Using alternative: $(basename "${found}")"
fi
# Create target directory if needed
mkdir -p "$(dirname "${target_path}")"
cp "${found}" "${target_path}"
log_info " Installed: ${target_file}"
}
# Function to extract LICENSE from archive
extract_license_from_archive() {
local search_dir="$1"
local family_slug="$2"
local family_name="$3"
local url="$4"
local license_id="$5"
# Look for common license file names
local license_file
license_file=$(find "${search_dir}" -type f \( -name "LICENSE" -o -name "LICENSE.txt" -o -name "OFL.txt" -o -name "OFL-*.txt" \) | head -1)
if [[ -n "${license_file}" ]]; then
cp "${license_file}" "${LICENSE_DIR}/${family_slug}.txt"
log_info " License: ${LICENSE_DIR}/${family_slug}.txt"
else
# Create a placeholder if no license found
cat > "${LICENSE_DIR}/${family_slug}.txt" <<EOF
# ${family_name}
# Downloaded from: ${url}
# License: ${license_id}
# License file not found in archive. Please refer to the source repository
# for the full license text corresponding to: ${license_id}
EOF
log_info " License: Placeholder (${license_id})"
fi
}
# Function to extract license from already-present font file
# This is used when skipping downloads
extract_license() {
local target_file="$1"
local family_slug="$2"
local family_name="$3"
local license_id="$4"
# Check if license already exists
if [[ -f "${LICENSE_DIR}/${family_slug}.txt" ]]; then
return 0
fi
# Create a placeholder license file
cat > "${LICENSE_DIR}/${family_slug}.txt" <<EOF
# ${family_name}
# Source: ${target_file}
# License: ${license_id}
# This font file was already present in the corpus directory.
# For the full license text, please refer to the source repository
# and the license identifier specified above.
EOF
}
# Main script
# =============
# Paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
WORKSPACE_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
MANIFEST_FILE="${WORKSPACE_ROOT}/build/shape-corpus-manifest.txt"
CORPUS_DIR="${WORKSPACE_ROOT}/build/shape-corpus"
LICENSE_DIR="${WORKSPACE_ROOT}/build/font-licenses"
# Create directories
mkdir -p "${CORPUS_DIR}"
mkdir -p "${LICENSE_DIR}"
# Check if manifest exists
if [[ ! -f "${MANIFEST_FILE}" ]]; then
echo "Error: Manifest file not found: ${MANIFEST_FILE}"
exit 1
fi
# Read manifest and download fonts
# Skip comments and empty lines
while IFS='|' read -r family_name url license_id target_file; do
# Skip comments and empty lines
[[ "${family_name}" =~ ^#.*$ ]] && continue
[[ -z "${family_name}" ]] && continue
# Normalize family name for filename (replace spaces with underscores)
family_slug=$(echo "${family_name}" | tr ' ' '_' | tr -cd '[:alnum:]_')
target_path="${CORPUS_DIR}/${target_file}"
# Skip if already downloaded
if [[ -f "${target_path}" ]]; then
log_skip "${family_name} - already present"
# Still copy LICENSE if missing
if [[ ! -f "${LICENSE_DIR}/${family_slug}.txt" ]]; then
log_info "Extracting LICENSE for ${family_name}..."
extract_license "${target_path}" "${family_slug}" "${family_name}" "${license_id}" || true
fi
continue
fi
log_info "Downloading ${family_name}..."
download_font "${family_name}" "${url}" "${target_file}" "${family_slug}" "${license_id}"
done < "${MANIFEST_FILE}"
echo ""
log_info "Font corpus download complete!"
echo " Corpus dir: ${CORPUS_DIR}"
echo " License dir: ${LICENSE_DIR}"
echo ""
log_info "To generate the shape database, run:"
echo " cargo xtask gen-shape-db ${CORPUS_DIR}"