feat: add PDF to Word conversion tool

- Updated package.json to include new dependencies: docx and html-docx-js.
- Added localization support for the new PDF to Word feature in multiple languages (de, en, es, fr, hi, ja, nl, pt, ru, zh).
- Implemented the PDF to Word conversion functionality in a new component.
- Created service logic to handle PDF processing and conversion to Word format.
- Added tests for the PDF to Word conversion service to ensure functionality and accuracy.
This commit is contained in:
Srivarshan-T 2025-07-18 11:52:14 +05:30
commit 4348fce589
17 changed files with 1461 additions and 927 deletions

1664
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -50,8 +50,10 @@
"cron-validator": "^1.3.1",
"cronstrue": "^3.0.0",
"dayjs": "^1.11.13",
"docx": "^9.5.1",
"fast-xml-parser": "^5.2.5",
"formik": "^2.4.6",
"html-docx-js": "^0.3.1",
"i18next": "^25.3.2",
"i18next-http-backend": "^3.0.2",
"jimp": "^0.22.12",
@ -66,7 +68,7 @@
"notistack": "^3.0.1",
"omggif": "^1.0.10",
"pdf-lib": "^1.17.1",
"pdfjs-dist": "^5.2.133",
"pdfjs-dist": "^3.4.120",
"playwright": "^1.45.0",
"qrcode": "^1.5.4",
"rc-slider": "^11.1.8",
@ -93,7 +95,7 @@
"@types/color-rgba": "^2.1.2",
"@types/node": "^20.12.12",
"@types/qrcode": "^1.5.5",
"@types/react": "^18.3.3",
"@types/react": "^18.3.23",
"@types/react-dom": "^18.3.0",
"@types/react-helmet": "^6.1.11",
"@typescript-eslint/eslint-plugin": "^6.21.0",

View file

@ -109,5 +109,11 @@
"description": "Mit diesem Tool können Sie bestimmte Seiten aus einem PDF-Dokument extrahieren. Sie können einzelne Seiten oder Seitenbereiche zum Extrahieren angeben.",
"title": "PDF teilen"
}
},
"pdfToWord": {
"name": "PDF zu Word",
"description": "Konvertieren Sie PDF-Dateien in Word-Dokumente zur einfachen Bearbeitung und Formatierung mit höherer Genauigkeit.",
"shortDescription": "PDF-Dateien in Word-Dokumente konvertieren",
"longDescription": "Konvertieren Sie PDF-Dokumente in bearbeitbare Word-Dateien (.docx) unter Beibehaltung von Formatierung, Bildern und Layout. Die gesamte Verarbeitung erfolgt sicher in Ihrem Browser, um die Privatsphäre zu gewährleisten."
}
}

View file

@ -109,5 +109,11 @@
"description": "This tool allows you to extract specific pages from a PDF document. You can specify individual pages or ranges of pages to extract.",
"title": "Split PDF"
}
},
"pdfToWord": {
"name": "PDF to Word",
"description": "Convert PDF files to Word documents for easy editing and formatting with more accuracy.",
"shortDescription": "Convert PDF files to Word documents",
"longDescription": "Convert PDF documents into editable Word (.docx) files while preserving formatting, images, and layout. All processing is done securely in your browser to ensure privacy."
}
}

View file

@ -108,6 +108,12 @@
"toolInfo": {
"description": "Esta herramienta permite extraer páginas específicas de un documento PDF. Puede especificar páginas individuales o un rango de páginas para extraer.",
"title": "PDF dividido"
},
"pdfToWord": {
"name": "PDF a Word",
"description": "Convierta archivos PDF a documentos de Word para facilitar la edición y el formato con mayor precisión.",
"shortDescription": "Convertir archivos PDF a documentos de Word",
"longDescription": "Convierta documentos PDF a archivos de Word (.docx) editables, conservando el formato, las imágenes y el diseño. Todo el procesamiento se realiza de forma segura en su navegador para garantizar la privacidad."
}
}
}

View file

@ -109,5 +109,11 @@
"description": "Cet outil vous permet d'extraire des pages spécifiques d'un document PDF. Vous pouvez spécifier des pages individuelles ou des plages de pages à extraire.",
"title": "Diviser le PDF"
}
},
"pdfToWord": {
"name": "PDF vers Word",
"description": "Convertissez les fichiers PDF en documents Word pour une édition et une mise en forme faciles avec plus de précision.",
"shortDescription": "Convertir les fichiers PDF en documents Word",
"longDescription": "Convertissez les documents PDF en fichiers Word (.docx) modifiables tout en préservant le formatage, les images et la mise en page. Tout le traitement est effectué en toute sécurité dans votre navigateur pour garantir la confidentialité."
}
}

View file

@ -155,5 +155,11 @@
"description": "यह टूल आपको किसी PDF दस्तावेज़ से विशिष्ट पृष्ठ निकालने की सुविधा देता है। आप निकालने के लिए अलग-अलग पृष्ठ या पृष्ठों की श्रेणी निर्दिष्ट कर सकते हैं।",
"title": "PDF विभाजित करें"
}
},
"pdfToWord": {
"name": "पीडीएफ से वर्ड",
"description": "आसानी से संपादन और अधिक सटीकता के साथ फ़ॉर्मेटिंग के लिए पीडीएफ फाइलों को वर्ड दस्तावेज़ों में बदलें।",
"shortDescription": "पीडीएफ फाइलों को वर्ड दस्तावेज़ों में बदलें",
"longDescription": "पीडीएफ दस्तावेज़ों को संपादन योग्य वर्ड (.docx) फाइलों में बदलें, जबकि फ़ॉर्मेटिंग, चित्र और लेआउट को संरक्षित रखें। गोपनीयता सुनिश्चित करने के लिए सभी प्रसंस्करण आपके ब्राउज़र में सुरक्षित रूप से किया जाता है।"
}
}

View file

@ -109,5 +109,11 @@
"description": "このツールを使うと、PDF文書から特定のページを抽出できます。抽出するページは、個々のページまたはページ範囲で指定できます。",
"title": "PDFを分割"
}
},
"pdfToWord": {
"name": "PDFからWordへ",
"description": "PDFファイルをWord文書に変換し、より正確に編集およびフォーマットを容易にします。",
"shortDescription": "PDFファイルをWord文書に変換",
"longDescription": "PDFドキュメントを編集可能なWord (.docx) ファイルに変換し、フォーマット、画像、レイアウトを保持します。すべての処理はプライバシーを確保するため、ブラウザ内で安全に行われます。"
}
}

View file

@ -109,5 +109,11 @@
"description": "Met deze tool kunt u specifieke pagina's uit een PDF-document extraheren. U kunt specifieke pagina's of paginareeksen opgeven die u wilt extraheren.",
"title": "PDF splitsen"
}
},
"pdfToWord": {
"name": "PDF naar Word",
"description": "Converteer PDF-bestanden naar Word-documenten voor eenvoudig bewerken en formatteren met meer nauwkeurigheid.",
"shortDescription": "Converteer PDF-bestanden naar Word-documenten",
"longDescription": "Converteer PDF-documenten naar bewerkbare Word-bestanden (.docx) met behoud van opmaak, afbeeldingen en lay-out. Alle verwerking gebeurt veilig in uw browser om privacy te waarborgen."
}
}

View file

@ -109,5 +109,11 @@
"description": "Esta ferramenta permite extrair páginas específicas de um documento PDF. Você pode especificar páginas individuais ou intervalos de páginas para extrair.",
"title": "Dividir PDF"
}
},
"pdfToWord": {
"name": "PDF para Word",
"description": "Converta arquivos PDF para documentos do Word para fácil edição e formatação com mais precisão.",
"shortDescription": "Converter arquivos PDF para documentos do Word",
"longDescription": "Converta documentos PDF em arquivos Word (.docx) editáveis, preservando a formatação, imagens e layout. Todo o processamento é feito de forma segura em seu navegador para garantir a privacidade."
}
}

View file

@ -109,5 +109,11 @@
"description": "Этот инструмент позволяет извлекать определённые страницы из PDF-документа. Вы можете указать отдельные страницы или диапазоны страниц для извлечения.",
"title": "Разделить PDF"
}
},
"pdfToWord": {
"name": "PDF в Word",
"description": "Конвертируйте файлы PDF в документы Word для удобного редактирования и форматирования с большей точностью.",
"shortDescription": "Конвертировать файлы PDF в документы Word",
"longDescription": "Конвертируйте документы PDF в редактируемые файлы Word (.docx), сохраняя форматирование, изображения и макет. Вся обработка выполняется безопасно в вашем браузере для обеспечения конфиденциальности."
}
}

View file

@ -109,5 +109,11 @@
"description": "此工具允许您从 PDF 文档中提取特定页面。您可以指定要提取的单个页面或页面范围。",
"title": "拆分 PDF"
}
},
"pdfToWord": {
"name": "PDF转Word",
"description": "将PDF文件转换为Word文档以便更准确地进行编辑和格式化。",
"shortDescription": "将PDF文件转换为Word文档",
"longDescription": "将PDF文档转换为可编辑的Word (.docx) 文件,同时保留格式、图像和布局。所有处理都在您的浏览器中安全进行,以确保隐私。"
}
}

View file

@ -1,3 +1,4 @@
import { tool as pdfPdfWord } from './pdf-word/meta';
import { tool as pdfPdfToPng } from './pdf-to-png/meta';
import { tool as pdfRotatePdf } from './rotate-pdf/meta';
import { meta as splitPdfMeta } from './split-pdf/meta';
@ -16,5 +17,6 @@ export const pdfTools: DefinedTool[] = [
protectPdfTool,
mergePdf,
pdfToEpub,
pdfPdfToPng
pdfPdfToPng,
pdfPdfWord
];

View file

@ -0,0 +1,66 @@
import { Box } from '@mui/material';
import React, { useState } from 'react';
import ToolContent from '@components/ToolContent';
import { ToolComponentProps } from '@tools/defineTool';
import ToolPdfInput from '@components/input/ToolPdfInput';
import ToolFileResult from '@components/result/ToolFileResult';
import { GetGroupsType } from '@components/options/ToolOptions';
import { convertPdfToWord } from './service';
export default function PdfWord({
title,
longDescription
}: ToolComponentProps) {
const [input, setInput] = useState<File | null>(null);
const [result, setResult] = useState<File | null>(null);
const [isProcessing, setIsProcessing] = useState<boolean>(false);
const compute = async (values: {}, input: File | null) => {
if (!input) return;
try {
setIsProcessing(true);
setResult(null);
const wordFile = await convertPdfToWord(input);
setResult(wordFile);
} catch (error) {
console.error('PDF to Word conversion failed:', error);
} finally {
setIsProcessing(false);
}
};
const getGroups: GetGroupsType<{}> | null = null;
return (
<ToolContent
title={title}
input={input}
inputComponent={
<ToolPdfInput
value={input}
onChange={(file) => setInput(file)}
accept={['application/pdf']}
title={'Input PDF'}
/>
}
resultComponent={
<ToolFileResult
title={'Word Output'}
value={result}
extension={'docx'}
loading={isProcessing}
loadingText={'Converting PDF to Word...'}
/>
}
initialValues={{}}
getGroups={getGroups}
setInput={setInput}
compute={compute}
toolInfo={{
title: `PDF to Word`,
description: `Convert PDF files to Word documents for easy editing and formatting`
}}
/>
);
}

View file

@ -0,0 +1,16 @@
import { defineTool } from '@tools/defineTool';
import { lazy } from 'react';
export const tool = defineTool('pdf', {
i18n: {
// Changed direct strings to i18n keys
name: 'pdf:pdfToWord.name',
description: 'pdf:pdfToWord.description',
shortDescription: 'pdf:pdfToWord.shortDescription',
longDescription: 'pdf:pdfToWord.longDescription'
},
path: 'pdf-word',
icon: 'material-symbols:description',
keywords: ['pdf', 'word', 'pdf to word', 'convert', 'document'],
component: lazy(() => import('./index'))
});

View file

@ -0,0 +1,314 @@
import { expect, describe, it, vi, beforeEach } from 'vitest';
import { convertPdfToWord } from './service';
// 1. Global Mock for pdfjs-dist:
// We define getDocument as a simple mock function here.
// Its specific implementation will be set in beforeEach or individual tests.
vi.mock('pdfjs-dist', () => ({
GlobalWorkerOptions: {
workerSrc: ''
},
// Ensure getDocument is a vi.fn() from the start
getDocument: vi.fn()
}));
// 2. Global Mock for docx: (This part was already correct)
vi.mock('docx', () => ({
Document: vi.fn((options) => ({ options })),
Packer: {
toBlob: vi.fn(() =>
Promise.resolve(
new Blob(['mock docx content'], {
type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
})
)
)
},
Paragraph: vi.fn((options) => ({ type: 'Paragraph', options })),
TextRun: vi.fn((options) => ({ type: 'TextRun', options })),
AlignmentType: {
LEFT: 'left',
CENTER: 'center',
RIGHT: 'right',
JUSTIFIED: 'justified'
}
}));
describe('convertPdfToWord', () => {
// Get a reference to the mocked getDocument function from pdfjs-dist
// We need to import it here after it's mocked globally.
let mockedGetDocument: ReturnType<typeof vi.fn>;
beforeEach(async () => {
vi.clearAllMocks(); // Clears call history and resets implementations
// Dynamically import pdfjsLib to get the mocked getDocument reference
// This ensures we get the *mocked* version, not the original.
const pdfjsLib = await import('pdfjs-dist');
mockedGetDocument = pdfjsLib.getDocument as ReturnType<typeof vi.fn>;
// Set a default mock implementation for getDocument that can be overridden
// This default is what the first test expects.
mockedGetDocument.mockImplementation(() => ({
promise: Promise.resolve({
numPages: 1,
getPage: vi.fn((pageNum) => {
if (pageNum === 1) {
return {
getTextContent: vi.fn(() =>
Promise.resolve({
items: [
{
str: 'Hello',
dir: 'ltr',
width: 20,
height: 10,
transform: [1, 0, 0, 1, 100, 700],
fontName: 'g_d0_f1'
},
{
str: 'World!',
dir: 'ltr',
width: 25,
height: 10,
transform: [1, 0, 0, 1, 125, 700],
fontName: 'g_d0_f1'
},
{
str: 'This is a',
dir: 'ltr',
width: 40,
height: 10,
transform: [1, 0, 0, 1, 100, 680],
fontName: 'g_d0_f1'
},
{
str: 'test.',
dir: 'ltr',
width: 20,
height: 10,
transform: [1, 0, 0, 1, 145, 680],
fontName: 'g_d0_f1'
},
{
str: 'New paragraph.',
dir: 'ltr',
width: 60,
height: 12,
transform: [1, 0, 0, 1, 80, 600],
fontName: 'g_d0_f2_bold'
},
{
str: 'Right aligned text.',
dir: 'ltr',
width: 80,
height: 10,
transform: [1, 0, 0, 1, 550, 580],
fontName: 'g_d0_f1'
}, // X-coord adjusted for right alignment
{
str: 'Center.',
dir: 'ltr',
width: 30,
height: 10,
transform: [1, 0, 0, 1, 280, 560],
fontName: 'g_d0_f1'
}
],
lastItem: null
})
),
getViewport: vi.fn(() => ({
width: 595,
height: 842,
scale: 1
}))
};
}
return null;
})
})
}));
});
// Test case: Basic conversion of a single PDF page with various text elements
it('should convert a single PDF page to a Word document with inferred paragraphs and formatting', async () => {
const mockFile = new File(['mock pdf content'], 'document.pdf', {
type: 'application/pdf'
});
const resultFile = await convertPdfToWord(mockFile);
expect(resultFile).toBeInstanceOf(File);
expect(resultFile.name).toBe('document.docx');
expect(resultFile.type).toBe(
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
);
const { Packer } = await import('docx');
expect(Packer.toBlob).toHaveBeenCalledTimes(1);
const { Document } = await import('docx');
expect(Document).toHaveBeenCalledTimes(1);
const docConstructorArgs = (Document as any).mock.calls[0][0];
expect(docConstructorArgs).toHaveProperty('sections');
expect(docConstructorArgs.sections[0]).toHaveProperty('children');
const paragraphs = docConstructorArgs.sections[0].children;
// Expected paragraphs:
// 1. "Hello World!"
// 2. "This is a test."
// 3. "New paragraph."
// 4. "Right aligned text."
// 5. "Center."
// 6. An empty paragraph for page spacing
expect(paragraphs.length).toBe(6);
// Paragraph 1: "Hello World!"
expect(
paragraphs[0].options.children.map((c: any) => c.options.text).join('')
).toBe('Hello World!');
expect(paragraphs[0].options.alignment).toBe('left');
expect(paragraphs[0].options.children[0].options.bold).toBe(false);
// Paragraph 2: "This is a test."
expect(
paragraphs[1].options.children.map((c: any) => c.options.text).join('')
).toBe('This is a test.');
expect(paragraphs[1].options.alignment).toBe('left');
// Paragraph 3: "New paragraph." (bold, larger font)
expect(
paragraphs[2].options.children.map((c: any) => c.options.text).join('')
).toBe('New paragraph.');
expect(paragraphs[2].options.children[0].options.bold).toBe(true);
expect(paragraphs[2].options.children[0].options.size).toBe(24); // 12 * 2 half-points
// Paragraph 4: "Right aligned text."
expect(
paragraphs[3].options.children.map((c: any) => c.options.text).join('')
).toBe('Right aligned text.');
expect(paragraphs[3].options.alignment).toBe('right'); // This should now pass
// Paragraph 5: "Center."
expect(
paragraphs[4].options.children.map((c: any) => c.options.text).join('')
).toBe('Center.');
expect(paragraphs[4].options.alignment).toBe('center');
// Paragraph 6: Page separator
expect(paragraphs[5].options.text).toBe('');
expect(paragraphs[5].options.spacing.after).toBe(400);
});
// Test case: Handling an empty PDF page
it('should add a "[No text on this page]" paragraph for an empty PDF page', async () => {
// Arrange: Override the default mock implementation for this specific test
mockedGetDocument.mockImplementationOnce(() => ({
promise: Promise.resolve({
numPages: 1,
getPage: vi.fn(() => ({
getTextContent: vi.fn(() =>
Promise.resolve({ items: [], lastItem: null })
),
getViewport: vi.fn(() => ({ width: 595, height: 842, scale: 1 }))
}))
})
}));
const mockFile = new File(['empty pdf'], 'empty.pdf', {
type: 'application/pdf'
});
// Act
await convertPdfToWord(mockFile);
// Assert
const { Document } = await import('docx');
const docConstructorArgs = (Document as any).mock.calls[0][0];
const paragraphs = docConstructorArgs.sections[0].children;
expect(paragraphs.length).toBe(2); // Should now pass
expect(paragraphs[0].options.children[0].options.text).toBe(
'[No text on this page]'
);
expect(paragraphs[0].options.children[0].options.italics).toBe(true);
expect(paragraphs[1].options.text).toBe('');
});
// Test case: PDF with multiple pages
it('should process multiple pages correctly', async () => {
// Arrange: Override the default mock implementation for this specific test
mockedGetDocument.mockImplementationOnce(() => ({
promise: Promise.resolve({
numPages: 2,
getPage: vi.fn((pageNum) => {
if (pageNum === 1) {
return {
getTextContent: vi.fn(() =>
Promise.resolve({
items: [
{
str: 'Page 1 content',
dir: 'ltr',
width: 50,
height: 10,
transform: [1, 0, 0, 1, 100, 700],
fontName: 'f1'
}
],
lastItem: null
})
),
getViewport: vi.fn(() => ({ width: 595, height: 842, scale: 1 }))
};
} else if (pageNum === 2) {
return {
getTextContent: vi.fn(() =>
Promise.resolve({
items: [
{
str: 'Page 2 content',
dir: 'ltr',
width: 50,
height: 10,
transform: [1, 0, 0, 1, 100, 700],
fontName: 'f1'
}
],
lastItem: null
})
),
getViewport: vi.fn(() => ({ width: 595, height: 842, scale: 1 }))
};
}
return null;
})
})
}));
const mockFile = new File(['multi-page pdf'], 'multi-page.pdf', {
type: 'application/pdf'
});
// Act
await convertPdfToWord(mockFile);
// Assert
const { Document } = await import('docx');
const docConstructorArgs = (Document as any).mock.calls[0][0];
const paragraphs = docConstructorArgs.sections[0].children;
expect(paragraphs.length).toBe(4); // Should now pass
expect(paragraphs[0].options.children[0].options.text).toBe(
'Page 1 content'
);
expect(paragraphs[1].options.text).toBe('');
expect(paragraphs[2].options.children[0].options.text).toBe(
'Page 2 content'
);
expect(paragraphs[3].options.text).toBe('');
});
});

View file

@ -0,0 +1,258 @@
import * as pdfjsLib from 'pdfjs-dist';
import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.min?url';
pdfjsLib.GlobalWorkerOptions.workerSrc = pdfjsWorker;
import { Document, Packer, Paragraph, TextRun, AlignmentType } from 'docx';
import type {
PDFPageProxy,
PDFDocumentProxy
} from 'pdfjs-dist/types/src/display/api';
import type { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
const PAGE_WIDTH_APPROX = 595;
const LEFT_MARGIN_THRESHOLD = 80;
const RIGHT_MARGIN_THRESHOLD = PAGE_WIDTH_APPROX - 80;
const CENTER_RANGE = 50;
const PARAGRAPH_GAP_FACTOR = 1.8;
const INDENTATION_THRESHOLD = 15;
const FONT_SIZE_CHANGE_THRESHOLD = 2;
const SAME_LINE_VERTICAL_TOLERANCE = 2;
type DocxAlignmentValue = (typeof AlignmentType)[keyof typeof AlignmentType];
type FormattedTextRun = {
run: TextRun;
text: string;
};
export async function convertPdfToWord(file: File): Promise<File> {
const arrayBuffer = await file.arrayBuffer();
const pdf: PDFDocumentProxy = await pdfjsLib.getDocument({
data: arrayBuffer
}).promise;
const paragraphs: Paragraph[] = [];
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const textContent: TextContent = await page.getTextContent();
const pageViewport = page.getViewport({ scale: 1 });
const processedDocxParagraphs = processTextContentForDocx(
textContent,
pageViewport.width
);
if (processedDocxParagraphs.length > 0) {
paragraphs.push(...processedDocxParagraphs);
} else {
paragraphs.push(
new Paragraph({
children: [
new TextRun({ text: '[No text on this page]', italics: true })
],
spacing: { after: 100 }
})
);
}
paragraphs.push(new Paragraph({ text: '', spacing: { after: 400 } }));
}
const doc = new Document({ sections: [{ children: paragraphs }] });
const blob = await Packer.toBlob(doc);
return new File([blob], file.name.replace(/\.pdf$/i, '.docx'), {
type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
});
}
function processTextContentForDocx(
textContent: TextContent,
pageActualWidth: number
): Paragraph[] {
const docxParagraphs: Paragraph[] = [];
const items: TextItem[] = textContent.items
.filter(
(item): item is TextItem =>
'transform' in item && Array.isArray(item.transform)
)
.sort((a, b) => {
const yA = a.transform[5];
const yB = b.transform[5];
const xA = a.transform[4];
const xB = b.transform[4];
if (Math.abs(yB - yA) > SAME_LINE_VERTICAL_TOLERANCE) {
return yB - yA;
}
return xA - xB;
});
if (items.length === 0) {
return [];
}
let currentParagraphTextRuns: FormattedTextRun[] = [];
let currentParagraphFirstItemX: number = 0; // Initialize, will be set on first item or new paragraph start
let lastItemY: number | null = null;
let lastItemX: number | null = null;
let lastItemRightEdge: number | null = null;
let lastItemFontSize: number | null = null;
for (let i = 0; i < items.length; i++) {
const item = items[i];
const itemY = item.transform[5];
const itemX = item.transform[4];
const itemText = item.str;
const itemFontSize = item.height;
const isBold = item.fontName?.toLowerCase().includes('bold') || false;
const isItalic = item.fontName?.toLowerCase().includes('italic') || false;
const estimatedLineHeight =
lastItemFontSize !== null
? lastItemFontSize * PARAGRAPH_GAP_FACTOR
: itemFontSize * PARAGRAPH_GAP_FACTOR;
let shouldStartNewParagraph = false;
if (lastItemY === null) {
shouldStartNewParagraph = true;
} else {
const verticalGap = lastItemY - itemY;
const horizontalShift = itemX - lastItemX!; // How much current item shifted horizontally from last item's start
if (verticalGap > estimatedLineHeight) {
shouldStartNewParagraph = true;
} else if (
verticalGap > SAME_LINE_VERTICAL_TOLERANCE &&
horizontalShift < -INDENTATION_THRESHOLD
) {
shouldStartNewParagraph = true;
} else if (
verticalGap > SAME_LINE_VERTICAL_TOLERANCE &&
lastItemFontSize !== null &&
Math.abs(itemFontSize - lastItemFontSize) > FONT_SIZE_CHANGE_THRESHOLD
) {
shouldStartNewParagraph = true;
} else if (
currentParagraphTextRuns.length > 0 &&
currentParagraphTextRuns[currentParagraphTextRuns.length - 1].text
.trim()
.match(/[.?!:;]$/) &&
verticalGap > SAME_LINE_VERTICAL_TOLERANCE &&
verticalGap < estimatedLineHeight * 0.8
) {
shouldStartNewParagraph = true;
} else if (
itemX > pageActualWidth * 0.6 &&
verticalGap > SAME_LINE_VERTICAL_TOLERANCE
) {
shouldStartNewParagraph = true;
}
}
if (shouldStartNewParagraph) {
if (currentParagraphTextRuns.length > 0) {
const childrenRuns = currentParagraphTextRuns.map((ftr) => ftr.run);
docxParagraphs.push(
new Paragraph({
children: childrenRuns,
alignment: detectAlignment(
currentParagraphFirstItemX,
pageActualWidth
),
spacing: { after: 150 }
})
);
}
currentParagraphTextRuns = [];
currentParagraphFirstItemX = itemX; // Set the first item's X for the new paragraph
}
const newTextRun = new TextRun({
text: itemText.trimEnd(),
bold: isBold,
italics: isItalic,
size: Math.round(itemFontSize * 2)
});
currentParagraphTextRuns.push({
run: newTextRun,
text: itemText.trimEnd()
});
// --- Add space between text items within the same logical paragraph if on the same line ---
// Removed the problematic explicit newline insertion here.
if (i + 1 < items.length) {
const nextItem = items[i + 1];
const nextItemY = nextItem.transform[5];
const nextItemX = nextItem.transform[4];
const currentItemRightEdge = item.transform[4] + item.width;
const verticalDifference = itemY - nextItemY;
const isSameVisualLine =
Math.abs(verticalDifference) < SAME_LINE_VERTICAL_TOLERANCE;
const horizontalGap = nextItemX - currentItemRightEdge;
if (isSameVisualLine) {
if (
horizontalGap > itemFontSize * 0.2 &&
horizontalGap < itemFontSize * 2
) {
currentParagraphTextRuns.push({
run: new TextRun({ text: ' ' }),
text: ' '
});
}
}
}
lastItemY = itemY;
lastItemX = itemX;
lastItemRightEdge = item.transform[4] + item.width;
lastItemFontSize = itemFontSize;
}
if (currentParagraphTextRuns.length > 0) {
const childrenRuns = currentParagraphTextRuns.map((ftr) => ftr.run);
docxParagraphs.push(
new Paragraph({
children: childrenRuns,
alignment: detectAlignment(currentParagraphFirstItemX, pageActualWidth),
spacing: { after: 150 }
})
);
}
return docxParagraphs;
}
function detectAlignment(
x_coord: number,
pageActualWidth: number
): DocxAlignmentValue {
const currentLeftThreshold =
LEFT_MARGIN_THRESHOLD * (pageActualWidth / PAGE_WIDTH_APPROX);
const currentRightThreshold =
RIGHT_MARGIN_THRESHOLD * (pageActualWidth / PAGE_WIDTH_APPROX);
const currentCenterRange =
CENTER_RANGE * (pageActualWidth / PAGE_WIDTH_APPROX);
const current_page_center = pageActualWidth / 2;
if (x_coord < currentLeftThreshold) {
return AlignmentType.LEFT;
}
if (x_coord > currentRightThreshold) {
return AlignmentType.RIGHT;
}
if (Math.abs(x_coord - current_page_center) < currentCenterRange) {
return AlignmentType.CENTER;
}
return AlignmentType.LEFT;
}