Apply Adobe Document AI to OCR scanned PDF file
Adobe recent updated their service PDF Services API. With the Free Tier service, you can scan 500 pdf files each month.
You can use the following procedures to apply and deploy the API with Node.js.
Remember to select Node.js as the preferred language.
Once finish, the web will notify you to download a .zip file, which includes the credential json.
Create script to deploy OCR process
Since Adobe updated their service, they would not provide private.key
in the .zip
file. You should apply the new method to OCR your PDF.
Prepare the Credential details
Once you unzip the downloaded file, you can find pdfservices-api-credentials.json
in the folder root. Remember to replace the client-id
and client-secret
in the following script with your own details.
OCR single PDF file
If you only want to deploy the process for single file, you can try the following code, save it as ocr-pdf.js
:
/*
* Copyright 2019 Adobe
* All Rights Reserved.
*
* NOTICE: Adobe permits you to use, modify, and distribute this file in
* accordance with the terms of the Adobe license agreement accompanying
* it. If you have received this file from a source other than Adobe,
* then your use, modification, or distribution of it requires the prior
* written permission of Adobe.
*/
const PDFServicesSdk = require("@adobe/pdfservices-node-sdk");
const fileloc = `path/to/input/pdf`;
const fileloc_output = `path/to/output/location`;
try {
// Initial setup, create credentials instance.
const credentials =
PDFServicesSdk.Credentials.servicePrincipalCredentialsBuilder()
.withClientId("replace-with-client-id")
.withClientSecret("replace-with-client-secret")
.build();
// Create an ExecutionContext using credentials and create a new operation instance.
const executionContext =
PDFServicesSdk.ExecutionContext.create(credentials),
ocrOperation = PDFServicesSdk.OCR.Operation.createNew();
// Set operation input from a source file.
const input = PDFServicesSdk.FileRef.createFromLocalFile(fileloc);
ocrOperation.setInput(input);
//Generating a file name
let outputFilePath = fileloc_output;
// Execute the operation and Save the result to the specified location.
ocrOperation
.execute(executionContext)
.then((result) => result.saveAsFile(outputFilePath))
.catch((err) => {
if (
err instanceof PDFServicesSdk.Error.ServiceApiError ||
err instanceof PDFServicesSdk.Error.ServiceUsageError
) {
console.log("Exception encountered while executing operation", err);
} else {
console.log("Exception encountered while executing operation", err);
}
});
} catch (err) {
console.log("Exception encountered while executing operation", err);
}
OCR multi files or a folder
If you plan to deploy the OCR service for the whole folder. You can save the following code as ocr-pdf-multi.js
:
In my code, the folder template is saved in an Excel file, I read the file and collect the c1
information to map the folder.
/*
* Copyright 2019 Adobe
* All Rights Reserved.
*
* NOTICE: Adobe permits you to use, modify, and distribute this file in
* accordance with the terms of the Adobe license agreement accompanying
* it. If you have received this file from a source other than Adobe,
* then your use, modification, or distribution of it requires the prior
* written permission of Adobe.
*/
const dfd = require("danfojs-node")
const fs = require('fs');
const path = require('path');
const PDFServicesSdk = require('@adobe/pdfservices-node-sdk');
/**
* This sample illustrates how to perform OCR operation on a PDF file and convert it into a searchable PDF file.
* <p>
* Refer to README.md for instructions on how to run the samples.
*/
dfd.readExcel('path/to/template.xlsx').then(async df => {
const sf = df['c1'];
console.log(sf.values.length);
for (const i of sf.values) {
const c1 = df["c1"].values[i];
const pdfloc = `path/to/folder/${c1}`;
const pdfloc_output = `path/to/touput/folder/${c1}`;
await new Promise(resolve => setTimeout(resolve, 10000));
fs.readdir(pdfloc, (err, files) => {
if (err) {
console.error('Error reading directory:', err);
return;
}
files.forEach(file => {
if (path.extname(file) === '.pdf') {
const fileloc = path.join(pdfloc, file);
const fileloc_output = path.join(pdfloc_output, file);
console.log(c1, fileloc);
console.log(fileloc_output);
try {
// Initial setup, create credentials instance.
const credentials = PDFServicesSdk.Credentials
.servicePrincipalCredentialsBuilder()
.withClientId('client-id')
.withClientSecret('client-secret')
.build();
// Create an ExecutionContext using credentials and create a new operation instance.
const executionContext = PDFServicesSdk.ExecutionContext.create(credentials),
ocrOperation = PDFServicesSdk.OCR.Operation.createNew();
// Set operation input from a source file.
const input = PDFServicesSdk.FileRef.createFromLocalFile(fileloc);
ocrOperation.setInput(input);
//Generating a file name
let outputFilePath = fileloc_output;
// Execute the operation and Save the result to the specified location.
ocrOperation.execute(executionContext)
.then(result => result.saveAsFile(outputFilePath))
.catch(err => {
if (err instanceof PDFServicesSdk.Error.ServiceApiError
|| err instanceof PDFServicesSdk.Error.ServiceUsageError) {
console.log('Exception encountered while executing operation', err);
} else {
console.log('Exception encountered while executing operation', err);
}
});
} catch (err) {
console.log('Exception encountered while executing operation', err);
}
}
});
});
}
});