Foxit PDF SDK
fs_ocr.h
Go to the documentation of this file.
1 #if (defined(_WIN32) || defined(_WIN64)) || defined(__linux__)
2 
16 #ifndef FS_OCR_H_
17 #define FS_OCR_H_
18 
19 #include "common/fs_common.h"
20 #include "pdf/fs_pdfdoc.h"
21 #include "pdf/fs_pdfpage.h"
22 
28 namespace foxit {
32 namespace addon {
36 namespace ocr {
43 class OCRCallback {
44  public:
54  virtual bool NeedToCancelNow(const wchar_t* info) = 0;
55 
64  virtual bool IsImageIgnored(foxit::pdf::graphics::ImageObject* image_object) = 0;
65 };
66 
72  public:
80  virtual void ProgressNotify(int current_rate) = 0;
81 };
82 
94 class OCREngine FS_FINAL : public Object {
95  public:
115  static ErrorCode Initialize(const wchar_t* ocr_resource_path);
116 
145  static ErrorCode Initialize(const wchar_t* ocr_resource_path, bool is_shared_cpu_cores_mode);
146 
155  static void Release();
156 
168  static void SetLogFile(const char* log_file_path);
169 
181  static void SetLogFile(const wchar_t* log_file_path);
182 
202  static void SetLanguages(const wchar_t* languages);
203 
213  static void SetOCRCallback(OCRCallback* callback);
214 };
215 
217 class OCRConfig FS_FINAL : public Object {
218  public:
223  :is_detect_pictures(true)
224  ,is_remove_noise(true)
225  ,is_correct_skew(true)
230  ,confidence(0){}
231 
245  this->is_detect_pictures = is_detect_pictures;
246  this->is_remove_noise = is_remove_noise;
247  this->is_correct_skew = is_correct_skew;
248  this->is_enable_text_extraction_mode = is_enable_text_extraction_mode;
249  this->is_sequentially_process = is_sequentially_process;
250  this->is_auto_overwrite_resolution = is_auto_overwrite_resolution;
251  this->resolution_to_overwrite = resolution_to_overwrite;
252  this->confidence = confidence;
253  }
254 
271  this->is_detect_pictures = is_detect_pictures;
272  this->is_remove_noise = is_remove_noise;
273  this->is_correct_skew = is_correct_skew;
274  this->is_enable_text_extraction_mode = is_enable_text_extraction_mode;
275  this->is_sequentially_process = is_sequentially_process;
276  this->is_auto_overwrite_resolution = is_auto_overwrite_resolution;
277  this->resolution_to_overwrite = resolution_to_overwrite;
278  this->confidence = confidence;
279  }
280 
288  OCRConfig& operator=(const OCRConfig& other) {
296  confidence = other.confidence;
297  return (*this);
298  }
299 
307  bool operator!=(const OCRConfig& other) {
312  return true;
313  return false;
314  }
315 
323 
333 
342 
355 
367 
375 
383 
393 };
394 
396 class OCRSettingData FS_FINAL : public Object {
397  public:
402 
412  this->pdf_doc = pdf_doc;
413  this->page_range = page_range;
414  this->is_editable = is_editable;
415  this->ocr_config = ocr_config;
416  }
417 
429  this->pdf_doc = pdf_doc;
430  this->page_range = page_range;
431  this->is_editable = is_editable;
432  this->ocr_config = ocr_config;
433  }
434 
443  pdf_doc = data.pdf_doc;
444  page_range = data.page_range;
445  is_editable = data.is_editable;
446  ocr_config = data.ocr_config;
447  return (*this);
448  }
449 
457  bool operator!=(const OCRSettingData& data) {
458  if (pdf_doc != data.pdf_doc || page_range != data.page_range || is_editable != data.is_editable || ocr_config != data.ocr_config)
459  return true;
460  return false;
461  }
462 
465 
468 
474 
477 };
478 
481 
482 
487 class OCRSuspectInfo FS_FINAL : public Object {
488  public:
491 
494 
497 };
498 
501 
502 
508 class OCR FS_FINAL : public Base {
509  public:
515  typedef enum _OCRConvertFormat {
517  e_OCRConvertFormatDOCX = 0,
519  e_OCRConvertFormatDOC = 1,
521  e_OCRConvertFormatRTF = 2,
523  e_OCRConvertFormatXLSX = 3,
525  e_OCRConvertFormatXLS = 4,
527  e_OCRConvertFormatPPTX = 5,
529  e_OCRConvertFormatHTML = 6
530  } OCRConvertFormat;
531 
535  OCR();
536 
542  OCR(const OCR& other);
543 
544  // User is strongly recommended NOT to use this method; otherwise unknown situation may occur.
545  explicit OCR(FS_HANDLE handle);
547  ~OCR();
548 
556  OCR& operator = (const OCR& other);
564  bool operator == (const OCR& other) const;
572  bool operator != (const OCR& other) const;
573 
581  bool IsEmpty() const;
582 
597  void OCRPDFPage(pdf::PDFPage pdf_page, bool is_editable, OCRProgressCallback* callback = NULL);
598 
614  void OCRPDFPage(pdf::PDFPage pdf_page, bool is_editable, const OCRConfig& config, OCRProgressCallback* callback = NULL);
615 
630  void OCRPDFDocument(pdf::PDFDoc pdf_doc, bool is_editable, OCRProgressCallback* callback = NULL);
631 
647  void OCRPDFDocument(pdf::PDFDoc pdf_doc, bool is_editable, const OCRConfig& config, OCRProgressCallback* callback = NULL);
648 
668  void OCRConvertTo(OCRConvertFormat format, const wchar_t* src_pdf_path, const wchar_t* password, const wchar_t* saved_file_path, common::Range page_range, bool is_retain_flowing_text, OCRProgressCallback* callback = NULL);
669 
690  void OCRConvertTo(OCRConvertFormat format, const wchar_t* src_pdf_path, const wchar_t* password, const wchar_t* saved_file_path, common::Range page_range, bool is_retain_flowing_text, const OCRConfig& config, OCRProgressCallback* callback = NULL);
691 
692 #if (defined(_WIN32) || defined(_WIN64)) || defined(__linux__)
693 
711  void OCRPDFDocuments(const ocr::OCRSettingDataArray& settingdata_array, OCRProgressCallback* callback = NULL);
712 #endif
713 
723  OCRSuspectInfoArray GetOCRSuspectsInfo(pdf::PDFDoc ocred_pdf_doc);
724 };
725 
726 } // namespace ocr
727 } // namespace addon
728 } // namespace foxit
729 
730 #endif // FS_OCR_H_
731 
732 #if (defined(_WIN32) || defined(_WIN64)) || defined(__linux__)
Definition: fs_ocr.h:217
bool is_detect_pictures
Decide whether to detect pictures. true means the pictures will be detected during analysis process....
Definition: fs_ocr.h:322
Definition: fs_common.h:1368
CFX_Object Object
Object type.
Definition: fs_basictypes.h:220
bool is_editable
Decide whether the OCR result is editable. true means the OCR result is editable. false means the OCR...
Definition: fs_ocr.h:473
Header file for PDF document related definitions and classes.
static void SetLogFile(const char *log_file_path)
Set log file for OCR engine.
OCRSettingData(pdf::PDFDoc pdf_doc, const common::Range &page_range, bool is_editable, const OCRConfig &ocr_config)
Constructor, with parameters.
Definition: fs_ocr.h:411
OCRConfig(bool is_detect_pictures, bool is_remove_noise, bool is_correct_skew, bool is_enable_text_extraction_mode, bool is_sequentially_process, bool is_auto_overwrite_resolution, int resolution_to_overwrite, int confidence)
Constructor, with parameters.
Definition: fs_ocr.h:243
virtual bool NeedToCancelNow(const wchar_t *info)=0
A callback function used to cancel current OCR progress.
WIDE STRING CLASS.
Definition: fx_string.h:1461
Definition: fs_pdfdoc.h:776
OCRConfig()
Constructor.
Definition: fs_ocr.h:222
virtual void ProgressNotify(int current_rate)=0
A callback function used to update current progress state data to user in order that user can update ...
void Set(pdf::PDFDoc pdf_doc, const common::Range &page_range, bool is_editable, const OCRConfig &ocr_config)
Set value.
Definition: fs_ocr.h:428
static ErrorCode Initialize(const wchar_t *ocr_resource_path)
Initialize OCR engine.
OCRSettingData()
Constructor.
Definition: fs_ocr.h:401
WString suspect_words
Suspicious words after OCR recognition.
Definition: fs_ocr.h:496
int resolution_to_overwrite
The resolution (DPI) used to overwrite the image resolution of PDF document.
Definition: fs_ocr.h:382
bool operator==(const char *str1, const CFX_ByteString &str2)
Check if two byte strings are equal.
Definition: fs_basictypes.h:128
OCRSettingData & operator=(const OCRSettingData &data)
Assign operator.
Definition: fs_ocr.h:442
Definition: fs_ocr.h:94
virtual bool IsImageIgnored(foxit::pdf::graphics::ImageObject *image_object)=0
A callback function used to determine whether an image object should be excluded from OCR text recogn...
Definition: fs_ocr.h:508
bool is_sequentially_process
Decide whether the OCR engine will process pages sequentially on one process.
Definition: fs_ocr.h:366
ErrorCode
Enumeration for error code.
Definition: fs_basictypes.h:236
bool is_auto_overwrite_resolution
Decide whether to set the resolution automatically.
Definition: fs_ocr.h:374
OCRConfig & operator=(const OCRConfig &other)
Assign operator.
Definition: fs_ocr.h:288
foxit::RectF words_rect
The box rectangle, in PDF coordinate system for suspicious words.
Definition: fs_ocr.h:493
pdf::PDFDoc pdf_doc
A valid PDF document that need to be OCR.
Definition: fs_ocr.h:464
Definition: fs_pdfpage.h:412
void * FS_HANDLE
Handle type.
Definition: fs_basictypes.h:213
int page_index
The index of page.
Definition: fs_ocr.h:490
Header file for common definitions and classes.
bool operator!=(const OCRSettingData &data)
Not equal operator.
Definition: fs_ocr.h:457
Definition: fs_ocr.h:487
Definition: fs_basictypes.h:451
bool operator!=(const OCRConfig &other)
Not equal operator.
Definition: fs_ocr.h:307
Header file for PDF page related definitions and classes.
static void SetLanguages(const wchar_t *languages)
Set the name of languages which would be included in the language database for doing OCR.
bool is_correct_skew
Decide whether to enable skew correction. true means to enable skew correction. false means not to en...
Definition: fs_ocr.h:341
Foxit namespace.
Definition: fs_taggedpdf.h:27
int confidence
The confidence threshold used to determine whether the recognized text is reliable.
Definition: fs_ocr.h:392
Definition: fs_ocr.h:43
bool is_remove_noise
Decide whether to remove noise of the image of PDF. It can be useful if the image of the PDF contains...
Definition: fs_ocr.h:332
#define NULL
The null-pointer value.
Definition: fx_system.h:792
bool is_enable_text_extraction_mode
Decide whether to enable text extraction mode.
Definition: fs_ocr.h:354
static void SetOCRCallback(OCRCallback *callback)
Set the callback object used to cancel OCR progress.
Definition: fs_ocr.h:396
void Set(bool is_detect_pictures, bool is_remove_noise, bool is_correct_skew, bool is_enable_text_extraction_mode, bool is_sequentially_process, bool is_auto_overwrite_resolution, int resolution_to_overwrite, int confidence)
Set value.
Definition: fs_ocr.h:269
common::Range page_range
The range of pages that need to be OCR.
Definition: fs_ocr.h:467
Definition: fs_pdfgraphicsobject.h:1118
OCRConfig ocr_config
The OCRConfig object.
Definition: fs_ocr.h:476
static void Release()
Release OCR engine.
Definition: fx_coordinates.h:771