Files changed (2) hide show
  1. app.py +213 -0
  2. requirements.txt +19 -0
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import sys
5
+ from PIL import Image, ImageDraw
6
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, GenerationConfig
7
+ from huggingface_hub import snapshot_download
8
+ import spaces
9
+ from typing import Optional, Tuple, Dict, Any, Iterable
10
+ from gradio.themes import Soft
11
+ from gradio.themes.utils import colors, fonts, sizes
12
+
13
+ print("Downloading model snapshot to ensure all scripts are present...")
14
+ model_dir = snapshot_download(repo_id="nvidia/NVIDIA-Nemotron-Parse-v1.1")
15
+ print(f"Model downloaded to: {model_dir}")
16
+
17
+ sys.path.append(model_dir)
18
+
19
+ try:
20
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
21
+ print("Successfully imported postprocessing functions.")
22
+ except ImportError as e:
23
+ print(f" Error importing postprocessing: {e}")
24
+ raise e
25
+
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ print(f"Using device: {device}")
28
+
29
+ colors.steel_blue = colors.Color(
30
+ name="steel_blue",
31
+ c50="#EBF3F8",
32
+ c100="#D3E5F0",
33
+ c200="#A8CCE1",
34
+ c300="#7DB3D2",
35
+ c400="#529AC3",
36
+ c500="#4682B4",
37
+ c600="#3E72A0",
38
+ c700="#36638C",
39
+ c800="#2E5378",
40
+ c900="#264364",
41
+ c950="#1E3450",
42
+ )
43
+
44
+ class SteelBlueTheme(Soft):
45
+ def __init__(
46
+ self,
47
+ *,
48
+ primary_hue: colors.Color | str = colors.gray,
49
+ secondary_hue: colors.Color | str = colors.steel_blue,
50
+ neutral_hue: colors.Color | str = colors.slate,
51
+ text_size: sizes.Size | str = sizes.text_lg,
52
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
53
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
54
+ ),
55
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
56
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
57
+ ),
58
+ ):
59
+ super().__init__(
60
+ primary_hue=primary_hue,
61
+ secondary_hue=secondary_hue,
62
+ neutral_hue=neutral_hue,
63
+ text_size=text_size,
64
+ font=font,
65
+ font_mono=font_mono,
66
+ )
67
+ super().set(
68
+ background_fill_primary="*primary_50",
69
+ background_fill_primary_dark="*primary_900",
70
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
71
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
72
+ button_primary_text_color="white",
73
+ button_primary_text_color_hover="white",
74
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
75
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
76
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
77
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
78
+ slider_color="*secondary_500",
79
+ slider_color_dark="*secondary_600",
80
+ block_title_text_weight="600",
81
+ block_border_width="3px",
82
+ block_shadow="*shadow_drop_lg",
83
+ button_primary_shadow="*shadow_drop_lg",
84
+ button_large_padding="11px",
85
+ color_accent_soft="*primary_100",
86
+ block_label_background_fill="*primary_200",
87
+ )
88
+
89
+ steel_blue_theme = SteelBlueTheme()
90
+ css = """
91
+ #main-title h1 { font-size: 2.3em !important; }
92
+ #output-title h2 { font-size: 2.1em !important; }
93
+ """
94
+
95
+ print("Loading Model components...")
96
+
97
+ processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
98
+ model = AutoModel.from_pretrained(
99
+ model_dir,
100
+ trust_remote_code=True,
101
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
102
+ ).to(device).eval()
103
+
104
+ try:
105
+ generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True)
106
+ except Exception as e:
107
+ print(f"Warning: Could not load GenerationConfig: {e}. Using default.")
108
+ generation_config = GenerationConfig(max_new_tokens=4096)
109
+
110
+ print("Model loaded successfully.")
111
+
112
+ @spaces.GPU
113
+ def process_ocr_task(image):
114
+ """
115
+ Processes an image with NVIDIA-Nemotron-Parse-v1.1.
116
+ """
117
+ if image is None:
118
+ return "Please upload an image first.", None
119
+
120
+ task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
121
+
122
+ inputs = processor(images=[image], text=task_prompt, return_tensors="pt").to(device)
123
+
124
+ if device.type == 'cuda':
125
+ inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
126
+
127
+ print("Running inference...")
128
+ with torch.no_grad():
129
+ outputs = model.generate(
130
+ **inputs,
131
+ generation_config=generation_config
132
+ )
133
+
134
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
135
+
136
+ try:
137
+ classes, bboxes, texts = extract_classes_bboxes(generated_text)
138
+ except Exception as e:
139
+ print(f"Error extracting boxes: {e}")
140
+ return generated_text, image
141
+
142
+ bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
143
+
144
+ table_format = 'latex'
145
+ text_format = 'markdown'
146
+ blank_text_in_figures = False
147
+
148
+ processed_texts = [
149
+ postprocess_text(
150
+ text,
151
+ cls=cls,
152
+ table_format=table_format,
153
+ text_format=text_format,
154
+ blank_text_in_figures=blank_text_in_figures
155
+ )
156
+ for text, cls in zip(texts, classes)
157
+ ]
158
+
159
+ result_image = image.copy()
160
+ draw = ImageDraw.Draw(result_image)
161
+
162
+ color_map = {
163
+ "Table": "red",
164
+ "Figure": "blue",
165
+ "Text": "green",
166
+ "Title": "purple"
167
+ }
168
+
169
+ final_output_text = ""
170
+
171
+ for cls, bbox, txt in zip(classes, bboxes, processed_texts):
172
+ color = color_map.get(cls, "red")
173
+ draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], outline=color, width=3)
174
+
175
+ if cls == "Table":
176
+ final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
177
+ elif cls == "Figure":
178
+ final_output_text += f"\n\n--- [Figure] ---\n(Figure Detected)\n-----------------\n"
179
+ else:
180
+ final_output_text += f"{txt}\n"
181
+
182
+ if not final_output_text.strip() and generated_text:
183
+ final_output_text = generated_text
184
+
185
+ return final_output_text, result_image
186
+
187
+ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
188
+ gr.Markdown("# **NVIDIA Nemotron Parse v1.1 [OCR/Parsing]**", elem_id="main-title")
189
+ gr.Markdown("Upload a document image to extract text, tables, and layout structures using NVIDIA's state-of-the-art Parse model.")
190
+
191
+ with gr.Row():
192
+ with gr.Column(scale=1):
193
+ image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"])
194
+ submit_btn = gr.Button("Process Document", variant="primary")
195
+
196
+ examples = gr.Examples(
197
+ examples=["examples/1.jpg"],
198
+ inputs=image_input,
199
+ label="Examples"
200
+ )
201
+
202
+ with gr.Column(scale=2):
203
+ output_text = gr.Textbox(label="Parsed Content (Markdown/LaTeX)", lines=8, show_copy_button=True)
204
+ output_image = gr.Image(label="Detected Layout & Bounding Boxes", type="pil")
205
+
206
+ submit_btn.click(
207
+ fn=process_ocr_task,
208
+ inputs=[image_input],
209
+ outputs=[output_text, output_image]
210
+ )
211
+
212
+ if __name__ == "__main__":
213
+ demo.queue(max_size=20).launch(share=True, mcp_server=True, ssr_mode=False)
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opencv_python_headless==4.9.0.80
2
+ opencv_python==4.8.0.74
3
+ transformers==4.51.3
4
+ huggingface_hub
5
+ open-clip-torch
6
+ beautifulsoup4
7
+ albumentations
8
+ sentencepiece
9
+ numpy==1.26.4
10
+ torchmetrics
11
+ torchvision
12
+ mdtex2html
13
+ html2text
14
+ spaces
15
+ einops
16
+ gradio
17
+ pillow
18
+ torch
19
+ timm