Skip to content

Commit 710f11e

Browse files
joeirimpanJoe Paul
authored andcommitted
feat: Freeze pdf dep + copy local pdf_tools
1 parent 72b779a commit 710f11e

File tree

3 files changed

+295
-2
lines changed

3 files changed

+295
-2
lines changed

Cargo.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ edition = "2021"
1313
anyhow = "1.0.68"
1414
chrono = "0.4.23"
1515
csv = "1.1.6"
16-
pdf = { git = "https://github.com/pdf-rs/pdf", features = [ "euclid" ] }
17-
pdf_tools = { git = "https://github.com/pdf-rs/pdf_tools"}
16+
pdf = { git = "https://github.com/pdf-rs/pdf", features = [ "euclid" ], rev = "5cf56b7" }
1817
regex = "1.7.1"
18+
pdf_encoding = "0.3.0"
19+
euclid = "0.22.6"
20+
log = "*"

src/main.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
mod pdf_tools;
2+
13
use anyhow::{Context, Error};
24
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
35
use csv::Writer;

src/pdf_tools.rs

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
use log::warn;
2+
use pdf::primitive::Name;
3+
4+
use std::collections::HashMap;
5+
use std::convert::TryInto;
6+
use std::rc::Rc;
7+
8+
use pdf::content::*;
9+
use pdf::encoding::BaseEncoding;
10+
use pdf::error::{PdfError, Result};
11+
use pdf::font::*;
12+
use pdf::object::*;
13+
use pdf_encoding::{self, DifferenceForwardMap};
14+
15+
use euclid::Transform2D;
16+
17+
#[derive(Clone)]
18+
enum Decoder {
19+
Map(DifferenceForwardMap),
20+
Cmap(ToUnicodeMap),
21+
None,
22+
}
23+
24+
impl Default for Decoder {
25+
fn default() -> Self {
26+
Decoder::None
27+
}
28+
}
29+
30+
#[derive(Default, Clone)]
31+
pub struct FontInfo {
32+
decoder: Decoder,
33+
}
34+
35+
impl FontInfo {
36+
pub fn decode(&self, data: &[u8], out: &mut String) -> Result<()> {
37+
match &self.decoder {
38+
Decoder::Cmap(ref cmap) => {
39+
// FIXME: not sure the BOM is obligatory
40+
if data.starts_with(&[0xfe, 0xff]) {
41+
// FIXME: really windows not chunks!?
42+
for w in data.windows(2) {
43+
let cp = u16::from_be_bytes(w.try_into().unwrap());
44+
if let Some(s) = cmap.get(cp) {
45+
out.push_str(s);
46+
}
47+
}
48+
} else {
49+
out.extend(
50+
data.iter()
51+
.filter_map(|&b| cmap.get(b.into()).map(|v| v.to_owned())),
52+
);
53+
}
54+
Ok(())
55+
}
56+
Decoder::Map(map) => {
57+
out.extend(
58+
data.iter()
59+
.filter_map(|&b| map.get(b).map(|v| v.to_owned())),
60+
);
61+
Ok(())
62+
}
63+
Decoder::None => {
64+
if data.starts_with(&[0xfe, 0xff]) {
65+
utf16be_to_char(&data[2..]).try_for_each(|r| {
66+
r.map_or(Err(PdfError::Utf16Decode), |c| {
67+
out.push(c);
68+
Ok(())
69+
})
70+
})
71+
} else if let Ok(text) = std::str::from_utf8(data) {
72+
out.push_str(text);
73+
Ok(())
74+
} else {
75+
Err(PdfError::Utf16Decode)
76+
}
77+
}
78+
}
79+
}
80+
}
81+
82+
struct FontCache<'src, T: Resolve> {
83+
fonts: HashMap<Name, Rc<FontInfo>>,
84+
page: &'src Page,
85+
resolve: &'src T,
86+
default_font: Rc<FontInfo>,
87+
}
88+
89+
impl<'src, T: Resolve> FontCache<'src, T> {
90+
fn new(page: &'src Page, resolve: &'src T) -> Self {
91+
let mut cache = FontCache {
92+
fonts: HashMap::new(),
93+
page,
94+
resolve,
95+
default_font: Rc::new(FontInfo::default()),
96+
};
97+
98+
cache.populate();
99+
100+
cache
101+
}
102+
103+
fn populate(&mut self) {
104+
if let Ok(resources) = self.page.resources() {
105+
for (name, font) in resources.fonts.iter() {
106+
if let Some(font) = font.as_ref() {
107+
if let Ok(font) = self.resolve.get(font) {
108+
self.add_font(name.clone(), font);
109+
}
110+
}
111+
}
112+
113+
for (font, _) in resources.graphics_states.values().filter_map(|gs| gs.font) {
114+
if let Ok(font) = self.resolve.get(font) {
115+
if let Some(name) = &font.name {
116+
self.add_font(name.clone(), font);
117+
}
118+
}
119+
}
120+
}
121+
}
122+
123+
fn add_font(&mut self, name: impl Into<Name>, font: RcRef<Font>) {
124+
let decoder = if let Some(to_unicode) = font.to_unicode(self.resolve) {
125+
let cmap = to_unicode.unwrap();
126+
Decoder::Cmap(cmap)
127+
} else if let Some(encoding) = font.encoding() {
128+
let map = match encoding.base {
129+
BaseEncoding::StandardEncoding => Some(&pdf_encoding::STANDARD),
130+
BaseEncoding::SymbolEncoding => Some(&pdf_encoding::SYMBOL),
131+
BaseEncoding::WinAnsiEncoding => Some(&pdf_encoding::WINANSI),
132+
BaseEncoding::MacRomanEncoding => Some(&pdf_encoding::MACROMAN),
133+
BaseEncoding::None => None,
134+
ref e => {
135+
warn!("unsupported pdf encoding {:?}", e);
136+
return;
137+
}
138+
};
139+
140+
Decoder::Map(DifferenceForwardMap::new(
141+
map,
142+
encoding
143+
.differences
144+
.iter()
145+
.map(|(k, v)| (*k, v.to_string()))
146+
.collect(),
147+
))
148+
} else {
149+
return;
150+
};
151+
152+
self.fonts
153+
.insert(name.into(), Rc::new(FontInfo { decoder }));
154+
}
155+
156+
fn get_by_font_name(&self, name: &Name) -> Rc<FontInfo> {
157+
self.fonts.get(name).unwrap_or(&self.default_font).clone()
158+
}
159+
160+
fn get_by_graphic_state_name(&self, name: &Name) -> Option<(Rc<FontInfo>, f32)> {
161+
self.page
162+
.resources()
163+
.ok()
164+
.and_then(|resources| resources.graphics_states.get(name))
165+
.and_then(|gs| gs.font)
166+
.map(|(font, font_size)| {
167+
let font = self
168+
.resolve
169+
.get(font)
170+
.ok()
171+
.and_then(|font| Some(self.get_by_font_name(font.name.as_ref()?)))
172+
.unwrap_or_else(|| self.default_font.clone());
173+
174+
(font, font_size)
175+
})
176+
}
177+
}
178+
179+
#[derive(Clone, Default)]
180+
pub struct TextState {
181+
pub font: Rc<FontInfo>,
182+
pub font_size: f32,
183+
pub text_leading: f32,
184+
pub text_matrix: Transform2D<f32, PdfSpace, PdfSpace>,
185+
}
186+
187+
pub fn ops_with_text_state<'src, T: Resolve>(
188+
page: &'src Page,
189+
resolve: &'src T,
190+
) -> impl Iterator<Item = (Op, Rc<TextState>)> + 'src {
191+
page.contents.iter().flat_map(move |contents| {
192+
contents.operations(resolve).unwrap().into_iter().scan(
193+
(Rc::new(TextState::default()), FontCache::new(page, resolve)),
194+
|(state, font_cache), op| {
195+
let mut update_state = |update_fn: &dyn Fn(&mut TextState)| {
196+
let old_state: &TextState = state;
197+
let mut new_state = old_state.clone();
198+
199+
update_fn(&mut new_state);
200+
201+
*state = Rc::new(new_state);
202+
};
203+
204+
match op {
205+
Op::BeginText => {
206+
*state = Default::default();
207+
}
208+
Op::GraphicsState { ref name } => {
209+
update_state(&|state: &mut TextState| {
210+
if let Some((font, font_size)) =
211+
font_cache.get_by_graphic_state_name(name)
212+
{
213+
state.font = font;
214+
state.font_size = font_size;
215+
}
216+
});
217+
}
218+
Op::TextFont { ref name, size } => {
219+
update_state(&|state: &mut TextState| {
220+
state.font = font_cache.get_by_font_name(name);
221+
state.font_size = size;
222+
});
223+
}
224+
Op::Leading { leading } => {
225+
update_state(&|state: &mut TextState| state.text_leading = leading);
226+
}
227+
Op::TextNewline => {
228+
update_state(&|state: &mut TextState| {
229+
state.text_matrix = state.text_matrix.pre_translate(
230+
Point {
231+
x: 0.0f32,
232+
y: state.text_leading,
233+
}
234+
.into(),
235+
);
236+
});
237+
}
238+
Op::MoveTextPosition { translation } => {
239+
update_state(&|state: &mut TextState| {
240+
state.text_matrix = state.text_matrix.pre_translate(translation.into());
241+
});
242+
}
243+
Op::SetTextMatrix { matrix } => {
244+
update_state(&|state: &mut TextState| {
245+
state.text_matrix = matrix.into();
246+
});
247+
}
248+
_ => {}
249+
}
250+
251+
Some((op, state.clone()))
252+
},
253+
)
254+
})
255+
}
256+
257+
pub fn page_text(page: &Page, resolve: &impl Resolve) -> Result<String, PdfError> {
258+
let mut out = String::new();
259+
260+
for (op, text_state) in ops_with_text_state(page, resolve) {
261+
match op {
262+
Op::TextDraw { ref text } => text_state.font.decode(&text.data, &mut out)?,
263+
Op::TextDrawAdjusted { ref array } => {
264+
for data in array {
265+
if let TextDrawAdjusted::Text(text) = data {
266+
text_state.font.decode(&text.data, &mut out)?;
267+
}
268+
}
269+
}
270+
Op::TextNewline => {
271+
out.push('\n');
272+
}
273+
Op::MoveTextPosition { translation } => {
274+
if translation.y.abs() < f32::EPSILON {
275+
out.push('\n');
276+
}
277+
}
278+
Op::SetTextMatrix { matrix } => {
279+
if (matrix.f - text_state.text_matrix.m32).abs() < f32::EPSILON {
280+
out.push('\n');
281+
} else {
282+
out.push('\t');
283+
}
284+
}
285+
_ => {}
286+
}
287+
}
288+
Ok(out)
289+
}

0 commit comments

Comments
 (0)