Skip to content

Commit e164982

Browse files
author
Laurent Franceschetti
committed
Improve tests (#244)
- Extracted fixture utilities into `fixture_util.py` - Created a ./test_fixture.sh to specifically test the fixture, with options (also called when invoking `pytest`) - Specifically made a test in that context, for the `find_page()` function.
1 parent 9186fe6 commit e164982

File tree

4 files changed

+364
-298
lines changed

4 files changed

+364
-298
lines changed

test/fixture.py

Lines changed: 21 additions & 285 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,35 @@
22
Fixtures for the testing of Mkdocs-Macros (pytest)
33
This program must be in the test directory.
44
5+
This is the two classes:
6+
7+
- DocProject
8+
- TestMarkdownPage
9+
10+
511
(C) Laurent Franceschetti 2024
612
"""
713

814
import os
9-
from io import StringIO
1015
import yaml
1116
import subprocess
1217
import re
1318
from dataclasses import dataclass, field
1419
from typing import List
1520
import json
1621
from typing import Any, List
17-
import inspect
22+
1823

1924

2025
# from rich import print
21-
import markdown
2226
from bs4 import BeautifulSoup
23-
import pandas as pd
24-
import rich
25-
from rich.table import Table
2627

2728

2829
"A dictionary where the keys are also accessible with the dot notation"
2930
from mkdocs_macros.util import SuperDict
31+
from .fixture_util import (get_frontmatter, markdown_to_html, get_first_h1,
32+
find_in_html, find_after, list_markdown_files, find_page,
33+
run_command)
3034

3135
# ---------------------------
3236
# Initialization
@@ -63,269 +67,6 @@ def list_doc_projects(directory:str):
6367
"The error string"
6468
MACRO_ERROR_STRING = '# _Macro Rendering Error_'
6569

66-
67-
# ---------------------------
68-
# Print functions
69-
# ---------------------------
70-
std_print = print
71-
from rich import print
72-
from rich.panel import Panel
73-
74-
TITLE_COLOR = 'green'
75-
def h1(s:str, color:str=TITLE_COLOR):
76-
"Color print a 1st level title to the console"
77-
print()
78-
print(Panel(f"[{color} bold]{s}", style=color, width=80))
79-
80-
def h2(s:str, color:str=TITLE_COLOR):
81-
"Color print a 2nd level title to the consule"
82-
print()
83-
print(f"[green bold underline]{s}")
84-
85-
def h3(s:str, color:str=TITLE_COLOR):
86-
"Color print a 2nd level title to the consule"
87-
print()
88-
print(f"[green underline]{s}")
89-
90-
# ---------------------------
91-
# Low-level functions
92-
# ---------------------------
93-
94-
def find_after(s:str, word:str, pattern:str):
95-
"""
96-
Find the the first occurence of a pattern after a word
97-
(Both word and pattern can be regex, and the matching
98-
is case insensitive.)
99-
"""
100-
word_pattern = re.compile(word, re.IGNORECASE)
101-
parts = word_pattern.split(s, maxsplit=1)
102-
# parts = s.split(word, 1)
103-
104-
if len(parts) > 1:
105-
# Strip the remainder and search for the pattern
106-
remainder = parts[1].strip()
107-
match = re.search(pattern, remainder, flags=re.IGNORECASE)
108-
return match.group(0) if match else None
109-
else:
110-
return None
111-
112-
def list_markdown_files(directory:str):
113-
"""
114-
Makes a list of markdown files in a directory
115-
"""
116-
markdown_files = []
117-
for root, dirs, files in os.walk(directory):
118-
for file in files:
119-
if file.endswith('.md') or file.endswith('.markdown'):
120-
relative_path = os.path.relpath(os.path.join(root, file), directory)
121-
markdown_files.append(relative_path)
122-
return markdown_files
123-
124-
125-
def markdown_to_html(markdown_text):
126-
"""Convert markdown text to HTML."""
127-
html = markdown.markdown(markdown_text, extensions=["tables"])
128-
# print("HTML:")
129-
# print(html)
130-
return html
131-
132-
133-
def style_dataframe(df:pd.DataFrame):
134-
"""
135-
Apply beautiful and colorful styling to any dataframe
136-
(patches the dataframe).
137-
"""
138-
def _rich_str(self):
139-
table = Table(show_header=True, header_style="bold magenta")
140-
141-
# Add columns
142-
for col in self.columns:
143-
table.add_column(col, style="dim", width=12)
144-
145-
# Add rows
146-
for row in self.itertuples(index=False):
147-
table.add_row(*map(str, row))
148-
149-
return table
150-
151-
# reassign str to rich (to avoid messing up when rich.print is used)
152-
df.__rich__ = _rich_str.__get__(df)
153-
154-
def extract_tables_from_html(html:str, formatter:callable=None):
155-
"""
156-
Extract tables from a HTML source and convert them into dataframes
157-
"""
158-
soup = BeautifulSoup(html, 'html.parser')
159-
tables = soup.find_all('table')
160-
161-
dataframes = {}
162-
unnamed_table_count = 0
163-
for table in tables:
164-
print("Found a table")
165-
# Find the nearest header
166-
header = table.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
167-
if header:
168-
header_text = header.get_text()
169-
else:
170-
unnamed_table_count += 1
171-
header_text = f"Unnamed Table {unnamed_table_count}"
172-
173-
# Convert HTML table to DataFrame
174-
df = pd.read_html(StringIO(str(table)))[0]
175-
if formatter:
176-
formatter(df)
177-
# Add DataFrame to dictionary with header as key
178-
dataframes[header_text] = df
179-
180-
return dataframes
181-
182-
183-
def get_frontmatter(text:str) -> tuple[str, dict]:
184-
"Get the front matter from a markdown file"
185-
# Split the content to extract the YAML front matter
186-
parts = text.split('---',maxsplit=2)
187-
if len(parts) > 1:
188-
frontmatter = parts[1]
189-
metadata = SuperDict(yaml.safe_load(frontmatter))
190-
try:
191-
markdown = parts[2]
192-
except IndexError:
193-
markdown = ''
194-
return (markdown.strip(), frontmatter, metadata)
195-
else:
196-
return (text, '', {})
197-
198-
def find_in_html(html: str,
199-
pattern: str,
200-
header: str = None, header_level: int = None) -> str | None:
201-
"""
202-
Find a text or regex pattern in a HTML document (case-insensitive)
203-
204-
Arguments
205-
---------
206-
- html: the html string
207-
- pattern: the text or regex
208-
- header (text or regex): if specified, it finds it first,
209-
and then looks for the text between that header and the next one
210-
(any level).
211-
- header_level: you can speciy it, if there is a risk of ambiguity.
212-
213-
Returns
214-
-------
215-
The line where the pattern was found, or None
216-
"""
217-
if not isinstance(pattern, str):
218-
pattern = str(pattern)
219-
220-
soup = BeautifulSoup(html, 'html.parser')
221-
222-
# Compile regex patterns with case-insensitive flag
223-
pattern_regex = re.compile(pattern, re.IGNORECASE)
224-
225-
if header:
226-
header_regex = re.compile(header, re.IGNORECASE)
227-
228-
# Find all headers (h1 to h6)
229-
headers = soup.find_all(re.compile('^h[1-6]$', re.IGNORECASE))
230-
231-
for hdr in headers:
232-
if header_regex.search(hdr.text):
233-
# Check if header level is specified and matches
234-
if header_level and hdr.name != f'h{header_level}':
235-
continue
236-
237-
# Extract text until the next header
238-
text = []
239-
for sibling in hdr.find_next_siblings():
240-
if sibling.name and re.match('^h[1-6]$', sibling.name, re.IGNORECASE):
241-
break
242-
text.append(sibling.get_text(separator='\n', strip=True))
243-
244-
full_text = '\n'.join(text)
245-
246-
# Search for the pattern in the extracted text
247-
match = pattern_regex.search(full_text)
248-
if match:
249-
# Find the full line containing the match
250-
lines = full_text.split('\n')
251-
for line in lines:
252-
if pattern_regex.search(line):
253-
return line
254-
else:
255-
# Extract all text from the document
256-
full_text = soup.get_text(separator='\n', strip=True)
257-
258-
# Search for the pattern in the full text
259-
match = pattern_regex.search(full_text)
260-
if match:
261-
# Find the full line containing the match
262-
lines = full_text.split('\n')
263-
for line in lines:
264-
if pattern_regex.search(line):
265-
return line
266-
267-
return None
268-
269-
270-
271-
272-
273-
274-
def get_first_h1(markdown_text: str):
275-
"""
276-
Get the first h1 in a markdown file,
277-
ignoring YAML frontmatter and comments.
278-
"""
279-
# Remove YAML frontmatter
280-
yaml_frontmatter_pattern = re.compile(r'^---\s*\n(.*?\n)?---\s*\n',
281-
re.DOTALL)
282-
markdown_text = yaml_frontmatter_pattern.sub('', markdown_text)
283-
# Regular expression to match both syntaxes for level 1 headers
284-
h1_pattern = re.compile(r'^(# .+|.+\n=+)', re.MULTILINE)
285-
match = h1_pattern.search(markdown_text)
286-
if match:
287-
header = match.group(0)
288-
# Remove formatting
289-
if header.startswith('#'):
290-
return header.lstrip('# ').strip()
291-
else:
292-
return header.split('\n')[0].strip()
293-
return None
294-
295-
296-
297-
def get_tables(markdown_text:str) -> dict[pd.DataFrame]:
298-
"""
299-
Convert markdown text to HTML, extract tables,
300-
and convert them to dataframes.
301-
"""
302-
html = markdown_to_html(markdown_text)
303-
dataframes = extract_tables_from_html(html,
304-
formatter=style_dataframe)
305-
return dataframes
306-
307-
308-
309-
# ---------------------------
310-
# OS Functions
311-
# ---------------------------
312-
def run_command(command, *args) -> subprocess.CompletedProcess:
313-
"Execute a command"
314-
full_command = [command] + list(args)
315-
return subprocess.run(full_command, capture_output=True, text=True)
316-
317-
def get_caller_directory():
318-
"Get the caller's directory name (to be called from a function)"
319-
# Get the current frame
320-
current_frame = inspect.currentframe()
321-
# Get the caller's frame
322-
caller_frame = inspect.getouterframes(current_frame, 2)
323-
# Get the file name of the caller
324-
caller_file = caller_frame[1].filename
325-
# Get the absolute path of the directory containing the caller file
326-
directory_abspath = os.path.abspath(os.path.dirname(caller_file))
327-
return directory_abspath
328-
32970
# ---------------------------
33071
# Log parsing
33172
# ---------------------------
@@ -824,24 +565,19 @@ def pages(self) -> List[TestMarkdownPage]:
824565
return self._pages
825566

826567
def get_page(self, name:str):
827-
"Get the page by its filename or a substring"
828-
print("SEARCHING:", name)
829-
for page in self.pages:
830-
# give priority to exact matches
831-
if name == page.filename:
832-
return page
833-
# try without extension
834-
stem, _ = os.path.splitext(page.filename)
835-
if name == stem:
836-
return page
837-
# try again without full path
568+
"""
569+
Find a name in the list of Markdown pages (filenames)
570+
using a name (full or partial, with or without extension).
571+
"""
572+
# get all the filenames of pages:
573+
filenames = [page.filename for page in self.pages]
574+
# get the filename we want, from that list:
575+
filename = find_page(name, filenames)
576+
# return the corresponding page:
838577
for page in self.pages:
839-
if page.filename.endswith(name):
578+
if page.filename == filename:
840579
return page
841-
stem, _ = os.path.splitext(page.filename)
842-
if stem.endswith(name):
843-
return page
844-
print("- NOT FOUND")
580+
845581

846582
def get_plugin(self, name:str) -> SuperDict:
847583
"Get the plugin by its plugin name"

0 commit comments

Comments
 (0)