Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions examples/python-magic/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# python-magic Examples

Each sub-directory contains a self-contained example. The order in
which the examples are to appear is specified in `order.json` (an
array of directory names in the expected order).

In each example directory you'll find:

* `config.toml` - must conform to the specification outlined here:
https://docs.pyscript.net/latest/user-guide/configuration/ This is
parsed and ultimately turned into a JSON representation as part of
the package's API object.
* `setup.py` - Python code for contextual and environmental setup,
NOT SEEN BY THE END USER, but is run before the `code.py` code is
evaluated. Allows us to create useful (IPython) shims, avoid
repeating boilerplate and whatnot.
* `code.py` - the actual code added to the editor which forms the
practical example of using the package.
54 changes: 54 additions & 0 deletions examples/python-magic/identify_file_types/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
A first look at python-magic.

python-magic is a thin Python wrapper around libmagic, the same engine
behind the Unix `file` command. Given some bytes (or a file path), it
guesses the file type by inspecting the content's signature -- not the
filename or extension.

Docs: https://github.com/ahupp/python-magic
"""
from IPython.core.display import display, HTML

import magic


# A small "file cabinet" of byte signatures for common file types.
# Each entry is a realistic header we'd find at the start of a file.
file_samples = {
"report.pdf": b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<< /Type /Catalog >>\nendobj\n",
"logo.png": (
b"\x89PNG\r\n\x1a\n"
b"\x00\x00\x00\rIHDR\x00\x00\x00\x10\x00\x00\x00\x10\x08\x06\x00\x00\x00"
),
"photo.jpg": b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00",
"archive.zip": b"PK\x03\x04\x14\x00\x00\x00\x08\x00" + b"\x00" * 20,
"notes.txt": b"Dear diary,\nToday I learned about libmagic.\n",
"song.mp3": b"ID3\x04\x00\x00\x00\x00\x00\x00" + b"\x00" * 64,
}

heading("Guessing file types from raw bytes")
note(
"We pass the first chunk of each file's bytes to "
"<code>magic.from_buffer</code> and let libmagic identify it. "
"Notice that we never look at the filename -- the bytes alone are enough."
)

# Build an HTML table of filename, libmagic description, and MIME type.
rows = ["<tr><th>Filename</th><th>Description</th><th>MIME type</th></tr>"]
for name, data in file_samples.items():
description = magic.from_buffer(data)
mime_type = magic.from_buffer(data, mime=True)
rows.append(
f"<tr><td><code>{name}</code></td>"
f"<td>{description}</td>"
f"<td><code>{mime_type}</code></td></tr>"
)

display(HTML("<table>" + "".join(rows) + "</table>"), append=True)

note(
"The recommendation from the docs is to feed at least the first 2048 "
"bytes for reliable identification. Shorter buffers can confuse the "
"detection."
)
1 change: 1 addition & 0 deletions examples/python-magic/identify_file_types/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["python-magic"]
44 changes: 44 additions & 0 deletions examples/python-magic/identify_file_types/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Shim IPython's display API onto PyScript so example code written in a
Jupyter/IPython idiom runs unmodified in the browser.
"""

import sys
import types
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
"""Wrap pyscript.display so output lands in the example target."""
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


ipython = types.ModuleType("IPython")
core = types.ModuleType("IPython.core")
core_display = types.ModuleType("IPython.core.display")
core_display.display = display
core_display.HTML = HTML
ipython.core = core
core.display = core_display
ipython.version_info = (9, 0, 2, '')
ipython.get_ipython = lambda: None
ipython.display = core_display
sys.modules["IPython"] = ipython
sys.modules["IPython.core"] = core
sys.modules["IPython.core.display"] = core_display
sys.modules["IPython.display"] = core_display


def heading(text, level=2):
"""Emit an HTML heading so sections are visually separated."""
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
"""Emit a short paragraph of explanatory prose."""
display(HTML(f"<p>{text}</p>"), append=True)
87 changes: 87 additions & 0 deletions examples/python-magic/mime_router/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# ---------------------------------------------------------------------
# Building a tiny content-aware upload router with python-magic.
# ---------------------------------------------------------------------
#
# A common real-world use of python-magic: an "upload handler" that
# decides what to do with a file based on its true type, regardless
# of what the user named it. This protects against mislabeled or
# disguised files (think: a script renamed to look like an image).

import magic


heading("A MIME-based upload router")
note(
"Each incoming upload is a tuple of (claimed filename, bytes). "
"We detect the real MIME type and dispatch to the appropriate "
"handler. A mismatch between the filename's extension and the "
"detected type is flagged as suspicious."
)

# Pretend these came in over the wire from a web form.
incoming_uploads = [
("vacation.jpg", b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01"
b"\x00\x01\x00\x00" + b"\x00" * 200),
("budget.pdf", b"%PDF-1.5\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<<>>\nendobj\n"
+ b"\x00" * 100),
("backup.zip", b"PK\x03\x04\x14\x00\x00\x00\x08\x00" + b"\x00" * 200),
# Sneaky: claims to be a PNG but is actually plain text.
("avatar.png", b"#!/bin/sh\necho 'definitely not an image'\n" * 20),
("readme.txt", b"Welcome to the project!\n\nThis is a friendly readme.\n" * 10),
]


# Map MIME prefixes to handler descriptions. In a real app these would
# be functions; here we just describe what would happen.
def route(mime_type):
"""Return a (handler_name, action) pair for a detected MIME type."""
if mime_type.startswith("image/"):
return ("ImageProcessor", "resize and store in /uploads/images")
if mime_type == "application/pdf":
return ("DocumentIndexer", "extract text and add to search index")
if mime_type.startswith("text/"):
return ("TextStore", "save to /uploads/text")
if mime_type in {"application/zip", "application/x-zip-compressed"}:
return ("ArchiveScanner", "scan contents before unpacking")
return ("QuarantineBin", "unknown type, hold for review")


# Map common extensions to expected MIME prefixes for sanity-checking.
expected_prefix = {
".jpg": "image/", ".jpeg": "image/", ".png": "image/",
".pdf": "application/pdf",
".zip": "application/zip",
".txt": "text/",
}

rows = [
"<tr><th>Filename</th><th>Detected MIME</th>"
"<th>Handler</th><th>Status</th></tr>"
]
for filename, data in incoming_uploads:
mime_type = magic.from_buffer(data, mime=True)
handler, action = route(mime_type)

# Cross-check the extension against the detected type.
extension = "." + filename.rsplit(".", 1)[-1].lower()
expected = expected_prefix.get(extension, "")
if expected and not mime_type.startswith(expected):
status = "⚠️ extension/content mismatch"
else:
status = "✓ ok"

rows.append(
f"<tr><td><code>{filename}</code></td>"
f"<td><code>{mime_type}</code></td>"
f"<td>{handler} &mdash; <em>{action}</em></td>"
f"<td>{status}</td></tr>"
)

display(HTML("<table>" + "".join(rows) + "</table>"), append=True)

note(
"Notice how <code>avatar.png</code> is correctly identified as a "
"shell script (<code>text/x-shellscript</code> or similar) and "
"flagged. This is exactly the kind of check a libmagic-based router "
"buys you for free."
)
1 change: 1 addition & 0 deletions examples/python-magic/mime_router/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["python-magic"]
20 changes: 20 additions & 0 deletions examples/python-magic/mime_router/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Lightweight setup for cell 2 -- no IPython shim, just the names
the first cell already established."""
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)
4 changes: 4 additions & 0 deletions examples/python-magic/order.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[
"identify_file_types",
"mime_router"
]