Skip to content

Commit

Permalink
add csv sniffing & encoding detection
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromedockes committed Jul 15, 2024
1 parent 1258032 commit b443179
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 73 deletions.
Binary file not shown.
10 changes: 6 additions & 4 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
href="https://cdn.jsdelivr.net/npm/[email protected]/build/pure-min.css"
integrity="sha384-X38yfunGUhNzHpBaEBsWLO+A0HDYOQi8ufWDkZ0k9e0eXz/tH3II7uKZ9msv++Ls"
crossorigin="anonymous">
<link rel="stylesheet" href="skrub-online-report.css?versionb528">
<link rel="stylesheet" href="skrub-online-report.css?version9a8b">
<script src="https://cdn.jsdelivr.net/pyodide/v0.26.1/full/pyodide.js"></script>
<script src="skrub-online-report.js?versionb528" defer></script>
<script src="skrub-online-report.js?version9a8b" defer></script>
</head>

<body>
Expand Down Expand Up @@ -62,18 +62,20 @@ <h1>Skrub table report generator</h1>
</fieldset>
</form>

<div>
<div id="csv-raw-text-section">
<h2>File contents</h2>
<div class="scroll">
<div id="csv-raw-text"></div>
</div>
</div>
<div>
<div id="csv-preview-section">
<h2>Table preview</h2>
<div class="scroll">
<div id="csv-preview"></div>
</div>
</div>
<div id="csv-error-section" class="error">
</div>
</details>
</div>
<div id="report">
Expand Down
2 changes: 1 addition & 1 deletion skrub-online-report.css
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fieldset {
align-items: center;
}

[data-hidden] {
[data-hidden], [data-hidden-force] {
display: none;
}

Expand Down
56 changes: 41 additions & 15 deletions skrub-online-report.js
Original file line number Diff line number Diff line change
Expand Up @@ -158,25 +158,51 @@ function updateState(data) {
setElementStates({
disableControls: false
});
if (data.decodingError !== undefined) {
let content =
`<div class="error"><p><strong>Failed to decode '${data.fileName}'. ` +
`Please select the file's encoding in the menu above. ` +
`The error message is shown below</strong></p><pre>${data.decodingError}</pre></div>`;
document.getElementById("csv-raw-text").innerHTML = content;

if (data.sniffedCsvParams) {
const form = document.getElementById("csv-dialog-form");
for (let param of ["delimiter", "quote", "escape", "encoding"]) {
form.elements[`${param}-input`].value = data[param];
}
}

const textSec = document.getElementById("csv-raw-text-section");
const textContentDiv = document.getElementById("csv-raw-text");
textContentDiv.innerHTML = "";
if (data.raw !== undefined) {
textSec.removeAttribute("data-hidden-force");
const pre = document.createElement("pre");
pre.textContent = data.raw;
textContentDiv.appendChild(pre);
} else {
document.getElementById("csv-raw-text").innerHTML =
`<pre>${data.raw}</pre>`;
textSec.setAttribute("data-hidden-force", "");
}
if (data.previewError !== undefined) {
let content =
`<div class="error"><p><strong>Failed to parse '${data.fileName}'. ` +
`The error message is shown below</strong></p><pre>${data.previewError}</pre></div>`;
document.getElementById("csv-preview").innerHTML = content;

const previewSec = document.getElementById("csv-preview-section");
const previewContentDiv = document.getElementById("csv-preview");
previewContentDiv.innerHTML = "";
if (data.preview !== undefined) {
previewSec.removeAttribute("data-hidden-force");
previewContentDiv.innerHTML = data.preview;
} else {
previewSec.setAttribute("data-hidden-force", "");
}

document.getElementById("csv-preview").innerHTML = data.preview;
const errorSec = document.getElementById("csv-error-section");
errorSec.innerHTML = "";
if (data.error !== undefined) {
errorSec.removeAttribute("data-hidden-force");
const summary = document.createElement("strong");
summary.textContent =
`There was an error processing ${data.fileName}. ` +
`You may need to adjust the encoding or CSV formatting information above. ` +
`The error is shown below`;
errorSec.appendChild(summary);
const errDetails = document.createElement("p");
errDetails.textContent = data.error;
errorSec.appendChild(errDetails);
} else {
errorSec.setAttribute("data-hidden-force", "");
}
break;
}
Expand Down Expand Up @@ -257,7 +283,7 @@ setElementStates({
disableControls: false,
hide: ["csv-dialog", "status-container"]
});
const worker = new Worker("skrub-worker.js?versionb528");
const worker = new Worker("skrub-worker.js?version9a8b");
worker.onmessage = (e) => {
console.log(e.data.kind);
updateState(e.data);
Expand Down
137 changes: 84 additions & 53 deletions skrub-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,34 @@ async function startPyodide() {
self.pyodide = await loadPyodide();
await self.pyodide.loadPackage("micropip");
const micropip = self.pyodide.pyimport("micropip");
await micropip.install(["pandas", "fastparquet"]);
await micropip.install("skrub-0.3.0.dev0-py3-none-any.whl?68a7t9");
await self.pyodide.loadPackage(["pandas", "matplotlib"]);
self.postMessage({
kind: "DONE_LOADING_PYODIDE"
});
}

async function computeReport() {
await micropip.install(["pandas", "fastparquet", "matplotlib",
"skrub-0.3.0.dev0-py3-none-any.whl?68a7t9",
"clevercsv-0.8.2-cp312-cp312-pyodide_2024_0_wasm32.whl"
]);
await pyodide.runPython(`
import io
import os
import traceback
import gc
os.environ["MPLBACKEND"] = "AGG"
import pandas as pd
import clevercsv
import chardet
import skrub
import js
`);
self.postMessage({
kind: "DONE_LOADING_PYODIDE"
});
}

report_error = None
async function computeReport() {
await pyodide.runPython(`
report_error = report = None
try:
def get_report():
global report
data = io.BytesIO(js.data.to_py())
if js.fileName.endswith(".parquet"):
df = pd.read_parquet(data)
Expand All @@ -47,16 +50,12 @@ try:
na_values=["?"],
keep_default_na=True,
)
report = skrub.TableReport(df, title=js.fileName).html_snippet()
try:
get_report()
except Exception as e:
report_error = traceback.format_exc()
report = None
data = None
df = None
gc.collect()
`);
return {
report: pyodide.globals.get("report"),
Expand Down Expand Up @@ -113,56 +112,86 @@ async function csvPreview() {
import io
import codecs
import traceback
import gc
import pandas as pd
import clevercsv
from chardet.universaldetector import UniversalDetector
import js
data_bytes = js.data.to_py()
decoding_error, preview_error = None, None
try:
decoder = codecs.getincrementaldecoder(js.encoding)()
decoded_text = decoder.decode(data_bytes[:4000])
except Exception as e:
decoding_error = str(e)
decoded_text = ""
data = io.BytesIO(data_bytes)
try:
error = error_type = decoded_text = preview = None
encoding = delimiter = quote = escape = None
def get_preview():
global encoding, delimiter, quote, escape, decoded_text, preview
data_bytes = bytes(js.data.to_py())
if js.sniff:
detector = UniversalDetector()
detector.feed(data_bytes[:65536])
detector.close()
encoding = detector.result["encoding"]
if encoding == "ascii":
encoding = "utf-8"
decoder = codecs.getincrementaldecoder(encoding)()
decoded_text = decoder.decode(data_bytes[:65536])
dialect = clevercsv.Sniffer().sniff(decoded_text)
decoded_text = decoded_text[:4000]
delimiter = dialect.delimiter or ","
quote = dialect.quotechar or '"'
escape = dialect.escapechar or ("\\u005c" if quote == "'" else quote)
else:
encoding = js.encoding
delimiter = js.delimiter
escape = js.escape
quote = js.quote
decoder = codecs.getincrementaldecoder(encoding)()
decoded_text = decoder.decode(data_bytes[:4000])
data = io.BytesIO(data_bytes)
df = pd.read_csv(
data,
encoding=js.encoding,
sep=js.delimiter,
escapechar=js.escape,
quotechar=js.quote,
encoding=encoding,
sep=delimiter,
escapechar=escape,
quotechar=quote,
nrows=5,
na_values=["?"],
keep_default_na=True,
)
preview = df.to_html().replace(
'class="dataframe"', 'class="pure-table pure-table-striped"'
)
except Exception as e:
preview = ""
preview_error = traceback.format_exc()
data_bytes = None
data = None
df = None
gc.collect()
try:
get_preview()
except Exception as e:
error = traceback.format_exc()
error_type = e.__class__.__name__
`);
self.postMessage({
kind: "DONE_COMPUTING_CSV_PREVIEW",
fileName: self.fileName
});
self.postMessage({
const data = {
kind: "CSV_PREVIEW",
raw: pyodide.globals.get("decoded_text"),
preview: pyodide.globals.get("preview"),
decodingError: pyodide.globals.get("decoding_error"),
previewError: pyodide.globals.get("preview_error"),
errorType: pyodide.globals.get("error_type"),
error: pyodide.globals.get("error"),
sniffedCsvParams: false,
fileName: self.fileName
});

};
if (self.sniff) {
data.sniffedCsvParams = true;
self.encoding = pyodide.globals.get("encoding");
data.encoding = self.encoding;
self.delimiter = pyodide.globals.get("delimiter");
data.delimiter = self.delimiter;
self.quote = pyodide.globals.get("quote");
data.quote = self.quote;
self.escape = pyodide.globals.get("escape");
data.escape = self.escape;
}
self.postMessage(data);
}

const pyodideLoaded = startPyodide();
Expand All @@ -183,20 +212,22 @@ self.onmessage = async (e) => {
if (self.fileName.endsWith(".parquet")) {
getReport();
} else {
self.sniff = true;
csvPreview();
}
break;
case "CSV_PARAMS":
if (self.encoding !== e.data.encoding || self.quote !== e.data
.quote || self.delimiter !== e.data.delimiter || self.escape !==
e.data.escape) {
self.sniff = false;
self.encoding = e.data.encoding;
self.quote = e.data.quote;
self.delimiter = e.data.delimiter;
self.escape = e.data.escape;
csvPreview();
}
break;
case "CSV_PARAMS":
self.encoding = e.data.encoding;
self.quote = e.data.quote;
self.delimiter = e.data.delimiter;
self.escape = e.data.escape;
csvPreview();
break;
case "CSV_COMMIT":
getReport();
break;
Expand Down

0 comments on commit b443179

Please sign in to comment.