Skip to content

Commit

Permalink
refactor(ts): kwic_download
Browse files Browse the repository at this point in the history
  • Loading branch information
arildm committed Jun 13, 2024
1 parent 25c9d3f commit 37f3d92
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 70 deletions.
10 changes: 6 additions & 4 deletions app/scripts/backend/kwic-proxy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ const kwicProxyFactory = new Factory(KwicProxy)
export default kwicProxyFactory

/** @see https://ws.spraakbanken.gu.se/docs/korp#tag/Concordance/paths/~1query/get */
type KorpQueryParams = {
export type KorpQueryParams = {
corpus: string
cqp: string
start?: number
Expand Down Expand Up @@ -164,7 +164,7 @@ type MakeRequestOptions = {
type Interval = { start: number; end: number }

/** @see https://ws.spraakbanken.gu.se/docs/korp#tag/Concordance/paths/~1query/get */
type KorpQueryResponse = {
export type KorpQueryResponse = {
/** Search hits */
kwic: ApiKwic[]
/** Total number of hits */
Expand All @@ -178,15 +178,17 @@ type KorpQueryResponse = {
}

/** Search hits */
type ApiKwic = {
export type ApiKwic = {
/** An object for each token in the context, with attribute values for that token */
tokens: Record<string, any>[]
/** Attribute values for the context (e.g. sentence) */
structs: Record<string, any>
/** Specifies the position of the match in the context. If `in_order` is false, `match` will consist of a list of match objects, one per highlighted word */
match: KwicMatch | KwicMatch[]
/** Hits from aligned corpora if available, otherwise omitted */
aligned: Record<string, any[]>
aligned: {
[linkedCorpusId: `${string}-${string}`]: Record<string, any>[]
}
}

/** Specifies the position of a match in a context */
Expand Down
138 changes: 72 additions & 66 deletions app/scripts/kwic_download.js → app/scripts/kwic_download.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,61 +3,68 @@ import _ from "lodash"
import moment from "moment"
import CSV from "comma-separated-values/csv"
import { locObj } from "@/i18n"
import { type ApiKwic, type KorpQueryParams } from "@/backend/kwic-proxy"
import { LangMap } from "./i18n/types"

// This is what is returned by massageData in kwic.js
type Row = ApiKwic | LinkedKwic | CorpusHeading
// The annotations option is not available for parallel
type AnnotationsRow = ApiKwic | CorpusHeading

type LinkedKwic = {
tokens: ApiKwic["tokens"]
isLinked: true
corpus: string
}

type CorpusHeading = {
newCorpus: LangMap | string
noContext?: boolean
}

const emptyRow = (length) => _.fill(new Array(length), "")
const isKwic = (row: Row): row is ApiKwic => "tokens" in row && !isLinkedKwic(row)
const isLinkedKwic = (row: Row): row is LinkedKwic => "isLinked" in row
const isCorpusHeading = (row: Row): row is CorpusHeading => "newCorpus" in row

const padRows = (data, length) => _.map(data, (row) => [row, ...emptyRow(length - 1)])
type TableRow = (string | number)[]

function createFile(dataType, fileType, content) {
const emptyRow = (length: number) => _.fill(new Array(length), "")

const padRows = (data: string[], length: number) => _.map(data, (value) => [value, ...emptyRow(length - 1)])

function createFile(dataType: string, fileType: string, content: string) {
const date = moment().format("YYYYMMDD_HHmmss")
const filename = `korp_${dataType}_${date}.${fileType}`
const blobURL = window.URL.createObjectURL(new Blob([content], { type: `text/${fileType}` }))
return [filename, blobURL]
}

function createSearchInfo(requestInfo, totalHits) {
const rows = []
const fields = ["cqp", "context", "within", "sorting", "start", "end", "hits"]
for (let field of fields) {
var row
if (field === "cqp") {
row = `## CQP query: ${requestInfo.cqp}`
}
if (field === "context") {
row = `## context: ${requestInfo.default_context}`
}
if (field === "within") {
row = `## within: ${requestInfo.default_within}`
}
if (field === "sorting") {
const sorting = requestInfo.sort || "none"
row = `## sorting: ${sorting}`
}
if (field === "start") {
row = `## start: ${requestInfo.start}`
}
if (field === "end") {
row = `## end: ${requestInfo.end}`
}
if (field === "hits") {
row = `## Total hits: ${totalHits}`
}
rows.push(row)
}
return rows
function createSearchInfo(requestInfo: KorpQueryParams, totalHits: number) {
return [
`## CQP query: ${requestInfo.cqp}`,
`## context: ${requestInfo.default_context}`,
`## within: ${requestInfo.default_within}`,
`## sorting: ${requestInfo.sort || "none"}`,
`## start: ${requestInfo.start}`,
`## end: ${requestInfo.end}`,
`## Total hits: ${totalHits}`,
]
}

function transformDataToAnnotations(data, searchInfo) {
const headers = _.filter(
_.keys(data[1].tokens[0]),
function transformDataToAnnotations(data: AnnotationsRow[], searchInfo: string[]) {
const firstTokensRow: ApiKwic = data.find((row) => isKwic(row)) as ApiKwic | undefined
if (!firstTokensRow) return undefined

const headers = Object.keys(firstTokensRow.tokens[0]).filter(
(val) => val.indexOf("_") !== 0 && val !== "structs" && val !== "$$hashKey" && val !== "position"
)

const columnCount = headers.length + 1
let corpus
const res = padRows(searchInfo, columnCount)
res.push(["match"].concat(headers))
for (let row of data) {
if (row.tokens) {
for (const row of data) {
if (isKwic(row)) {
const textAttributes = []
for (let attrName in row.structs) {
const attrValue = row.structs[attrName]
Expand Down Expand Up @@ -89,20 +96,14 @@ function transformDataToAnnotations(data, searchInfo) {
return res
}

function transformDataToKWIC(data, searchInfo) {
let row
let corpus
const structHeaders = []
let res = []
for (row of data) {
if (row.tokens) {
if (row.isLinked) {
// parallell mode does not have matches or structs for the linked sentences
// current wordaround is to add all tokens to the left context
res.push(["", "", row.tokens.map((token) => token.word).join(" "), "", ""])
continue
}

function transformDataToKWIC(data: Row[], searchInfo: string[]) {
let corpus: string
const structHeaders: string[] = []
let res: TableRow[] = []
for (const row of data) {
if (isCorpusHeading(row)) {
corpus = locObj(row.newCorpus)
} else if (isKwic(row)) {
var attrName, token
const leftContext = []
const match = []
Expand Down Expand Up @@ -139,21 +140,23 @@ function transformDataToKWIC(data, searchInfo) {
structs.push("")
}
}
const newRow = [
const newRow: TableRow = [
corpus,
row.match instanceof Array ? row.match.map((match) => match.position).join(", ") : row.match.position,
leftContext.join(" "),
match.join(" "),
rightContext.join(" "),
].concat(structs)
res.push(newRow)
} else if (row.newCorpus) {
corpus = locObj(row.newCorpus)
} else {
// parallell mode does not have matches or structs for the linked sentences
// current wordaround is to add all tokens to the left context
res.push(["", "", row.tokens.map((token) => token.word).join(" "), "", ""])
}
}

const headers = ["corpus", "match_position", "left context", "match", "right_context"].concat(structHeaders)
res = [headers].concat(res)
res = [headers, ...res]

res.push(emptyRow(headers.length))
for (let row of padRows(searchInfo, headers.length)) {
Expand All @@ -163,17 +166,17 @@ function transformDataToKWIC(data, searchInfo) {
return res
}

function transformData(dataType, data, requestInfo, totalHits) {
function transformData(dataType: "annotations" | "kwic", data: Row[], requestInfo: KorpQueryParams, totalHits: number) {
const searchInfo = createSearchInfo(requestInfo, totalHits)
if (dataType === "annotations") {
return transformDataToAnnotations(data, searchInfo)
return transformDataToAnnotations(data as AnnotationsRow[], searchInfo)
}
if (dataType === "kwic") {
return transformDataToKWIC(data, searchInfo)
}
}

function makeContent(fileType, transformedData) {
function makeContent(fileType: "csv" | "tsv", transformedData: TableRow[]): string {
let dataDelimiter
if (fileType === "csv") {
dataDelimiter = ","
Expand All @@ -189,11 +192,14 @@ function makeContent(fileType, transformedData) {
return csv.encode()
}

// dataType: either "kwic" or "annotations"
// fileType: either "csv" or "tsv"
// data: json from the backend
export function makeDownload(dataType, fileType, data, requestInfo, totalHits) {
const tmp = transformData(dataType, data, requestInfo, totalHits)
const tmp2 = makeContent(fileType, tmp)
return createFile(dataType, fileType, tmp2)
export function makeDownload(
dataType: "annotations" | "kwic",
fileType: "csv" | "tsv",
data: Row[],
requestInfo: KorpQueryParams,
totalHits: number
) {
const table = transformData(dataType, data, requestInfo, totalHits)
const csv = makeContent(fileType, table)
return createFile(dataType, fileType, csv)
}

0 comments on commit 37f3d92

Please sign in to comment.