Skip to content

Commit

Permalink
IMN-705-datalake-data-export (#973)
Browse files Browse the repository at this point in the history
Co-authored-by: Eric Camellini <[email protected]>
  • Loading branch information
Viktor-K and ecamellini authored Oct 7, 2024
1 parent b39576f commit ce8d346
Show file tree
Hide file tree
Showing 25 changed files with 994 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ jobs:
dockerfile_path: packages/ivass-certified-attributes-importer
- image_name: one-trust-notices
dockerfile_path: packages/one-trust-notices
- image_name: datalake-data-exporter
dockerfile_path: packages/datalake-data-exporter

steps:
- name: Checkout repository
Expand Down
1 change: 1 addition & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ services:
mc ready minio;
echo 'MinIO is ready. Seeding data...';
mc cp --recursive data/ minio/interop-local-bucket/;
mc mb minio/interop-datalake-bucket || true;
"
volumes:
- ./minio-seed:/data
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"start:anac-certified-attributes-importer": "turbo start --filter pagopa-interop-anac-certified-attributes-importer",
"start:ivass-certified-attributes-importer": "turbo start --filter pagopa-interop-ivass-certified-attributes-importer",
"start:one-trust-notices": "turbo start --filter pagopa-interop-one-trust-notices",
"start:datalake-data-export": "turbo start --filter pagopa-interop-datalake-data-export",
"test": "turbo test",
"build": "turbo build",
"check": "turbo check",
Expand Down
14 changes: 10 additions & 4 deletions packages/commons-test/src/testUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import {
AgreementId,
PurposeVersionId,
ProducerKeychain,
DescriptorState,
} from "pagopa-interop-models";
import { AuthData } from "pagopa-interop-commons";
import { z } from "zod";
Expand Down Expand Up @@ -199,11 +200,11 @@ export const getMockAttribute = (
origin: undefined,
});

export const getMockPurpose = (): Purpose => ({
export const getMockPurpose = (versions?: PurposeVersion[]): Purpose => ({
id: generateId(),
eserviceId: generateId(),
consumerId: generateId(),
versions: [],
versions: versions ?? [],
title: "Purpose 1 - test",
description: "Test purpose - description",
createdAt: new Date(),
Expand Down Expand Up @@ -242,11 +243,11 @@ export const getMockPurposeVersionDocument = (): PurposeVersionDocument => ({
createdAt: new Date(),
});

export const getMockDescriptor = (): Descriptor => ({
export const getMockDescriptor = (state?: DescriptorState): Descriptor => ({
id: generateId(),
version: "1",
docs: [],
state: descriptorState.draft,
state: state || descriptorState.draft,
audience: [],
voucherLifespan: 60,
dailyCallsPerConsumer: 10,
Expand All @@ -261,6 +262,11 @@ export const getMockDescriptor = (): Descriptor => ({
},
});

export const getMockDescriptorList = (length?: number): Descriptor[] => {
const arrayLength = length ?? Math.floor(Math.random() * 10) + 1;
return Array.from({ length: arrayLength }, () => getMockDescriptor());
};

export const getMockDocument = (): Document => ({
name: "fileName",
path: "filePath",
Expand Down
14 changes: 14 additions & 0 deletions packages/datalake-data-export/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
LOG_LEVEL=info

READMODEL_DB_HOST=localhost
READMODEL_DB_NAME=readmodel
READMODEL_DB_USERNAME=root
READMODEL_DB_PASSWORD=example
READMODEL_DB_PORT=27017

DATALAKE_STORAGE_BUCKET=interop-datalake-bucket
S3_CUSTOM_SERVER=true
S3_SERVER_HOST=http://localhost
S3_SERVER_PORT=9000

AWS_CONFIG_FILE=aws.config.local
41 changes: 41 additions & 0 deletions packages/datalake-data-export/DockerFile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
FROM node:20.14.0@sha256:02cd2205818f121c13612721876f28c18bd50148bb8af532ea121c96ffcb59bf as build

RUN corepack enable

WORKDIR /app
COPY package.json /app/
COPY pnpm-lock.yaml /app/
COPY pnpm-workspace.yaml /app/

COPY ./packages/commons/package.json /app/packages/commons/package.json
COPY ./packages/models/package.json /app/packages/models/package.json
COPY ./packages/datalake-data-export/package.json /app/packages/datalake-data-export/package.json

RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile

COPY tsconfig.json /app/
COPY turbo.json /app/

COPY ./packages/commons /app/packages/commons
COPY ./packages/models /app/packages/models
COPY ./packages/datalake-data-export /app/packages/datalake-data-export

RUN pnpm build && \
rm -rf /app/node_modules/.modules.yaml && \
rm -rf /app/node_modules/.cache && \
mkdir /out && \
cp -a --parents -t /out \
node_modules packages/datalake-data-export/node_modules \
package*.json packages/datalake-data-export/package*.json \
packages/commons/ \
packages/models/ \
packages/datalake-data-export/dist && \
find /out -exec touch -h --date=@0 {} \;

FROM node:20.14.0-slim@sha256:5e8ac65a0231d76a388683d07ca36a9769ab019a85d85169fe28e206f7a3208e as final

COPY --from=build /out /app

WORKDIR /app/packages/datalake-data-export

CMD [ "node", "." ]
1 change: 1 addition & 0 deletions packages/datalake-data-export/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This job exports all interop data to datalake
4 changes: 4 additions & 0 deletions packages/datalake-data-export/aws.config.local
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[default]
aws_access_key_id=testawskey
aws_secret_access_key=testawssecret
region=eu-central-1
39 changes: 39 additions & 0 deletions packages/datalake-data-export/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"name": "pagopa-interop-datalake-data-export",
"version": "1.0.0",
"description": "PagoPA Interoperability Data Lake Data Export",
"main": "dist",
"type": "module",
"scripts": {
"test": "vitest",
"lint": "eslint . --ext .ts,.tsx",
"lint:autofix": "eslint . --ext .ts,.tsx --fix",
"format:check": "prettier --check src",
"format:write": "prettier --write src",
"start": "node --loader ts-node/esm -r 'dotenv-flow/config' --watch ./src/index.ts",
"build": "tsc",
"check": "tsc --project tsconfig.check.json"
},
"keywords": [],
"author": "",
"license": "Apache-2.0",
"devDependencies": {
"@types/node": "20.14.6",
"@types/uuid": "9.0.8",
"@pagopa/eslint-config": "3.0.0",
"pagopa-interop-commons-test": "workspace:*",
"prettier": "2.8.8",
"ts-node": "10.9.2",
"typescript": "5.4.5",
"vitest": "1.6.0"
},
"dependencies": {
"pagopa-interop-commons": "workspace:*",
"pagopa-interop-models": "workspace:*",
"dotenv-flow": "4.1.0",
"date-fns": "3.6.0",
"mongodb": "6.7.0",
"uuid": "10.0.0",
"zod": "3.23.8"
}
}
24 changes: 24 additions & 0 deletions packages/datalake-data-export/src/config/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import {
FileManagerConfig,
LoggerConfig,
ReadModelDbConfig,
} from "pagopa-interop-commons";
import { z } from "zod";

export const DatalakeStorageConfig = z
.object({
DATALAKE_STORAGE_BUCKET: z.string(),
})
.transform((c) => ({
dataLakeStorageBucket: c.DATALAKE_STORAGE_BUCKET,
}));
export type DatalakeStorageConfig = z.infer<typeof DatalakeExporterConfig>;

export const DatalakeExporterConfig = DatalakeStorageConfig.and(LoggerConfig)
.and(FileManagerConfig)
.and(ReadModelDbConfig);

export type DatalakeExporterConfig = z.infer<typeof DatalakeExporterConfig>;
export const config: DatalakeExporterConfig = DatalakeExporterConfig.parse(
process.env
);
111 changes: 111 additions & 0 deletions packages/datalake-data-export/src/config/models/models.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import {
Agreement,
Descriptor,
Document,
EService,
Purpose,
PurposeVersion,
Tenant,
} from "pagopa-interop-models";
import { z } from "zod";

export type ExportedCollection =
| "tenants"
| "eservices"
| "agreements"
| "purposes";

/**
* The pick method used to extract the fields from the original type is not
* type-safe. It is possible to put a field that does not exist in the original
* and the type will still be valid.
* This type utility makes the pick method type-safe by accepting only the
* fields that exist in the original type.
*/
type StrictPick<T> = Partial<Record<keyof T, boolean>>;

export const ExportedTenant = Tenant.pick({
id: true,
kind: true,
selfcareId: true,
externalId: true,
createdAt: true,
onboardedAt: true,
name: true,
} satisfies StrictPick<Tenant>);
export type ExportedTenant = z.infer<typeof ExportedTenant>;

const ExportedCatalogDocument = Document.pick({
checksum: true,
} satisfies StrictPick<Document>);

const ExportedDescriptor = Descriptor.pick({
id: true,
description: true,
version: true,
voucherLifespan: true,
dailyCallsPerConsumer: true,
dailyCallsTotal: true,
state: true,
publishedAt: true,
suspendedAt: true,
deprecatedAt: true,
archivedAt: true,
} satisfies StrictPick<Descriptor>).and(
z.object({
interface: ExportedCatalogDocument.optional(),
})
);
export const ExportedEService = EService.pick({
id: true,
producerId: true,
name: true,
description: true,
mode: true,
createdAt: true,
} satisfies StrictPick<EService>).and(
z.object({
descriptors: z.array(ExportedDescriptor),
})
);
export type ExportedEService = z.infer<typeof ExportedEService>;

export const ExportedAgreement = Agreement.pick({
id: true,
eserviceId: true,
descriptorId: true,
producerId: true,
consumerId: true,
state: true,
suspendedByConsumer: true,
suspendedByProducer: true,
suspendedByPlatform: true,
createdAt: true,
suspendedAt: true,
stamps: true,
} satisfies StrictPick<Agreement>);
export type ExportedAgreement = z.infer<typeof ExportedAgreement>;

const ExportedPurposeVersion = PurposeVersion.pick({
id: true,
state: true,
createdAt: true,
suspendedAt: true,
firstActivationAt: true,
dailyCalls: true,
} satisfies StrictPick<PurposeVersion>);
export const ExportedPurpose = Purpose.pick({
id: true,
eserviceId: true,
consumerId: true,
suspendedByConsumer: true,
suspendedByProducer: true,
title: true,
description: true,
createdAt: true,
isFreeOfCharge: true,
freeOfChargeReason: true,
} satisfies StrictPick<Purpose>).and(
z.object({ versions: z.array(ExportedPurposeVersion) })
);
export type ExportedPurpose = z.infer<typeof ExportedPurpose>;
29 changes: 29 additions & 0 deletions packages/datalake-data-export/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import {
initFileManager,
logger,
ReadModelRepository,
} from "pagopa-interop-commons";
import { v4 as uuidv4 } from "uuid";
import { datalakeServiceBuilder } from "./services/datalakeService.js";
import { readModelServiceBuilder } from "./services/readModelService.js";
import { config } from "./config/config.js";

const log = logger({
serviceName: "datalake-data-export",
correlationId: uuidv4(),
});

const fileManager = initFileManager(config);
const readModelService = readModelServiceBuilder(
ReadModelRepository.init(config)
);

export const dataLakeService = datalakeServiceBuilder(
readModelService,
fileManager,
log
);

log.info("Datalake Data Exporter job started");
await dataLakeService.exportData();
log.info("Done!");
47 changes: 47 additions & 0 deletions packages/datalake-data-export/src/services/dataBuilder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import {
ExportedAgreement,
ExportedCollection,
ExportedEService,
ExportedPurpose,
ExportedTenant,
} from "../config/models/models.js";
import { arrayToNdjson, splitArrayIntoChunks } from "../utils/helperUtils.js";

export type ExportedData = [
collection: ExportedCollection,
ndjsonFiles: string[],
count: number
];

const generateNdjsonFiles = (
data: Array<
ExportedAgreement | ExportedEService | ExportedPurpose | ExportedTenant
>,
exportTimestamp: Date
): string[] => {
const dataWithTimestamp = data.map((item) => ({ ...item, exportTimestamp }));
const dataChunks = splitArrayIntoChunks(dataWithTimestamp, 1000);

return dataChunks.map(arrayToNdjson);
};

export const buildDataToExport = (
tenants: ExportedTenant[],
eservices: ExportedEService[],
agreements: ExportedAgreement[],
purposes: ExportedPurpose[],
exportTimestamp: Date
): ExportedData[] => [
["tenants", generateNdjsonFiles(tenants, exportTimestamp), tenants.length],
[
"eservices",
generateNdjsonFiles(eservices, exportTimestamp),
eservices.length,
],
[
"agreements",
generateNdjsonFiles(agreements, exportTimestamp),
agreements.length,
],
["purposes", generateNdjsonFiles(purposes, exportTimestamp), purposes.length],
];
Loading

0 comments on commit ce8d346

Please sign in to comment.