Skip to content

Commit

Permalink
feat: check mime type of urls before adding to request queue (#377)
Browse files Browse the repository at this point in the history
* feat: check mime type of urls before adding to request queue

* fix: add typesafety to crawlDomain

* feat: check magic number of urls ending with .zip

- use a fork of @crawlee/core which supports async
  transformRequestFunction

* fix: include auth header in request

* fix: let crawlee handle requests when mime check fails

- catch errors when checking mime type, and return true

* Fork crawlee-core to GovTechSG org

* fix: type errors

- extract function for parsing the --header option

* Fix types errors

* Bump package version

---------

Co-authored-by: younglim <[email protected]>
  • Loading branch information
shioju and younglim authored Aug 5, 2024
1 parent ece0e47 commit 4c1297b
Show file tree
Hide file tree
Showing 11 changed files with 937 additions and 450 deletions.
816 changes: 619 additions & 197 deletions package-lock.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
{
"name": "@govtechsg/purple-hats",
"main": "dist/npmIndex.js",
"version": "0.10.7",
"version": "0.10.8",
"type": "module",
"imports": {
"#root/*.js": "./dist/*.js"
},
"dependencies": {
"@crawlee/core": "github:GovTechSG/crawlee-core",
"@json2csv/node": "^7.0.3",
"@napi-rs/canvas": "^0.1.53",
"axe-core": "^4.9.1",
Expand All @@ -25,6 +26,7 @@
"prettier": "^3.1.0",
"print-message": "^3.0.1",
"safe-regex": "^2.1.1",
"sync-request-curl": "^3.0.0",
"typescript": "^5.4.5",
"url": "^0.11.3",
"validator": "^13.11.0",
Expand Down
25 changes: 6 additions & 19 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
validateDirPath,
validateFilePath,
validateCustomFlowLabel,
parseHeaders,
} from './constants/common.js';
import constants, { ScannerTypes } from './constants/constants.js';
import { cliOptions, messageOptions } from './constants/cliFunctions.js';
Expand Down Expand Up @@ -178,27 +179,13 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
return option;
})
.coerce('m', option => {
const headerValues = option.split(', ');
const allHeaders = {};

headerValues.map((headerValue: string) => {
const headerValuePair = headerValue.split(/ (.*)/s);
if (headerValuePair.length < 2) {
printMessage(
[
`Invalid value for authorisation request header. Please provide valid keywords in the format: "<header> <value>". For multiple authentication headers, please provide the keywords in the format: "<header> <value>, <header2> <value2>, ..." .`,
],
messageOptions,
);
process.exit(1);
}
allHeaders[headerValuePair[0]] = headerValuePair[1]; // {"header": "value", "header2": "value2", ...}
});

return allHeaders;
return parseHeaders(option);
})
.check(argvs => {
if ((argvs.scanner === ScannerTypes.CUSTOM || argvs.scanner === ScannerTypes.LOCALFILE) && argvs.maxpages) {
if (
(argvs.scanner === ScannerTypes.CUSTOM || argvs.scanner === ScannerTypes.LOCALFILE) &&
argvs.maxpages
) {
throw new Error('-p or --maxpages is only available in website and sitemap scans.');
}
return true;
Expand Down
140 changes: 71 additions & 69 deletions src/combine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@ import crawlLocalFile from './crawlers/crawlLocalFile.js';
import crawlIntelligentSitemap from './crawlers/crawlIntelligentSitemap.js';
import { generateArtifacts } from './mergeAxeResults.js';
import { getHost, createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js';
import { ScannerTypes,UrlsCrawled} from './constants/constants.js';
import { ScannerTypes, UrlsCrawled } from './constants/constants.js';
import { getBlackListedPatterns, submitForm, urlWithoutAuth } from './constants/common.js';
import { consoleLogger, silentLogger } from './logs.js';
import runCustom from './crawlers/runCustom.js';
import { alertMessageOptions } from './constants/cliFunctions.js';
import { Data } from './index.js';
import { fileURLToPath, pathToFileURL } from 'url';

import { pathToFileURL } from 'url';

// Class exports
export class ViewportSettingsClass {
Expand All @@ -21,16 +20,20 @@ export class ViewportSettingsClass {
viewportWidth: number;
playwrightDeviceDetailsObject: any; // You can replace 'any' with a more specific type if possible

constructor(deviceChosen: string, customDevice: string, viewportWidth: number, playwrightDeviceDetailsObject: any) {
constructor(
deviceChosen: string,
customDevice: string,
viewportWidth: number,
playwrightDeviceDetailsObject: any,
) {
this.deviceChosen = deviceChosen;
this.customDevice = customDevice;
this.viewportWidth = viewportWidth;
this.playwrightDeviceDetailsObject = playwrightDeviceDetailsObject;
}
}


const combineRun = async (details:Data, deviceToScan:string) => {
const combineRun = async (details: Data, deviceToScan: string) => {
const envDetails = { ...details };

const {
Expand All @@ -57,18 +60,15 @@ const combineRun = async (details:Data, deviceToScan:string) => {
customFlowLabel = 'Custom Flow',
extraHTTPHeaders,
safeMode,
zip
zip,
} = envDetails;

process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
process.env.CRAWLEE_STORAGE_DIR = randomToken;

const host =
(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
? ''
: getHost(url);
const host = type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE ? '' : getHost(url);

let blacklistedPatterns:string[] | null = null;
let blacklistedPatterns: string[] | null = null;
try {
blacklistedPatterns = getBlackListedPatterns(blacklistedPatternsFilename);
} catch (error) {
Expand All @@ -78,8 +78,10 @@ const combineRun = async (details:Data, deviceToScan:string) => {
}

// remove basic-auth credentials from URL
let finalUrl = (!(type === ScannerTypes.SITEMAP|| type === ScannerTypes.LOCALFILE)) ? urlWithoutAuth(url) : new URL(pathToFileURL(url));

let finalUrl = !(type === ScannerTypes.SITEMAP || type === ScannerTypes.LOCALFILE)
? urlWithoutAuth(url)
: new URL(pathToFileURL(url));

//Use the string version of finalUrl to reduce logic at submitForm
let finalUrlString = finalUrl.toString();

Expand All @@ -91,14 +93,14 @@ const combineRun = async (details:Data, deviceToScan:string) => {
urlsCrawled: new UrlsCrawled(),
};

const viewportSettings:ViewportSettingsClass = new ViewportSettingsClass(
const viewportSettings: ViewportSettingsClass = new ViewportSettingsClass(
deviceChosen,
customDevice,
viewportWidth,
playwrightDeviceDetailsObject,
);

let urlsCrawledObj;
let urlsCrawledObj: UrlsCrawled;
switch (type) {
case ScannerTypes.CUSTOM:
urlsCrawledObj = await runCustom(
Expand Down Expand Up @@ -127,22 +129,22 @@ const combineRun = async (details:Data, deviceToScan:string) => {
);
break;

case ScannerTypes.LOCALFILE:
urlsCrawledObj = await crawlLocalFile(
url,
randomToken,
host,
viewportSettings,
maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
fileTypes,
blacklistedPatterns,
includeScreenshots,
extraHTTPHeaders,
);
break;
case ScannerTypes.LOCALFILE:
urlsCrawledObj = await crawlLocalFile(
url,
randomToken,
host,
viewportSettings,
maxRequestsPerCrawl,
browser,
userDataDirectory,
specifiedMaxConcurrency,
fileTypes,
blacklistedPatterns,
includeScreenshots,
extraHTTPHeaders,
);
break;

case ScannerTypes.INTELLIGENT:
urlsCrawledObj = await crawlIntelligentSitemap(
Expand Down Expand Up @@ -194,43 +196,43 @@ const combineRun = async (details:Data, deviceToScan:string) => {
scanDetails.urlsCrawled = urlsCrawledObj;
await createDetailsAndLogs(randomToken);
if (scanDetails.urlsCrawled) {
if (scanDetails.urlsCrawled.scanned.length > 0) {
await createAndUpdateResultsFolders(randomToken);
const pagesNotScanned = [
...urlsCrawledObj.error,
...urlsCrawledObj.invalid,
...urlsCrawledObj.forbidden,
];
const basicFormHTMLSnippet = await generateArtifacts(
randomToken,
url,
type,
deviceToScan,
urlsCrawledObj.scanned,
pagesNotScanned,
customFlowLabel,
undefined,
scanDetails,
zip
);
const [name, email] = nameEmail.split(':');

await submitForm(
browser,
userDataDirectory,
url, // scannedUrl
new URL(finalUrlString).href, //entryUrl
type,
email,
name,
JSON.stringify(basicFormHTMLSnippet),
urlsCrawledObj.scanned.length,
urlsCrawledObj.scannedRedirects.length,
pagesNotScanned.length,
metadata,
);
}
}else {
if (scanDetails.urlsCrawled.scanned.length > 0) {
await createAndUpdateResultsFolders(randomToken);
const pagesNotScanned = [
...urlsCrawledObj.error,
...urlsCrawledObj.invalid,
...urlsCrawledObj.forbidden,
];
const basicFormHTMLSnippet = await generateArtifacts(
randomToken,
url,
type,
deviceToScan,
urlsCrawledObj.scanned,
pagesNotScanned,
customFlowLabel,
undefined,
scanDetails,
zip,
);
const [name, email] = nameEmail.split(':');

await submitForm(
browser,
userDataDirectory,
url, // scannedUrl
new URL(finalUrlString).href, //entryUrl
type,
email,
name,
JSON.stringify(basicFormHTMLSnippet),
urlsCrawledObj.scanned.length,
urlsCrawledObj.scannedRedirects.length,
pagesNotScanned.length,
metadata,
);
}
} else {
printMessage([`No pages were scanned.`], alertMessageOptions);
}
};
Expand Down
Loading

0 comments on commit 4c1297b

Please sign in to comment.