diff --git a/.gitmodules b/.gitmodules index 05a2491880..f07b3dfb30 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1401,3 +1401,6 @@ [submodule "vendor/grammars/zephir-sublime"] path = vendor/grammars/zephir-sublime url = https://github.com/phalcon/zephir-sublime +[submodule "vendor/grammars/Octave-language-grammar"] + path = vendor/grammars/Octave-language-grammar + url = https://github.com/gnu-octave/Octave-language-grammar diff --git a/grammars.yml b/grammars.yml index 23c95ad2f7..2f123c29b3 100644 --- a/grammars.yml +++ b/grammars.yml @@ -87,6 +87,8 @@ vendor/grammars/ObjectScript.tmBundle: - source.objectscript_class - source.objectscript_csp - source.objectscript_macros +vendor/grammars/Octave-language-grammar: +- source.octave vendor/grammars/PHP-Twig.tmbundle: - text.html.twig vendor/grammars/PogoScript.tmbundle: diff --git a/lib/linguist/heuristics.yml b/lib/linguist/heuristics.yml index 8a6fcea078..ea2a069513 100644 --- a/lib/linguist/heuristics.yml +++ b/lib/linguist/heuristics.yml @@ -417,6 +417,14 @@ disambiguations: rules: - language: Objective-C named_pattern: objectivec + - language: Octave + pattern: + - '^\s*##' + - '^\s*endfunction' + - '^\s*endclassdef' + - '^\s*endif' + - '^\s*endfor' + - '^\s*endwhile' - language: Mercury pattern: ':- module' - language: MUF diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index aae547bd83..db6949e46c 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -943,11 +943,11 @@ Caddyfile: type: data color: "#22b638" aliases: - - Caddy + - Caddy extensions: - - ".caddyfile" + - ".caddyfile" filenames: - - Caddyfile + - Caddyfile ace_mode: text tm_scope: source.Caddyfile language_id: 615465151 @@ -4020,8 +4020,6 @@ M4Sugar: MATLAB: type: programming color: "#e16737" - aliases: - - octave extensions: - ".matlab" - ".m" @@ -4884,6 +4882,16 @@ Objective-J: tm_scope: source.js.objj ace_mode: text language_id: 259 +Octave: + type: programming + color: "#2299c4" + extensions: + - ".m" + tm_scope: source.octave + ace_mode: text + codemirror_mode: octave + codemirror_mime_type: text/x-octave + language_id: 854759768 Odin: type: programming color: "#60AFFE" @@ -5359,7 +5367,7 @@ Pkl: extensions: - ".pkl" interpreters: - - pkl + - pkl tm_scope: source.pkl ace_mode: text language_id: 288822799 diff --git a/samples/Octave/ClusterCriterion.m b/samples/Octave/ClusterCriterion.m new file mode 100644 index 0000000000..1dd31b243a --- /dev/null +++ b/samples/Octave/ClusterCriterion.m @@ -0,0 +1,245 @@ +## Copyright (C) 2021 Stefano Guidoni +## Copyright (C) 2024 Andreas Bertsatos +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef ClusterCriterion < handle + ## -*- texinfo -*- + ## @deftypefn {statistics} {@var{obj} =} ClusterCriterion (@var{x}, @var{clust}, @var{criterion}) + ## + ## A clustering evaluation object as created by @code{evalclusters}. + ## + ## @code{ClusterCriterion} is a superclass for clustering evaluation objects + ## as created by @code{evalclusters}. + ## + ## List of public properties: + ## @table @code + ## @item @qcode{ClusteringFunction} + ## a valid clustering funtion name or function handle. It can be empty if + ## the clustering solutions are passed as an input matric. + ## + ## @item @qcode{CriterionName} + ## a valid criterion name to evaluate the clustering solutions. + ## + ## @item @qcode{CriterionValues} + ## a vector of values as generated by the evaluation criterion for each + ## clustering solution. + ## + ## @item @qcode{InspectedK} + ## the list of proposed cluster numbers. + ## + ## @item @qcode{Missing} + ## a logical vector of missing observations. When there are @code{NaN} + ## values in the data matrix, the corresponding observation is excluded. + ## + ## @item @qcode{NumObservations} + ## the number of non-missing observations in the data matrix. + ## + ## @item @qcode{OptimalK} + ## the optimal number of clusters. + ## + ## @item @qcode{OptimalY} + ## the clustering solution corresponding to @code{OptimalK}. + ## + ## @item @qcode{X} + ## the data matrix. + ## + ## @end table + ## + ## List of public methods: + ## @table @code + ## @item @qcode{addK} + ## add a list of numbers of clusters to evaluate. + ## + ## @item @qcode{compact} + ## return a compact clustering evaluation object. Not implemented + ## + ## @item @qcode{plot} + ## plot the clustering evaluation values against the corresponding number of + ## clusters. + ## + ## @end table + ## + ## @seealso{evalclusters, CalinskiHarabaszEvaluation, DaviesBouldinEvaluation, + ## GapEvaluation, SilhouetteEvaluation} + ## @end deftypefn + + properties (Access = public) + ## public properties + endproperties + + properties (GetAccess = public, SetAccess = protected) + ClusteringFunction = ""; + CriterionName = ""; + CriterionValues = []; + InspectedK = []; + Missing = []; + NumObservations = 0; + OptimalK = 0; + OptimalY = []; + X = []; + endproperties + + properties (Access = protected) + N = 0; # number of observations + P = 0; # number of variables + ClusteringSolutions = []; # + OptimalIndex = 0; # index of the optimal K + endproperties + + methods (Access = public) + + ## constructor + function this = ClusterCriterion (x, clust, KList) + ## parsing input data + if ((! ismatrix (x)) || (! isnumeric (x))) + error ("ClusterCriterion: 'x' must be a numeric matrix"); + endif + this.X = x; + this.N = rows (this.X); + this.P = columns (this.X); + ## look for missing values + for iter = 1 : this.N + if (any (find (x(iter, :) == NaN))) + this.Missing(iter) = true; + else + this.Missing(iter) = false; + endif + endfor + ## number of usable observations + this.NumObservations = sum (this.Missing == false); + + ## parsing the clustering algorithm + if (ischar (clust)) + if (any (strcmpi (clust, {"kmeans", "linkage", "gmdistribution"}))) + this.ClusteringFunction = lower (clust); + else + error ("ClusterCriterion: unknown clustering algorithm '%s'", clust); + endif + elseif (isa (clust, "function_handle")) + this.ClusteringFunction = clust; + elseif (ismatrix (clust)) + if (isnumeric (clust) && (length (size (clust)) == 2) && ... + (rows (clust) == this.N)) + this.ClusteringFunction = ""; + this.ClusteringSolutions = clust(find (this.Missing == false), :); + else + error ("ClusterCriterion: invalid matrix of clustering solutions"); + endif + else + error ("ClusterCriterion: invalid argument"); + endif + + ## parsing the list of cluster sizes to inspect + this.InspectedK = parseKList (this, KList); + endfunction + + ## -*- texinfo -*- + ## @deftypefn {ClusterCriterion} {@var{obj} =} addK (@var{obj}, @var{K}) + ## + ## Add a new cluster array to inspect the ClusterCriterion object. + ## + ## @end deftypefn + function this = addK (this, k) + + ## if there is not a clustering function, then we are using a predefined + ## set of clustering solutions, hence we cannot redefine the number of + ## solutions + if (isempty (this.ClusteringFunction)) + warning (["ClusterCriterion.addK: cannot redefine the list of cluster"... + "numbers to evaluate when there is not a clustering function"]); + return; + endif + + ## otherwise go on + newList = this.parseKList ([this.InspectedK k]); + + ## check if the list has changed + if (length (newList) == length (this.InspectedK)) + warning ("ClusterCriterion.addK: the list has not changed"); + else + ## update ClusteringSolutions and CriterionValues + ClusteringSolutions_tmp = zeros (this.NumObservations, ... + length (newList)); + CriterionValues_tmp = zeros (length (newList), 1); + for iter = 1 : length (this.InspectedK) + idx = find (newList == this.InspectedK(iter)); + + if (! isempty (idx)) + ClusteringSolutions_tmp(:, idx) = this.ClusteringSolutions(:, iter); + CriterionValues_tmp(idx) = this.CriterionValues(iter); + endif + endfor + this.ClusteringSolutions = ClusteringSolutions_tmp; + this.CriterionValues = CriterionValues_tmp; + + ## reset the old results + this.OptimalK = 0; + this.OptimalY = []; + this.OptimalIndex = 0; + + ## update the list of cluster numbers to evaluate + this.InspectedK = newList; + endif + endfunction + + ## -*- texinfo -*- + ## @deftypefn {ClusterCriterion} {} plot (@var{obj}) + ## @deftypefnx {ClusterCriterion} {@var{h} =} plot (@var{obj}) + ## + ## Plot the evaluation results. + ## + ## Plot the CriterionValues against InspectedK from the ClusterCriterion, + ## @var{obj}, to the current plot. It can also return a handle to the + ## current plot. + ## + ## @end deftypefn + function h = plot (this) + yLabel = sprintf ("%s value", this.CriterionName); + h = gca (); + hold on; + plot (this.InspectedK, this.CriterionValues, "bo-"); + plot (this.OptimalK, this.CriterionValues(this.OptimalIndex), "b*"); + xlabel ("number of clusters"); + ylabel (yLabel); + hold off; + endfunction + + ## -*- texinfo -*- + ## @deftypefn {ClusterCriterion} {@var{obj} =} compact (@var{obj}) + ## + ## Return a compact ClusterCriterion object (not implemented yet). + ## + ## @end deftypefn + function this = compact (this) + warning ("ClusterCriterion.compact: this method is not yet implemented."); + endfunction + + endmethods + + methods (Access = private) + ## check if a list of cluster sizes is correct + function retList = parseKList (this, KList) + if (isnumeric (KList) && isvector (KList) && all (find (KList > 0)) && ... + all (floor (KList) == KList)) + retList = unique (KList); + else + error (["ClusterCriterion: the list of cluster sizes must be an " ... + "array of positive integer numbers"]); + endif + endfunction + endmethods +endclassdef diff --git a/samples/Octave/anova1.m b/samples/Octave/anova1.m new file mode 100644 index 0000000000..fe799c9339 --- /dev/null +++ b/samples/Octave/anova1.m @@ -0,0 +1,351 @@ +## Copyright (C) 2021-2022 Andreas Bertsatos +## Copyright (C) 2022 Andrew Penn +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {statistics} {@var{p} =} anova1 (@var{x}) +## @deftypefnx {statistics} {@var{p} =} anova1 (@var{x}, @var{group}) +## @deftypefnx {statistics} {@var{p} =} anova1 (@var{x}, @var{group}, @var{displayopt}) +## @deftypefnx {statistics} {@var{p} =} anova1 (@var{x}, @var{group}, @var{displayopt}, @var{vartype}) +## @deftypefnx {statistics} {[@var{p}, @var{atab}] =} anova1 (@var{x}, @dots{}) +## @deftypefnx {statistics} {[@var{p}, @var{atab}, @var{stats}] =} anova1 (@var{x}, @dots{}) +## +## Perform a one-way analysis of variance (ANOVA) for comparing the means of two +## or more groups of data under the null hypothesis that the groups are drawn +## from distributions with the same mean. For planned contrasts and/or +## diagnostic plots, use @qcode{anovan} instead. +## +## anova1 can take up to three input arguments: +## +## @itemize +## @item +## @var{x} contains the data and it can either be a vector or matrix. +## If @var{x} is a matrix, then each column is treated as a separate group. +## If @var{x} is a vector, then the @var{group} argument is mandatory. +## +## @item +## @var{group} contains the names for each group. If @var{x} is a matrix, then +## @var{group} can either be a cell array of strings of a character array, with +## one row per column of @var{x}. If you want to omit this argument, enter an +## empty array ([]). If @var{x} is a vector, then @var{group} must be a vector +## of the same length, or a string array or cell array of strings with one row +## for each element of @var{x}. @var{x} values corresponding to the same value +## of @var{group} are placed in the same group. +## +## @item +## @var{displayopt} is an optional parameter for displaying the groups contained +## in the data in a boxplot. If omitted, it is 'on' by default. If group names +## are defined in @var{group}, these are used to identify the groups in the +## boxplot. Use 'off' to omit displaying this figure. +## +## @item +## @var{vartype} is an optional parameter to used to indicate whether the +## groups can be assumed to come from populations with equal variance. When +## @qcode{vartype} is @qcode{"equal"} the variances are assumed to be equal +## (this is the default). When @qcode{vartype} is @qcode{"unequal"} the +## population variances are not assumed to be equal and Welch's ANOVA test is +## used instead. +## @end itemize +## +## anova1 can return up to three output arguments: +## +## @itemize +## @item +## @var{p} is the p-value of the null hypothesis that all group means are equal. +## +## @item +## @var{atab} is a cell array containing the results in a standard ANOVA table. +## +## @item +## @var{stats} is a structure containing statistics useful for performing +## a multiple comparison of means with the MULTCOMPARE function. +## @end itemize +## +## If anova1 is called without any output arguments, then it prints the results +## in a one-way ANOVA table to the standard output. It is also printed when +## @var{displayopt} is 'on'. +## +## +## Examples: +## +## @example +## x = meshgrid (1:6); +## x = x + normrnd (0, 1, 6, 6); +## anova1 (x, [], 'off'); +## [p, atab] = anova1(x); +## @end example +## +## +## @example +## x = ones (50, 4) .* [-2, 0, 1, 5]; +## x = x + normrnd (0, 2, 50, 4); +## groups = @{"A", "B", "C", "D"@}; +## anova1 (x, groups); +## @end example +## +## @seealso{anova2, anovan, multcompare} +## @end deftypefn + +function [p, anovatab, stats] = anova1 (x, group, displayopt, vartype) + + ## Check for valid number of input arguments + if (nargin < 1 || nargin > 4) + error ("anova1: invalid number of input arguments."); + endif + ## Add defaults + if (nargin < 2) + group = []; + endif + if (nargin < 3) + displayopt = "on"; + endif + if (nargin < 4) + vartype = "equal"; + endif + plotdata = ! (strcmp (displayopt, 'off')); + + ## Convert group to cell array from character array, make it a column + if (! isempty (group) && ischar (group)) + group = cellstr (group); + endif + if (size (group, 1) == 1) + group = group'; + endif + + ## If x is a matrix, convert it to column vector and create a + ## corresponging column vector for groups + if (length (x) < prod (size (x))) + [n, m] = size (x); + x = x(:); + gi = reshape (repmat ((1:m), n, 1), n*m, 1); + if (length (group) == 0) ## no group names are provided + group = gi; + elseif (size (group, 1) == m) ## group names exist and match columns + group = group(gi,:); + else + error ("anova1: columns in X and GROUP length do not match."); + endif + endif + + ## Check that x and group are the same size + if (! all (numel (x) == numel (group))) + error ("anova1: GROUP must be a vector with the same number of rows as x."); + endif + + ## Identify NaN values (if any) and remove them from X along with + ## their corresponding values from group vector + nonan = ! isnan (x); + x = x(nonan); + group = group(nonan, :); + + ## Convert group to indices and separate names + [group_id, group_names] = grp2idx (group); + group_id = group_id(:); + named = 1; + + ## Center data to improve accuracy and keep uncentered data for ploting + xorig = x; + mu = mean(x); + x = x - mu; + xr = x; + + ## Get group size and mean for each group + groups = size (group_names, 1); + xs = zeros (1, groups); + xm = xs; + xv = xs; + for j = 1:groups + group_size = find (group_id == j); + xs(j) = length (group_size); + xm(j) = mean (xr(group_size)); + xv(j) = var (xr(group_size), 0); + endfor + + ## Calculate statistics + lx = length (xr); ## Number of samples in groups + gm = mean (xr); ## Grand mean of groups + dfm = length (xm) - 1; ## degrees of freedom for model + dfe = lx - dfm - 1; ## degrees of freedom for error + SSM = xs .* (xm - gm) * (xm - gm)'; ## Sum of Squares for Model + SST = (xr(:) - gm)' * (xr(:) - gm); ## Sum of Squares Total + SSE = SST - SSM; ## Sum of Squares Error + if (dfm > 0) + MSM = SSM / dfm; ## Mean Square for Model + else + MSM = NaN; + endif + if (dfe > 0) + MSE = SSE / dfe; ## Mean Square for Error + else + MSE = NaN; + endif + ## Calculate F statistic + if (SSE != 0) ## Regular Matrix case. + switch (lower (vartype)) + case "equal" + ## Assume equal variances (Fisher's One-way ANOVA) + F = (SSM / dfm) / MSE; + case "unequal" + ## Accomodate for unequal variances (Welch's One-way ANOVA) + ## Calculate the sampling variance for each group (i.e. the square of the SEM) + sv = xv ./ xs; + ## Calculate weights as the reciprocal of the sampling variance + w = 1 ./ sv; + ## Calculate the origin + ori = sum (w .* xm) ./ sum (w); + ## Calculate Welch's F statistic + F = (groups - 1)^-1 * sum (w .* (xm - ori).^2) /... + (1 + ((2 * (groups - 2)/(groups^2 - 1)) * ... + sum ((1 - w / sum (w)).^2 .* (xs - 1).^-1))); + ## Welch's test does not use a pooled error term + MSE = NaN; + ## Correct the error degrees of freedom + dfe = (3 /(groups^2 - 1) * sum ((1 - w / sum (w)).^2 .* (xs-1).^-1))^-1; + otherwise + error ("anova1: invalid fourth (vartype) argument to anova1."); + endswitch + p = 1 - fcdf (F, dfm, dfe); ## Probability of F given equal means. + elseif (SSM == 0) ## Constant Matrix case. + F = 0; + p = 1; + else ## Perfect fit case. + F = Inf; + p = 0; + end + + ## Create results table (if requested) + if (nargout > 1) + switch (lower (vartype)) + case "equal" + anovatab = {"Source", "SS", "df", "MS", "F", "Prob>F"; ... + "Groups", SSM, dfm, MSM, F, p; ... + "Error", SSE, dfe, MSE, "", ""; ... + "Total", SST, dfm + dfe, "", "", ""}; + case "unequal" + anovatab = {"Source", "F", "df", "dfe", "F", "Prob>F"; ... + "Groups", SSM, dfm, dfe, F, p}; + endswitch + endif + ## Create stats structure (if requested) for MULTCOMPARE + if (nargout > 2) + if (length (group_names) > 0) + stats.gnames = group_names; + else + stats.gnames = strjust (num2str ((1:length (xm))'), 'left'); + end + stats.n = xs; + stats.source = 'anova1'; + stats.vartype = vartype; + stats.means = xm + mu; + stats.vars = xv; + stats.df = dfe; + stats.s = sqrt (MSE); + endif + ## Print results table on screen if no output argument was requested + if (nargout == 0 || plotdata) + switch (lower (vartype)) + case "equal" + printf("\n ANOVA Table\n\n"); + printf("Source SS df MS F Prob>F\n"); + printf("------------------------------------------------------\n"); + printf("Groups %10.4f %5.0f %10.4f %8.2f %9.4f\n", SSM, dfm, MSM, F, p); + printf("Error %10.4f %5.0f %10.4f\n", SSE, dfe, MSE); + printf("Total %10.4f %5.0f\n\n", SST, dfm + dfe); + case "unequal" + printf("\n Welch's ANOVA Table\n\n"); + printf("Source F df dfe Prob>F\n"); + printf("-----------------------------------------\n"); + printf("Groups %8.2f %5.0f %7.2f %10.4f\n\n", F, dfm, dfe, p); + endswitch + endif + ## Plot data using BOXPLOT (unless opted out) + if (plotdata) + boxplot (x, group_id, "Notch", "on", "Labels", group_names); + endif +endfunction + + +%!demo +%! x = meshgrid (1:6); +%! randn ("seed", 15); # for reproducibility +%! x = x + normrnd (0, 1, 6, 6); +%! anova1 (x, [], 'off'); + +%!demo +%! x = meshgrid (1:6); +%! randn ("seed", 15); # for reproducibility +%! x = x + normrnd (0, 1, 6, 6); +%! [p, atab] = anova1(x); + +%!demo +%! x = ones (50, 4) .* [-2, 0, 1, 5]; +%! randn ("seed", 13); # for reproducibility +%! x = x + normrnd (0, 2, 50, 4); +%! groups = {"A", "B", "C", "D"}; +%! anova1 (x, groups); + +%!demo +%! y = [54 87 45; 23 98 39; 45 64 51; 54 77 49; 45 89 50; 47 NaN 55]; +%! g = [1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ]; +%! anova1 (y(:), g(:), "on", "unequal"); + +## testing against GEAR.DAT data file and results for one-factor ANOVA from +## https://www.itl.nist.gov/div898/handbook/eda/section3/eda354.htm +%!test +%! data = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000, ... +%! 0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988, ... +%! 0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996, ... +%! 1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996, ... +%! 0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996, ... +%! 1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996, ... +%! 0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002, ... +%! 0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006, ... +%! 1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991, ... +%! 0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]; +%! group = [1:10] .* ones (10,10); +%! group = group(:); +%! [p, tbl] = anova1 (data, group, "off"); +%! assert (p, 0.022661, 1e-6); +%! assert (tbl{2,5}, 2.2969, 1e-4); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 0.003903, 1e-6); +%! data = reshape (data, 10, 10); +%! [p, tbl, stats] = anova1 (data, [], "off"); +%! assert (p, 0.022661, 1e-6); +%! assert (tbl{2,5}, 2.2969, 1e-4); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 0.003903, 1e-6); +%! means = [0.998, 0.9991, 0.9954, 0.9982, 0.9919, 0.9988, 1.0015, 1.0004, 0.9983, 0.9948]; +%! N = 10 * ones (1, 10); +%! assert (stats.means, means, 1e-6); +%! assert (length (stats.gnames), 10, 0); +%! assert (stats.n, N, 0); + +## testing against one-way ANOVA example dataset from GraphPad Prism 8 +%!test +%! y = [54 87 45; 23 98 39; 45 64 51; 54 77 49; 45 89 50; 47 NaN 55]; +%! g = [1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ; 1 2 3 ]; +%! [p, tbl] = anova1 (y(:), g(:), "off", "equal"); +%! assert (p, 0.00004163, 1e-6); +%! assert (tbl{2,5}, 22.573418, 1e-6); +%! assert (tbl{2,3}, 2, 0); +%! assert (tbl{3,3}, 14, 0); +%! [p, tbl] = anova1 (y(:), g(:), "off", "unequal"); +%! assert (p, 0.00208877, 1e-8); +%! assert (tbl{2,5}, 15.523192, 1e-6); +%! assert (tbl{2,3}, 2, 0); +%! assert (tbl{2,4}, 7.5786897, 1e-6); diff --git a/samples/Octave/silhouette.m b/samples/Octave/silhouette.m new file mode 100644 index 0000000000..e3c15c4713 --- /dev/null +++ b/samples/Octave/silhouette.m @@ -0,0 +1,206 @@ +## Copyright (C) 2016 Nan Zhou +## Copyright (C) 2021 Stefano Guidoni +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {statistics} {} silhouette (@var{X}, @var{clust}) +## @deftypefnx {statistics} {[@var{si}, @var{h}] =} silhouette (@var{X}, @var{clust}) +## @deftypefnx {statistics} {[@var{si}, @var{h}] =} silhouette (@dots{}, @var{Metric}, @var{MetricArg}) +## +## Compute the silhouette values of clustered data and show them on a plot. +## +## @var{X} is a n-by-p matrix of n data points in a p-dimensional space. Each +## datapoint is assigned to a cluster using @var{clust}, a vector of n elements, +## one cluster assignment for each data point. +## +## Each silhouette value of @var{si}, a vector of size n, is a measure of the +## likelihood that a data point is accurately classified to the right cluster. +## Defining "a" as the mean distance between a point and the other points from +## its cluster, and "b" as the mean distance between that point and the points +## from other clusters, the silhouette value of the i-th point is: +## +## @tex +## \def\frac#1#2{{\begingroup#1\endgroup\over#2}} +## $$ S_i = \frac{b_i - a_i}{max(a_1,b_i)} $$ +## @end tex +## @ifnottex +## @verbatim +## bi - ai +## Si = ------------ +## max(ai,bi) +## @end verbatim +## @end ifnottex +## +## Each element of @var{si} ranges from -1, minimum likelihood of a correct +## classification, to 1, maximum likelihood. +## +## Optional input value @var{Metric} is the metric used to compute the distances +## between data points. Since @code{silhouette} uses @code{pdist} to compute +## these distances, @var{Metric} is quite similar to the option @var{Metric} of +## pdist and it can be: +## @itemize @bullet +## @item A known distance metric defined as a string: @qcode{Euclidean}, +## @qcode{sqEuclidean} (default), @qcode{cityblock}, @qcode{cosine}, +## @qcode{correlation}, @qcode{Hamming}, @qcode{Jaccard}. +## +## @item A vector as those created by @code{pdist}. In this case @var{X} does +## nothing. +## +## @item A function handle that is passed to @code{pdist} with @var{MetricArg} +## as optional inputs. +## @end itemize +## +## Optional return value @var{h} is a handle to the silhouette plot. +## +## @strong{Reference} +## Peter J. Rousseeuw, Silhouettes: a Graphical Aid to the Interpretation and +## Validation of Cluster Analysis. 1987. doi:10.1016/0377-0427(87)90125-7 +## @end deftypefn +## +## @seealso{dendrogram, evalclusters, kmeans, linkage, pdist} + +function [si, h] = silhouette (X, clust, metric = "sqeuclidean", varargin) + ## check the input parameters + if (nargin < 2) + print_usage (); + endif + + n = size (clust, 1); + + ## check size + if (! isempty (X)) + if (size (X, 1) != n) + error ("First dimension of X <%d> doesn't match that of clust <%d>",... + size (X, 1), n); + endif + endif + + ## check metric + if (ischar (metric)) + metric = lower (metric); + switch (metric) + case "sqeuclidean" + metric = "squaredeuclidean"; + case { "euclidean", "cityblock", "cosine", ... + "correlation", "hamming", "jaccard" } + ; + otherwise + error ("silhouette: invalid metric '%s'", metric); + endswitch + elseif (isnumeric (metric) && isvector (metric)) + ## X can be omitted when using this + distMatrix = squareform (metric); + if (size (distMatrix, 1) != n) + error ("First dimension of X <%d> doesn't match that of clust <%d>",... + size (distMatrix, 1), n); + endif + endif + + ## main + si = zeros(n, 1); + clusterIDs = unique (clust); # eg [1; 2; 3; 4] + m = length (clusterIDs); + + ## if only one cluster is defined, the silhouette value is not defined + if (m == 1) + si = NaN * ones (n, 1); + return; + endif + + ## distance matrix showing the distance for any two rows of X + if (! exist ('distMatrix', 'var')) + distMatrix = squareform (pdist (X, metric, varargin{:})); + endif + + ## calculate values of si one by one + for iii = 1 : length (si) + + ## allocate values to clusters + groupedValues = {}; + for jjj = 1 : m + groupedValues{clusterIDs(jjj)} = [distMatrix(iii, ... + clust == clusterIDs(jjj))]; + endfor + ## end allocation + + ## calculate a(i) + ## average distance of iii to all other objects in the same cluster + if (length (groupedValues{clust(iii)}) == 1) + si(iii) = 1; + continue; + else + a_i = (sum (groupedValues{clust(iii)})) / ... + (length (groupedValues{clust(iii)}) - 1); + endif + ## end a(i) + + + ## calculate b(i) + clusterIDs_new = clusterIDs; + ## remove the cluster iii in + clusterIDs_new(find (clusterIDs_new == clust(iii))) = []; + ## average distance of iii to all objects of another cluster + a_iii_2others = zeros (length (clusterIDs_new), 1); + for jjj = 1 : length (clusterIDs_new) + a_iii_2others(jjj) = mean (groupedValues{clusterIDs_new(jjj)}); + endfor + b_i = min (a_iii_2others); + ## end b(i) + + + ## calculate s(i) + si(iii) = (b_i - a_i) / (max ([a_i; b_i])); + ## end s(i) + endfor + + ## plot + ## a poor man silhouette graph + vBarsc = zeros (m, 1); + vPadding = [0; 0; 0; 0]; + Bars = vPadding; + + for i = 1 : m + vBar = si(find (clust == clusterIDs(i))); + vBarsc(i) = length (Bars) + (length (vBar) / 2); + Bars = [Bars; (sort (vBar, "descend")); vPadding]; + endfor + + figure(); + h = barh (Bars, "hist", "facecolor", [0 0.4471 0.7412]); + + xlabel ("Silhouette Value"); + ylabel ("Cluster"); + set (gca, "ytick", vBarsc, "yticklabel", clusterIDs); + ylim ([0 (length (Bars))]); + axis ("ij"); +endfunction + + +%!demo +%! load fisheriris; +%! X = meas(:,3:4); +%! cidcs = kmeans (X, 3, "Replicates", 5); +%! silhouette (X, cidcs); +%! y_labels(cidcs([1 51 101])) = unique (species); +%! set (gca, "yticklabel", y_labels); +%! title ("Fisher's iris data"); + +## Test input validation +%!error silhouette (); +%!error silhouette ([1 2; 1 1]); +%!error silhouette ([1 2; 1 1], [1 2 3]'); +%!error silhouette ([1 2; 1 1], [1 2]', "xxx"); diff --git a/test/test_heuristics.rb b/test/test_heuristics.rb index 884a039d9b..41c879ee66 100755 --- a/test/test_heuristics.rb +++ b/test/test_heuristics.rb @@ -633,6 +633,7 @@ def test_m_by_heuristics "M" => all_fixtures("M", "MDB.m"), "Mathematica" => all_fixtures("Mathematica", "*.m") - all_fixtures("Mathematica", "Problem12.m"), "MATLAB" => all_fixtures("MATLAB", "create_ieee_paper_plots.m"), + "Octave" => all_fixtures("Octave", "*.m"), "Limbo" => all_fixtures("Limbo", "*.m"), nil => ambiguous }) diff --git a/test/test_language.rb b/test/test_language.rb index 0f781beeb0..287f9465b4 100644 --- a/test/test_language.rb +++ b/test/test_language.rb @@ -166,7 +166,7 @@ def test_find_all_by_alias def test_find_by_extension assert_equal [], Language.find_by_extension('.factor-rc') - assert_equal [Language['Limbo'], Language['M'], Language['MATLAB'], Language['MUF'], Language['Mathematica'], Language['Mercury'], Language['Objective-C']], Language.find_by_extension('foo.m') + assert_equal [Language['Limbo'], Language['M'], Language['MATLAB'], Language['MUF'], Language['Mathematica'], Language['Mercury'], Language['Objective-C'], Language['Octave']], Language.find_by_extension('foo.m') assert_equal [Language['Ruby']], Language.find_by_extension('foo.rb') assert_equal [Language['Ruby']], Language.find_by_extension('foo/bar.rb') assert_equal [Language['Ruby']], Language.find_by_extension('PKGBUILD.rb') diff --git a/vendor/grammars/Octave-language-grammar b/vendor/grammars/Octave-language-grammar new file mode 160000 index 0000000000..0f9f17f60e --- /dev/null +++ b/vendor/grammars/Octave-language-grammar @@ -0,0 +1 @@ +Subproject commit 0f9f17f60e4668c7aaaee6d9a6ad59ce6e773908 diff --git a/vendor/licenses/git_submodule/Octave-language-grammar.dep.yml b/vendor/licenses/git_submodule/Octave-language-grammar.dep.yml new file mode 100644 index 0000000000..fe96bdce46 --- /dev/null +++ b/vendor/licenses/git_submodule/Octave-language-grammar.dep.yml @@ -0,0 +1,31 @@ +--- +name: Octave-language-grammar +version: 0f9f17f60e4668c7aaaee6d9a6ad59ce6e773908 +type: git_submodule +homepage: https://github.com/gnu-octave/Octave-language-grammar +license: mit +licenses: +- sources: LICENSE + text: | + MIT License + + Copyright (C) 2024 The Octave Project Developers + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +notices: []