-
Notifications
You must be signed in to change notification settings - Fork 275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LLM_EXTRACT_TEXT implementation #18435
base: main
Are you sure you want to change the base?
Changes from all commits
8a76a0b
76f2764
b805a99
f619224
373d39d
11b1dcc
f02d9be
f12a9b1
4b487dd
227e8e2
5498d07
16580c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
package function | ||
|
||
import ( | ||
"context" | ||
"github.com/ledongthuc/pdf" | ||
"github.com/matrixorigin/matrixone/pkg/common/moerr" | ||
"github.com/matrixorigin/matrixone/pkg/common/util" | ||
"github.com/matrixorigin/matrixone/pkg/container/types" | ||
"github.com/matrixorigin/matrixone/pkg/container/vector" | ||
"github.com/matrixorigin/matrixone/pkg/fileservice" | ||
"github.com/matrixorigin/matrixone/pkg/vm/process" | ||
"strings" | ||
) | ||
|
||
// LLMExtractText function | ||
func LLMExtractText(parameters []*vector.Vector, result vector.FunctionResultWrapper, proc *process.Process, length int, selectList *FunctionSelectList) error { | ||
input := vector.GenerateFunctionStrParameter(parameters[0]) | ||
output := vector.GenerateFunctionStrParameter(parameters[1]) | ||
extractorType := vector.GenerateFunctionStrParameter(parameters[2]) | ||
rs := vector.MustFunctionResult[bool](result) | ||
|
||
rowCount := uint64(length) | ||
|
||
for i := uint64(0); i < rowCount; i++ { | ||
inputBytes, nullInput := input.GetStrValue(i) | ||
if nullInput { | ||
if err := rs.AppendMustNullForBytesResult(); err != nil { | ||
return err | ||
} | ||
continue | ||
} | ||
|
||
outputBytes, nullInput2 := output.GetStrValue(i) | ||
if nullInput2 { | ||
if err := rs.AppendMustNullForBytesResult(); err != nil { | ||
return err | ||
} | ||
continue | ||
} | ||
|
||
extractorTypeBytes, nullInput3 := extractorType.GetStrValue(i) | ||
if nullInput3 { | ||
if err := rs.AppendMustNullForBytesResult(); err != nil { | ||
return err | ||
} | ||
continue | ||
} | ||
|
||
inputPath := util.UnsafeBytesToString(inputBytes) | ||
outputPath := util.UnsafeBytesToString(outputBytes) | ||
extractorTypeString := util.UnsafeBytesToString(extractorTypeBytes) | ||
|
||
moUrl, _, ext, err := types.ParseDatalink(inputPath) | ||
if err != nil { | ||
return err | ||
} | ||
if "."+extractorTypeString != ext { | ||
return moerr.NewInvalidInputNoCtxf("File type and extractor type are not equal.") | ||
} | ||
if ext != ".pdf" { | ||
return moerr.NewInvalidInputNoCtxf("Only pdf file supported.") | ||
} | ||
|
||
outputPathUrl, _, _, err := types.ParseDatalink(outputPath) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
success, err := extractTextFromPdfAndWriteToFile(moUrl, outputPathUrl, proc) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// return whether the process completes successfully | ||
if success { | ||
if err := rs.Append(true, false); err != nil { | ||
return err | ||
} | ||
} else { | ||
if err := rs.Append(false, false); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
} | ||
|
||
return nil | ||
} | ||
|
||
func extractTextFromPdfAndWriteToFile(pdfPath string, txtPath string, proc *process.Process) (bool, error) { | ||
// read PDF to string | ||
content, err := readPdfToString(pdfPath) | ||
if err != nil { | ||
return false, moerr.NewInvalidInputNoCtxf("Invalid PDF input.") | ||
} | ||
|
||
// file service and write file | ||
ctx := context.TODO() | ||
fs, readPath, err := fileservice.GetForETL(ctx, proc.Base.FileService, txtPath) | ||
|
||
// delete the file if txt file exist because Write() only works when a file does not exist | ||
_, err = fs.StatFile(ctx, readPath) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just error out if file exists. You may delete the customer file in the cloud if you use stage URL. |
||
if err == nil { | ||
err1 := fs.Delete(ctx, readPath) | ||
if err1 != nil { | ||
return false, moerr.NewInvalidInputNoCtxf("Cannot remove file %s", readPath) | ||
} | ||
} | ||
|
||
_, err = fileservice.DoWithRetry( | ||
"BackupWrite", | ||
func() (int, error) { | ||
return 0, fs.Write(ctx, fileservice.IOVector{ | ||
FilePath: readPath, | ||
Entries: []fileservice.IOEntry{ | ||
{ | ||
Offset: 0, | ||
Size: int64(len(content)), | ||
Data: []byte(content), | ||
}, | ||
}, | ||
}) | ||
}, | ||
64, | ||
fileservice.IsRetryableError, | ||
) | ||
if err != nil { | ||
return false, err | ||
} | ||
return true, nil | ||
} | ||
|
||
func isSameSentence(current, last pdf.Text) bool { | ||
return strings.TrimSpace(current.S) != "" && | ||
last.Font == current.Font && | ||
last.FontSize == current.FontSize && | ||
last.X == current.X && | ||
last.Y == current.Y | ||
} | ||
|
||
func readPdfToString(path string) (string, error) { | ||
f, r, err := pdf.Open(path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use fileservice to open a pdf file? |
||
if err != nil { | ||
return "", err | ||
} | ||
defer func() { | ||
if f != nil { | ||
f.Close() | ||
} | ||
}() | ||
|
||
var textBuilder strings.Builder | ||
totalPage := r.NumPage() | ||
|
||
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { | ||
p := r.Page(pageIndex) | ||
if p.V.IsNull() { | ||
continue | ||
} | ||
var lastTextStyle pdf.Text | ||
texts := p.Content().Text | ||
for _, text := range texts { | ||
if isSameSentence(text, lastTextStyle) { | ||
lastTextStyle.S += text.S | ||
} else { | ||
if lastTextStyle.S != "" { | ||
textBuilder.WriteString(lastTextStyle.S) | ||
} | ||
lastTextStyle = text | ||
} | ||
} | ||
if lastTextStyle.S != "" { | ||
textBuilder.WriteString(lastTextStyle.S + " ") | ||
} | ||
} | ||
|
||
return textBuilder.String(), nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package function | ||
|
||
import ( | ||
"fmt" | ||
"github.com/matrixorigin/matrixone/pkg/container/types" | ||
"github.com/matrixorigin/matrixone/pkg/testutil" | ||
"github.com/stretchr/testify/require" | ||
"os" | ||
"testing" | ||
) | ||
|
||
func TestLLMExtractText(t *testing.T) { | ||
testCases := initLLMExtractTextCase() | ||
wrongTestCases := initLLMExtractWrongTextCase() | ||
|
||
proc := testutil.NewProcess() | ||
for _, tc := range testCases { | ||
fcTC := NewFunctionTestCase(proc, tc.inputs, tc.expect, LLMExtractText) | ||
s, info := fcTC.Run() | ||
require.True(t, s, fmt.Sprintf("case is '%s', err info is '%s'", tc.info, info)) | ||
} | ||
|
||
for _, tc := range wrongTestCases { | ||
fcTC := NewFunctionTestCase(proc, tc.inputs, tc.expect, LLMExtractText) | ||
s, info := fcTC.Run() | ||
require.True(t, s, fmt.Sprintf("case is '%s', err info is '%s'", tc.info, info)) | ||
} | ||
|
||
} | ||
|
||
func initLLMExtractTextCase() []tcTemp { | ||
regularCases := []struct { | ||
info string | ||
input []string | ||
output []string | ||
extractorType []string | ||
wants []bool | ||
}{ | ||
{ | ||
info: "test encode - simple text", | ||
input: []string{ | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/MODocs1.pdf?offset=0&size=4", GetFilePath()), | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/example.pdf?offset=0&size=4", GetFilePath()), | ||
}, | ||
output: []string{ | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/MODocs1.txt", GetFilePath()), | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/example.txt", GetFilePath()), | ||
}, | ||
extractorType: []string{ | ||
"pdf", | ||
"pdf", | ||
}, | ||
wants: []bool{ | ||
true, | ||
true, | ||
}, | ||
}, | ||
} | ||
|
||
var testInputs = make([]tcTemp, 0, len(regularCases)) | ||
for _, c := range regularCases { | ||
testInputs = append(testInputs, tcTemp{ | ||
info: c.info, | ||
inputs: []FunctionTestInput{ | ||
NewFunctionTestInput(types.T_datalink.ToType(), c.input, []bool{}), | ||
NewFunctionTestInput(types.T_datalink.ToType(), c.output, []bool{}), | ||
NewFunctionTestInput(types.T_varchar.ToType(), c.extractorType, []bool{}), | ||
}, | ||
expect: NewFunctionTestResult(types.T_bool.ToType(), false, c.wants, []bool{}), | ||
}) | ||
} | ||
|
||
return testInputs | ||
} | ||
|
||
func initLLMExtractWrongTextCase() []tcTemp { | ||
regularCases := []struct { | ||
info string | ||
input []string | ||
output []string | ||
extractorType []string | ||
wants []bool | ||
}{ | ||
{ | ||
info: "test encode - simple text", | ||
input: []string{ | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/MODocs1.txt?offset=0&size=4", GetFilePath()), | ||
"", | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/example.pdf?offset=0&size=4", GetFilePath()), | ||
}, | ||
output: []string{ | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/MODocs1.txt", GetFilePath()), | ||
"", | ||
fmt.Sprintf("file://%s/../../../../test/distributed/resources/llm_test/extract_text/example.txt", GetFilePath()), | ||
}, | ||
extractorType: []string{ | ||
"pdf", | ||
"", | ||
"txt", | ||
}, | ||
wants: []bool{ | ||
true, | ||
true, | ||
}, | ||
}, | ||
} | ||
|
||
var testInputs = make([]tcTemp, 0, len(regularCases)) | ||
for _, c := range regularCases { | ||
testInputs = append(testInputs, tcTemp{ | ||
info: c.info, | ||
inputs: []FunctionTestInput{ | ||
NewFunctionTestInput(types.T_datalink.ToType(), c.input, []bool{}), | ||
NewFunctionTestInput(types.T_datalink.ToType(), c.output, []bool{}), | ||
NewFunctionTestInput(types.T_varchar.ToType(), c.extractorType, []bool{}), | ||
}, | ||
expect: NewFunctionTestResult(types.T_bool.ToType(), true, c.wants, []bool{}), | ||
}) | ||
} | ||
|
||
return testInputs | ||
} | ||
|
||
func GetFilePath() string { | ||
dir, _ := os.Getwd() | ||
return dir | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
select llm_extract_text(cast('file://$resources/llm_test/extract_text/MODocs1.pdf?offset=0&size=4' as datalink), cast('file://$resources/llm_test/extract_text/MODocs1.txt' as datalink), "pdf"); | ||
llm_extract_text(cast(file:///Users/charles/Desktop/codes/matrixone/matrixone/test/distributed/resources/llm_test/extract_text/MODocs1.pdf?offset=0&size=4 as datalink), cast(file:///Users/charles/Desktop/codes/matrixone/matrixone/test/distributed/resources/llm_test/extract_text/MODocs1.txt as datalink), pdf) | ||
true | ||
select llm_extract_text(cast('file://$resources/llm_test/extract_text/example.pdf?offset=0&size=4' as datalink), cast('file://$resources/llm_test/extract_text/example.txt' as datalink), "pdf"); | ||
llm_extract_text(cast(file:///Users/charles/Desktop/codes/matrixone/matrixone/test/distributed/resources/llm_test/extract_text/example.pdf?offset=0&size=4 as datalink), cast(file:///Users/charles/Desktop/codes/matrixone/matrixone/test/distributed/resources/llm_test/extract_text/example.txt as datalink), pdf) | ||
true |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
select llm_extract_text(cast('file://$resources/llm_test/extract_text/MODocs1.pdf?offset=0&size=4' as datalink), cast('file://$resources/llm_test/extract_text/MODocs1.txt' as datalink), "pdf"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need offset and size here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I included them here to demonstrate that this format of datalink is recognized by the function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, just remove those. |
||
select llm_extract_text(cast('file://$resources/llm_test/extract_text/example.pdf?offset=0&size=4' as datalink), cast('file://$resources/llm_test/extract_text/example.txt' as datalink), "pdf"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
try to get the context from proc. Only use context.TODO() when you have no choice.