Skip to content

Commit

Permalink
Merge pull request #43 from CodeReviewerAi/abstract-syntax-trees
Browse files Browse the repository at this point in the history
Abstract syntax trees
  • Loading branch information
johan-t authored Jan 3, 2024
2 parents 8bbea07 + d09a2a5 commit 0fc338b
Show file tree
Hide file tree
Showing 8 changed files with 277 additions and 101 deletions.
30 changes: 26 additions & 4 deletions .github/workflows/get_function_data_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,41 @@ on:

jobs:
test:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
with:
path: 'functionRetriever' # Specify a path for the main repo

- name: Clone testRepo repository
run: |
mkdir -p ${{ github.workspace }}/inputData # Create inputData directory
git clone https://github.com/RapidReview-ai/testRepo ${{ github.workspace }}/inputData/testRepo
- name: Create outputData directory
run: mkdir -p ${{ github.workspace }}/functionRetriever/outputData

- name: Set up Python 3.x
uses: actions/setup-python@v2
with:
python-version: '3.11.1'
- name: Install dependencies

- name: Set up Node.js
uses: actions/setup-node@v2
with:
node-version: '18.12.1'

- name: Install npm dependencies
run: npm install
working-directory: ${{ github.workspace }}/functionRetriever

- name: Install Python dependencies
run: |
pip install --upgrade pip
# Install any other dependencies your project requires:
# pip install -r requirements.txt
pip install -r requirements.txt
working-directory: ${{ github.workspace }}/functionRetriever

- name: Run tests
run: python test_get_function_data.py
working-directory: ${{ github.workspace }}/functionRetriever
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
.env
**.json
function_changes.json
test_function_changes.json
package-lock.json
path
.DS_Store
**/__pycache__
**/__pycache__
node_modules
temp.js
14 changes: 14 additions & 0 deletions babelParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
const babel = require('@babel/parser');
const fs = require('fs');

const code = fs.readFileSync(process.argv[2], 'utf8');

try {
const ast = babel.parse(code, {
sourceType: "module",
plugins: [],
});
console.log(JSON.stringify(ast));
} catch (error) {
console.error("Parsing error:", error);
}
4 changes: 2 additions & 2 deletions createEmbeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from qdrant_client import QdrantClient
from qdrant_client.models import CollectionDescription, Distance, VectorParams, Record

def embed_sample_functions():
def embed_sample_functions(repo_path):
# Initialize Qdrant Client
client = QdrantClient(host='localhost', port=6333)
# client = QdrantClient(":memory:")
Expand All @@ -17,7 +17,7 @@ def embed_sample_functions():
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load the JSON data from the file
json_file_path = 'outputData/function_changes.json' # depends on how you run the file, should be changed to be global and not local path
json_file_path = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
with open(json_file_path, 'r') as file:
json_data = json.load(file)

Expand Down
244 changes: 179 additions & 65 deletions getFunctionData.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,202 @@
import os
import git
import json
import re
import os
import time
import subprocess

def get_function_data(repo_path='../inputData/testRepo2'):
# Determine the output file based on the original repo_path
output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo2') else 'outputData/function_changes.json'

# Determine the directory where this script is located
def get_function_data(repo_path='../inputData/testRepo'):
output_file = 'outputData/test_function_changes.json' if repo_path.endswith('testRepo') else 'outputData/function_changes.json'
script_dir = os.path.dirname(os.path.abspath(__file__))

# Construct the path to your repository relative to the script's location
repo_path = os.path.join(script_dir, repo_path)
repo = git.Repo(repo_path)

# Pull the latest changes from the main branch
repo = git.Repo(repo_path)

repo.git.checkout('main')
repo.git.pull()

merge_commits = [commit for commit in repo.iter_commits('main') if commit.parents and len(commit.parents) > 1]
merge_commits.reverse() # Reverse the list to get the oldest merge commit first
merge_commits.reverse()

def create_temp_file_and_get_ast(file_content, temp_file_path='temp.js'):
with open(temp_file_path, 'w') as f:
f.write(file_content)
ast = get_ast_from_js(file_content, temp_file_path)
if os.path.exists(temp_file_path):
os.remove(temp_file_path) # Clean up the temporary file
return ast

def get_ast_from_js(file_content, temp_file_path):
with open(temp_file_path, 'w') as temp_file:
temp_file.write(file_content)
result = subprocess.run(['node', 'babelParser.js', temp_file_path], capture_output=True, text=True)
if result.stderr:
print("Error in parsing:", result.stderr)
return None
return json.loads(result.stdout)

def get_functions_from_file(file_content):

# create ast from file content
ast = create_temp_file_and_get_ast(file_content)

functions = []
try:
# Traverse the AST to find function declarations
def traverse(node):
if not isinstance(node, dict):
return

if 'type' in node:
# Check for arrow functions or function expressions assigned to variables
if node['type'] in ['VariableDeclarator'] and 'init' in node:
init_node = node['init']
if init_node and 'type' in init_node and init_node['type'] in ['FunctionExpression', 'ArrowFunctionExpression']:
function_name = None
if 'name' in node['id']:
function_name = node['id']['name']
if function_name:
functions.append(function_name)

# Existing checks for FunctionDeclaration, etc.
elif node['type'] in ['FunctionDeclaration', 'FunctionExpression', 'ArrowFunctionExpression']:
function_name = None
if 'id' in node and node['id'] is not None:
function_name = node['id']['name']
elif 'key' in node and 'name' in node['key']:
function_name = node['key']['name']
if function_name:
functions.append(function_name)

# Check for methods in classes
if node['type'] == 'MethodDefinition' and 'key' in node and node['key']['type'] == 'Identifier':
functions.append(node['key']['name'])

# Recursively traverse child nodes
for key, value in node.items():
if isinstance(value, dict):
traverse(value)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
traverse(item)

traverse(ast['program'])
except Exception as e:
print(f"Error processing AST: {e}")
return functions

def normalize_change_counts(functions):
# Find the min and max changes after merge
min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']

# Normalize the change counts between -1 and 1
for func_key, func_info in functions.items():
if max_changes != min_changes:
normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
else:
normalized_score = 0
func_info['score'] = normalized_score

return functions

def get_func_name(diff):
pattern = re.compile(r'function\s+([^\(]+)\s*\(([^)]*)\)\s*{', re.MULTILINE)
return pattern.findall(diff)

def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
commit = repo.commit(commit_hash)
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')

pattern = re.compile(r'function\s+' + re.escape(function_name) + r'\s*\((.*?)\)\s*\{([\s\S]*?)\}', re.MULTILINE)
match = pattern.search(file_content)

if match:
full_function = f"function {function_name}({match.group(1)}) {{{match.group(2)}}}"
return full_function
# create ast from file content
ast = create_temp_file_and_get_ast(file_content)

try:
# Define a function to recursively search for the function
def find_function(node, function_name):
if not isinstance(node, dict):
return None

# Handle different types of function nodes
if node.get('type') == 'FunctionDeclaration' and node.get('id', {}).get('name') == function_name:
return node.get('start'), node.get('end')

if node.get('type') == 'VariableDeclarator':
init_node = node.get('init')
if isinstance(init_node, dict) and init_node.get('type') in ['FunctionExpression', 'ArrowFunctionExpression']:
if node.get('id', {}).get('name') == function_name:
return node.get('start'), node.get('end')

# Recursive traversal
for key, value in node.items():
if isinstance(value, dict):
result = find_function(value, function_name)
if result:
return result
elif isinstance(value, list):
for item in value:
result = find_function(item, function_name)
if result:
return result
return None

# Search for the function in the AST
start_end = find_function(ast['program'], function_name) # Pass function_name here
if start_end:
start, end = start_end
return file_content[start:end]
except Exception as e:
print(f"Error processing AST: {e}")

return None

functions = {}




for commit in merge_commits:
parent_commit = commit.parents[0]
diffs = commit.diff(parent_commit, create_patch=True)

for diff in diffs:
diff_content = diff.diff.decode('utf-8')
for func_name, _ in get_func_name(diff_content):
full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, diff.a_path)
if full_function:
func_key = f"{diff.a_path}::{func_name}"
if func_key not in functions:
functions[func_key] = {
'function_name': func_name,
'merged_function': full_function,
'commit': commit.hexsha,
'changes_after_merge': 0,
'latest_function': full_function,
'time_first_merged': commit.authored_datetime,
'file_path': diff.a_path
}


for func_key, func_info in functions.items():
for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest
if commit.authored_datetime > func_info['time_first_merged']:
for file_path in commit.stats.files:
if file_path.endswith('.js'):
try:
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')
for func_name in get_functions_from_file(file_content):
full_function = get_full_function_at_commit(repo, commit.hexsha, func_name, file_path)
if full_function:
func_key = f"{file_path}::{func_name}"
if func_key not in functions:
functions[func_key] = {
'function_name': func_name,
'merged_function': full_function,
'commit': commit.hexsha,
'changes_after_merge': 0,
'latest_function': full_function,
'time_first_merged': commit.authored_datetime,
'file_path': file_path
}
except Exception as e:
print(f"Error processing commit {commit.hexsha}: {e}")
continue

for commit in repo.iter_commits('main', reverse=True): # Iterate from the oldest to newest commit
for file_path in commit.stats.files:
if file_path.endswith('.js'):
try:
blob = commit.tree / func_info['file_path']
blob = commit.tree / file_path
file_content = blob.data_stream.read().decode('utf-8')
new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], func_info['file_path'])
if new_content and new_content.strip() != func_info['latest_function'].strip():
func_info['changes_after_merge'] += 1
func_info['latest_function'] = new_content
except KeyError:
current_functions = get_functions_from_file(file_content)

for func_key, func_info in functions.items():
if func_info['file_path'] == file_path:
if func_info['function_name'] in current_functions:
new_content = get_full_function_at_commit(repo, commit.hexsha, func_info['function_name'], file_path)
if new_content and new_content.strip() != func_info['latest_function'].strip() and commit.authored_datetime > func_info['time_first_merged']:
func_info['changes_after_merge'] += 1
func_info['latest_function'] = new_content
except Exception as e:
print(f"Error processing commit {commit.hexsha}: {e}")
continue

# Find the min and max changes after merge
min_changes = min(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']
max_changes = max(functions.values(), key=lambda x: x['changes_after_merge'])['changes_after_merge']

# Normalize the change counts between -1 and 1
for func_key, func_info in functions.items():
if max_changes != min_changes:
normalized_score = 2 * ((func_info['changes_after_merge'] - min_changes) / (max_changes - min_changes)) - 1
else:
normalized_score = 0
func_info['score'] = normalized_score
# Normalize the change counts to a score between -1 and 1
functions = normalize_change_counts(functions)

# Convert datetime objects to string before saving
for func in functions.values():
Expand All @@ -97,5 +207,9 @@ def get_full_function_at_commit(repo, commit_hash, function_name, file_path):
json.dump(functions, f, indent=4)

if __name__ == '__main__':
# pass repo_path variable if you want to test on another repo other than default
get_function_data()
start_time = time.time()
get_function_data() #pass this variable if you want to run another repo than testRepo: repo_path='../inputData/elixirsolutions'
end_time = time.time()
elapsed_time = round((end_time - start_time) / 60, 2) # convert to minutes and round to 2 decimal places
print('✅ Printed function data to outputData/test_function_changes.json ✅')
print(f'⏰ The program took {elapsed_time} minutes to run. ⏰')
Loading

0 comments on commit 0fc338b

Please sign in to comment.