Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/test data02 #22

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Classifier/Preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from SampleReader import statistics, extractSamples, shuffle_data, rebalance_data
from Persistence import write


RAW_DATA_PATH = "..//result//codevectors//codevectors_labeled.txt"
PROCESSED_DATA_PATH = "..//result//codevectors//codevectors_labeled_shuffled_test02.txt"
STEP_SIZE = 6000000


def write_rebalanced_shuffled_data(file_in, save_path: str, stop: int = -1, balance: float = -1, step: int = 60000):
eof = False
sample_count: int = 0
while not eof and (stop < 0 or sample_count < stop):
samples, eof = extractSamples(file_in, min(step, max(stop - sample_count, 100) if stop > 0 else step))
if balance > 0:
samples = rebalance_data(samples, balance)
else:
samples = shuffle_data(samples)
sample_count += len(samples)
write(samples, save_path)


if __name__ == '__main__':
sample_count, positive_count, negative_count = statistics(RAW_DATA_PATH)
with open(RAW_DATA_PATH, "r") as file:
write_rebalanced_shuffled_data(file, PROCESSED_DATA_PATH, step=STEP_SIZE)

12 changes: 0 additions & 12 deletions Classifier/SampleReader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from Sample import Sample
from Persistence import write
import random

# read code2vec representation from txt file into samples in an iterative more RAM efficient way
Expand Down Expand Up @@ -125,15 +124,4 @@ def statistics_samples(samples: [Sample]):
negative_count = conditionalCount(samples, filterNegative)
return (negative_count + positive_count), positive_count, negative_count

def write_rebalanced_shuffled_data(file_in, save_path: str, stop: int = -1, balance: float = -1, step: int = 60000):
eof = False
sample_count: int = 0
while not eof and (stop < 0 or sample_count < stop):
samples, eof = extractSamples(file_in, min(step, max(stop - sample_count, 100) if stop > 0 else step))
if balance > 0:
samples = rebalance_data(samples, balance)
else:
samples = shuffle_data(samples)
sample_count += len(samples)
write(samples, save_path)

6 changes: 4 additions & 2 deletions DataPreprocessor/DataMiner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from py4j.java_gateway import JavaGateway, GatewayParameters
from LogLabeler import label
from organizeDataForCode2Vec import prepare
from IOHelper import create_directory


INPUT_PATH = "C://Users//Jan//Desktop//Repositories"
METHODS_SAVE_PATH = "C://Users//Jan//Desktop//log-strategy//DataPreprocessor//data"
INPUT_PATH = "..//..//Repositories"
METHODS_SAVE_PATH = "..//result"


if __name__ == '__main__':
Expand All @@ -13,6 +14,7 @@

# 0.
print("Please run the TermExtractor.App class with VM arguments: -Xmx4g -Xmx8g -XX:+UseG1GC.")
create_directory(METHODS_SAVE_PATH)

# 1. extract java methods from git repositories
print("Extracting java methods with JavaExtractor from", INPUT_PATH)
Expand Down
22 changes: 0 additions & 22 deletions DataPreprocessor/TermExtactor/TermExtactor.iml

This file was deleted.

23 changes: 0 additions & 23 deletions DataPreprocessor/TermExtactor/TermExtactor1.iml

This file was deleted.

12 changes: 12 additions & 0 deletions DataPreprocessor/TermExtactor/term-extractor.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_5">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
3 changes: 3 additions & 0 deletions DataPreprocessor/script/selected_repositories_test02.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Apache
git clone https://github.com/apache/metron.git
git clone https://github.com/apache/pdfbox.git
11 changes: 11 additions & 0 deletions result/FUNCTION_STATISTICS
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,14 @@ elasticsearch
Functions with log: 2973
Functions without log: 104096
Functions without log: 104096

metron
Functions with log: 413
Functions without log: 9378
Functions without log: 9378

pdfbox
Functions with log: 369
Functions without log: 9839
Functions without log: 9839

Loading