Skip to content

Commit 2f00a44

Browse files
committed
Change in summary generation
1 parent 8402c16 commit 2f00a44

File tree

3 files changed

+42
-30
lines changed

3 files changed

+42
-30
lines changed

LLM/generate.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from langchain.chat_models import ChatOpenAI
44
from langchain.prompts import PromptTemplate
55
from langchain.docstore.document import Document
6-
from langchain.chains.summarize import load_summarize_chain
6+
from langchain.text_splitter import CharacterTextSplitter
7+
from langchain.chains import LLMChain, ReduceDocumentsChain, MapReduceDocumentsChain
8+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
79

810
logging.basicConfig(level=logging.INFO)
911
logger = logging.getLogger("GenerateSummary")
@@ -14,18 +16,19 @@ class LLM_Summarize:
1416

1517
def __init__(self, llm_token):
1618
self.llm = ChatOpenAI(temperature=0.1, openai_api_key=llm_token)
17-
self.code_summmary_prompt = """You are an elite programmer who can understand Github Repository code give to you in text very
19+
self.code_summary_prompt = """You are an elite programmer who can understand Github Repository code give to you in text very
1820
well and summarize what is written in it.
1921
20-
Code : {text}
22+
Code : {codes}
2123
22-
Summarize the above code present between delimiters in 50-70 words and in paragraph"""
24+
Summarize the above list of codes present between delimiters in 50-70 words each and in paragraph.
25+
Store it in a list."""
2326
self.all_summary_prompt = """You are great at understanding bigger picture of a codebase by looking at summary of different code
2427
files. Given the following summaries and you have to tell in detail what does the project do.
2528
2629
Summaries : {summary_list}
2730
28-
Limit final summary to 2000 words. Provide an elegant answer highlighting its purpose,
31+
Limit final summary to 2000-3000 words. Provide an elegant answer highlighting its purpose,
2932
main features, and key technologies used. Include 2-3 emojis."""
3033

3134
def summarize_repo(self, code_list):
@@ -36,28 +39,40 @@ def summarize_repo(self, code_list):
3639

3740
code_list = [Document(page_content=code) for code in code_list]
3841

39-
# Prompt to use in map and reduce stages
40-
CODE_SUMMARY = PromptTemplate(
41-
template=self.code_summmary_prompt, input_variables=["text"]
42+
# Map
43+
MAP_PROMPT = PromptTemplate.from_template(template=self.code_summary_prompt)
44+
map_chain = LLMChain(llm=self.llm, prompt=MAP_PROMPT)
45+
46+
# Reduce
47+
REDUCE_PROMPT = PromptTemplate.from_template(template=self.all_summary_prompt)
48+
reduce_chain = LLMChain(llm=self.llm, prompt=REDUCE_PROMPT)
49+
50+
logger.info("Prompt Ready")
51+
52+
combine_documents_chain = StuffDocumentsChain(
53+
llm_chain=reduce_chain, document_variable_name="summary_list"
4254
)
43-
ALL_SUMMARY = PromptTemplate(
44-
template=self.all_summary_prompt, input_variables=["summary_list"]
55+
reduce_documents_chain = ReduceDocumentsChain(
56+
combine_documents_chain=combine_documents_chain,
57+
collapse_documents_chain=combine_documents_chain,
58+
token_max=4000,
4559
)
4660

47-
logger.info("Prompt Ready")
61+
map_reduce_chain = MapReduceDocumentsChain(
62+
llm_chain=map_chain,
63+
reduce_documents_chain=reduce_documents_chain,
64+
document_variable_name="codes",
65+
return_intermediate_steps=False,
66+
)
4867

49-
chain = load_summarize_chain(
50-
self.llm,
51-
chain_type="map_reduce",
52-
map_prompt=CODE_SUMMARY,
53-
combine_prompt=ALL_SUMMARY,
54-
combine_document_variable_name="summary_list",
68+
# Split text
69+
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
70+
chunk_size=1000, chunk_overlap=0
5571
)
56-
logger.info("Running LLM")
72+
split_docs = text_splitter.split_documents(code_list)
5773

58-
result = chain({"input_documents": code_list}, return_only_outputs=True)[
59-
"output_text"
60-
]
74+
logger.info("Running LLM")
75+
result = map_reduce_chain.run(split_docs)
6176

6277
# Configuring according to HTML page
6378
result = result.replace("\n", "<br>")

LLM/scrap.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
"jpg",
1111
"jpeg",
1212
"svg",
13-
"md",
13+
"pkl",
14+
"gitignore",
15+
"txt",
1416
] # exclude files with these extensions
15-
no_decode_extensions = ["md", "html"] # No ASCII decoding
1617

1718

1819
def scrap_repo(github_owner, github_repo_name, llm_token):
@@ -40,12 +41,7 @@ def scrap_repo(github_owner, github_repo_name, llm_token):
4041
extension = file_content.path.split(".")[-1] # Get file extension
4142
if extension in exclude_extensions:
4243
continue
43-
elif extension in no_decode_extensions:
44-
text = (
45-
file_content.decoded_content.decode()
46-
) # No ASCII decoding, get raw content
47-
else:
48-
text = file_content.decoded_content.decode("ASCII")
44+
text = file_content.decoded_content.decode()
4945

5046
code_list.append(text)
5147

frontend/summary_style.css

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ body {
1010
font-size: 15px;
1111
color: #fff;
1212
text-align: center;
13-
word-wrap: break-word;
13+
/* word-wrap: break-word; */
14+
overflow-wrap: normal;
1415
margin-bottom: 30px;
1516
width: 700px;
1617
/* width: 100%; */

0 commit comments

Comments
 (0)