Return to BSD News archive
Newsgroups: comp.unix.bsd
Path: sserve!manuel.anu.edu.au!munnari.oz.au!uunet!pmafire!news.dell.com!swrinde!gatech!hubcap!ncrcae!ncr-sd!crash!fpm
From: fpm@crash.cts.com (Frank Maclachlan)
Subject: Functions are not word-aligned in 386BSD 0.1 locore.s (w/ patch)
Organization: CTS Network Services (crash, ctsnet), El Cajon, CA
Date: 20 Oct 92 13:32:56 PDT
Message-ID: <1992Oct20.133257.29726@crash>
Keywords: patch 386BSD
Lines: 738
I've wanted to post this for a long time, but resisted since it doesn't
*fix* anything; it simply makes certain kernel routines a tiny bit
faster.
I noticed that branch/function call addresses and some data items in
'/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s' are not aligned on
32 bit word boundaries (0 modulo 4). This causes a small performance
hit on 386DX/486 based machines. The 486, in fact, performs best when
branch addresses are aligned on 16 byte boundaries (0 modulo 16) due to
its burst cache fill capability; this is one of the reasons why the gcc
2.2.2 -486 option often results in larger binaries.
Anyway, I modified '/sys/i386/i386/locore.s' and '/sys/i386/isa/icu.s'
to align things on 4 byte boundaries. I used a macro, ALIGN32, to
perform the alignment since the new version of the Gnu assembler will
probably change the meaning of .align 2. I also eliminated an unnec-
essary instruction in ___divsi3. I don't claim a profound performance
boost, but every little bit helps!
To apply the patch, change directory to /sys and type the following:
patch -p0 <patch_file_name
Here's the patch:
diff -rc i386.ORIG/i386/locore.s i386/i386/locore.s
*** i386.ORIG/i386/locore.s Wed Aug 12 21:03:05 1992
--- i386/i386/locore.s Tue Oct 20 12:56:10 1992
***************
*** 63,68 ****
--- 63,69 ----
.set SYSPDROFF,0x3F8 # Page dir index of System Base
#define NOP inb $0x84, %al ; inb $0x84, %al
+ #define ALIGN32 .align 2 /* 2^2 = 4 */
/*
* PTmap is recursive pagemap at top of virtual address space.
***************
*** 372,377 ****
--- 373,379 ----
* Support routines for GCC
*/
.globl ___udivsi3
+ ALIGN32
___udivsi3:
movl 4(%esp),%eax
xorl %edx,%edx
***************
*** 379,387 ****
ret
.globl ___divsi3
___divsi3:
movl 4(%esp),%eax
! xorl %edx,%edx
cltd
idivl 8(%esp)
ret
--- 381,390 ----
ret
.globl ___divsi3
+ ALIGN32
___divsi3:
movl 4(%esp),%eax
! #xorl %edx,%edx /* not needed - cltd sign extends into %edx */
cltd
idivl 8(%esp)
ret
***************
*** 390,395 ****
--- 393,399 ----
* I/O bus instructions via C
*/
.globl _inb
+ ALIGN32
_inb: movl 4(%esp),%edx
subl %eax,%eax # clr eax
NOP
***************
*** 398,403 ****
--- 402,408 ----
.globl _inw
+ ALIGN32
_inw: movl 4(%esp),%edx
subl %eax,%eax # clr eax
NOP
***************
*** 406,411 ****
--- 411,417 ----
.globl _rtcin
+ ALIGN32
_rtcin: movl 4(%esp),%eax
outb %al,$0x70
subl %eax,%eax # clr eax
***************
*** 413,418 ****
--- 419,425 ----
ret
.globl _outb
+ ALIGN32
_outb: movl 4(%esp),%edx
NOP
movl 8(%esp),%eax
***************
*** 421,426 ****
--- 428,434 ----
ret
.globl _outw
+ ALIGN32
_outw: movl 4(%esp),%edx
NOP
movl 8(%esp),%eax
***************
*** 433,438 ****
--- 441,447 ----
*/
.globl _bzero
+ ALIGN32
_bzero:
pushl %edi
movl 8(%esp),%edi
***************
*** 454,459 ****
--- 463,469 ----
*/
.globl _fillw
+ ALIGN32
_fillw:
pushl %edi
movl 8(%esp),%eax
***************
*** 466,471 ****
--- 476,482 ----
ret
.globl _bcopyb
+ ALIGN32
_bcopyb:
pushl %esi
pushl %edi
***************
*** 486,491 ****
--- 497,503 ----
*/
.globl _bcopy,_ovbcopy
+ ALIGN32
_ovbcopy:
_bcopy:
pushl %esi
***************
*** 507,512 ****
--- 519,525 ----
popl %esi
xorl %eax,%eax
ret
+ ALIGN32
1:
addl %ecx,%edi /* copy backwards. */
addl %ecx,%esi
***************
*** 530,535 ****
--- 543,549 ----
#ifdef notdef
.globl _copyout
+ ALIGN32
_copyout:
movl _curpcb, %eax
movl $cpyflt, PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 596,601 ****
--- 610,616 ----
ret
.globl _copyin
+ ALIGN32
_copyin:
movl _curpcb,%eax
movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 621,626 ****
--- 636,642 ----
movl %eax,PCB_ONFAULT(%edx)
ret
+ ALIGN32
cpyflt:
popl %ebx
popl %edi
***************
*** 631,636 ****
--- 647,653 ----
ret
#else
.globl _copyout
+ ALIGN32
_copyout:
movl _curpcb,%eax
movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 655,660 ****
--- 672,678 ----
ret
.globl _copyin
+ ALIGN32
_copyin:
movl _curpcb,%eax
movl $cpyflt,PCB_ONFAULT(%eax) # in case we page/protection violate
***************
*** 678,683 ****
--- 696,702 ----
movl %eax,PCB_ONFAULT(%edx)
ret
+ ALIGN32
cpyflt: popl %edi
popl %esi
movl _curpcb,%edx
***************
*** 689,694 ****
--- 708,714 ----
# insb(port,addr,cnt)
.globl _insb
+ ALIGN32
_insb:
pushl %edi
movw 8(%esp),%dx
***************
*** 705,710 ****
--- 725,731 ----
# insw(port,addr,cnt)
.globl _insw
+ ALIGN32
_insw:
pushl %edi
movw 8(%esp),%dx
***************
*** 720,725 ****
--- 741,747 ----
# outsw(port,addr,cnt)
.globl _outsw
+ ALIGN32
_outsw:
pushl %esi
movw 8(%esp),%dx
***************
*** 735,740 ****
--- 757,763 ----
# outsb(port,addr,cnt)
.globl _outsb
+ ALIGN32
_outsb:
pushl %esi
movw 8(%esp),%dx
***************
*** 753,758 ****
--- 776,782 ----
* void lgdt(struct region_descriptor *rdp);
*/
.globl _lgdt
+ ALIGN32
_lgdt:
/* reload the descriptor table */
movl 4(%esp),%eax
***************
*** 779,784 ****
--- 803,809 ----
* void lidt(struct region_descriptor *rdp);
*/
.globl _lidt
+ ALIGN32
_lidt:
movl 4(%esp),%eax
lidt (%eax)
***************
*** 788,793 ****
--- 813,819 ----
* void lldt(u_short sel)
*/
.globl _lldt
+ ALIGN32
_lldt:
lldt 4(%esp)
ret
***************
*** 796,801 ****
--- 822,828 ----
* void ltr(u_short sel)
*/
.globl _ltr
+ ALIGN32
_ltr:
ltr 4(%esp)
ret
***************
*** 805,810 ****
--- 832,838 ----
*/
.globl _lcr3
.globl _load_cr3
+ ALIGN32
_load_cr3:
_lcr3:
inb $0x84,%al # check wristwatch
***************
*** 816,821 ****
--- 844,850 ----
# tlbflush()
.globl _tlbflush
+ ALIGN32
_tlbflush:
inb $0x84,%al # check wristwatch
movl %cr3,%eax
***************
*** 826,831 ****
--- 855,861 ----
# lcr0(cr0)
.globl _lcr0,_load_cr0
+ ALIGN32
_lcr0:
_load_cr0:
movl 4(%esp),%eax
***************
*** 834,839 ****
--- 864,870 ----
# rcr0()
.globl _rcr0
+ ALIGN32
_rcr0:
movl %cr0,%eax
ret
***************
*** 840,845 ****
--- 871,877 ----
# rcr2()
.globl _rcr2
+ ALIGN32
_rcr2:
movl %cr2,%eax
ret
***************
*** 847,852 ****
--- 879,885 ----
# rcr3()
.globl _rcr3
.globl __cr3
+ ALIGN32
__cr3:
_rcr3:
movl %cr3,%eax
***************
*** 854,859 ****
--- 887,893 ----
# ssdtosd(*ssdp,*sdp)
.globl _ssdtosd
+ ALIGN32
_ssdtosd:
pushl %ebx
movl 8(%esp),%ecx
***************
*** 877,882 ****
--- 911,917 ----
/*
* {fu,su},{byte,word}
*/
+ ALIGN32
ALTENTRY(fuiword)
ENTRY(fuword)
movl _curpcb,%ecx
***************
*** 887,892 ****
--- 922,928 ----
movl $0,PCB_ONFAULT(%ecx)
ret
+ ALIGN32
ENTRY(fusword)
movl _curpcb,%ecx
movl $fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate
***************
*** 896,901 ****
--- 932,938 ----
movl $0,PCB_ONFAULT(%ecx)
ret
+ ALIGN32
ALTENTRY(fuibyte)
ENTRY(fubyte)
movl _curpcb,%ecx
***************
*** 906,911 ****
--- 943,949 ----
movl $0,PCB_ONFAULT(%ecx)
ret
+ ALIGN32
fusufault:
movl _curpcb,%ecx
xorl %eax,%eax
***************
*** 913,918 ****
--- 951,957 ----
decl %eax
ret
+ ALIGN32
ALTENTRY(suiword)
ENTRY(suword)
movl _curpcb,%ecx
***************
*** 944,949 ****
--- 983,989 ----
movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
ret
+ ALIGN32
ENTRY(susword)
movl _curpcb,%ecx
movl $fusufault,PCB_ONFAULT(%ecx) #in case we page/protection violate
***************
*** 972,977 ****
--- 1012,1018 ----
movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
ret
+ ALIGN32
ALTENTRY(suibyte)
ENTRY(subyte)
movl _curpcb,%ecx
***************
*** 1001,1006 ****
--- 1042,1048 ----
movl %eax,PCB_ONFAULT(%ecx) #in case we page/protection violate
ret
+ ALIGN32
ENTRY(setjmp)
movl 4(%esp),%eax
movl %ebx, 0(%eax) # save ebx
***************
*** 1013,1018 ****
--- 1055,1061 ----
xorl %eax,%eax # return (0);
ret
+ ALIGN32
ENTRY(longjmp)
movl 4(%esp),%eax
movl 0(%eax),%ebx # restore ebx
***************
*** 1044,1049 ****
--- 1087,1093 ----
*
* Call should be made at spl6(), and p->p_stat should be SRUN
*/
+ ALIGN32
ENTRY(setrq)
movl 4(%esp),%eax
cmpl $0,P_RLINK(%eax) # should not be on q already
***************
*** 1070,1075 ****
--- 1114,1120 ----
*
* Call should be made at spl6().
*/
+ ALIGN32
ENTRY(remrq)
movl 4(%esp),%eax
movzbl P_PRI(%eax),%edx
***************
*** 1106,1111 ****
--- 1151,1157 ----
* to wait for something to come ready.
*/
.globl Idle
+ ALIGN32
Idle:
idle:
call _spl0
***************
*** 1123,1128 ****
--- 1169,1175 ----
/*
* Swtch()
*/
+ ALIGN32
ENTRY(swtch)
incl _cnt+V_SWTCH
***************
*** 1254,1259 ****
--- 1301,1307 ----
ret
.globl _mvesp
+ ALIGN32
_mvesp: movl %esp,%eax
ret
/*
***************
*** 1265,1270 ****
--- 1313,1319 ----
* Since this code requires a parameter from the "old" stack,
* pass it back as a return value.
*/
+ ALIGN32
ENTRY(swtch_to_inactive)
popl %edx # old pc
popl %eax # arg, our return value
***************
*** 1279,1284 ****
--- 1328,1334 ----
* Update pcb, saving current processor state and arranging
* for alternate return ala longjmp in swtch if altreturn is true.
*/
+ ALIGN32
ENTRY(savectx)
movl 4(%esp), %ecx
movw _cpl, %ax
***************
*** 1329,1334 ****
--- 1379,1385 ----
* update profiling information for the user process.
*/
+ ALIGN32
ENTRY(addupc)
pushl %ebp
movl %esp,%ebp
***************
*** 1358,1363 ****
--- 1409,1415 ----
leave
ret
+ ALIGN32
proffault:
/* if we get a fault, then kill profiling all together */
movl $0,PCB_ONFAULT(%edx) /* squish the fault handler */
***************
*** 1367,1372 ****
--- 1419,1425 ----
ret
.data
+ ALIGN32
.globl _cyloffset, _curpcb
_cyloffset: .long 0
.globl _proc0paddr
***************
*** 1466,1471 ****
--- 1519,1525 ----
IDTVEC(rsvd14)
pushl $0; TRAP(31)
+ ALIGN32
alltraps:
pushal
nop
***************
*** 1491,1496 ****
--- 1545,1551 ----
* This code checks for a kgdb trap, then falls through
* to the regular trap code.
*/
+ ALIGN32
bpttraps:
pushal
nop
***************
*** 1511,1516 ****
--- 1566,1572 ----
* Call gate entry for syscall
*/
+ ALIGN32
IDTVEC(syscall)
pushfl # only for stupid carry bit and more stupid wait3 cc kludge
pushal # only need eax,ecx,edx - trap resaves others
***************
*** 1529,1534 ****
--- 1585,1591 ----
popfl
lret
+ ALIGN32
ENTRY(htonl)
ENTRY(ntohl)
movl 4(%esp),%eax
***************
*** 1537,1542 ****
--- 1594,1600 ----
xchgb %al,%ah
ret
+ ALIGN32
ENTRY(htons)
ENTRY(ntohs)
movzwl 4(%esp),%eax
diff -rc i386.ORIG/isa/icu.s i386/isa/icu.s
*** i386.ORIG/isa/icu.s Tue May 12 20:21:27 1992
--- i386/isa/icu.s Mon Oct 19 15:27:54 1992
***************
*** 43,48 ****
--- 43,49 ----
*/
.data
+ ALIGN32
.globl _imen
.globl _cpl
_cpl: .long 0xffff # current priority level (all off)
***************
*** 62,67 ****
--- 63,69 ----
/*
* Handle return from interrupt after device handler finishes
*/
+ ALIGN32
doreti:
cli
popl %ebx # remove intr number
***************
*** 89,94 ****
--- 91,97 ----
addl $8,%esp
iret
+ ALIGN32
1: cmpl $0,_netisr # check for softint s/traps
jne 1f
cmpl $0,_want_resched
***************
*** 102,107 ****
--- 105,111 ----
#include "../net/netisr.h"
+ ALIGN32
1:
#define DONET(s, c) ; \
***************
*** 171,176 ****
--- 175,181 ----
.globl _splhigh
.globl _splclock
+ ALIGN32
_splhigh:
_splclock:
cli # disable interrupts
***************
*** 190,195 ****
--- 195,201 ----
ret
.globl _spltty # block clists
+ ALIGN32
_spltty:
cli # disable interrupts
NOP
***************
*** 210,215 ****
--- 216,222 ----
.globl _splimp
.globl _splnet
+ ALIGN32
_splimp:
_splnet:
cli # disable interrupts
***************
*** 230,235 ****
--- 237,243 ----
ret
.globl _splbio
+ ALIGN32
_splbio:
cli # disable interrupts
NOP
***************
*** 249,254 ****
--- 257,263 ----
ret
.globl _splsoftclock
+ ALIGN32
_splsoftclock:
cli # disable interrupts
NOP
***************
*** 269,274 ****
--- 278,284 ----
.globl _splnone
.globl _spl0
+ ALIGN32
_splnone:
_spl0:
cli # disable interrupts
***************
*** 307,312 ****
--- 317,323 ----
ret
.globl _splx
+ ALIGN32
_splx:
cli # disable interrupts
NOP
--
UUCP: {hplabs!hp-sdd ucsd nosc}!crash!fpm
ARPA: crash!fpm@nosc.mil
INET: fpm@crash.cts.com